def future_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] rx = kd_x/as_floatx(kp_x+kd_x) re = kd_e/as_floatx(kp_e+kd_e) scale = (1./as_floatx(kp_x*kp_e + kp_x*kd_e + kd_x*kp_e)) n_samples, n_in, n_out = shapes x_past_var = create_shared_variable(np.zeros((n_samples, n_in))) e_past_var = create_shared_variable(np.zeros((n_samples, n_out))) x_past = x_past_var*rx e_past = e_past_var*re w_grad = scale * (xs.T.dot(e_past+es) + x_past.T.dot(es)) add_update(x_past_var, x_past + xs) add_update(e_past_var, e_past + es) return w_grad
def __init__(self, w, b=0, normalize_minibatch=False, scale=False, use_bias=True): """ :param w: Initial weight value. Can be: - A numpy array, in which case a shared variable is instantiated from this data. - A symbolic variable that is either a shared variabe or descended from a shared variable. This is used when there are shared parameters. :param b: Can be: - A numpy vector representing the initial bias on the hidden layer, where len(b) = w.shape[1] - A scaler, which just initializes the full vector to this value :param normalize_minibatch: Set to True to normalize over the minibatch. This has been shown to cause better optimization :param scale: Set to True to include an scale term (per output). Generally this only makes sense if normalize_minibatch is True. :param use_bias: Use a bias term? Generally, the answer is "True", a bias term helps. """ self.w = create_shared_variable(w, name='w') self.b = create_shared_variable( b, shape=w.shape[1] if w.ndim == 2 else (w.shape[0], w.shape[2]) if w.ndim == 3 else bad_value(w.shape), name='b') self.log_scale = create_shared_variable( 0 if scale else None, shape=w.shape[1], name='log_scale') if scale else None self.normalizer = \ batch_normalize if normalize_minibatch is True else \ None if normalize_minibatch is False else \ normalize_minibatch self._use_bias = use_bias
def matrix_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes, epsilon=1e-7): """ :param xs: :param es: :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: :param epsilon: :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes v1 = create_shared_variable(np.zeros((n_samples, n_in, n_out))) rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) xr_decayed = xr*rx er_decayed = er*re spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) v2 = xr_decayed[:, :, None]*er_decayed[:, None, :] dws = (spikes*(v2-v1))/(rx*re-1) new_xr = xr_decayed + xs/(kp_x+kd_x) new_er = er_decayed + es/(kp_e+kd_e) add_update(v1, tt.switch(spikes, new_xr[:, :, None]*new_er[:, None, :], v1)) add_update(xr, new_xr) add_update(er, new_er) return dws.sum(axis=0)
def future_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] rx = kd_x / as_floatx(kp_x + kd_x) re = kd_e / as_floatx(kp_e + kd_e) scale = (1. / as_floatx(kp_x * kp_e + kp_x * kd_e + kd_x * kp_e)) n_samples, n_in, n_out = shapes x_past_var = create_shared_variable(np.zeros((n_samples, n_in))) e_past_var = create_shared_variable(np.zeros((n_samples, n_out))) x_past = x_past_var * rx e_past = e_past_var * re w_grad = scale * (xs.T.dot(e_past + es) + x_past.T.dot(es)) add_update(x_past_var, x_past + xs) add_update(e_past_var, e_past + es) return w_grad
def past_weight_grad_step(xs, es, kp_x, kd_x, kp_e, kd_e, shape, dws=None): """ Do an efficient update of the weights given the two spike-update. (This still runs FING SLOWLY!) :param xs: An (n_in) vector :param es: An (n_out) vector :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: (n_in, n_out) :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_in, n_out = shape rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros(n_in)+1) te_last = create_shared_variable(np.zeros(n_out)+1) x_last = create_shared_variable(np.zeros(n_in)) e_last = create_shared_variable(np.zeros(n_out)) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) x_spike_ixs, = tt.nonzero(x_spikes) e_spike_ixs, = tt.nonzero(e_spikes) if dws is None: dws = tt.zeros(shape) t_last = tt.minimum(tx_last[x_spike_ixs, None], te_last) # (n_x_spikes, n_out) dws = tt.inc_subtensor(dws[x_spike_ixs, :], x_last[x_spike_ixs, None]*e_last * rx**(tx_last[x_spike_ixs, None]-t_last) * re**(te_last[None, :]-t_last) * geoseries_sum(re*rx, t_end=t_last, t_start=1) ) new_x_last = tt.set_subtensor(x_last[x_spike_ixs], x_last[x_spike_ixs]*rx**tx_last[x_spike_ixs]+ xs[x_spike_ixs]/as_floatx(kd_x)) new_tx_last = tt.switch(x_spikes, 0, tx_last) t_last = tt.minimum(new_tx_last[:, None], te_last[e_spike_ixs]) # (n_in, n_e_spikes) dws = tt.inc_subtensor(dws[:, e_spike_ixs], new_x_last[:, None]*e_last[e_spike_ixs] * rx**(new_tx_last[:, None]-t_last) * re**(te_last[None, e_spike_ixs]-t_last) * geoseries_sum(re*rx, t_end=t_last, t_start=1) ) add_update(x_last, new_x_last) add_update(e_last, tt.set_subtensor(e_last[e_spike_ixs], e_last[e_spike_ixs]*re**te_last[e_spike_ixs]+ es[e_spike_ixs]/as_floatx(kd_e))) add_update(tx_last, new_tx_last+1) add_update(te_last, tt.switch(e_spikes, 1, te_last+1)) return dws
def __call__(self, x): # x should have assert x.ishape[0]==1, "This method only works for minibatches of size 1, but you used a minibatch of size: %s" % (x.tag.test_value.shape[0]) running_mean = create_shared_variable(np.zeros(x.tag.test_value.shape[1:])) running_mean_sq = create_shared_variable(np.zeros(x.tag.test_value.shape[1:])) new_running_mean = running_mean * self.decay_constant + x[0] * (1-self.decay_constant).astype(theano.config.floatX) new_running_mean_sq = running_mean_sq * self.decay_constant + (x[0]**2) * (1-self.decay_constant).astype(theano.config.floatX) add_update(running_mean, new_running_mean) add_update(running_mean_sq, new_running_mean_sq) running_std = tt.sqrt((new_running_mean_sq - new_running_mean**2)) return (x - running_mean)/(running_std+1e-7)
def __init__(self, kp, kd, shapes): """ :param kp: :param kd: :param shapes: A tuple that specifies (minibatch_size, n_in, n_out) """ self.kp = kp self.kd = kd self.r = kd/as_floatx(kp+kd) self.scale = (1./as_floatx(kp**2 + 2*kp*kd)) self.x_past = create_shared_variable(np.zeros((shapes[0], shapes[1]))) self.e_past = create_shared_variable(np.zeros((shapes[0], shapes[2])))
def __init__(self, w, b, force_shared_parameters = True, border_mode = 'valid', filter_flip = True): """ w is the kernel, an ndarray of shape (n_output_maps, n_input_maps, w_size_y, w_size_x) b is the bias, an ndarray of shape (n_output_maps, ) force_shared_parameters: Set to true if you want to make the parameters shared variables. If False, the parameters will be :param border_mode: {'valid', 'full', 'half', int, (int1, int2)}. Afects default is 'valid'. See theano.tensor.nnet.conv2d docstring for details. """ self.w = create_shared_variable(w) if force_shared_parameters else tt.constant(w) self.b = create_shared_variable(b) if force_shared_parameters else tt.constant(b) self.border_mode = border_mode self.filter_flip = filter_flip
def __init__(self, w, b, force_shared_parameters = True, border_mode = 'valid', filter_flip = True): """ w is the kernel, an ndarray of shape (n_output_maps, n_input_maps, w_size_y, w_size_x) b is the bias, an ndarray of shape (n_output_maps, ). Can also be "False" meaning, don't use biases force_shared_parameters: Set to true if you want to make the parameters shared variables. If False, the parameters will be be constants (which allows for certain optimizations) :param border_mode: {'valid', 'full', 'half', int, (int1, int2)}. Affects default is 'valid'. See theano.tensor.nnet.conv2d docstring for details. """ self.w = create_shared_variable(w) if force_shared_parameters else tt.constant(w) self.b = False if b is False else create_shared_variable(b) if force_shared_parameters else tt.constant(b) self.border_mode = border_mode self.filter_flip = filter_flip
def __init__(self, w, b, nonlinearity, encdec, encdec_back, grad_calc='xx', minibatch_size=1): self.n_in, self.n_out = w.shape self.w = create_shared_variable(w) self.b = create_shared_variable(b) assert isinstance(encdec, IEncoderDecoder) assert isinstance(encdec_back, IEncoderDecoder) self.encdec = encdec self.encdec_back = encdec_back self.nonlinearity = nonlinearity self.minibatch_size = minibatch_size self.grad_calc = grad_calc self.fwd_op_count = create_shared_variable(0, name='fwd_op_count') self.back_op_count = create_shared_variable(0, name='back_op_count') self.update_op_count = create_shared_variable(0, name='update_op_count')
def encode(self, x, shape=None): if shape is None: xp = create_shared_variable(np.zeros((0, )*x.ndim), name='xp') delta = ifelse(xp.size>0, x-xp, x) else: xp = create_shared_variable(np.zeros(shape), name='xp{}'.format(shape)) delta = x - xp add_update(xp, x) y = self.kp*x + self.kd*delta if self.quantization is None: return y elif self.quantization=='herd': return herd(y, shape=shape) else: raise Exception('No quantizer: {}'.format(self.quantization))
def encode(self, x, shape=None): if shape is None: xp = create_shared_variable(np.zeros((0, ) * x.ndim), name='xp') delta = ifelse(xp.size > 0, x - xp, x) else: xp = create_shared_variable(np.zeros(shape), name='xp{}'.format(shape)) delta = x - xp add_update(xp, x) y = self.kp * x + self.kd * delta if self.quantization is None: return y elif self.quantization == 'herd': return herd(y, shape=shape) else: raise Exception('No quantizer: {}'.format(self.quantization))
def get_sampling_fcn(self, initial_vis, n_steps): initial_vis = \ create_shared_variable(initial_vis) if isinstance(initial_vis, np.ndarray) else \ initial_vis if isinstance(initial_vis, SharedVariable) else \ create_shared_variable(initial_vis.tag.test_value) @symbolic_multi def sample(): vis = initial_vis for i in xrange(n_steps): hid = self.propup(vis) vis = self.propdown(hid) add_update(initial_vis, vis) return vis, hid return sample
def past_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: Make this actually use sparsity, one of these days. kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x / (kp_x + kd_x) re = kd_e / (kp_e + kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in)) + 1) te_last = create_shared_variable(np.zeros((n_samples, n_out)) + 1) x_last = create_shared_variable(np.zeros((n_samples, n_in))) e_last = create_shared_variable(np.zeros((n_samples, n_out))) t_last = tt.minimum(tx_last[:, :, None], te_last[:, None, :]) x_spikes = tt.neq(xs, 0) dw_potentials = x_last[:, :, None] * e_last[:, None, :] * \ rx**(tx_last[:, :, None]-t_last) \ * re**(te_last[:, None, :]-t_last) \ * geoseries_sum(rx*re, t_end=t_last, t_start=1) e_spikes = tt.neq(es, 0) dws = (x_spikes[:, :, None] + e_spikes[:, None, :] - x_spikes[:, :, None] * e_spikes[:, None, :]) * dw_potentials # (n_samples, n_in, n_out) add_update( x_last, tt.switch(x_spikes, x_last * rx**tx_last + xs / as_floatx(kd_x), x_last)) add_update( e_last, tt.switch(e_spikes, e_last * rx**te_last + es / as_floatx(kd_e), e_last)) add_update(tx_last, tt.switch(x_spikes, 1, tx_last + 1)) add_update(te_last, tt.switch(e_spikes, 1, te_last + 1)) return dws.sum(axis=0)
def past_weight_grad_calculator_reloaded(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: RESOLVE INSTABILITY ISSUE kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x / (kp_x + kd_x) re = kd_e / (kp_e + kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in))) te_last = create_shared_variable(np.zeros((n_samples, n_out))) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) t_last = tt.maximum(tx_last[:, :, None], te_last[:, None, :]) sum_to_last = geoseries_sum( rx * re, t_start=t_last, t_end=0 ) # Wasteful, since most of this is multiplied by zeros later, but for now it don't matter spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) dw_es = ( xr[:, :, None] * er[:, None, :] * spikes ) * sum_to_last # PROBLEM HERE!!!! Can be very small number times very large numen # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last add_update(xr, xr * rx + xs / (kp_x + kd_x)) add_update(er, er * re + es / (kp_e + kd_e)) add_update(tx_last, tt.switch(x_spikes, 0, tx_last - 1)) add_update(te_last, tt.switch(e_spikes, 0, te_last - 1)) return dw_es.sum(axis=0)
def decode(self, y, shape=None): xp = shared_like( y, name='xp') if shape is None else create_shared_variable( np.zeros(shape), name='xp{}'.format(shape)) div = (self.kp + self.kd) x = (y + self.kd * xp) / div add_update(xp, x) return x
def herd(x, shape=None): phi = shared_like(x, name='phi') if shape is None else create_shared_variable( np.zeros(shape), name='phi{}'.format(shape)) phi_ = phi + x s = tt.round(phi_) add_update(phi, phi_ - s) return s
def __init__(self, w, b = 0, b_rev = None, use_bias = True): """ :param w: Initial weight value. Can be: - A numpy array, in which case a shared variable is instantiated from this data. - A symbolic variable that is either a shared variabe or descended from a shared variable. This is used when there are shared parameters. :param b: Can be: - A numpy vector representing the initial bias on the hidden layer, where len(b) = w.shape[1] - A scaler, which just initializes the full vector to this value :param b_rev: Can be: - A numpy vector representing the initial bias on the visible layer, where len(b) = w.shape[0] - A scaler, which just initializes the full vector to this value - None, in which case b_rev is not created (for instance in an MLP). """ self.w = create_shared_variable(w, name = 'w') self.b = create_shared_variable(b, shape = w.shape[1], name = 'b') if use_bias else None self.b_rev = create_shared_variable(b_rev, shape = w.shape[0], name = 'b_rev') if use_bias else None self._use_bias = use_bias
def __init__(self, n_input, n_hidden, initializer_fcn, input_layer_type='softmax', hidden_layer_type='tanh'): self.lstm = LSTMLayer.from_initializer( n_input=n_input, n_hidden=n_hidden, initializer_fcn=initializer_fcn, hidden_layer_type=hidden_layer_type) self.w_hz = create_shared_variable(initializer_fcn, (n_hidden, n_input)) self.b_z = create_shared_variable(0, n_input) self.output_activation = mysoftmax if input_layer_type == 'softmax' else get_named_activation_function( input_layer_type)
def matrix_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes, epsilon=1e-7): """ :param xs: :param es: :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: :param epsilon: :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes v1 = create_shared_variable(np.zeros((n_samples, n_in, n_out))) rx = kd_x / (kp_x + kd_x) re = kd_e / (kp_e + kd_e) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) xr_decayed = xr * rx er_decayed = er * re spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) v2 = xr_decayed[:, :, None] * er_decayed[:, None, :] dws = (spikes * (v2 - v1)) / (rx * re - 1) new_xr = xr_decayed + xs / (kp_x + kd_x) new_er = er_decayed + es / (kp_e + kd_e) add_update(v1, tt.switch(spikes, new_xr[:, :, None] * new_er[:, None, :], v1)) add_update(xr, new_xr) add_update(er, new_er) return dws.sum(axis=0)
def past_weight_grad_calculator2(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ This attempt never really got off the ground. It doesn't work """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) # xr_new = xr*rx + xs/(kp_x+kd_x) # er_new = er*re + es/(kp_e+kd_e) arr = rx*re/(1-rx*re) xr_new = xr*arr + xs/(kp_x+kd_x) er_new = er*arr + es/(kp_e+kd_e) xsum = create_shared_variable(np.zeros((n_samples, n_in))) esum = create_shared_variable(np.zeros((n_samples, n_out))) xsum_new = xsum+xr_new esum_new = esum+er_new x_nospikes = tt.eq(xs, 0) e_nospikes = tt.eq(es, 0) dw = xs.T.dot(esum_new) + xsum_new.T.dot(es) add_update(xr, xr_new) add_update(er, er_new) add_update(xsum, xsum_new*x_nospikes) add_update(esum, esum_new*e_nospikes) return xs.T.dot(er) + xr.T.dot(es) # return xr.T.dot(er) # return dw
def __init__(self, ws, bs=None, comp_weight=1e-6, optimizer=None, layerwise_scales=False, parametrization='log', hidden_activations='relu', output_activation='softmax', rng=None): """ Learns how to rescale the units to be an optimal rounding network. :param ws: A list of (n_in, n_out) weight matrices :param bs: A length of bias vectors (same length as ws) :param comp_weight: The weight (lambda in the paper) given to computation :param optimizer: The optimizer (an IGradientOptimizer object) :param layerwise_scales: Make scales layerwise (as opposed to unitwise) :param parametrization: What space to parametrize in ('log', 'direct', or 'softplus') :param hidden_activations: Hidden activation functions (as a string, eg 'relu') :param output_activation: Output activation function :param rng: Random number generator or seed. """ if optimizer is None: optimizer = get_named_optimizer('sgd', 0.01) if bs is None: bs = [np.zeros(w.shape[1]) for w in ws] self.ws = [create_shared_variable(w) for w in ws] self.bs = [create_shared_variable(b) for b in bs] self.comp_weight = tt.constant(comp_weight, dtype=theano.config.floatX) self.optimizer = optimizer self.hidden_activations = hidden_activations self.output_activation = output_activation scale_dims = [()] * len(ws) if layerwise_scales else [ ws[0].shape[0] ] + [w.shape[1] for w in ws[:-1]] self.k_params = \ [create_shared_variable(np.ones(d)) for d in scale_dims] if parametrization=='direct' else \ [create_shared_variable(np.zeros(d)) for d in scale_dims] if parametrization=='log' else \ [create_shared_variable(np.zeros(d)+np.exp(1)-1) for d in scale_dims] if parametrization=='softplus' else \ bad_value(parametrization) self.parametrization = parametrization self.rng = get_theano_rng(rng)
def __init__(self, w, b=0, b_rev=None, use_bias=True): """ :param w: Initial weight value. Can be: - A numpy array, in which case a shared variable is instantiated from this data. - A symbolic variable that is either a shared variabe or descended from a shared variable. This is used when there are shared parameters. :param b: Can be: - A numpy vector representing the initial bias on the hidden layer, where len(b) = w.shape[1] - A scaler, which just initializes the full vector to this value :param b_rev: Can be: - A numpy vector representing the initial bias on the visible layer, where len(b) = w.shape[0] - A scaler, which just initializes the full vector to this value - None, in which case b_rev is not created (for instance in an MLP). """ self.w = create_shared_variable(w, name='w') self.b = create_shared_variable(b, shape=w.shape[1], name='b') if use_bias else None self.b_rev = create_shared_variable( b_rev, shape=w.shape[0], name='b_rev') if use_bias else None self._use_bias = use_bias
def past_weight_grad_calculator_reloaded(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: RESOLVE INSTABILITY ISSUE kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in))) te_last = create_shared_variable(np.zeros((n_samples, n_out))) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) t_last = tt.maximum(tx_last[:, :, None], te_last[:, None, :]) sum_to_last = geoseries_sum(rx*re, t_start=t_last, t_end=0) # Wasteful, since most of this is multiplied by zeros later, but for now it don't matter spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) dw_es = (xr[:, :, None]*er[:, None, :]*spikes)*sum_to_last # PROBLEM HERE!!!! Can be very small number times very large numen # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last add_update(xr, xr*rx + xs/(kp_x+kd_x)) add_update(er, er*re + es/(kp_e+kd_e)) add_update(tx_last, tt.switch(x_spikes, 0, tx_last-1)) add_update(te_last, tt.switch(e_spikes, 0, te_last-1)) return dw_es.sum(axis=0)
def past_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: Make this actually use sparsity, one of these days. kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in))+1) te_last = create_shared_variable(np.zeros((n_samples, n_out))+1) x_last = create_shared_variable(np.zeros((n_samples, n_in))) e_last = create_shared_variable(np.zeros((n_samples, n_out))) t_last = tt.minimum(tx_last[:, :, None], te_last[:, None, :]) x_spikes = tt.neq(xs, 0) dw_potentials = x_last[:, :, None] * e_last[:, None, :] * \ rx**(tx_last[:, :, None]-t_last) \ * re**(te_last[:, None, :]-t_last) \ * geoseries_sum(rx*re, t_end=t_last, t_start=1) e_spikes = tt.neq(es, 0) dws = (x_spikes[:, :, None]+e_spikes[:, None, :]-x_spikes[:, :, None]*e_spikes[:, None, :])*dw_potentials # (n_samples, n_in, n_out) add_update(x_last, tt.switch(x_spikes, x_last*rx**tx_last + xs/as_floatx(kd_x), x_last)) add_update(e_last, tt.switch(e_spikes, e_last*rx**te_last + es/as_floatx(kd_e), e_last)) add_update(tx_last, tt.switch(x_spikes, 1, tx_last+1)) add_update(te_last, tt.switch(e_spikes, 1, te_last+1)) return dws.sum(axis=0)
def encode(self, x, shape=None): running_mag = create_shared_variable(1.) add_update(running_mag, (1 - self.adaptation_rate) * running_mag + self.adaptation_rate * abs(x).mean()) target_k_beta = self.k_beta_init * running_mag add_update( self.k_beta, self.k_beta + self.adaptation_rate * (target_k_beta - self.k_beta)) return pd_encode(x, kp=self.kp, kd=self.kd, quantization=self.quantization, shape=shape)
def __init__(self, w, b = 0, normalize_minibatch = False, scale = False, use_bias = True): """ :param w: Initial weight value. Can be: - A numpy array, in which case a shared variable is instantiated from this data. - A symbolic variable that is either a shared variabe or descended from a shared variable. This is used when there are shared parameters. :param b: Can be: - A numpy vector representing the initial bias on the hidden layer, where len(b) = w.shape[1] - A scaler, which just initializes the full vector to this value :param normalize_minibatch: Set to True to normalize over the minibatch. This has been shown to cause better optimization :param scale: Set to True to include an scale term (per output). Generally this only makes sense if normalize_minibatch is True. :param use_bias: Use a bias term? Generally, the answer is "True", a bias term helps. """ self.w = create_shared_variable(w, name = 'w') self.b = create_shared_variable(b, shape = w.shape[1] if w.ndim==2 else (w.shape[0], w.shape[2]) if w.ndim==3 else bad_value(w.shape), name = 'b') self.log_scale = create_shared_variable(0 if scale else None, shape = w.shape[1], name = 'log_scale') if scale else None self.normalizer = \ batch_normalize if normalize_minibatch is True else \ None if normalize_minibatch is False else \ normalize_minibatch self._use_bias = use_bias
def __init__(self, w, b, nonlinearity, encdec, encdec_back, grad_calc='xx', minibatch_size=1): self.n_in, self.n_out = w.shape self.w = create_shared_variable(w) self.b = create_shared_variable(b) assert isinstance(encdec, IEncoderDecoder) assert isinstance(encdec_back, IEncoderDecoder) self.encdec = encdec self.encdec_back = encdec_back self.nonlinearity = nonlinearity self.minibatch_size = minibatch_size self.grad_calc = grad_calc self.fwd_op_count = create_shared_variable(0, name='fwd_op_count') self.back_op_count = create_shared_variable(0, name='back_op_count') self.update_op_count = create_shared_variable(0, name='update_op_count')
def train(wake_visible): wake_hidden = self.propup(wake_visible) persistent_state = sleep_hidden = create_shared_variable(np.zeros(wake_hidden.tag.test_value.shape), name = 'persistend_hidden_state') if persistent else wake_hidden for _ in xrange(n_gibbs): sleep_visible = self.propdown(sleep_hidden) sleep_hidden = self.propup(sleep_visible) wake_energy = self.energy(wake_visible) sleep_energy = self.energy(sleep_visible) cost = wake_energy - sleep_energy optimizer(cost = cost, parameters = self.parameters, constants = [wake_visible, sleep_visible]) if persistent: add_update(persistent_state, sleep_hidden)
def get_sampling_fcn(self, initial_vis, n_steps): """ :param initial_vis: An (n_samples, n_input_dims) array representing the initial visible samples :param n_steps: Number of steps to bounce on each call. :return: A function that returns an (n_samples, n_input_dims) tensor of samples. """ initial_vis = create_shared_variable(initial_vis) initial_top_vis = self.propup(initial_vis, to_layer=-1) top_sampling_fcn = self.rbms[-1].get_sampling_fcn(initial_vis= initial_top_vis, n_steps=n_steps) @symbolic_simple def sample(): top_sample, _ = top_sampling_fcn() bottom_sample = self.propdown(top_sample, stochastic = True, from_layer = -2) return bottom_sample return sample
def __init__(self, kp, kd, adaptation_rate = 0.0001, quantization = None): """ :param kp_over_kd: The ratio of kp/kd. 0.01 might be a normal value. :param relative_scale: Try to maintain a scale of :param adaptation_rate: """ self.k_alpha = kd/float(kp+kd) self.k_beta_init = 1/float(kp+kd) # The scale self.k_beta=self.k_beta_init assert np.allclose(self.kp, kp) assert np.allclose(self.kd, kd) self.k_beta = create_shared_variable(self.k_beta_init) self.adaptation_rate = adaptation_rate self.quantization = quantization
def __init__(self, kp, kd, adaptation_rate=0.0001, quantization=None): """ :param kp_over_kd: The ratio of kp/kd. 0.01 might be a normal value. :param relative_scale: Try to maintain a scale of :param adaptation_rate: """ self.k_alpha = kd / float(kp + kd) self.k_beta_init = 1 / float(kp + kd) # The scale self.k_beta = self.k_beta_init assert np.allclose(self.kp, kp) assert np.allclose(self.kd, kd) self.k_beta = create_shared_variable(self.k_beta_init) self.adaptation_rate = adaptation_rate self.quantization = quantization
def _update_param(self, param, gradient): # Initialize variables i = create_shared_variable(0.) m = theano.shared(param.get_value() * 0.) v = theano.shared(param.get_value() * 0.) # Recompute values i_t = i + 1. fix1 = 1. - (1. - self.beta_1)**i_t fix2 = 1. - (1. - self.beta_2)**i_t lr_t = self.alpha * (tt.sqrt(fix2) / fix1) m_t = (self.beta_1 * gradient) + ((1. - self.beta_1) * m) v_t = (self.beta_2 * tt.sqr(gradient)) + ((1. - self.beta_2) * v) g_t = m_t / (tt.sqrt(v_t) + self.eps) p_t = param - (lr_t * g_t) add_update(param, p_t) add_update(m, m_t) add_update(v, v_t) add_update(i, i_t)
def _update_param(self, param, gradient): # Initialize variables i = create_shared_variable(0.) m = theano.shared(param.get_value() * 0.) v = theano.shared(param.get_value() * 0.) # Recompute values i_t = i + 1. fix1 = 1. - (1. - self.beta_1)**i_t fix2 = 1. - (1. - self.beta_2)**i_t lr_t = self.alpha * (tt.sqrt(fix2) / fix1) m_t = (self.beta_1 * gradient) + ((1. - self.beta_1) * m) v_t = (self.beta_2 * tt.sqr(gradient)) + ((1. - self.beta_2) * v) g_t = m_t / (tt.sqrt(v_t) + self.eps) p_t = param - (lr_t * g_t) add_update(param, p_t) add_update(m, m_t) add_update(v, v_t) add_update(i, i_t)
def __init__(self, w, b=0, stride = (1, 1)): self.w = create_shared_variable(w, name = 'w') self.b = create_shared_variable(b, name = 'b') self._stride = stride
def __init__(self, w, b=0, stride=(1, 1)): self.w = create_shared_variable(w, name='w') self.b = create_shared_variable(b, name='b') self._stride = stride
def decode(self, y, shape=None): xp = shared_like(y, name='xp') if shape is None else create_shared_variable(np.zeros(shape), name='xp{}'.format(shape)) div = (self.kp+self.kd) x = (y+self.kd*xp)/div add_update(xp, x) return x
def __init__(self, shape, scale_shape = None): self.phi = create_shared_variable(np.zeros(shape)) self.log_scales = create_shared_variable(0. if scale_shape is None else np.zeros(scale_shape))
def past_weight_grad_step(xs, es, kp_x, kd_x, kp_e, kd_e, shape, dws=None): """ Do an efficient update of the weights given the two spike-update. (This still runs FING SLOWLY!) :param xs: An (n_in) vector :param es: An (n_out) vector :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: (n_in, n_out) :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_in, n_out = shape rx = kd_x / (kp_x + kd_x) re = kd_e / (kp_e + kd_e) tx_last = create_shared_variable(np.zeros(n_in) + 1) te_last = create_shared_variable(np.zeros(n_out) + 1) x_last = create_shared_variable(np.zeros(n_in)) e_last = create_shared_variable(np.zeros(n_out)) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) x_spike_ixs, = tt.nonzero(x_spikes) e_spike_ixs, = tt.nonzero(e_spikes) if dws is None: dws = tt.zeros(shape) t_last = tt.minimum(tx_last[x_spike_ixs, None], te_last) # (n_x_spikes, n_out) dws = tt.inc_subtensor( dws[x_spike_ixs, :], x_last[x_spike_ixs, None] * e_last * rx**(tx_last[x_spike_ixs, None] - t_last) * re**(te_last[None, :] - t_last) * geoseries_sum(re * rx, t_end=t_last, t_start=1)) new_x_last = tt.set_subtensor( x_last[x_spike_ixs], x_last[x_spike_ixs] * rx**tx_last[x_spike_ixs] + xs[x_spike_ixs] / as_floatx(kd_x)) new_tx_last = tt.switch(x_spikes, 0, tx_last) t_last = tt.minimum(new_tx_last[:, None], te_last[e_spike_ixs]) # (n_in, n_e_spikes) dws = tt.inc_subtensor( dws[:, e_spike_ixs], new_x_last[:, None] * e_last[e_spike_ixs] * rx**(new_tx_last[:, None] - t_last) * re**(te_last[None, e_spike_ixs] - t_last) * geoseries_sum(re * rx, t_end=t_last, t_start=1)) add_update(x_last, new_x_last) add_update( e_last, tt.set_subtensor( e_last[e_spike_ixs], e_last[e_spike_ixs] * re**te_last[e_spike_ixs] + es[e_spike_ixs] / as_floatx(kd_e))) add_update(tx_last, new_tx_last + 1) add_update(te_last, tt.switch(e_spikes, 1, te_last + 1)) return dws
def encode(self, x, shape=None): running_mag = create_shared_variable(1.) add_update(running_mag, (1-self.adaptation_rate)*running_mag + self.adaptation_rate*abs(x).mean()) target_k_beta = self.k_beta_init*running_mag add_update(self.k_beta, self.k_beta + self.adaptation_rate*(target_k_beta - self.k_beta)) return pd_encode(x, kp=self.kp, kd=self.kd, quantization=self.quantization, shape=shape)
def get_initial_state(self, h_init=None, c_init=None): if h_init is None: h_init = create_shared_variable(0, shape=self.n_hidden, name='h') if c_init is None: c_init = create_shared_variable(0, shape=self.n_hidden, name='c') return h_init, c_init
def __init__(self, shape, scale = 1): self.phi = create_shared_variable(np.zeros(shape)) self.scale = scale
def get_generation_function(self, maintain_state=True, stochastic=True, rng=None): """ Return a symbolic function that generates a sequence (and updates its internal state). :param stochastic: True to sample a onehot-vector from the output. False to simply reinsert the distribution vector. :param rng: A seed, numpy or theano random number generator :return: A symbolic function of the form: (outputs, updates) = generate(primer, n_steps) """ h_init, c_init = self.lstm.get_initial_state() x_init = create_shared_variable(0, shape=self.lstm.n_inputs) rng = get_theano_rng(rng) @symbolic_multi def generate(primer, n_steps): """ Generate a sequence of outputs, and update the internal state. primer: A sequence to prime on. This will overwrite the OUTPUT at each time step. Note: this means the first iteration will run off the last output from the previous call to generate. n_steps: Number of steps (after the primer) to run. return: A sequence of length n_steps. """ n_primer_steps = primer.shape[0] n_total_steps = n_primer_steps + n_steps def do_step(i, x_, h_, c_): """ i: The step number (int) x_: An input vector h_: A hiddens state vector c_: A memory cell vector """ y_prob, h, c = self.step(x_, h_, c_) y_candidate = ifelse( int(stochastic), rng.multinomial(n=1, pvals=y_prob[None, :])[0].astype( theano.config.floatX), y_prob) # y_candidate = ifelse(int(stochastic), rng.multinomial(n=1, pvals=y_prob.dimshuffle('x', 1))[0].astype(theano.config.floatX), y_prob) y = ifelse( i < n_primer_steps, primer[i], y_candidate ) # Note: If you get error here, you just need to prime with something on first call. return y, h, c (x_gen, h_gen, c_gen), updates = theano.scan( do_step, sequences=[tt.arange(n_total_steps)], outputs_info=[x_init, h_init, c_init], ) if maintain_state: updates += [(x_init, x_gen[-1]), (h_init, h_gen[-1]), (c_init, c_gen[-1])] for var, val in updates.items(): add_update(var, val) return x_gen[n_primer_steps:], return generate
def __init__(self, *args, **kwargs): ConvLayer.__init__(self, *args, **kwargs) self.bias_switch = create_shared_variable(1.)
def __init__(self, w, b_vis, b_hid, rng): self.rng = get_theano_rng(rng) self.w = create_shared_variable(w) self.b_vis = create_shared_variable(b_vis) self.b_hid = create_shared_variable(b_hid)
def __init__(self, shape): self.sum = create_shared_variable(np.zeros(shape))
def from_initializer(cls, n_input, n_hidden, initializer_fcn, hidden_layer_type='tanh'): """ :param n_input: Number of inputs :param n_hidden: Number of hiddens :param n_output: Number of outputs :param initializer_fcn: Function taking a shape and returning parameters. :return: An LSTMLayer """ return LSTMLayer( w_xi=create_shared_variable(initializer_fcn, shape=(n_input, n_hidden)), w_xf=create_shared_variable(initializer_fcn, shape=(n_input, n_hidden)), w_xc=create_shared_variable(initializer_fcn, shape=(n_input, n_hidden)), w_xo=create_shared_variable(initializer_fcn, shape=(n_input, n_hidden)), w_hi=create_shared_variable(initializer_fcn, shape=(n_hidden, n_hidden)), w_hf=create_shared_variable(initializer_fcn, shape=(n_hidden, n_hidden)), w_hc=create_shared_variable(initializer_fcn, shape=(n_hidden, n_hidden)), w_ho=create_shared_variable(initializer_fcn, shape=(n_hidden, n_hidden)), w_co=create_shared_variable(initializer_fcn, shape=(n_hidden, n_hidden)), b_i=create_shared_variable(0, shape=n_hidden), b_f=create_shared_variable(0, shape=n_hidden), b_c=create_shared_variable(0, shape=n_hidden), b_o=create_shared_variable(0, shape=n_hidden), hidden_layer_type=hidden_layer_type)
def herd(x, shape = None): phi = shared_like(x, name='phi') if shape is None else create_shared_variable(np.zeros(shape), name='phi{}'.format(shape)) phi_ = phi + x s = tt.round(phi_) add_update(phi, phi_ - s) return s