def sample(): vis = initial_vis for i in xrange(n_steps): hid = self.propup(vis) vis = self.propdown(hid) add_update(initial_vis, vis) return vis, hid
def cd_function(*input_signals): wake_visible = input_signals if input_layers is None else up_path(*input_signals) wake_hidden = propup(*wake_visible) initial_hidden =[theano.shared(np.zeros(wh.tag.test_value.shape, dtype = theano.config.floatX), name = 'persistent_hidden_state') for wh in wake_hidden] \ if persistent else wake_hidden gibbs_path = [(hidden_layers, visible_layers)] + [(visible_layers, hidden_layers), (hidden_layers, visible_layers)] * (n_gibbs-1) sleep_visible = self.get_inference_function(hidden_layers, visible_layers, gibbs_path)(*initial_hidden) sleep_hidden = propup(*sleep_visible) all_params = sum([x.parameters for x in ([self.layers[i] for i in visible_layers] +[self.layers[i] for i in hidden_layers]+[self.bridges[i, j] for i in visible_layers for j in hidden_layers])], []) if method == 'free_energy': cost = free_energy(*wake_visible).mean() - free_energy(*sleep_visible).mean() elif method == 'energy': cost = tt.mean(wake_visible.T.dot(wake_hidden) - sleep_visible.T.dot(sleep_hidden)) else: bad_value(method) optimizer(cost = cost, parameters = all_params, constants = wake_visible+sleep_visible) if persistent: for p, s in zip(initial_hidden, sleep_hidden): add_update(p, s)
def future_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] rx = kd_x/as_floatx(kp_x+kd_x) re = kd_e/as_floatx(kp_e+kd_e) scale = (1./as_floatx(kp_x*kp_e + kp_x*kd_e + kd_x*kp_e)) n_samples, n_in, n_out = shapes x_past_var = create_shared_variable(np.zeros((n_samples, n_in))) e_past_var = create_shared_variable(np.zeros((n_samples, n_out))) x_past = x_past_var*rx e_past = e_past_var*re w_grad = scale * (xs.T.dot(e_past+es) + x_past.T.dot(es)) add_update(x_past_var, x_past + xs) add_update(e_past_var, e_past + es) return w_grad
def running_average(data): n_points = theano.shared(np.array(1).astype(int)) avg = theano.shared(np.zeros_like(data.tag.test_value).astype(theano.config.floatX)) new_avg = data*(1./n_points) + avg*(n_points-1.)/n_points add_update(avg, new_avg) add_update(n_points, n_points+1) return new_avg
def future_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] rx = kd_x / as_floatx(kp_x + kd_x) re = kd_e / as_floatx(kp_e + kd_e) scale = (1. / as_floatx(kp_x * kp_e + kp_x * kd_e + kd_x * kp_e)) n_samples, n_in, n_out = shapes x_past_var = create_shared_variable(np.zeros((n_samples, n_in))) e_past_var = create_shared_variable(np.zeros((n_samples, n_out))) x_past = x_past_var * rx e_past = e_past_var * re w_grad = scale * (xs.T.dot(e_past + es) + x_past.T.dot(es)) add_update(x_past_var, x_past + xs) add_update(e_past_var, e_past + es) return w_grad
def train(wake_visible): wake_hidden = propup(wake_visible) persistent_state = sleep_hidden = theano.shared( np.zeros(wake_hidden.tag.test_value.shape, dtype=theano.config.floatX), name='persistend_hidden_state') if persistent else wake_hidden for _ in xrange(n_gibbs): sleep_visible = propdown(sleep_hidden) sleep_hidden = propup(sleep_visible) wake_energy = bridge.free_energy( wake_visible) + hidden_layer.free_energy(bridge(wake_visible)) sleep_energy = bridge.free_energy( sleep_visible) + hidden_layer.free_energy( bridge(sleep_visible)) cost = tt.mean(wake_energy - sleep_energy) params = visible_layer.parameters + bridge.parameters + hidden_layer.parameters optimizer(cost=cost, parameters=params, constants=[wake_visible, sleep_visible]) if persistent: add_update(persistent_state, sleep_hidden)
def __call__(self, x): # x should have assert x.ishape[0]==1, "This method only works for minibatches of size 1, but you used a minibatch of size: %s" % (x.tag.test_value.shape[0]) running_mean = create_shared_variable(np.zeros(x.tag.test_value.shape[1:])) new_running_mean = running_mean * self.decay_constant + x[0] * (1-self.decay_constant).astype(theano.config.floatX) add_update(running_mean, new_running_mean) return x - running_mean
def running_average(data): n_points = theano.shared(np.array(1).astype(int)) avg = theano.shared( np.zeros_like(data.tag.test_value).astype(theano.config.floatX)) new_avg = data * (1. / n_points) + avg * (n_points - 1.) / n_points add_update(avg, new_avg) add_update(n_points, n_points + 1) return new_avg
def __call__(self): (vector_ixs, ), updates = self._get_vector_indices_and_updates() full_indices = \ (vector_ixs, ) if isinstance(self._size, int) else \ ind2sub(vector_ixs, self._size) for var, val in updates: add_update(var, val) return full_indices
def herd(x, shape=None): phi = shared_like(x, name='phi') if shape is None else create_shared_variable( np.zeros(shape), name='phi{}'.format(shape)) phi_ = phi + x s = tt.round(phi_) add_update(phi, phi_ - s) return s
def decode(self, y, shape=None): xp = shared_like( y, name='xp') if shape is None else create_shared_variable( np.zeros(shape), name='xp{}'.format(shape)) div = (self.kp + self.kd) x = (y + self.kd * xp) / div add_update(xp, x) return x
def _update_param(self, param, gradient): mean_squared_grad = theano.shared(np.zeros_like(param.get_value())) new_mean_squared_grad = self.decay * mean_squared_grad + ( 1 - self.decay) * gradient**2 delta_p = -self.learning_rate * gradient / tt.maximum( tt.sqrt(new_mean_squared_grad), self.epsilon) add_update(param, param + delta_p) add_update(mean_squared_grad, new_mean_squared_grad)
def free_sample(): (visible_state, hidden_state), _ = get_bounce_fcn( start_from=start_from, n_steps=n_steps, return_smooth_visible=return_smooth_visible)(persistent_state) add_update( persistent_state, visible_state if start_from == 'visible' else hidden_state) return visible_state, hidden_state
def __call__(self, inputs): if self.scale != 1: import theano inputs = inputs * np.array(self.scale, dtype=theano.config.floatX) inc_phi = self.phi + inputs spikes = tt.round(inc_phi) new_phi = inc_phi-spikes add_update(self.phi, new_phi) return spikes
def __call__(self, x): """ param x: A (n_samples, n_input_maps, size_y, size_x) image/feature tensor return: A (n_samples, n_output_maps, size_y-w_size_y+1, size_x-w_size_x+1) tensor """ result = tt.nnet.conv2d(input=x, filters=self.w, border_mode=self.border_mode, filter_flip=self.filter_flip) + self.bias_switch*(self.b[:, None, None] if self.b is not False else 0) if self.b is not False: add_update(self.bias_switch, 0) return result
def _update_param(self, param, gradient): if self.momentum != 0: mom = theano.shared(np.zeros_like(param.get_value())) new_mom = self.momentum * mom + gradient add_update(mom, new_mom) direction = new_mom # Or mom, something about Nesterov... else: direction = gradient add_update(param, param - self.eta*direction - self.decay*param)
def _update_param(self, param, gradient): if self.momentum != 0: mom = theano.shared(np.zeros_like(param.get_value())) new_mom = self.momentum * mom + gradient add_update(mom, new_mom) direction = new_mom # Or mom, something about Nesterov... else: direction = gradient add_update(param, param - self.eta * direction - self.decay * param)
def train(x, y): w_0 = tt.set_subtensor(w[alpha], 0) # (n_dim_in, n_dim_out) w_1 = tt.set_subtensor(w[alpha], 1) # (n_dim_in, n_dim_out) z_0 = tt.nnet.sigmoid(x.dot(w_0)) # (n_samples, n_dim_out) z_1 = tt.nnet.sigmoid(x.dot(w_1)) # (n_samples, n_dim_out) log_likelihood_ratio = tt.sum(tt.log(bernoulli(y, z_1))-tt.log(bernoulli(y, z_0)), axis = 0) # (n_dim_out, ) p_wa = tt.nnet.sigmoid(log_likelihood_ratio) # (n_dim_out, ) w_sample = rng.binomial(p=p_wa) # (n_dim_out, ) w_new = tt.set_subtensor(w[alpha], w_sample) # (n_dim_in, n_dim_out) add_update(w, w_new) add_update(alpha, (alpha+1) % n_dim_in)
def train(self, x, y): p_wa = self.compute_p_wa( self._w, x, y, self._alpha, self._possible_ws) # (n_alpha, n_dim_out, n_possible_ws) w_sample = sample_categorical(self._rng, p_wa, values=self._possible_ws) w_new = tt.set_subtensor(self._w[self._alpha], w_sample) # (n_dim_in, n_dim_out) add_update(self._w, w_new) self._add_alpha_update()
def train(self, x, y): p_wa = self.compute_p_wa(self._w, x, y, self._alpha, self._possible_ws) phi_alpha = self._phi[self._alpha] + p_wa # (n_alpha, n_dim_out, n_possible_ws) k_chosen = tt.argmax(phi_alpha, axis = 2) # (n_alpha, n_dim_out) selected_phi_indices = (tt.arange(self._alpha.shape[0])[:, None], tt.arange(y.shape[1])[None, :], k_chosen) new_phi_alpha = tt.set_subtensor(phi_alpha[selected_phi_indices], phi_alpha[selected_phi_indices]-1) # (n_alpha, n_dim_out, n_possible_ws) w_sample = self._possible_ws[k_chosen] # (n_alpha, n_dim_out) new_phi = tt.set_subtensor(self._phi[self._alpha], new_phi_alpha) # (n_dim_in, n_dim_out, n_possible_ws) w_new = tt.set_subtensor(self._w[self._alpha], w_sample) # (n_dim_in, n_dim_out) add_update(self._w, w_new) add_update(self._phi, new_phi) self._add_alpha_update()
def encode(self, x, shape=None): running_mag = create_shared_variable(1.) add_update(running_mag, (1 - self.adaptation_rate) * running_mag + self.adaptation_rate * abs(x).mean()) target_k_beta = self.k_beta_init * running_mag add_update( self.k_beta, self.k_beta + self.adaptation_rate * (target_k_beta - self.k_beta)) return pd_encode(x, kp=self.kp, kd=self.kd, quantization=self.quantization, shape=shape)
def train(wake_visible): wake_hidden = self.propup(wake_visible) persistent_state = sleep_hidden = create_shared_variable(np.zeros(wake_hidden.tag.test_value.shape), name = 'persistend_hidden_state') if persistent else wake_hidden for _ in xrange(n_gibbs): sleep_visible = self.propdown(sleep_hidden) sleep_hidden = self.propup(sleep_visible) wake_energy = self.energy(wake_visible) sleep_energy = self.energy(sleep_visible) cost = wake_energy - sleep_energy optimizer(cost = cost, parameters = self.parameters, constants = [wake_visible, sleep_visible]) if persistent: add_update(persistent_state, sleep_hidden)
def encode(self, x, shape=None): if shape is None: xp = create_shared_variable(np.zeros((0, )*x.ndim), name='xp') delta = ifelse(xp.size>0, x-xp, x) else: xp = create_shared_variable(np.zeros(shape), name='xp{}'.format(shape)) delta = x - xp add_update(xp, x) y = self.kp*x + self.kd*delta if self.quantization is None: return y elif self.quantization=='herd': return herd(y, shape=shape) else: raise Exception('No quantizer: {}'.format(self.quantization))
def encode(self, x, shape=None): if shape is None: xp = create_shared_variable(np.zeros((0, ) * x.ndim), name='xp') delta = ifelse(xp.size > 0, x - xp, x) else: xp = create_shared_variable(np.zeros(shape), name='xp{}'.format(shape)) delta = x - xp add_update(xp, x) y = self.kp * x + self.kd * delta if self.quantization is None: return y elif self.quantization == 'herd': return herd(y, shape=shape) else: raise Exception('No quantizer: {}'.format(self.quantization))
def compute_grad(self, xc, ec, x_true = None, e_true = None): """ :param xc: :param ec: :param x: :param e: :return: """ x_past = self.x_past*self.r if x_true is None else x_true*(self.kp+self.kd)-xc e_past = self.e_past*self.r if e_true is None else e_true*(self.kp+self.kd)-ec w_grad = self.scale * (xc.T.dot(e_past+ec) + x_past.T.dot(ec)) if x_true is None: add_update(self.x_past, x_past + xc) if e_true is None: add_update(self.e_past, e_past + ec) return w_grad
def forward_pass_and_state(self, x, count_ops = False): # s = quantize(x, mode = self.fwd_quantizer, shape = (self.minibatch_size, self.n_in)) # s = pd_encode(x, kp=self.kp, kd=self.kd, quantization=self.fwd_quantizer, shape = (self.minibatch_size, self.n_in)) s = self.encdec.encode(x, shape = (self.minibatch_size, self.n_in)) # if self.n_in==784: # tdbplot(s.reshape((28, 28)), 's') # pre_act = pd_decode(s.dot(self.w), kp=self.kp, kd=self.kd, shape= (self.minibatch_size, self.n_out)) + self.b pre_act = self.encdec.decode(s.dot(self.w), shape= (self.minibatch_size, self.n_out)) + self.b if count_ops: add_update(self.fwd_op_count, self.fwd_op_count+abs(s).sum().astype('int64')*self.n_out, accumulate=True) # pre_act = s.dot(self.w) + self.b out = compute_activation(pre_act, activation_name=self.nonlinearity) return out, (x, s, pre_act)
def multi_step(self, inputs, h_init=None, c_init=None, update_states=True): """ Do a chain of steps and update the internal states inputs is a symbolic (n_frames, ...) array outputs is a symbolic (n_frames, ...) array """ h_init, c_init = self.get_initial_state(h_init, c_init) all_states, updates = theano.scan( self.step, sequences=[inputs], outputs_info=[h_init, c_init], ) h_sequence, c_sequence = all_states if update_states: add_update(h_init, h_sequence[-1]) add_update(c_init, c_sequence[-1]) return h_sequence
def generate(primer, n_steps): """ Generate a sequence of outputs, and update the internal state. primer: A sequence to prime on. This will overwrite the OUTPUT at each time step. Note: this means the first iteration will run off the last output from the previous call to generate. n_steps: Number of steps (after the primer) to run. return: A sequence of length n_steps. """ n_primer_steps = primer.shape[0] n_total_steps = n_primer_steps + n_steps def do_step(i, x_, h_, c_): """ i: The step number (int) x_: An input vector h_: A hiddens state vector c_: A memory cell vector """ y_prob, h, c = self.step(x_, h_, c_) y_candidate = ifelse( int(stochastic), rng.multinomial(n=1, pvals=y_prob[None, :])[0].astype( theano.config.floatX), y_prob) # y_candidate = ifelse(int(stochastic), rng.multinomial(n=1, pvals=y_prob.dimshuffle('x', 1))[0].astype(theano.config.floatX), y_prob) y = ifelse( i < n_primer_steps, primer[i], y_candidate ) # Note: If you get error here, you just need to prime with something on first call. return y, h, c (x_gen, h_gen, c_gen), updates = theano.scan( do_step, sequences=[tt.arange(n_total_steps)], outputs_info=[x_init, h_init, c_init], ) if maintain_state: updates += [(x_init, x_gen[-1]), (h_init, h_gen[-1]), (c_init, c_gen[-1])] for var, val in updates.items(): add_update(var, val) return x_gen[n_primer_steps:],
def past_weight_grad_step(xs, es, kp_x, kd_x, kp_e, kd_e, shape, dws=None): """ Do an efficient update of the weights given the two spike-update. (This still runs FING SLOWLY!) :param xs: An (n_in) vector :param es: An (n_out) vector :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: (n_in, n_out) :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_in, n_out = shape rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros(n_in)+1) te_last = create_shared_variable(np.zeros(n_out)+1) x_last = create_shared_variable(np.zeros(n_in)) e_last = create_shared_variable(np.zeros(n_out)) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) x_spike_ixs, = tt.nonzero(x_spikes) e_spike_ixs, = tt.nonzero(e_spikes) if dws is None: dws = tt.zeros(shape) t_last = tt.minimum(tx_last[x_spike_ixs, None], te_last) # (n_x_spikes, n_out) dws = tt.inc_subtensor(dws[x_spike_ixs, :], x_last[x_spike_ixs, None]*e_last * rx**(tx_last[x_spike_ixs, None]-t_last) * re**(te_last[None, :]-t_last) * geoseries_sum(re*rx, t_end=t_last, t_start=1) ) new_x_last = tt.set_subtensor(x_last[x_spike_ixs], x_last[x_spike_ixs]*rx**tx_last[x_spike_ixs]+ xs[x_spike_ixs]/as_floatx(kd_x)) new_tx_last = tt.switch(x_spikes, 0, tx_last) t_last = tt.minimum(new_tx_last[:, None], te_last[e_spike_ixs]) # (n_in, n_e_spikes) dws = tt.inc_subtensor(dws[:, e_spike_ixs], new_x_last[:, None]*e_last[e_spike_ixs] * rx**(new_tx_last[:, None]-t_last) * re**(te_last[None, e_spike_ixs]-t_last) * geoseries_sum(re*rx, t_end=t_last, t_start=1) ) add_update(x_last, new_x_last) add_update(e_last, tt.set_subtensor(e_last[e_spike_ixs], e_last[e_spike_ixs]*re**te_last[e_spike_ixs]+ es[e_spike_ixs]/as_floatx(kd_e))) add_update(tx_last, new_tx_last+1) add_update(te_last, tt.switch(e_spikes, 1, te_last+1)) return dws
def get_all_signals(self, input_): scale = self.get_scale() scaled_input = input_*scale inc_phi = self.phi + scaled_input epsilon = tt.round(inc_phi) - inc_phi spikes = inc_phi + epsilon # spikes = tt.round(inc_phi) new_phi = inc_phi-spikes output = spikes / scale signals = dict( input=input_, scaled_input=scaled_input, spikes=spikes, epsilon=epsilon, output=output, ) add_update(self.phi, new_phi) return signals
def train(self, x, y): p_wa = self.compute_p_wa(self._w, x, y, self._alpha, self._possible_ws) phi_alpha = self._phi[ self._alpha] + p_wa # (n_alpha, n_dim_out, n_possible_ws) k_chosen = tt.argmax(phi_alpha, axis=2) # (n_alpha, n_dim_out) selected_phi_indices = (tt.arange(self._alpha.shape[0])[:, None], tt.arange(y.shape[1])[None, :], k_chosen) new_phi_alpha = tt.set_subtensor( phi_alpha[selected_phi_indices], phi_alpha[selected_phi_indices] - 1) # (n_alpha, n_dim_out, n_possible_ws) w_sample = self._possible_ws[k_chosen] # (n_alpha, n_dim_out) new_phi = tt.set_subtensor( self._phi[self._alpha], new_phi_alpha) # (n_dim_in, n_dim_out, n_possible_ws) w_new = tt.set_subtensor(self._w[self._alpha], w_sample) # (n_dim_in, n_dim_out) add_update(self._w, w_new) add_update(self._phi, new_phi) self._add_alpha_update()
def forward_pass_and_state(self, x, count_ops=False): # s = quantize(x, mode = self.fwd_quantizer, shape = (self.minibatch_size, self.n_in)) # s = pd_encode(x, kp=self.kp, kd=self.kd, quantization=self.fwd_quantizer, shape = (self.minibatch_size, self.n_in)) s = self.encdec.encode(x, shape=(self.minibatch_size, self.n_in)) # if self.n_in==784: # tdbplot(s.reshape((28, 28)), 's') # pre_act = pd_decode(s.dot(self.w), kp=self.kp, kd=self.kd, shape= (self.minibatch_size, self.n_out)) + self.b pre_act = self.encdec.decode( s.dot(self.w), shape=(self.minibatch_size, self.n_out)) + self.b if count_ops: add_update(self.fwd_op_count, self.fwd_op_count + abs(s).sum().astype('int64') * self.n_out, accumulate=True) # pre_act = s.dot(self.w) + self.b out = compute_activation(pre_act, activation_name=self.nonlinearity) return out, (x, s, pre_act)
def matrix_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes, epsilon=1e-7): """ :param xs: :param es: :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: :param epsilon: :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes v1 = create_shared_variable(np.zeros((n_samples, n_in, n_out))) rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) xr_decayed = xr*rx er_decayed = er*re spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) v2 = xr_decayed[:, :, None]*er_decayed[:, None, :] dws = (spikes*(v2-v1))/(rx*re-1) new_xr = xr_decayed + xs/(kp_x+kd_x) new_er = er_decayed + es/(kp_e+kd_e) add_update(v1, tt.switch(spikes, new_xr[:, :, None]*new_er[:, None, :], v1)) add_update(xr, new_xr) add_update(er, new_er) return dws.sum(axis=0)
def train(self, x, target): out = self.predict(x) delta_w = x.T.dot(target - out) delta_b = (target - out).sum(axis = 0) recon = self.backward(out) delta_w_rev = out.T.dot(x - recon) delta_b_rev = (x - recon).sum(axis = 0) add_update(self.w, self.w+delta_w) add_update(self.w_rev, self.w_rev+delta_w_rev) add_update(self.b, self.b+delta_b) add_update(self.b_rev, self.b_rev+delta_b_rev)
def past_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: Make this actually use sparsity, one of these days. kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x / (kp_x + kd_x) re = kd_e / (kp_e + kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in)) + 1) te_last = create_shared_variable(np.zeros((n_samples, n_out)) + 1) x_last = create_shared_variable(np.zeros((n_samples, n_in))) e_last = create_shared_variable(np.zeros((n_samples, n_out))) t_last = tt.minimum(tx_last[:, :, None], te_last[:, None, :]) x_spikes = tt.neq(xs, 0) dw_potentials = x_last[:, :, None] * e_last[:, None, :] * \ rx**(tx_last[:, :, None]-t_last) \ * re**(te_last[:, None, :]-t_last) \ * geoseries_sum(rx*re, t_end=t_last, t_start=1) e_spikes = tt.neq(es, 0) dws = (x_spikes[:, :, None] + e_spikes[:, None, :] - x_spikes[:, :, None] * e_spikes[:, None, :]) * dw_potentials # (n_samples, n_in, n_out) add_update( x_last, tt.switch(x_spikes, x_last * rx**tx_last + xs / as_floatx(kd_x), x_last)) add_update( e_last, tt.switch(e_spikes, e_last * rx**te_last + es / as_floatx(kd_e), e_last)) add_update(tx_last, tt.switch(x_spikes, 1, tx_last + 1)) add_update(te_last, tt.switch(e_spikes, 1, te_last + 1)) return dws.sum(axis=0)
def _update_param(self, param, gradient): mom1 = theano.shared(np.zeros_like(param.get_value())) mom2 = theano.shared(np.zeros_like(param.get_value())) mom1_new = mom1 + self._beta_1 * (gradient - mom1) mom2_new = tt.maximum(abs(gradient) + self._eps, (1. - self._beta_2) * mom2) new_param = param - self._alpha * mom1_new / mom2_new add_update(param, new_param) add_update(mom1, mom1_new) add_update(mom2, mom2_new)
def past_weight_grad_calculator_reloaded(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: RESOLVE INSTABILITY ISSUE kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x / (kp_x + kd_x) re = kd_e / (kp_e + kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in))) te_last = create_shared_variable(np.zeros((n_samples, n_out))) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) t_last = tt.maximum(tx_last[:, :, None], te_last[:, None, :]) sum_to_last = geoseries_sum( rx * re, t_start=t_last, t_end=0 ) # Wasteful, since most of this is multiplied by zeros later, but for now it don't matter spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) dw_es = ( xr[:, :, None] * er[:, None, :] * spikes ) * sum_to_last # PROBLEM HERE!!!! Can be very small number times very large numen # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last add_update(xr, xr * rx + xs / (kp_x + kd_x)) add_update(er, er * re + es / (kp_e + kd_e)) add_update(tx_last, tt.switch(x_spikes, 0, tx_last - 1)) add_update(te_last, tt.switch(e_spikes, 0, te_last - 1)) return dw_es.sum(axis=0)
def _update_param(self, param, gradient): mom1 = theano.shared(np.zeros_like(param.get_value())) mom2 = theano.shared(np.zeros_like(param.get_value())) mom1_new = mom1 + self._beta_1 * (gradient - mom1) mom2_new = tt.maximum( abs(gradient) + self._eps, (1. - self._beta_2) * mom2) new_param = param - self._alpha * mom1_new / mom2_new add_update(param, new_param) add_update(mom1, mom1_new) add_update(mom2, mom2_new)
def train(x, y): p_wa = compute_p_wa(w, x, y, alpha) # Now, the herding part... here're the 3 lines from the minipaper phi_alpha = phi[alpha] + p_wa w_sample = phi_alpha > 0.5 new_phi_alpha = phi_alpha - w_sample add_update(w, tt.set_subtensor(w[alpha], w_sample)) add_update(phi, tt.set_subtensor(phi[alpha], new_phi_alpha)) add_update(alpha, (alpha+1) % n_dim_in)
def _update_param(self, param, gradient): # Initialize variables i = create_shared_variable(0.) m = theano.shared(param.get_value() * 0.) v = theano.shared(param.get_value() * 0.) # Recompute values i_t = i + 1. fix1 = 1. - (1. - self.beta_1)**i_t fix2 = 1. - (1. - self.beta_2)**i_t lr_t = self.alpha * (tt.sqrt(fix2) / fix1) m_t = (self.beta_1 * gradient) + ((1. - self.beta_1) * m) v_t = (self.beta_2 * tt.sqr(gradient)) + ((1. - self.beta_2) * v) g_t = m_t / (tt.sqrt(v_t) + self.eps) p_t = param - (lr_t * g_t) add_update(param, p_t) add_update(m, m_t) add_update(v, v_t) add_update(i, i_t)
def past_weight_grad_calculator2(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ This attempt never really got off the ground. It doesn't work """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) # xr_new = xr*rx + xs/(kp_x+kd_x) # er_new = er*re + es/(kp_e+kd_e) arr = rx*re/(1-rx*re) xr_new = xr*arr + xs/(kp_x+kd_x) er_new = er*arr + es/(kp_e+kd_e) xsum = create_shared_variable(np.zeros((n_samples, n_in))) esum = create_shared_variable(np.zeros((n_samples, n_out))) xsum_new = xsum+xr_new esum_new = esum+er_new x_nospikes = tt.eq(xs, 0) e_nospikes = tt.eq(es, 0) dw = xs.T.dot(esum_new) + xsum_new.T.dot(es) add_update(xr, xr_new) add_update(er, er_new) add_update(xsum, xsum_new*x_nospikes) add_update(esum, esum_new*e_nospikes) return xs.T.dot(er) + xr.T.dot(es) # return xr.T.dot(er) # return dw
def past_weight_grad_calculator_reloaded(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: RESOLVE INSTABILITY ISSUE kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in))) te_last = create_shared_variable(np.zeros((n_samples, n_out))) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) t_last = tt.maximum(tx_last[:, :, None], te_last[:, None, :]) sum_to_last = geoseries_sum(rx*re, t_start=t_last, t_end=0) # Wasteful, since most of this is multiplied by zeros later, but for now it don't matter spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) dw_es = (xr[:, :, None]*er[:, None, :]*spikes)*sum_to_last # PROBLEM HERE!!!! Can be very small number times very large numen # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last add_update(xr, xr*rx + xs/(kp_x+kd_x)) add_update(er, er*re + es/(kp_e+kd_e)) add_update(tx_last, tt.switch(x_spikes, 0, tx_last-1)) add_update(te_last, tt.switch(e_spikes, 0, te_last-1)) return dw_es.sum(axis=0)
def past_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: Make this actually use sparsity, one of these days. kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in))+1) te_last = create_shared_variable(np.zeros((n_samples, n_out))+1) x_last = create_shared_variable(np.zeros((n_samples, n_in))) e_last = create_shared_variable(np.zeros((n_samples, n_out))) t_last = tt.minimum(tx_last[:, :, None], te_last[:, None, :]) x_spikes = tt.neq(xs, 0) dw_potentials = x_last[:, :, None] * e_last[:, None, :] * \ rx**(tx_last[:, :, None]-t_last) \ * re**(te_last[:, None, :]-t_last) \ * geoseries_sum(rx*re, t_end=t_last, t_start=1) e_spikes = tt.neq(es, 0) dws = (x_spikes[:, :, None]+e_spikes[:, None, :]-x_spikes[:, :, None]*e_spikes[:, None, :])*dw_potentials # (n_samples, n_in, n_out) add_update(x_last, tt.switch(x_spikes, x_last*rx**tx_last + xs/as_floatx(kd_x), x_last)) add_update(e_last, tt.switch(e_spikes, e_last*rx**te_last + es/as_floatx(kd_e), e_last)) add_update(tx_last, tt.switch(x_spikes, 1, tx_last+1)) add_update(te_last, tt.switch(e_spikes, 1, te_last+1)) return dws.sum(axis=0)
def matrix_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes, epsilon=1e-7): """ :param xs: :param es: :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: :param epsilon: :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes v1 = create_shared_variable(np.zeros((n_samples, n_in, n_out))) rx = kd_x / (kp_x + kd_x) re = kd_e / (kp_e + kd_e) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) xr_decayed = xr * rx er_decayed = er * re spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) v2 = xr_decayed[:, :, None] * er_decayed[:, None, :] dws = (spikes * (v2 - v1)) / (rx * re - 1) new_xr = xr_decayed + xs / (kp_x + kd_x) new_er = er_decayed + es / (kp_e + kd_e) add_update(v1, tt.switch(spikes, new_xr[:, :, None] * new_er[:, None, :], v1)) add_update(xr, new_xr) add_update(er, new_er) return dws.sum(axis=0)
def backward_pass(self, state, grad, cost = None, count_ops = False): """ :param grad: An integer (n_samples, n_dim_out) gradient estimate :return: (delta, param_gradients) Where: delta: A (n_samples, n_dim_in) integer gradient estimate """ assert (grad is None) != (cost is None), "You can either pass grad or cost" ap, ap_q, z = state if cost is None: filters = tt.grad(compute_activation(z, activation_name=self.nonlinearity).sum(), wrt=z) grad_z = filters*grad elif grad is None: grad_z = tt.grad(cost, wrt=z) # sb = quantize(pre_act_grad, mode=self.back_quantizer, shape=(self.minibatch_size, self.n_out)) # grad_z_q = pd_encode(grad_z, kp=self.kp_back, kd=self.kd_back, quantization=self.back_quantizer, shape=(self.minibatch_size, self.n_out)) grad_z_q = self.encdec_back.encode(grad_z, shape=(self.minibatch_size, self.n_out)) if count_ops: add_update(self.back_op_count, self.back_op_count+abs(grad_z_q).sum().astype('int64')*self.n_in, accumulate=True) # grad_ap = pd_decode(grad_z_q.dot(self.w.T), kp=self.kp_back, kd=self.kd_back, shape=(self.minibatch_size, self.n_in)) grad_ap = self.encdec_back.decode(grad_z_q.dot(self.w.T), shape=(self.minibatch_size, self.n_in)) if self.grad_calc in ('true', 'xx', 'recon'): # Dense op count add_update(self.update_op_count, self.back_op_count+self.minibatch_size*self.n_in*self.n_out) elif self.grad_calc in ('future', 'future-true', 'past', 'past_step', 'past_reloaded', 'past_matrix'): # Sparse op count add_update(self.update_op_count, self.update_op_count+abs(ap_q).sum().astype('int64')*self.n_out + abs(grad_z_q).sum().astype('int64')*self.n_in) else: raise NotImplementedError('No op-count method for {}'.format(self.grad_calc)) w_grad = self._get_past_gradient(ap, grad_z, ap_q, grad_z_q, grad_calc=self.grad_calc) # tdbplot(w_grad, self.grad_calc) # w_reloaded = self._get_past_gradient(ap, grad_z, ap_q, grad_z_q, grad_calc='past_reloaded') # tdbplot(w_reloaded, 'reloaded') b_grad = grad_z_q.sum(axis=0) if self.grad_calc[-1]=='s' else grad_z.sum(axis=0) return grad_ap, [w_grad, b_grad]
def _update_param(self, param, gradient): add_update(param, param - self._eta*gradient + 2*tt.sqrt(self._eta)*self._rng.normal(size = param.ishape))
def update(self): add_update(self._var, self._var+1)
def __call__(self): counter = theano.shared(np.zeros((), dtype = 'int')+self._initial_value) add_update(counter, counter+1) return counter
def lying_function_that_says_its_stateless_but_has_state(): add_update(var, var+1) return var+1
def honest_function_that_actually_updates(): add_update(var, var+1)
def running_sum(x): s = create_shared_variable(0.) new_s = s+x add_update(s, new_s) return new_s
def count(self): add_update(self._count_var, self._count_var+1) return self._count_var
def train(self, x, y): p_wa = self.compute_p_wa(self._w, x, y, self._alpha, self._possible_ws) # (n_alpha, n_dim_out, n_possible_ws) w_sample = sample_categorical(self._rng, p_wa, values = self._possible_ws) w_new = tt.set_subtensor(self._w[self._alpha], w_sample) # (n_dim_in, n_dim_out) add_update(self._w, w_new) self._add_alpha_update()
def _add_alpha_update(self): new_alpha = (self._alpha+self._n_alpha) % self._w.shape[0] \ if self._alpha_update_policy == 'sequential' else \ self._rng.choice(a=self._w.shape[0], size = (self._n_alpha, ), replace = False).reshape([-1]) # Reshape is for some reason necessary when n_alpha=1 add_update(self._alpha, new_alpha)
def _update_param(self, param, gradient): add_update(param, param - gradient)
def _update_param(self, param, gradient): add_update(param, param - self._eta * gradient)