def logp(self, x): n = self.n eta = self.eta diag_idxs = self.diag_idxs cumsum = tt.cumsum(x ** 2) variance = tt.zeros(n) variance = tt.inc_subtensor(variance[0], x[0] ** 2) variance = tt.inc_subtensor( variance[1:], cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]]) sd_vals = tt.sqrt(variance) logp_sd = self.sd_dist.logp(sd_vals).sum() corr_diag = x[diag_idxs] / sd_vals logp_lkj = (2 * eta - 3 + n - tt.arange(n)) * tt.log(corr_diag) logp_lkj = tt.sum(logp_lkj) # Compute the log det jacobian of the second transformation # described in the docstring. idx = tt.arange(n) det_invjac = tt.log(corr_diag) - idx * tt.log(sd_vals) det_invjac = det_invjac.sum() norm = _lkj_normalizing_constant(eta, n) return norm + logp_lkj + logp_sd + det_invjac
def __init__(self, vocab_size, dim, lr=0.5): W = np.asarray(np.random.rand(vocab_size, dim), dtype=theano.config.floatX) / float(dim) W1 = np.asarray((np.random.rand(vocab_size, dim)), dtype=theano.config.floatX) / float(dim) self.W = theano.shared(W, name='W', borrow=True) self.W1 = theano.shared(W1, name='W1', borrow=True) gW = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX) gW1 = np.asarray( np.ones((vocab_size, dim)), dtype=theano.config.floatX) self.gW = theano.shared(gW, name='gW', borrow=True) self.gW1 = theano.shared(gW1, name='gW1', borrow=True) X = T.vector() fX = T.vector() ind_W = T.ivector() ind_W1 = T.ivector() w = self.W[ind_W, :] w1 = self.W1[ind_W1, :] cost = T.sum(fX * ((T.sum(w * w1, axis=1) - X) ** 2)) grad = T.clip(T.grad(cost, [w, w1]), -5.0, 5.0) updates1 = [(self.gW, T.inc_subtensor(self.gW[ind_W, :], grad[0] ** 2))] updates2 = [(self.gW1, T.inc_subtensor(self.gW1[ind_W1, :], grad[1] ** 2))] updates3 = [(self.W, T.inc_subtensor(self.W[ind_W, :], - (lr / T.sqrt(self.gW[ind_W, :])) * grad[0]))] updates4 = [(self.W1, T.inc_subtensor(self.W1[ind_W1, :], - (lr / T.sqrt(self.gW1[ind_W1, :])) * grad[1]))] updates = updates1 + updates2 + updates3 + updates4 self.cost_fn = theano.function( inputs=[ind_W, ind_W1, X, fX], outputs=cost, updates=updates)
def recurrence(log_p_curr, log_p_prev, skip_mask=None): if skip_mask is None: skip_mask = T.ones_like(log_p_curr[:, 1:-2:2]) # normalise and bring back to p space k = T.max(log_p_prev, axis=1, keepdims=True) norm_p_prev = T.switch( T.isinf(log_p_prev), 0, T.exp(log_p_prev - k)) # set -inf to 0 # previous _result = norm_p_prev # add shift of previous _result = T.inc_subtensor(_result[:, 1:], norm_p_prev[:, :-1]) # add skips of previous _result = T.inc_subtensor(_result[:, 3::2], T.switch(skip_mask,norm_p_prev[:, 1:-2:2],0)) # current # log(p) should be 0 for first 2 terms result = T.switch( T.eq(_result, 0), -np.inf, log_p_curr + T.log(_result) + k ) return result
def log_likelihood(self): Users = self.L[:, :-1] Items = self.R[:, :-1] UserBiases = self.L[:, -1].reshape((-1, 1)) ItemBiases = self.R[:, -1].reshape((-1, 1)) A = T.dot(self.L[:, :-1], (self.R[:, :-1]).T) A = T.inc_subtensor(A[:, :], UserBiases) A = T.inc_subtensor(A[:, :], ItemBiases.T) B = A * self.counts loglik = T.sum(B) A = T.exp(A) A += 1 A = T.log(A) A = (self.counts + 1) * A loglik -= T.sum(A) # L2 regularization loglik -= 0.5 * self.reg_param * T.sum(T.square(self.L[:, :-1])) loglik -= 0.5 * self.reg_param * T.sum(T.square(self.R[:, :-1])) # Return negation of LogLikelihood cause we will minimize cost return -loglik
def power_pool_2d(x, ds, p=3, b=0): n_batch, n_ch, s0, s1 = x.shape d0, d1 = ds c = tt.ones((s0, s1)) # sum elements in regions y = tt.abs_(x[:, :, 0::d0, 0::d1])**p d = c[0::d0, 0::d1].copy() for i in range(0, d0): for j in range(0, d1): if i != 0 or j != 0: ni = (s0 - i - 1) / d0 + 1 nj = (s1 - j - 1) / d1 + 1 xij = tt.abs_(x[:, :, i::d0, j::d1])**p y = tt.inc_subtensor(y[:, :, :ni, :nj], xij) d = tt.inc_subtensor(d[:ni, :nj], c[i::d0, j::d1]) # divide by number of elements y /= d y += b**p # take root y = y**(1. / p) return y
def update_log_p(skip_idxs,zeros,active,log_p_curr,log_p_prev): active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()] active_next = T.cast(T.minimum( T.maximum( active + 1, T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1 ), log_p_curr.shape[0] ), 'int32') common_factor = T.max(log_p_prev[:active]) p_prev = T.exp(log_p_prev[:active] - common_factor) _p_prev = zeros[:active_next] # copy over _p_prev = T.set_subtensor(_p_prev[:active], p_prev) # previous transitions _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1]) # skip transitions _p_prev = T.inc_subtensor( _p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs]) updated_log_p_prev = T.log(_p_prev) + common_factor log_p_next = T.set_subtensor( zeros[:active_next], log_p_curr[:active_next] + updated_log_p_prev ) return active_next, log_p_next
def log_likelihood(self): Users = self.U[:, :-1] Middle = self.S Items = self.V[:-1, :] UserBiases = self.U[:, -1].reshape((-1, 1)) ItemBiases = self.V[-1, :].reshape((-1, 1)) A = T.dot(T.dot(self.U[:, :-1], self.S[:-1, :-1]), self.V[:-1, :]) A = T.inc_subtensor(A[:, :], UserBiases * T.sqrt(self.S[-1, -1])) A = T.inc_subtensor(A[:, :], ItemBiases.T * T.sqrt(self.S[-1, -1])) B = A * self.counts loglik = T.sum(B) A = T.exp(A) A += 1 A = T.log(A) A = (self.counts + 1) * A loglik -= T.sum(A) # L2 regularization loglik -= 0.5 * self.reg_param * T.sum(T.square(T.diag(self.S)[:-1])) # Return negation of LogLikelihood cause we will minimize cost return -loglik
def adadelta(self, param, grad, updates, sample_idx = None, epsilon = 1e-6): v1 = np.float32(self.adapt_params[0]) v2 = np.float32(1.0 - self.adapt_params[0]) acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True) upd = theano.shared(param.get_value(borrow=False) * 0., borrow=True) if sample_idx is None: acc_new = v1 * acc + v2 * (grad**2) updates[acc] = acc_new grad_scaling = (upd + epsilon) / (acc_new + epsilon) upd_new = v1 * upd + v2 * grad_scaling * (grad**2) updates[upd] = upd_new else: acc_s = acc[sample_idx] # acc_new = v1 * acc_s + v2 * (grad**2) #Faster, but inaccurate when an index occurs multiple times # updates[acc] = T.set_subtensor(acc_s, acc_new) #Faster, but inaccurate when an index occurs multiple times updates[acc] = T.inc_subtensor(T.set_subtensor(acc_s, acc_s * v1)[sample_idx], v2 * (grad**2)) #Slower, but accurate when an index occurs multiple times acc_new = updates[acc][sample_idx] #Slower, but accurate when an index occurs multiple times upd_s = upd[sample_idx] grad_scaling = (upd_s + epsilon) / (acc_new + epsilon) # updates[upd] = T.set_subtensor(upd_s, v1 * upd_s + v2 * grad_scaling * (grad**2)) #Faster, but inaccurate when an index occurs multiple times updates[upd] = T.inc_subtensor(T.set_subtensor(upd_s, upd_s * v1)[sample_idx], v2 * grad_scaling * (grad**2)) #Slower, but accurate when an index occurs multiple times gradient_scaling = T.cast(T.sqrt(grad_scaling), theano.config.floatX) if self.learning_rate != 1.0: print('Warn: learning_rate is not 1.0 while using adadelta. Setting learning_rate to 1.0') self.learning_rate = 1.0 return grad * gradient_scaling #Ok, checked
def adam(self, param, grad, updates, sample_idx = None, epsilon = 1e-6): v1 = np.float32(self.adapt_params[0]) v2 = np.float32(1.0 - self.adapt_params[0]) v3 = np.float32(self.adapt_params[1]) v4 = np.float32(1.0 - self.adapt_params[1]) acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True) meang = theano.shared(param.get_value(borrow=False) * 0., borrow=True) countt = theano.shared(param.get_value(borrow=False) * 0., borrow=True) if sample_idx is None: acc_new = v3 * acc + v4 * (grad**2) meang_new = v1 * meang + v2 * grad countt_new = countt + 1 updates[acc] = acc_new updates[meang] = meang_new updates[countt] = countt_new else: acc_s = acc[sample_idx] meang_s = meang[sample_idx] countt_s = countt[sample_idx] # acc_new = v3 * acc_s + v4 * (grad**2) #Faster, but inaccurate when an index occurs multiple times # updates[acc] = T.set_subtensor(acc_s, acc_new) #Faster, but inaccurate when an index occurs multiple times updates[acc] = T.inc_subtensor(T.set_subtensor(acc_s, acc_s * v3)[sample_idx], v4 * (grad**2)) #Slower, but accurate when an index occurs multiple times acc_new = updates[acc][sample_idx] #Slower, but accurate when an index occurs multiple times # meang_new = v1 * meang_s + v2 * grad # updates[meang] = T.set_subtensor(meang_s, meang_new) #Faster, but inaccurate when an index occurs multiple times updates[meang] = T.inc_subtensor(T.set_subtensor(meang_s, meang_s * v1)[sample_idx], v2 * (grad**2)) #Slower, but accurate when an index occurs multiple times meang_new = updates[meang][sample_idx] #Slower, but accurate when an index occurs multiple times countt_new = countt_s + 1.0 updates[countt] = T.set_subtensor(countt_s, countt_new) return (meang_new / (1 - v1**countt_new)) / (T.sqrt(acc_new / (1 - v1**countt_new)) + epsilon)
def fac_vis(self, x_phid, x_shid): # calculate probability of visible units # fac_vis[view][node, sample, statistic] facv_vis = [T.zeros((self.n_vis_nodes[view], self.n_samples, self.vis[view].n_statistics), dtype=theano.config.floatX) for view in range(self.n_views)] fv_shid = self.shid.f(x_shid) for view in range(self.n_views): fv_phid = self.phid[view].f(x_phid[view]) for statistic in range(self.vis[view].n_statistics): facv_vis[view] = T.set_subtensor(facv_vis[view][:, :, statistic], self.bias_vis[view][:, statistic].dimshuffle(0, 'x')) if self.vis[view].fixed_bias[statistic]: facv_vis[view] = T.set_subtensor(facv_vis[view][:, :, statistic], self.vis[view].fixed_bias_value[statistic]) for from_statistic in range(self.phid[view].n_statistics): facv_vis[view] = T.inc_subtensor(facv_vis[view][:, :, statistic], T.dot(self.weights_priv[view][:, statistic, :, from_statistic].T, fv_phid[:, :, from_statistic])) for from_statistic in range(self.shid.n_statistics): facv_vis[view] = T.inc_subtensor(facv_vis[view][:, :, statistic], T.dot(self.weights_shrd[view][:, statistic, :, from_statistic].T, fv_shid[:, :, from_statistic])) return facv_vis
def scan(self, x, z, non_sequences, i, outputs_info, W_re, W_in, b, go_backwards = False, truncate_gradient = -1): W_re_b = self.parent.add_param( self.parent.create_recurrent_weights(self.n_units, self.n_re, name="W_re_b_%s" % self.parent.name)) z_f = z[:,:,:z.shape[2]/2] z_b = z[::-1,:,z.shape[2]/2:] z_f = T.inc_subtensor(z_f[0], T.dot(outputs_info[0], W_re)) z_b = T.inc_subtensor(z_b[0], T.dot(outputs_info[0], W_re_b)) result = BLSTMOpInstance(z_f,z_b, W_re, W_re_b, outputs_info[1], T.zeros_like(outputs_info[1]), i, i[::-1]) return [ T.concatenate([result[0],result[1][::-1]],axis=2), T.concatenate([result[4],result[5][::-1]],axis=1).dimshuffle('x',0,1) ]
def gs_recurrence(p_curr, p_prev): # add previous _result = p_prev # add shift of previous _result = T.inc_subtensor(_result[1:], p_prev[:-1]) # add skips of previous _result = T.inc_subtensor(_result[3::2], p_prev[1:-2:2]) # current _result = _result * p_curr return _result
def add_synap_post_inp(i,po,p,s,q): # i:: sequence # po:: post # p:: pre # s:: dA # q:: W index = T.nonzero(q[:self.Ne,i]) npo = T.inc_subtensor(po[index,i],s) nw = T.inc_subtensor(q[:,i],p[:,i]) nw = T.clip(nw,0,self.wmax) return {po:npo,q:nw}
def past_weight_grad_step(xs, es, kp_x, kd_x, kp_e, kd_e, shape, dws=None): """ Do an efficient update of the weights given the two spike-update. (This still runs FING SLOWLY!) :param xs: An (n_in) vector :param es: An (n_out) vector :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: (n_in, n_out) :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_in, n_out = shape rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros(n_in)+1) te_last = create_shared_variable(np.zeros(n_out)+1) x_last = create_shared_variable(np.zeros(n_in)) e_last = create_shared_variable(np.zeros(n_out)) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) x_spike_ixs, = tt.nonzero(x_spikes) e_spike_ixs, = tt.nonzero(e_spikes) if dws is None: dws = tt.zeros(shape) t_last = tt.minimum(tx_last[x_spike_ixs, None], te_last) # (n_x_spikes, n_out) dws = tt.inc_subtensor(dws[x_spike_ixs, :], x_last[x_spike_ixs, None]*e_last * rx**(tx_last[x_spike_ixs, None]-t_last) * re**(te_last[None, :]-t_last) * geoseries_sum(re*rx, t_end=t_last, t_start=1) ) new_x_last = tt.set_subtensor(x_last[x_spike_ixs], x_last[x_spike_ixs]*rx**tx_last[x_spike_ixs]+ xs[x_spike_ixs]/as_floatx(kd_x)) new_tx_last = tt.switch(x_spikes, 0, tx_last) t_last = tt.minimum(new_tx_last[:, None], te_last[e_spike_ixs]) # (n_in, n_e_spikes) dws = tt.inc_subtensor(dws[:, e_spike_ixs], new_x_last[:, None]*e_last[e_spike_ixs] * rx**(new_tx_last[:, None]-t_last) * re**(te_last[None, e_spike_ixs]-t_last) * geoseries_sum(re*rx, t_end=t_last, t_start=1) ) add_update(x_last, new_x_last) add_update(e_last, tt.set_subtensor(e_last[e_spike_ixs], e_last[e_spike_ixs]*re**te_last[e_spike_ixs]+ es[e_spike_ixs]/as_floatx(kd_e))) add_update(tx_last, new_tx_last+1) add_update(te_last, tt.switch(e_spikes, 1, te_last+1)) return dws
def RMSprop(self, cost, params, full_params, sampled_params, sidxs, epsilon=1e-6): grads = [T.grad(cost = cost, wrt = param) for param in params] sgrads = [T.grad(cost = cost, wrt = sparam) for sparam in sampled_params] updates = OrderedDict() if self.grad_cap>0: norm=T.cast(T.sqrt(T.sum([T.sum([T.sum(g**2) for g in g_list]) for g_list in grads]) + T.sum([T.sum(g**2) for g in sgrads])), theano.config.floatX) grads = [[T.switch(T.ge(norm, self.grad_cap), g*self.grad_cap/norm, g) for g in g_list] for g_list in grads] sgrads = [T.switch(T.ge(norm, self.grad_cap), g*self.grad_cap/norm, g) for g in sgrads] for p_list, g_list in zip(params, grads): for p, g in zip(p_list, g_list): if self.adapt: if self.adapt == 'adagrad': g = self.adagrad(p, g, updates) if self.adapt == 'rmsprop': g = self.rmsprop(p, g, updates) if self.adapt == 'adadelta': g = self.adadelta(p, g, updates) if self.adapt == 'adam': g = self.adam(p, g, updates) if self.momentum > 0: velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True) velocity2 = self.momentum * velocity - np.float32(self.learning_rate) * (g + self.lmbd * p) updates[velocity] = velocity2 updates[p] = p + velocity2 else: updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32(self.learning_rate) * g for i in range(len(sgrads)): g = sgrads[i] fullP = full_params[i] sample_idx = sidxs[i] sparam = sampled_params[i] if self.adapt: if self.adapt == 'adagrad': g = self.adagrad(fullP, g, updates, sample_idx) if self.adapt == 'rmsprop': g = self.rmsprop(fullP, g, updates, sample_idx) if self.adapt == 'adadelta': g = self.adadelta(fullP, g, updates, sample_idx) if self.adapt == 'adam': g = self.adam(fullP, g, updates, sample_idx) if self.lmbd > 0: delta = np.float32(self.learning_rate) * (g + self.lmbd * sparam) else: delta = np.float32(self.learning_rate) * g if self.momentum > 0: velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True) vs = velocity[sample_idx] velocity2 = self.momentum * vs - delta updates[velocity] = T.set_subtensor(vs, velocity2) updates[fullP] = T.inc_subtensor(sparam, velocity2) else: updates[fullP] = T.inc_subtensor(sparam, - delta) return updates
def _pyramid_step(self, x_h, x_zr, x_m, t, h_tm1): ''' x_h/z/r: input at time t shape=[batch, hid] or [hid] x_m: mask of x_t shape=[batch] or [1] h_tm1: previous state shape=[batch, t+1 or n_steps, hid] or [t+1 or n_steps, hid] ''' if self.with_begin_tag: if x_h.ndim == 1 and h_tm1.ndim == 2: h_tm1 = T.set_subtensor(h_tm1[t,:], self.struct_begin_tag) elif x_h.ndim == 2 and h_tm1.ndim == 3: h_tm1 = T.set_subtensor(h_tm1[:,t,:], self.struct_begin_tag[None,:]) else: raise NotImplementedError zr_t = T.dot(h_tm1, self.W_hzr) can_h_t = T.dot(h_tm1, self.W_hh) if x_h.ndim == 1 and h_tm1.ndim == 2: xzr = x_zr[None,:] xm = x_m[:,None] zr_t = T.inc_subtensor(zr_t[:t+1], xzr) elif x_h.ndim == 2 and h_tm1.ndim == 3: xzr = x_zr[:,None,:] xm = x_m[:,None,None] zr_t = T.inc_subtensor(zr_t[:,:t+1], xzr) else: raise NotImplementedError zr_t = T.nnet.sigmoid(zr_t) z_t = _slice(zr_t, 0, self.n_hids) r_t = _slice(zr_t, 1, self.n_hids) can_h_t *= r_t if x_h.ndim == 1 and h_tm1.ndim == 2: xh = x_h[None,:] can_h_t = T.inc_subtensor(can_h_t[:t+1], xh) elif x_h.ndim == 2 and h_tm1.ndim == 3: xh = x_h[:,None,:] can_h_t = T.inc_subtensor(can_h_t[:,:t+1], xh) else: raise NotImplementedError can_h_t = T.tanh(can_h_t) h_t = z_t * h_tm1 + (1 - z_t) * can_h_t h_t = xm * h_t + (1. - xm) * h_tm1 return h_t
def __init__(self, n_from, n_to, de, seed=1692, init_params=None): """ n_from :: number of from embeddings in the vocabulary n_to :: number of to embeddings in the vocabulary de :: dimension of the word embeddings """ np.random.seed(seed) # parameters of the model if init_params is not None: with open('data/case_embeddings.pkl', 'rb') as f: temp = cPickle.load(f) self.Win = theano.shared(temp.Win.get_value().astype(theano.config.floatX)) self.Wout = theano.shared(temp.Wout.get_value().astype(theano.config.floatX)) else: self.Win = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (n_from, de)).astype(theano.config.floatX)) self.Wout = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (n_to, de)).astype(theano.config.floatX)) # adagrad self.cumulative_gradients_in = theano.shared(0.1 * np.ones((n_from, de)).astype(theano.config.floatX)) self.cumulative_gradients_out = theano.shared(0.1 * np.ones((n_to, de)).astype(theano.config.floatX)) idxs = TT.imatrix() x_in = self.Win[idxs[:, 0], :] x_out = self.Wout[idxs[:, 1], :] norms_in= TT.sqrt(TT.sum(x_in ** 2, axis=1)) norms_out = TT.sqrt(TT.sum(x_out ** 2, axis=1)) norms = norms_in * norms_out y = TT.vector('y') # label y_predictions = TT.sum(x_in * x_out, axis=1) / norms # cost and gradients and learning rate loss = TT.mean(TT.sqr(y_predictions - y)) gradients = TT.grad(loss, [x_in, x_out]) updates = [ (self.cumulative_gradients_in, TT.inc_subtensor(self.cumulative_gradients_in[idxs[:, 0]], gradients[0] ** 2)), (self.cumulative_gradients_out, TT.inc_subtensor(self.cumulative_gradients_out[idxs[:, 1]], gradients[1] ** 2)), (self.Win, TT.inc_subtensor(self.Win[idxs[:, 0]], - (0.5 / TT.sqrt(self.cumulative_gradients_in[idxs[:, 0]])) * gradients[0])), (self.Wout, TT.inc_subtensor(self.Wout[idxs[:, 1]], - (0.5 / TT.sqrt(self.cumulative_gradients_out[idxs[:, 1]])) * gradients[1])), ] # theano functions self.calculate_loss = theano.function(inputs=[idxs, y], outputs=loss) self.classify = theano.function(inputs=[idxs], outputs=y_predictions) self.train = theano.function( inputs=[idxs, y], outputs=loss, updates=updates, name='training_fn' )
def indexed_train_func(self, arc, learning_rate, prealloc_x, batch_size, apply_x=identity): ''' Train function with indexed restriction ''' nnlayer = self.layers[arc] applied_cost = theano.clone(self.cost, replace={ self._x: apply_x(self._x) }) updates = [ (nnlayer.W, T.inc_subtensor(nnlayer.W[:,nnlayer.idx], - learning_rate * T.grad(applied_cost, nnlayer.W)[:,nnlayer.idx].T)) , (nnlayer.b, T.inc_subtensor(nnlayer.b[nnlayer.idx], - learning_rate * T.grad(applied_cost, nnlayer.b)[nnlayer.idx])) , (nnlayer.b_prime, - learning_rate * T.grad(applied_cost, nnlayer.b_prime)) ] idx = T.iscalar('idx') givens = { self._x: prealloc_x[idx * batch_size:(idx+1) * batch_size] } return theano.function([idx, nnlayer.idx], None, updates=updates, givens=givens)
def add_synap_pre_inp(i,p,po,s,q): # i :: sequence # p :: pre | post # s :: dApre | dApost # q :: W index = T.nonzero(q[i,:self.Ne]) np = T.inc_subtensor(p[i,index],s) ## tmp = p[i,:] ## tmp=T.inc_subtensor(tmp[index],s) ## np=T.set_subtensor(p[i,:],tmp) #np = T.inc_subtensor(p[i,:],s) nw = T.inc_subtensor(q[i,:],po[i,:]) nw=T.clip(nw,0,self.wmax) return {p:np,q:nw}
def test_context_manager(): x = tensor.vector() y = tensor.vector() z = tensor.inc_subtensor(x[1:3], y) xp = tensor.vector() yp = tensor.vector() zp = tensor.inc_subtensor(xp[1:1234], yp) vars = (1234, xp, yp) with variables(*vars): match, = run(0, vars, (eq, z, zp)) assert match == (3, x, y)
def compute_numerical_gradient(v,i,X,Y): # perturb the input v_plus = T.inc_subtensor(v[i],self.eps) v_minus = T.inc_subtensor(v[i],-1.0*self.eps) # roll it back into the weight matrices and bias vectors wts_plus, bs_plus = nu.t_reroll(v_plus,nnet.num_nodes) wts_minus, bs_minus = nu.t_reroll(v_minus,nnet.num_nodes) # compute the loss for both sides, and then compute the numerical gradient loss_plus = nnet.compute_loss(X,Y,wts_plus,bs_plus) loss_minus = nnet.compute_loss(X,Y,wts_minus,bs_minus) return 1.0*(loss_plus-loss_minus)/(2*self.eps) # ( E(weights[i]+eps) - E(weights[i]-eps) )/(2*eps)
def step(i_,j_,Rij_,_U,_V): cftools.test_value(i_, np.array([0.5])) cftools.test_value(j_, np.array([0.5])) cftools.test_value(Rij_, np.array([0.5])) i = i_[0] j = j_[0] Rij = Rij_[0] eij = Rij - T.dot(_U[:,i].T, _V[:,j]) new_U = T.inc_subtensor(_U[:,i], config.lr * eij * _V[:,j]) eij = Rij - T.dot(new_U[:,i].T, _V[:,j]) new_V = T.inc_subtensor(_V[:,j], config.lr * eij * new_U[:,i]) return { _U:new_U, _V:new_V }
def call(self, inputs, mask=None): if not isinstance(inputs, list) or len(inputs) <= 1: raise TypeError('SpkLifeLongMemory must be called on a list of tensors ' '(at least 2). Got: ' + str(inputs)) # (None(batch), 1), index of speaker target_spk_l = inputs[0] target_spk_l = K.reshape(target_spk_l, (target_spk_l.shape[0], )) if K.dtype(target_spk_l) != 'int32': target_spk_l = K.cast(target_spk_l, 'int32') # (None(batch), embed_dim) spk_vector_l = inputs[1] # Start to update life-long memory based on the learned speech vector # First do normalization spk_vector_eps = K.switch(K.equal(spk_vector_l, 0.), np.spacing(1), spk_vector_l) # avoid zero spk_vector_eps = K.sqrt(K.sum(spk_vector_eps**2, axis=1)) spk_vector_eps = spk_vector_eps.dimshuffle((0, 'x')) spk_vector = T.true_div(spk_vector_l, K.repeat_elements(spk_vector_eps, self.vec_dim, axis=1)) # Store speech vector into life-long memory according to the speaker identity. life_long_mem = T.inc_subtensor(self.life_long_mem[target_spk_l, :], spk_vector) # Normalization for memory life_long_mem_eps = K.switch(K.equal(life_long_mem, 0.), np.spacing(1), life_long_mem) # avoid 0 life_long_mem_eps = K.sqrt(K.sum(life_long_mem_eps**2, axis=1)) life_long_mem_eps = life_long_mem_eps.dimshuffle((0, 'x')) life_long_mem = T.true_div(life_long_mem, K.repeat_elements(life_long_mem_eps, self.vec_dim, axis=1)) # (None(batch), spk_size, embed_dim) return life_long_mem
def __init__(self, n_out, x_out=None, delay=0, sparse=False, name="", network=None, eval_flag=False, data_key=None, # if we don't want to use "data" but something else. via y_in # These will be given if we initialize via JSON. sources=None, dropout=0, train_flag=None, mask=None, index=None, y_in=None, dtype=None): super(SourceLayer, self).__init__(layer_class=self.layer_class, name=name) if data_key is not None: assert x_out is None assert network assert dtype network.use_target(target=data_key, dtype=dtype) x_out = network.y[data_key] if x_out is None: assert network is not None x_out = network.x assert not sources, 'specify `"from": "null"` in json' # or just ignore? assert dropout == 0 if getattr(x_out.tag, "test_value", None) is None: if not sparse: x_out.tag.test_value = numpy.random.rand(3,2,n_out).astype('float32') if index and getattr(index.tag, "test_value", None) is None: index.tag.test_value = numpy.ones((3,2), dtype='int8') if not delay: self.output = x_out else: self.output = T.inc_subtensor(T.zeros_like(x_out)[delay:], x_out[:-delay]) self.set_attr('n_out', n_out) self.set_attr('sparse', sparse) self.set_attr('delay', delay) self.index = index self.device = 'cpu' self.eval_flag = eval_flag
def step_fn(current_input_to_state, prev_c, prev_h): # all args have shape (batch size, output_dim, height) # TODO consider learning this padding prev_h_padded = T.zeros((batch_size, output_dim, 1+height), dtype=theano.config.floatX) prev_h_padded = T.inc_subtensor(prev_h_padded[:,:,1:], prev_h) state_to_state = lib.ops.conv1d.Conv1D( name+'.StateToState', output_dim, 4*output_dim, 2, prev_h_padded, biases=False ) gates = current_input_to_state + state_to_state o_f_i = T.nnet.sigmoid(gates[:,:3*output_dim,:]) o = o_f_i[:,0*output_dim:1*output_dim,:] f = o_f_i[:,1*output_dim:2*output_dim,:] i = o_f_i[:,2*output_dim:3*output_dim,:] g = T.tanh(gates[:,3*output_dim:4*output_dim,:]) new_c = (f * prev_c) + (i * g) new_h = o * T.tanh(new_c) return (new_c, new_h)
def fprop_step_mask(self, state_below, mask, state_before, U): """ Scan function for case using masks Parameters ---------- : todo state_below : TheanoTensor """ g_on = tensor.inc_subtensor( state_below[:, self.dim:], tensor.dot(state_before, U[:, self.dim:]) ) r_on = tensor.nnet.sigmoid(g_on[:, self.dim:2*self.dim]) u_on = tensor.nnet.sigmoid(g_on[:, 2*self.dim:]) z_t = tensor.tanh( g_on[:, :self.dim] + tensor.dot(r_on * state_before, U[:, :self.dim]) ) z_t = u_on * state_before + (1. - u_on) * z_t z_t = mask[:, None] * z_t + (1 - mask[:, None]) * state_before return z_t
def test_simple_3d(self): """Increments or sets part of a tensor by a scalar using full slice and a partial slice depending on a scalar. """ a = tt.dtensor3() increment = tt.dscalar() sl1 = slice(None) sl2_end = tt.lscalar() sl2 = slice(sl2_end) sl3 = 2 for do_set in [True, False]: print "Set", do_set if do_set: resut = tt.set_subtensor(a[sl1, sl3, sl2], increment) else: resut = tt.inc_subtensor(a[sl1, sl3, sl2], increment) f = theano.function([a, increment, sl2_end], resut) val_a = numpy.ones((5, 3, 4)) val_inc = 2.3 val_sl2_end = 2 expected_result = numpy.copy(val_a) result = f(val_a, val_inc, val_sl2_end) if do_set: expected_result[:, sl3, :val_sl2_end] = val_inc else: expected_result[:, sl3, :val_sl2_end] += val_inc self.assertTrue(numpy.array_equal(result, expected_result))
def fprop(self, XH): # XH is a list of inputs: [state_belows, state_befores] # each state vector is: [state_before; cell_before] # Hence, you use h[:, :self.nout] to compute recurrent term X, H = XH if len(X) != len(self.parent): raise AttributeError("The number of inputs doesn't match " "with the number of parents.") if len(H) != len(self.recurrent): raise AttributeError("The number of inputs doesn't match " "with the number of recurrents.") # The index of self recurrence is 0 z_tm1 = H[0] z = T.zeros((X[0].shape[0], 3 * self.nout)) for x, (parname, parout) in izip(X, self.parent.items()): W = self.params['W_' + parname + '__' + self.name] z += T.dot(x[:, :parout], W) for h, (recname, recout) in izip(H, self.recurrent.items()): U = self.params['U_' + recname + '__' + self.name] z = T.inc_subtensor(z[:, self.nout:], T.dot(h[:, :recout], U[:, self.nout:])) z += self.params['b_' + self.name] # Compute activations of gating units r_on = T.nnet.sigmoid(z[:, self.nout:2 * self.nout]) u_on = T.nnet.sigmoid(z[:, 2 * self.nout:]) # Update hidden & cell states c_t = T.zeros_like(z_tm1) for h, (recname, recout) in izip(H, self.recurrent.items()): U = self.params['U_' + recname + '__' + self.name] c_t += T.dot(h[:, :recout], U[:, :self.nout]) z_t = T.tanh(z[:, :self.nout] + r_on * c_t) z_t = u_on * z_tm1 + (1. - u_on) * z_t z_t.name = self.name return z_t
def test_incsub_f16(): shp = (3, 3) shared = gpuarray_shared_constructor xval = np.arange(np.prod(shp), dtype='float16').reshape(shp) + 1 yval = np.empty((2,) + shp[1:], dtype='float16') yval[:] = 2 x = shared(xval, name='x') y = tensor.tensor(dtype='float16', broadcastable=(False,) * len(shp), name='y') expr = tensor.advanced_inc_subtensor1(x, y, [0, 2]) f = theano.function([y], expr, mode=mode_with_gpu) assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1) for node in f.maker.fgraph.toposort()]) == 1 rval = f(yval) rep = xval.copy() np.add.at(rep, [[0, 2]], yval) assert np.allclose(rval, rep) expr = tensor.inc_subtensor(x[1:], y) f = theano.function([y], expr, mode=mode_with_gpu) assert sum([isinstance(node.op, GpuIncSubtensor) for node in f.maker.fgraph.toposort()]) == 1 rval = f(yval) rep = xval.copy() rep[1:] += yval assert np.allclose(rval, rep)
def create_adam_updates(updates, params, gparams, gsums, xsums, lr, eps, beta1, beta2): i = theano.shared(np.float64(0.0).astype(theano.config.floatX)) i_t = i + 1.0 omb1_t = 1.0 - beta1**i_t omb2_t = 1.0 - beta2**i_t lr_t = lr * (T.sqrt(omb2_t) / omb1_t) for p, g, m, v in zip(params, gparams, gsums, xsums): if is_subtensor_op(p): origin, indexes = get_subtensor_op_inputs(p) m_sub = m[indexes] v_sub = v[indexes] m_t = beta1*m_sub + (1.0-beta1)*g v_t = beta2*v_sub + (1.0-beta2)*T.sqr(g) g_t = m_t / (T.sqrt(v_t) + eps) updates[m] = T.set_subtensor(m_sub, m_t) updates[v] = T.set_subtensor(v_sub, v_t) updates[origin] = T.inc_subtensor(p, -lr_t*g_t) else: m_t = beta1*m + (1.0-beta1)*g v_t = beta2*v + (1.0-beta2)*T.sqr(g) g_t = m_t / (T.sqrt(v_t) + eps) updates[m] = m_t updates[v] = v_t updates[p] = p - lr_t*g_t updates[i] = i_t
def forward(self, x): y = tt.zeros(x.shape) y = tt.inc_subtensor(y[..., 0], x[..., 0]) y = tt.inc_subtensor(y[..., 1:], tt.log(x[..., 1:] - x[..., :-1])) return y
def test_jax_basic(): x = tt.matrix("x") y = tt.matrix("y") b = tt.vector("b") # `ScalarOp` z = tt.cosh(x**2 + y / 3.0) # `[Inc]Subtensor` out = tt.set_subtensor(z[0], -10.0) out = tt.inc_subtensor(out[0, 1], 2.0) out = out[:5, :3] out_fg = theano.gof.FunctionGraph([x, y], [out]) test_input_vals = [ np.tile(np.arange(10), (10, 1)).astype(tt.config.floatX), np.tile(np.arange(10, 20), (10, 1)).astype(tt.config.floatX), ] (jax_res, ) = compare_jax_and_py(out_fg, test_input_vals) # Confirm that the `Subtensor` slice operations are correct assert jax_res.shape == (5, 3) # Confirm that the `IncSubtensor` operations are correct assert jax_res[0, 0] == -10.0 assert jax_res[0, 1] == -8.0 out = tt.clip(x, y, 5) out_fg = theano.gof.FunctionGraph([x, y], [out]) compare_jax_and_py(out_fg, test_input_vals) out = tt.diagonal(x, 0) out_fg = theano.gof.FunctionGraph([x], [out]) compare_jax_and_py( out_fg, [np.arange(10 * 10).reshape((10, 10)).astype(tt.config.floatX)]) out = tt.slinalg.cholesky(x) out_fg = theano.gof.FunctionGraph([x], [out]) compare_jax_and_py(out_fg, [ (np.eye(10) + np.random.randn(10, 10) * 0.01).astype(tt.config.floatX) ]) # not sure why this isn't working yet with lower=False out = tt.slinalg.Cholesky(lower=False)(x) out_fg = theano.gof.FunctionGraph([x], [out]) compare_jax_and_py(out_fg, [ (np.eye(10) + np.random.randn(10, 10) * 0.01).astype(tt.config.floatX) ]) out = tt.slinalg.solve(x, b) out_fg = theano.gof.FunctionGraph([x, b], [out]) compare_jax_and_py( out_fg, [ np.eye(10).astype(tt.config.floatX), np.arange(10).astype(tt.config.floatX) ], ) out = tt.nlinalg.alloc_diag(b) out_fg = theano.gof.FunctionGraph([b], [out]) compare_jax_and_py(out_fg, [np.arange(10).astype(tt.config.floatX)]) out = tt.nlinalg.det(x) out_fg = theano.gof.FunctionGraph([x], [out]) compare_jax_and_py( out_fg, [np.arange(10 * 10).reshape((10, 10)).astype(tt.config.floatX)]) out = tt.nlinalg.matrix_inverse(x) out_fg = theano.gof.FunctionGraph([x], [out]) compare_jax_and_py(out_fg, [ (np.eye(10) + np.random.randn(10, 10) * 0.01).astype(tt.config.floatX) ])
def get_pseudograd(loss, params, srng=None, eps_sigma=1.0, grad_prior=1.0, r = 1.0e-1): srng = get_srng(srng) eps = 1.0 / eps_sigma def step(i, param, eta, lam_diag, dx): upd = OrderedDict() upd[dx] = dx n = param.get_value(borrow=True).shape[0] one = T.constant(1.0) zero = T.constant(0.0) dx = T.fvector() pgrads = [] for param in params: value = param.get_value(borrow=True) shape=value.shape n = np.prod(shape) i = T.iscalar() zeros = T.zeros(shape=n, dtype=param.dtype) delta = (2 * srng.binomial() - 1) * r inc = T.set_subtensor(zeros[i], delta).reshape(shape) new_loss = theano.clone( loss, replace={param: param + inc} ) dloss = new_loss - loss eta = theano.shared(np.zeros(shape=n, dtype='float32')) lam_diag = theano.shared(np.ones(n, dtype='float32') * grad_prior) def u(i): upd = OrderedDict() upd[eta] = T.inc_subtensor(eta[i], dloss * eps * delta) upd[lam_diag] = T.inc_subtensor(lam_diag[i], eps * (r ** 2)) _, upd = theano.scan( u, sequences=T.arange(n) ) dloss = new_loss - loss upd[eta] = rho * T.inc_subtensor(eta[i], dloss * eps * T.sum(dx)) + (one - rho) * T.zeros(n) upd[lam_diag] = rho * T.inc_subtensor(lam_diag[i], eps * T.sum(dx) ** 2) + (one - rho) * T.ones(n) pgrad = eta / lam_diag upd[param] = param - learning_rate * pgrad t = theano.function([dx, i] + input, output, updates=upd) dx_ = np.zeros(n, dtype='float32')
def test_incsubtensor1(self): tv = numpy.asarray(self.rng.uniform(size=(3, )), theano.config.floatX) t = theano.shared(tv) out = tensor.inc_subtensor(self.x[:3], t) self.check_rop_lop(out, self.in_shape)
import numpy as np import theano import theano.tensor as T fX = theano.config.floatX s = theano.shared(np.ones((10, 1), dtype=fX)) idxs = [0, 1, 1] fn = theano.function([], updates=[(s, T.inc_subtensor(s[idxs], s[idxs]**2))]) fn() print s.get_value()
print(z.eval({a: np.diag((3, 3)).astype(theano.config.floatX), b : 3})) cond = T.vector('cond') x, y = T.vectors('x', 'y') z = T.switch(cond, x, y) print(z.eval({ cond: [1, 0], x: [10, 10], y: [3, 2]})) a = T.matrix('a') print(T.max(a).eval({a: [[1, 2], [3, 4]]})) print(T.max(a, axis=0).eval({a: [[1, 2], [3, 4]]})) print(T.max(a, axis=1).eval({a: [[1, 2], [3, 4]]})) a = T.arange(10).reshape((5, 2)) b = a[::-1] print(b.eval()) print(T.concatenate([a, b]).eval()) print(T.concatenate([a, b], axis=1).eval()) print(T.stack([a, b]).eval()) a = T.arange(10).reshape((5, 2)) print(T.set_subtensor(a[3:], [-1, -1]).eval()) print(T.inc_subtensor(a[3:], [-1, -1]).eval())
def RMSprop(self, cost, params, full_params, sampled_params, sidxs, epsilon=1e-6): grads = [T.grad(cost=cost, wrt=param) for param in params] sgrads = [T.grad(cost=cost, wrt=sparam) for sparam in sampled_params] updates = OrderedDict() if self.grad_cap > 0: norm = T.cast( T.sqrt( T.sum([ T.sum([T.sum(g**2) for g in g_list]) for g_list in grads ]) + T.sum([T.sum(g**2) for g in sgrads])), theano.config.floatX) grads = [[ T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in g_list ] for g_list in grads] sgrads = [ T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in sgrads ] for p_list, g_list in zip(params, grads): for p, g in zip(p_list, g_list): if self.adapt == 'adagrad': g = self.adagrad(p, g, updates) elif self.adapt == 'rmsprop': g = self.rmsprop(p, g, updates) elif self.adapt == 'adadelta': g = self.adadelta(p, g, updates) elif self.adapt == 'adam': g = self.adam(p, g, updates) if self.momentum > 0: velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True) velocity2 = self.momentum * velocity - np.float32( self.learning_rate) * (g + self.lmbd * p) updates[velocity] = velocity2 updates[p] = p + velocity2 else: updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32( self.learning_rate) * g for i in range(len(sgrads)): g = sgrads[i] fullP = full_params[i] sample_idx = sidxs[i] sparam = sampled_params[i] if self.adapt == 'adagrad': g = self.adagrad(fullP, g, updates, sample_idx) elif self.adapt == 'rmsprop': g = self.rmsprop(fullP, g, updates, sample_idx) elif self.adapt == 'adadelta': g = self.adadelta(fullP, g, updates, sample_idx) elif self.adapt == 'adam': g = self.adam(fullP, g, updates, sample_idx) if self.lmbd > 0: delta = np.float32( self.learning_rate) * (g + self.lmbd * sparam) else: delta = np.float32(self.learning_rate) * g if self.momentum > 0: velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True) vs = velocity[sample_idx] velocity2 = self.momentum * vs - delta updates[velocity] = T.set_subtensor(vs, velocity2) updates[fullP] = T.inc_subtensor(sparam, velocity2) else: updates[fullP] = T.inc_subtensor(sparam, -delta) return updates
def scan_step(index, prev_res, y_labeling, y_): res_t = T.inc_subtensor(prev_res[y_[index, T.arange(batch_size)], T.arange(batch_size)], y_labeling[index, T.arange(batch_size)]) return res_t
def backward(self, y): out = tt.zeros(y.shape) out = tt.inc_subtensor(out[0], y[0]) out = tt.inc_subtensor(out[1:], tt.exp(y[1:])) return tt.cumsum(out)
def _update_std(nnet, layer, dW, db, loss, idx=None): """ update with standard feature vectors (i.e. non-compressed) """ assert layer.isconv or layer.issvm if Cfg.store_on_gpu: assert idx is not None C = Cfg.C D = Cfg.D eps = Cfg.eps k = layer.k K = (C * D) / (C + D) W_s = dW * K * T.cast(1. / nnet.data.n_train, 'floatX') b_s = db * K * T.cast(1. / nnet.data.n_train, 'floatX') l_s = loss * T.cast(1. / nnet.data.n_train, 'floatX') if Cfg.store_on_gpu: DeltaW = W_s - layer.W_i[idx] Deltab = b_s - layer.b_i[idx] Deltal = l_s - layer.l_i[idx] else: DeltaW = W_s - layer.W_i_buffer Deltab = b_s - layer.b_i_buffer Deltal = l_s - layer.l_i_buffer gamma = (K * Deltal + T.sum(DeltaW * layer.W) + T.sum(Deltab * layer.b)) / \ (eps + T.sum(DeltaW ** 2) + T.sum(Deltab ** 2)) gamma = gamma.clip(0, 1) W = layer.W - gamma * DeltaW b = layer.b - gamma * Deltab l = layer.l + gamma * Deltal if Cfg.store_on_gpu: # new value W_i = T.inc_subtensor(layer.W_i[idx], gamma * DeltaW) b_i = T.inc_subtensor(layer.b_i[idx], gamma * Deltab) l_i = T.inc_subtensor(layer.l_i[idx], gamma * Deltal) # shared variable to update layer_W_i = layer.W_i layer_b_i = layer.b_i layer_l_i = layer.l_i else: # new value W_i = layer.W_i_buffer + gamma * DeltaW b_i = layer.b_i_buffer + gamma * Deltab l_i = layer.l_i_buffer + gamma * Deltal # shared variable to update layer_W_i = layer.W_i_buffer layer_b_i = layer.b_i_buffer layer_l_i = layer.l_i_buffer # average W_avg = T.cast((k * 1. / (k + 2)), 'floatX') * layer.W_avg + \ T.cast((2. / (k + 2)), 'floatX') * W b_avg = T.cast((k * 1. / (k + 2)), 'floatX') * layer.b_avg + \ T.cast((2. / (k + 2)), 'floatX') * b k = k + 1 updates = ((layer.W, W), (layer.b, b), (layer.W_avg, W_avg), (layer.b_avg, b_avg), (layer.k, k), (layer.l, l), (layer_W_i, W_i), (layer_b_i, b_i), (layer_l_i, l_i), (layer.gamma, gamma)) return updates
def build_network(self, K, vocab_size, doc_var, query_var, docmask_var, qmask_var, candmask_var, feat_var, W_init): l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var) l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var) l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var) l_featin = L.InputLayer(shape=(None, None), input_var=feat_var) l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) # B x N x 1 x DE l_doce = L.ReshapeLayer( l_docembed, (doc_var.shape[0], doc_var.shape[1], EMBED_DIM)) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=vocab_size, output_size=EMBED_DIM, W=l_docembed.W) l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 if not EMB_TRAIN: l_docembed.params[l_docembed.W].remove('trainable') l_fwd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1) l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1) l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice]) # B x 2D q = L.get_output(l_q) # B x 2D l_qs = [l_q] for i in range(K - 1): l_fwd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc_1 = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x DE l_fwd_q_1 = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_q_1 = L.GRULayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1], axis=2) # B x Q x DE l_qs.append(l_q_c_1) qd = L.get_output(l_q_c_1) # B x Q x DE dd = L.get_output(l_doc_1) # B x N x DE M = T.batched_dot(dd, qd.dimshuffle((0, 2, 1))) # B x N x Q alphas = T.nnet.softmax( T.reshape(M, (M.shape[0] * M.shape[1], M.shape[2]))) alphas_r = T.reshape(alphas, (M.shape[0],M.shape[1],M.shape[2]))* \ qmask_var[:,np.newaxis,:] # B x N x Q alphas_r = alphas_r / alphas_r.sum(axis=2)[:, :, np.newaxis] # B x N x Q q_rep = T.batched_dot(alphas_r, qd) # B x N x DE l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN), input_var=q_rep) l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul) l_doce = L.dropout(l_doc_2_in, p=DROPOUT_RATE) # B x N x DE l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2 l_fwd_doc = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_doc = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2) d = L.get_output(l_doc) # B x N x 2D p = T.batched_dot(d, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,T.flatten(doc_var,outdim=2)],\ pm) dv = L.get_output(l_doc, deterministic=True) # B x N x 2D p = T.batched_dot(dv, q) # B x N pm = T.nnet.softmax(p) * candmask_var pm = pm / pm.sum(axis=1)[:, np.newaxis] index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape) final_v = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,\ T.flatten(doc_var,outdim=2)],pm) return final, final_v, l_doc, l_qs
def backward(self, y): x = tt.zeros(y.shape) x = tt.inc_subtensor(x[..., 0], y[..., 0]) x = tt.inc_subtensor(x[..., 1:], tt.exp(y[..., 1:])) return tt.cumsum(x, axis=-1)
def __init__(self, hidden_size, num_labels, num_features, embedding_size, \ fixed_embeddings, activation='logistic'): ''' hidden_size :: dimension of the hidden layer num_labels :: number of labels num_features :: number of word embeddings in the vocabulary embedding_size :: dimension of the word embeddings activation :: logistic or tanh ''' self.hidden_size = hidden_size self.num_labels = num_labels self.num_features = num_features self.embedding_size = embedding_size self.original_embedding_size = fixed_embeddings.shape[0] self.bidirectional = True if activation == 'logistic': self.activation_function = T.nnet.sigmoid elif activation == 'tanh': self.activation_function = T.tanh else: raise NotImplementedError self.create_parameters() self.initialize_parameters() # Copy the fixed embeddings to self.emb. num_fixed_embeddings = fixed_embeddings.shape[1] self.num_fixed_embeddings = num_fixed_embeddings E = self.emb.get_value() E[:, :num_fixed_embeddings] = fixed_embeddings.astype( theano.config.floatX) self.emb.set_value(E) #T.set_subtensor(self.emb[:, :num_fixed_embeddings], \ # fixed_embeddings.astype(theano.config.floatX)) # As many elements as words in the sentence. self.idxs = T.ivector() idxs = self.idxs #positions_nonupd = (idxs < num_fixed_embeddings).nonzero()[0] #positions_upd = (idxs >= num_fixed_embeddings).nonzero()[0] self.positions_nonupd = T.ivector() self.positions_upd = T.ivector() positions_nonupd = self.positions_nonupd positions_upd = self.positions_upd idxs_nonupd = idxs[positions_nonupd] idxs_upd = idxs[positions_upd] emb_nonupd = self.emb[:, idxs_nonupd] emb_upd = self.emb[:, idxs_upd] positions = T.concatenate([positions_nonupd, positions_upd]) emb = T.concatenate([emb_nonupd, emb_upd], axis=1) emb = T.set_subtensor(emb[:, positions], emb) #emb = self.emb[:, idxs] #x = emb.T x = T.dot(self.Wx, emb).T #self.positions_to_update = T.ivector() #positions_to_update = self.positions_to_update #emb_to_update = emb[:, positions_to_update] self.y = T.iscalar('y') # label. y = self.y #[h, s], _ = theano.scan(fn=self.recurrence_old, # sequences=x, # outputs_info=[self.h0, None], # n_steps=x.shape[0]) h, _ = theano.scan(fn=self.recurrence, sequences=x, outputs_info=self.h0, n_steps=x.shape[0]) if self.bidirectional: l, _ = theano.scan(fn=self.recurrence_right_to_left, sequences=x[::-1, :], outputs_info=self.l0, n_steps=x.shape[0]) l = l[::-1, :] #s = T.nnet.softmax(T.dot(self.Why, h.T).T + # T.dot(self.Wly, l.T).T + self.by) s = T.nnet.softmax( T.dot(self.Why, h[-1, :]) + T.dot(self.Wly, l[0, :]) + self.by) else: #s = T.nnet.softmax(T.dot(self.Why, h.T).T + self.by) s = T.nnet.softmax(T.dot(self.Why, h[-1, :]) + self.by) p_y_given_x_sentence = s[0] # check. self.y_pred = T.argmax(p_y_given_x_sentence) y_pred = self.y_pred self.num_mistakes = 1 - T.eq(y, y_pred) # cost and gradients and learning rate self.lr = T.scalar('lr') lr = self.lr #self.sentence_nll = -T.mean(T.log(p_y_given_x_sentence) # [T.arange(x.shape[0]), y]) self.sentence_nll = -T.log(p_y_given_x_sentence[y]) params_to_update = self.params[1:] sentence_gradients = T.grad(self.sentence_nll, params_to_update) sentence_gradient_emb = T.grad(self.sentence_nll, emb_upd) sentence_update_emb = [(self.emb, T.inc_subtensor(emb_upd, -lr * sentence_gradient_emb))] self.sentence_updates = OrderedDict( (p, p - lr * g) for p, g in zip(params_to_update, sentence_gradients)) self.sentence_updates.update(sentence_update_emb) self.classify = theano.function( inputs=[idxs, positions_upd, positions_nonupd], outputs=[y_pred, p_y_given_x_sentence])
def fprop(self, XH, tparams): # XH is a list of inputs: [state_belows, state_befores] # each state vector is: [state_before; cell_before] # Hence, you use h[:, :self.nout] to compute recurrent term X, H = XH if len(X) != len(self.parent): raise AttributeError("The number of inputs doesn't match " "with the number of parents.") if len(H) != len(self.recurrent): raise AttributeError("The number of inputs doesn't match " "with the number of recurrents.") # The index of self recurrence is 0 z_t = H[0] Nm = len(self.recurrent) z = T.zeros((X[0].shape[0], 4 * self.nout + Nm), dtype=theano.config.floatX) for x, (parname, parout) in izip(X, self.parent.items()): W = tparams['W_' + parname + '__' + self.name] if x.ndim == 1: if 'int' not in x.dtype: x = T.cast(x, 'int64') z += W[x] else: z += T.dot(x[:, :parout], W) for h, (recname, recout) in izip(H, self.recurrent.items()): U = tparams['U_' + recname + '__' + self.name] z = T.inc_subtensor(z[:, self.nout:], T.dot(h[:, :recout], U[:, self.nout:])) z += tparams['b_' + self.name] # Compute activations of gating units i_t = T.nnet.sigmoid(z[:, self.nout:2 * self.nout]) f_t = T.nnet.sigmoid(z[:, 2 * self.nout:3 * self.nout]) o_t = T.nnet.sigmoid(z[:, 3 * self.nout:4 * self.nout]) gron = T.nnet.sigmoid(z[:, 4 * self.nout:]) c_t = z[:, :self.nout] for i, (h, (recname, recout)) in\ enumerate(izip(H, self.recurrent.items())): gated_h = h[:, :recout] * gron[:, i].dimshuffle(0, 'x') U = tparams['U_' + recname + '__' + self.name] c_t += T.dot(gated_h, U[:, :self.nout]) # Update hidden & cell states z_t = T.set_subtensor( z_t[:, self.nout:], f_t * z_t[:, self.nout:] + i_t * self.nonlin(c_t)) z_t = T.set_subtensor(z_t[:, :self.nout], o_t * self.nonlin(z_t[:, self.nout:])) z_t.name = self.name return z_t
def set_subtensor(subtensor, amnt): return T.inc_subtensor(subtensor, amnt, set_instead_of_inc=True)
def get_output_for(self, input, deterministic=False, **kwargs): out, r = T.zeros(self.get_output_shape_for(input.shape)), self.upscale for y, x in itertools.product(range(r), repeat=2): out = T.inc_subtensor(out[:, :, y::r, x::r], input[:, r * y + x::r * r, :, :]) return out
def test_incsubtensor2(self): tv = numpy.asarray(self.rng.uniform(size=(10, )), theano.config.floatX) t = theano.shared(tv) out = tensor.inc_subtensor(t[:4], self.x[:4]) self.check_rop_lop(out, (10, ))
def costs(self, application_call, prediction, prediction_mask, groundtruth, groundtruth_mask, **inputs): def _prediction_subtensor(data): if data.ndim != 3: raise ValueError flat_data = data.reshape( (data.shape[0] * data.shape[1], data.shape[2])) flat_data = flat_data[tensor.arange(flat_data.shape[0]), prediction.flatten()] return flat_data.reshape( (prediction.shape[0], prediction.shape[1])) attended = disconnected_grad(inputs.pop('attended')) attended_mask = disconnected_grad(inputs.pop('attended_mask')) # Compute the rewards rewards = self.reward_brick.apply(prediction, prediction_mask, groundtruth, groundtruth_mask)[:, :, 0] future_rewards = rewards[::-1].cumsum(axis=0)[::-1] # Compute the critic outputs if self.critic: padding = tensor.repeat(tensor.fill(prediction[0:1], self.bos_token), 1, axis=0) mask_padding = tensor.repeat(tensor.fill(prediction_mask[0:1], 1.), 1, axis=0) padded_prediction = tensor.concatenate([padding, prediction]) padded_prediction_mask = tensor.concatenate( [mask_padding, prediction_mask]) if self.critic_uses_groundtruth: critic_context = groundtruth critic_context_mask = groundtruth_mask else: critic_context = tensor.zeros_like(groundtruth[0:1]) critic_context_mask = tensor.zeros_like(groundtruth_mask[0:1]) critic_kwargs = dict(prediction=padded_prediction, prediction_mask=padded_prediction_mask, groundtruth=critic_context, groundtruth_mask=critic_context_mask, inputs=critic_context, inputs_mask=critic_context_mask) if self.critic_uses_actor_states: extra_inputs = disconnected_grad(inputs['states']) # We don't need the very last hidden state of the actor # in extra_inputs. We have to add something instead for the shapes # to match. It doesn't matter at all, what exactly we add. critic_kwargs['extra_inputs'] = tensor.concatenate( [extra_inputs, tensor.zeros_like(extra_inputs[0:1])]) critic_cg = ComputationGraph(self.critic.costs(**critic_kwargs)) outputs, = VariableFilter( applications=[self.critic.generator.readout.all_outputs], roles=[OUTPUT])(critic_cg) # The first subtensor should be discarded, because it was outputted # for the padding. In addition to that Q-values from the first # 'critic_burnin_steps' will be ignored, see later in the code. outputs = outputs[1:] else: outputs = self.merge(**dict_subset(inputs, self.merge_names)) prediction_outputs = _prediction_subtensor(outputs) # Compute Q adjustments adjustments = outputs prediction_adjustments = prediction_outputs if self.accumulate_outputs: prediction_adjustments = prediction_outputs.cumsum(axis=0) adjustments = tensor.inc_subtensor( adjustments[1:], prediction_adjustments[:-1][:, :, None]) # Compute shared additive biases for all Q values if self.use_value_biases: value_biases = (self.value_summand.apply(attended)[:, :, 0] * attended_mask).sum(axis=0) else: value_biases = tensor.zeros_like(adjustments[0, :, 0]) values = adjustments + value_biases[None, :, None] prediction_values = prediction_adjustments + value_biases[None, :] rolled_prediction_mask = tensor.roll(prediction_mask, -1, axis=0) rolled_prediction_mask = tensor.set_subtensor( rolled_prediction_mask[-1], 0) # Compute probabilities logs = self.scores(use_epsilon=False, **inputs) probs = tensor.exp(logs) if self.trpo_coef: logger.debug("Using TRPO coefficient of {}".format(self.trpo_coef)) old_probs = tensor.tensor3('probs') else: old_probs = tensor.zeros_like(probs) prediction_logs = _prediction_subtensor(logs) # Compute value targets value_targets = (disconnected_grad(probs) * values).sum(axis=-1) value_targets = tensor.roll(value_targets, -1, axis=0) value_targets = ( self.discount * value_targets * rolled_prediction_mask + rewards) value_targets = value_targets.astype(theano.config.floatX) total_costs = 0 # Compute critic cost if not self.compute_targets: logger.debug("Using given targets") value_targets = tensor.matrix('value_targets') if self.solve_bellman == 'no': logger.debug("Not solving Bellman, just predicting the rewards") value_targets = rewards.copy(name='value_targets') elif self.solve_bellman == 'without_dp': future_rewards = rewards[::-1].cumsum(axis=0)[::-1] logger.debug("Solving Bellman, but without DP") value_targets = future_rewards elif self.solve_bellman is not True: raise ValueError() critic_errors = prediction_values - value_targets if self.critic_loss == 'L2': logger.debug("L2 loss for the critic") critic_costs_per_char = critic_errors**2 * prediction_mask elif self.critic_loss == 'huber': logger.debug("Huber loss for the critic") use_L2 = tensor.lt(abs(critic_errors), 0.5) critic_costs_per_char = ( use_L2 * critic_errors**2 + (1 - use_L2) * abs(critic_errors)) * prediction_mask else: raise ValueError() critic_costs = critic_costs_per_char[self.critic_burnin_steps:].sum( axis=0) if not self.freeze_critic: total_costs += critic_costs # Compute critic Monte-Carlo cost critic_monte_carlo_costs = ( (((prediction_values - future_rewards)**2) * prediction_mask)[self.critic_burnin_steps:].sum(axis=0)) # Value penalty if self.value_penalty: logger.debug("Use value penalty") if self.value_penalty_type == 'L2': value_deviations = (values - values.mean(axis=-1, keepdims=True))**2 elif self.value_penalty_type == 'L1': value_deviations = abs(values - values.mean(axis=-1, keepdims=True)) else: raise ValueError("unknown value penalty type {}".format( self.value_penalty_type)) if not self.freeze_critic: total_costs += ( self.value_penalty * (value_deviations.sum(axis=-1) * prediction_mask)[self.critic_burnin_steps:].sum(axis=0)) # Compute actor cost if self.critic: # The actor cost will be minimized, that's why values # must be negated. est_name = self.actor_grad_estimate if est_name == 'all_actions': disadvantages = disconnected_grad( values.max(axis=-1)[:, :, None] - values) actor_costs = ((probs * disadvantages).sum(axis=-1) * prediction_mask) actor_costs = actor_costs[self.critic_burnin_steps:] elif est_name.startswith('1_action'): # Here we do not provide a target for the first step for # the reason we lack an estimate of the value of the initial state. # This is how our critic works. # Hopefully the network won't unlearn # to produce a BOS first. future_reward_estimate = (future_rewards if est_name.endswith('unbiased') else prediction_values) weights = -disconnected_grad(future_reward_estimate[1:] + rewards[:-1] - prediction_values[:-1]) actor_costs = ((prediction_logs[1:] * weights) * prediction_mask[1:]) actor_costs = actor_costs[self.critic_burnin_steps + 1:] else: raise ValueError actor_costs = actor_costs.sum(axis=0) actor_entropies = (probs * -logs).sum(axis=-1) * prediction_mask actor_entropies = actor_entropies[self.critic_burnin_steps:].sum( axis=0) old_actor_cross_entropies = (old_probs * -logs).sum(axis=-1) * prediction_mask old_actor_cross_entropies = old_actor_cross_entropies[ self.critic_burnin_steps:].sum(axis=0) critic_policy = disconnected_grad( self.softmax.apply(self.critic_policy_t * values, extra_ndim=1)) critic_cross_entropies = ((critic_policy * -logs).sum(axis=-1) * prediction_mask) critic_cross_entropies = critic_cross_entropies[ self.critic_burnin_steps:].sum(axis=0) actor_costs_with_penalties = ( actor_costs - self.entropy_reward_coof * actor_entropies # But really, should it be minus here, below? - self.cross_entropy_reward_coof * critic_cross_entropies + self.trpo_coef * old_actor_cross_entropies) if not self.freeze_actor: total_costs += actor_costs_with_penalties else: total_costs += disconnected_grad(actor_costs_with_penalties) # Add auxiliary variables for intermediate steps of the computation application_call.add_auxiliary_variable(rewards, name='rewards') application_call.add_auxiliary_variable(value_biases, name='value_biases') application_call.add_auxiliary_variable(values.copy(), name='values') application_call.add_auxiliary_variable(outputs.copy(), name='outputs') application_call.add_auxiliary_variable(prediction_values, name='prediction_values') application_call.add_auxiliary_variable(prediction_outputs, name='prediction_outputs') application_call.add_auxiliary_variable(value_targets.copy(), name='value_targets') application_call.add_auxiliary_variable(probs.copy(), name='probs') application_call.add_auxiliary_variable(prediction_logs, name='prediction_log_probs') # Compute some statistics for debugging last_character_mask = prediction_mask - rolled_prediction_mask last_character_costs = (critic_costs_per_char * last_character_mask).sum(axis=0) mean2_output = (((prediction_outputs**2) * prediction_mask).sum() / prediction_mask.sum())**0.5 max_output = abs(prediction_outputs * prediction_mask).max() expected_reward = (probs[0] * values[0]).sum(axis=-1) application_call.add_auxiliary_variable(last_character_costs, name='last_character_costs') application_call.add_auxiliary_variable(critic_costs.mean(), name='mean_critic_cost') application_call.add_auxiliary_variable( critic_monte_carlo_costs.mean(), name='mean_critic_monte_carlo_cost') if self.critic: application_call.add_auxiliary_variable(actor_costs.mean(), name='mean_actor_cost') application_call.add_auxiliary_variable(actor_entropies.mean(), name='mean_actor_entropy') application_call.add_auxiliary_variable(expected_reward.mean(), name='mean_expected_reward') application_call.add_auxiliary_variable(mean2_output, name='mean2_output') application_call.add_auxiliary_variable(max_output, name='max_output') return total_costs
def _create_update_fun(self): """ Given examples of the form: ( [1100, 1200, 12], [1, 2, 0, 0, 1, 0] ) Corresponding to a sequence of words 1100, 1200 and an object 12, with labels for class 1, 1, class 2, 2, and for the sigmoid classes the third class active, we can do regression. """ input = T.ivector('input') input_object = T.iscalar('input_object_index') labels = T.ivector('labels') sigmoid_labels = T.ivector('sigmoid_labels') embeddings = self.model_matrix[input] object_embedding = self.object_matrix[input_object] if self.concatenate: # or we concatenate all the words and add the object to it merged_embeddings = T.concatenate( [embeddings.ravel(), object_embedding]) else: # or we sum all the words and add the object to it: merged_embeddings = embeddings.sum(axis=1) + object_embedding preds, prediction, error = self.projection_fun(merged_embeddings, labels, sigmoid_labels) updates = OrderedDict() gparams = T.grad(error, self.params) for gparam, param in zip(gparams, self.params): if param == self.model_matrix: updates[param] = T.inc_subtensor(param[input], -self.alpha * gparam[input]) elif param == self.object_matrix: updates[param] = T.inc_subtensor( param[input_object], -self.alpha * gparam[input_object]) else: updates[param] = param - self.alpha * gparam self.predict_proba = theano.function([input, input_object], preds + [prediction], mode=self.theano_mode) self.predict = theano.function([input, input_object], [pred.argmax() for pred in preds] + [prediction.round()], mode=self.theano_mode) input_vector = T.vector() alt_preds, alt_prediction, alt_error = self.projection_fun( input_vector, labels, sigmoid_labels) self.predict_vector = theano.function( [input_vector], [pred.argmax() for pred in alt_preds] + [alt_prediction.round()], mode=self.theano_mode) self.predict_vector_proba = theano.function([input_vector], alt_preds + [alt_prediction], mode=self.theano_mode) training_inputs = [] if len(self.output_classes) > 0: training_inputs.append(labels) if self.output_sigmoid_classes > 0: training_inputs.append(sigmoid_labels) self.gradient_fun = theano.function([input, input_object] + training_inputs, gparams, mode=self.theano_mode) self.update_fun = theano.function([input, input_object] + training_inputs, error, updates=updates, mode=self.theano_mode)
def step(x_t, M_tm1, c_tm1, h_tm1, r_tm1, wr_tm1, wu_tm1): # Feed Forward controller # h_t = lasagne.nonlinearities.tanh(T.dot(x_t, W_h) + b_h) # LSTM controller # p.3: "This memory is used by the controller as the input to a classifier, # such as a softmax output layer, and as an additional # input for the next controller state." -> T.dot(r_tm1, W_rh) preactivations = T.dot(x_t, W_xh) + T.dot(r_tm1, W_rh) + T.dot( h_tm1, W_hh) + b_h gf_, gi_, go_, u_ = slice_equally(preactivations, controller_size, 4) gf = lasagne.nonlinearities.sigmoid(gf_) gi = lasagne.nonlinearities.sigmoid(gi_) go = lasagne.nonlinearities.sigmoid(go_) u = lasagne.nonlinearities.tanh(u_) c_t = gf * c_tm1 + gi * u h_t = go * lasagne.nonlinearities.tanh(c_t) # (batch_size, num_units) k_t = lasagne.nonlinearities.tanh( T.dot(h_t, W_key) + b_key) # (batch_size, nb_reads, memory_size[1]) a_t = lasagne.nonlinearities.tanh( T.dot(h_t, W_add) + b_add) # (batch_size, nb_reads, memory_size[1]) sigma_t = lasagne.nonlinearities.sigmoid( T.dot(h_t, W_sigma) + b_sigma) # (batch_size, nb_reads, 1) sigma_t = T.addbroadcast(sigma_t, 2) wlu_tm1 = T.argsort(wu_tm1, axis=1)[:, :nb_reads] # (batch_size, nb_reads) # ww_t = sigma_t * wr_tm1 + (1. - sigma_t) * wlu_tm1 ww_t = (sigma_t * wr_tm1).reshape( (batch_size * nb_reads, memory_shape[0])) ww_t = T.inc_subtensor( ww_t[T.arange(batch_size * nb_reads), wlu_tm1.flatten()], 1. - sigma_t.flatten()) # (batch_size * nb_reads, memory_size[0]) ww_t = ww_t.reshape( (batch_size, nb_reads, memory_shape[0])) # (batch_size, nb_reads, memory_size[0]) # p.4: "Prior to writing to memory, the least used memory location is # computed from wu_tm1 and is set to zero" M_t = T.set_subtensor(M_tm1[T.arange(batch_size), wlu_tm1[:, 0]], 0.) M_t = M_t + T.batched_dot(ww_t.dimshuffle( 0, 2, 1), a_t) # (batch_size, memory_size[0], memory_size[1]) K_t = cosine_similarity(k_t, M_t) # (batch_size, nb_reads, memory_size[0]) wr_t = lasagne.nonlinearities.softmax( K_t.reshape((batch_size * nb_reads, memory_shape[0]))) wr_t = wr_t.reshape( (batch_size, nb_reads, memory_shape[0])) # (batch_size, nb_reads, memory_size[0]) if batch_size == 1: wr_t = T.unbroadcast(wr_t, 0) wu_t = gamma * wu_tm1 + T.sum(wr_t, axis=1) + T.sum( ww_t, axis=1) # (batch_size, memory_size[0]) r_t = T.batched_dot(wr_t, M_t).flatten( ndim=2) # (batch_size, nb_reads * memory_size[1]) return M_t, c_t, h_t, r_t, wr_t, wu_t
def optimization_sgd(trainvec, testvec, n_epochs, batch_size, alpha=0.01, beta=0.05): i = T.lvector('i') j = T.lvector('j') x = T.dvector('x') num_user = 6040 num_item = 3952 factors = 20 init_mean = 0 init_stdev = 0.02 mfobj = MF_Batch(i, j, num_user, num_item, factors, init_mean, init_stdev) regcost, error = mfobj.errors(x, beta) gp, gq = T.grad(cost=regcost, wrt=[mfobj.P, mfobj.Q]) updates = [(mfobj.P, T.inc_subtensor(mfobj.P[i, :], -gp[i, :] * alpha)), (mfobj.Q, T.inc_subtensor(mfobj.Q[j, :], -gq[j, :] * alpha))] train_model = theano.function( inputs=[i, j, x], #givens=[(mfobj.P[i, :]), mfobj.Q[:, j]], outputs=regcost, updates=updates) test_model = theano.function( inputs=[i, j, x], #givens=[(mfobj.P[i, :]), mfobj.Q[:, j]], outputs=error) mean_rating = np.mean(trainvec[:, 2]) done_looping = False epoch = 0 N = len(trainvec) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 totalErrors = 0 testErrors = 0 for k in range(int(math.floor(N / batch_size))): batch = np.arange(k * batch_size, min(N - 1, (k + 1) * batch_size)) idi = trainvec[batch, 0] - 1 idj = trainvec[batch, 1] - 1 ratings = trainvec[batch, 2] - mean_rating minibatch_cost = train_model(idi, idj, ratings) totalErrors += minibatch_cost NN = len(testvec) batch_size = 1000 for k in range(int(math.floor(NN / batch_size))): batch = np.arange(k * batch_size, min(NN - 1, (k + 1) * batch_size)) p_idx = testvec[batch, 0] - 1 q_idx = testvec[batch, 1] - 1 ratings = testvec[batch, 2] - mean_rating testErrors += test_model(p_idx, q_idx, ratings) print( "the training cost at epoch {} is {}, and the testing error is {}". format(epoch, np.sqrt(totalErrors / N), np.sqrt(testErrors / NN))) # test it on the test dataset NN = len(testvec) batch_size = 1000 diff = 0 for k in range(int(math.floor(NN / batch_size))): batch = np.arange(k * batch_size, min(NN - 1, (k + 1) * batch_size)) p_idx = testvec[batch, 0] - 1 q_idx = testvec[batch, 1] - 1 ratings = testvec[batch, 2] - mean_rating diff += test_model(p_idx, q_idx, ratings) print("Total average test error for {} instances is {}".format( NN, np.sqrt(diff / NN)))
def gradient_recurrence(x_t_plus_1, y_t_plus_1, y_t, isend_t, dh_t_plus_1, h_t_plus_1, dV_re_t_plus_1, dV_im_t_plus_1, dhidden_bias_t_plus_1, dtheta_t_plus_1, dreflection_t_plus_1, dscale_t_plus_1, dU_t_plus_1, dout_bias_t_plus_1, V_re, V_im, hidden_bias, theta, reflection, scale, U, out_bias): dV_re_t = dV_re_t_plus_1 dV_im_t = dV_im_t_plus_1 dhidden_bias_t = dhidden_bias_t_plus_1 dtheta_t = dtheta_t_plus_1 dreflection_t = dreflection_t_plus_1 dscale_t = dscale_t_plus_1 dU_t = dU_t_plus_1 dout_bias_t = dout_bias_t_plus_1 # Compute h_t -------------------------------------------------------------------------- data_linoutput_re = T.dot(x_t_plus_1, V_re) data_linoutput_im = T.dot(x_t_plus_1, V_im) data_linoutput = T.concatenate([data_linoutput_re, data_linoutput_im], axis=1) total_linoutput = apply_nonlinearity_inverse(h_t_plus_1, hidden_bias) hidden_linoutput = total_linoutput - data_linoutput step8 = scale_diag(hidden_linoutput, n_hidden, 1. / scale) step7 = times_diag(step8, n_hidden, -theta[2, :]) step6 = times_reflection(step7, n_hidden, reflection[1, :]) # step5 = step6 step5 = do_fft(step6, n_hidden) step4 = times_diag(step5, n_hidden, -theta[1, :]) step3 = vec_permutation(step4, n_hidden, reverse_index_permute) step2 = times_reflection(step3, n_hidden, reflection[0, :]) # step1 = step2 step1 = do_ifft(step2, n_hidden) step0 = times_diag(step1, n_hidden, -theta[0, :]) h_t = step0 # Compute deriv contributions to hidden to output params------------------------------------------------ dU_contribution, dout_bias_contribution = \ hidden_output_derivs(h_t_plus_1, U, out_bias, y_t_plus_1) dU_t = dU_t + dU_contribution dout_bias_t = dout_bias_t + dout_bias_contribution # Compute derivative of linoutputs ------------------------------------------------------------------- deriv, rescale, modTL = compute_nonlinearity_deriv( total_linoutput, hidden_bias) dh_t_plus_1_TL = dh_t_plus_1 * total_linoutput matrix = dh_t_plus_1_TL[:, :n_hidden] + dh_t_plus_1_TL[:, n_hidden:] matrix = matrix * (deriv - rescale) / (modTL**2) dtotal_linoutput = dh_t_plus_1 * T.tile(rescale, [1, 2]) \ + T.tile(matrix, [1, 2]) * total_linoutput dhidden_linoutput = dtotal_linoutput ddata_linoutput = dtotal_linoutput # Compute deriv contributions to hidden bias------------------------------------------------------- dhidden_bias_contribution = dh_t_plus_1_TL * T.tile( deriv / modTL, [1, 2]) dhidden_bias_t = dhidden_bias_t + dhidden_bias_contribution[:, :n_hidden] \ + dhidden_bias_contribution[:, n_hidden:] # Compute derivative of h_t ------------------------------------------------------------------- # use transpose conjugate operations dstep8 = scale_diag(dhidden_linoutput, n_hidden, scale) dstep7 = times_diag(dstep8, n_hidden, -theta[2, :]) dstep6 = times_reflection(dstep7, n_hidden, reflection[1, :]) # dstep5 = dstep6 dstep5 = do_fft(dstep6, n_hidden) dstep4 = times_diag(dstep5, n_hidden, -theta[1, :]) dstep3 = vec_permutation(dstep4, n_hidden, reverse_index_permute) dstep2 = times_reflection(dstep3, n_hidden, reflection[0, :]) # dstep1 = dstep2 dstep1 = do_ifft(dstep2, n_hidden) dstep0 = times_diag(dstep1, n_hidden, -theta[0, :]) dh_t = dstep0 dh_t_contribution = compute_dctdht(h_t, U, out_bias, y_t) dh_t = theano.ifelse.ifelse(T.eq(isend_t, 0), dh_t + dh_t_contribution, dh_t) # Compute deriv contributions to Unitary parameters ---------------------------------------------------- # scale------------------------------------------------ dscale_contribution = dhidden_linoutput * step8 dscale_t = dscale_t + dscale_contribution[:, :n_hidden] \ + dscale_contribution[:, n_hidden:] # theta2----------------------------------------------- dtheta2_contribution = dstep8 * times_diag(step7, n_hidden, theta[2, :] + 0.5 * np.pi) dtheta_t = T.inc_subtensor( dtheta_t[:, 2, :], dtheta2_contribution[:, :n_hidden] + dtheta2_contribution[:, n_hidden:]) # reflection1----------------------------------------- v_re = reflection[1, :n_hidden] v_im = reflection[1, n_hidden:] vstarv = (v_re**2 + v_im**2).sum() dstep7_re = dstep7[:, :n_hidden] dstep7_im = dstep7[:, n_hidden:] step6_re = step6[:, :n_hidden] step6_im = step6[:, n_hidden:] v_re_dot_v_re = T.dot(v_re, v_re.T) v_im_dot_v_im = T.dot(v_im, v_im.T) v_im_dot_v_re = T.dot(v_im, v_re.T) dstep7_re_dot_v_re = T.dot(dstep7_re, v_re.T).dimshuffle(0, 'x') #n_b x 1 dstep7_re_dot_v_im = T.dot(dstep7_re, v_im.T).dimshuffle(0, 'x') step6_re_dot_v_re = T.dot(step6_re, v_re.T).dimshuffle(0, 'x') step6_re_dot_v_im = T.dot(step6_re, v_im.T).dimshuffle(0, 'x') dstep7_im_dot_v_re = T.dot(dstep7_im, v_re.T).dimshuffle(0, 'x') dstep7_im_dot_v_im = T.dot(dstep7_im, v_im.T).dimshuffle(0, 'x') step6_im_dot_v_re = T.dot(step6_im, v_re.T).dimshuffle(0, 'x') step6_im_dot_v_im = T.dot(step6_im, v_im.T).dimshuffle(0, 'x') dstep7_re_timesum_step6_re = (dstep7_re * step6_re).sum(axis=1) dstep7_re_timesum_step6_im = (dstep7_re * step6_im).sum(axis=1) dstep7_im_timesum_step6_re = (dstep7_im * step6_re).sum(axis=1) dstep7_im_timesum_step6_im = (dstep7_im * step6_im).sum(axis=1) #-------- dstep7_re_RedOpdv_re_term1 = -2. / vstarv * ( dstep7_re * step6_re_dot_v_re + dstep7_re_dot_v_re * step6_re - dstep7_re * step6_im_dot_v_im + dstep7_re_dot_v_im * step6_im) outer_sum = (T.outer(step6_re_dot_v_re, v_re) + T.outer(step6_re_dot_v_im, v_im) - T.outer(step6_im_dot_v_im, v_re) + T.outer(step6_im_dot_v_re, v_im)) dstep7_re_RedOpdv_re_term2 = 4. / (vstarv**2) * T.outer( (dstep7_re * outer_sum).sum(axis=1), v_re) dstep7_im_ImdOpdv_re_term1 = -2. / vstarv * ( dstep7_im * step6_im_dot_v_re + dstep7_im_dot_v_re * step6_im + dstep7_im * step6_re_dot_v_im - dstep7_im_dot_v_im * step6_re) outer_sum = (T.outer(step6_im_dot_v_re, v_re) + T.outer(step6_im_dot_v_im, v_im) + T.outer(step6_re_dot_v_im, v_re) - T.outer(step6_re_dot_v_re, v_im)) dstep7_im_ImdOpdv_re_term2 = 4. / (vstarv**2) * T.outer( (dstep7_im * outer_sum).sum(axis=1), v_re) dv_re_contribution = (dstep7_re_RedOpdv_re_term1 + dstep7_re_RedOpdv_re_term2 + dstep7_im_ImdOpdv_re_term1 + dstep7_im_ImdOpdv_re_term2) #--------- dstep7_re_RedOpdv_im_term1 = -2. / vstarv * ( dstep7_re * step6_re_dot_v_im + dstep7_re_dot_v_im * step6_re - dstep7_re_dot_v_re * step6_im + dstep7_re * step6_im_dot_v_re) outer_sum = (T.outer(step6_re_dot_v_re, v_re) + T.outer(step6_re_dot_v_im, v_im) - T.outer(step6_im_dot_v_im, v_re) + T.outer(step6_im_dot_v_re, v_im)) dstep7_re_RedOpdv_im_term2 = 4. / (vstarv**2) * T.outer( (dstep7_re * outer_sum).sum(axis=1), v_im) dstep7_im_ImdOpdv_im_term1 = -2. / vstarv * ( dstep7_im * step6_im_dot_v_im + dstep7_im_dot_v_im * step6_im + dstep7_im_dot_v_re * step6_re - dstep7_im * step6_re_dot_v_re) outer_sum = (T.outer(step6_im_dot_v_re, v_re) + T.outer(step6_im_dot_v_im, v_im) + T.outer(step6_re_dot_v_im, v_re) - T.outer(step6_re_dot_v_re, v_im)) dstep7_im_ImdOpdv_im_term2 = 4. / (vstarv**2) * T.outer( (dstep7_im * outer_sum).sum(axis=1), v_im) dv_im_contribution = (dstep7_re_RedOpdv_im_term1 + dstep7_re_RedOpdv_im_term2 + dstep7_im_ImdOpdv_im_term1 + dstep7_im_ImdOpdv_im_term2) dreflection_t = T.inc_subtensor(dreflection_t[:, 1, :n_hidden], dv_re_contribution) dreflection_t = T.inc_subtensor(dreflection_t[:, 1, n_hidden:], dv_im_contribution) # theta1----------------------------------------------------- dtheta1_contribution = dstep5 * times_diag(step4, n_hidden, theta[1, :] + 0.5 * np.pi) dtheta_t = T.inc_subtensor( dtheta_t[:, 1, :], dtheta1_contribution[:, :n_hidden] + dtheta1_contribution[:, n_hidden:]) # reflection0------------------------------------------------ v_re = reflection[0, :n_hidden] v_im = reflection[0, n_hidden:] vstarv = (v_re**2 + v_im**2).sum() dstep3_re = dstep3[:, :n_hidden] dstep3_im = dstep3[:, n_hidden:] step2_re = step2[:, :n_hidden] step2_im = step2[:, n_hidden:] v_re_dot_v_re = T.dot(v_re, v_re.T) v_im_dot_v_im = T.dot(v_im, v_im.T) v_im_dot_v_re = T.dot(v_im, v_re.T) dstep3_re_dot_v_re = T.dot(dstep3_re, v_re.T).dimshuffle(0, 'x') #n_b x 1 dstep3_re_dot_v_im = T.dot(dstep3_re, v_im.T).dimshuffle(0, 'x') step2_re_dot_v_re = T.dot(step2_re, v_re.T).dimshuffle(0, 'x') step2_re_dot_v_im = T.dot(step2_re, v_im.T).dimshuffle(0, 'x') dstep3_im_dot_v_re = T.dot(dstep3_im, v_re.T).dimshuffle(0, 'x') dstep3_im_dot_v_im = T.dot(dstep3_im, v_im.T).dimshuffle(0, 'x') step2_im_dot_v_re = T.dot(step2_im, v_re.T).dimshuffle(0, 'x') step2_im_dot_v_im = T.dot(step2_im, v_im.T).dimshuffle(0, 'x') dstep3_re_timesum_step2_re = (dstep3_re * step2_re).sum(axis=1) dstep3_re_timesum_step2_im = (dstep3_re * step2_im).sum(axis=1) dstep3_im_timesum_step2_re = (dstep3_im * step2_re).sum(axis=1) dstep3_im_timesum_step2_im = (dstep3_im * step2_im).sum(axis=1) #-------- dstep3_re_RedOpdv_re_term1 = -2. / vstarv * ( dstep3_re * step2_re_dot_v_re + dstep3_re_dot_v_re * step2_re - dstep3_re * step2_im_dot_v_im + dstep3_re_dot_v_im * step2_im) outer_sum = (T.outer(step2_re_dot_v_re, v_re) + T.outer(step2_re_dot_v_im, v_im) - T.outer(step2_im_dot_v_im, v_re) + T.outer(step2_im_dot_v_re, v_im)) dstep3_re_RedOpdv_re_term2 = 4. / (vstarv**2) * T.outer( (dstep3_re * outer_sum).sum(axis=1), v_re) dstep3_im_ImdOpdv_re_term1 = -2. / vstarv * ( dstep3_im * step2_im_dot_v_re + dstep3_im_dot_v_re * step2_im + dstep3_im * step2_re_dot_v_im - dstep3_im_dot_v_im * step2_re) outer_sum = (T.outer(step2_im_dot_v_re, v_re) + T.outer(step2_im_dot_v_im, v_im) + T.outer(step2_re_dot_v_im, v_re) - T.outer(step2_re_dot_v_re, v_im)) dstep3_im_ImdOpdv_re_term2 = 4. / (vstarv**2) * T.outer( (dstep3_im * outer_sum).sum(axis=1), v_re) dv_re_contribution = (dstep3_re_RedOpdv_re_term1 + dstep3_re_RedOpdv_re_term2 + dstep3_im_ImdOpdv_re_term1 + dstep3_im_ImdOpdv_re_term2) #--------- dstep3_re_RedOpdv_im_term1 = -2. / vstarv * ( dstep3_re * step2_re_dot_v_im + dstep3_re_dot_v_im * step2_re - dstep3_re_dot_v_re * step2_im + dstep3_re * step2_im_dot_v_re) outer_sum = (T.outer(step2_re_dot_v_re, v_re) + T.outer(step2_re_dot_v_im, v_im) - T.outer(step2_im_dot_v_im, v_re) + T.outer(step2_im_dot_v_re, v_im)) dstep3_re_RedOpdv_im_term2 = 4. / (vstarv**2) * T.outer( (dstep3_re * outer_sum).sum(axis=1), v_im) dstep3_im_ImdOpdv_im_term1 = -2. / vstarv * ( dstep3_im * step2_im_dot_v_im + dstep3_im_dot_v_im * step2_im + dstep3_im_dot_v_re * step2_re - dstep3_im * step2_re_dot_v_re) outer_sum = (T.outer(step2_im_dot_v_re, v_re) + T.outer(step2_im_dot_v_im, v_im) + T.outer(step2_re_dot_v_im, v_re) - T.outer(step2_re_dot_v_re, v_im)) dstep3_im_ImdOpdv_im_term2 = 4. / (vstarv**2) * T.outer( (dstep3_im * outer_sum).sum(axis=1), v_im) dv_im_contribution = (dstep3_re_RedOpdv_im_term1 + dstep3_re_RedOpdv_im_term2 + dstep3_im_ImdOpdv_im_term1 + dstep3_im_ImdOpdv_im_term2) dreflection_t = T.inc_subtensor(dreflection_t[:, 0, :n_hidden], dv_re_contribution) dreflection_t = T.inc_subtensor(dreflection_t[:, 0, n_hidden:], dv_im_contribution) # theta0------------------------------------------------------------------------------ dtheta0_contribution = dstep1 * times_diag(step0, n_hidden, theta[0, :] + 0.5 * np.pi) dtheta_t = T.inc_subtensor( dtheta_t[:, 0, :], dtheta0_contribution[:, :n_hidden] + dtheta0_contribution[:, n_hidden:]) # Compute deriv contributions to V -------------------------------------------------- ddata_linoutput_re = ddata_linoutput[:, :n_hidden] ddata_linoutput_im = ddata_linoutput[:, n_hidden:] dV_re_contribution = T.batched_dot( x_t_plus_1.dimshuffle(0, 1, 'x'), ddata_linoutput_re.dimshuffle(0, 'x', 1)) dV_im_contribution = T.batched_dot( x_t_plus_1.dimshuffle(0, 1, 'x'), ddata_linoutput_im.dimshuffle(0, 'x', 1)) dV_re_t = dV_re_t + dV_re_contribution dV_im_t = dV_im_t + dV_im_contribution return [ dh_t, h_t, dV_re_t, dV_im_t, dhidden_bias_t, dtheta_t, dreflection_t, dscale_t, dU_t, dout_bias_t ]
def fitt(self, X, num_neg_samples=10, learning_rate=10e-4, mu=0.99, reg=0.1, epochs=10): N = len(X) V = self.V D = self.D self._get_pnw(X) W1 = init_weights((V, D)) W2 = init_weights((D, V)) W1 = theano.shared(W1) W2 = theano.shared(W2) thInput = T.iscalar('input_word') thContext = T.ivector('context') thNegSamples = T.ivector('negative') W1_subset = W1[thInput] W2_psubset = W2[:, thContext] W2_nsubset = W2[:, thNegSamples] p_activation = W1_subset.dot(W2_psubset) pos_pY = T.nnet.sigmoid(p_activation) n_activation = W1_subset.dot(W2_nsubset) neg_pY = T.nnet.sigmoid(-n_activation) cost = -T.log(pos_pY).sum() - T.log(neg_pY).sum() W1_grad = T.grad(cost, W1_subset) W2_pgrad = T.grad(cost, W2_psubset) W2_ngrad = T.grad(cost, W2_nsubset) W1_update = T.inc_subtensor(W1_subset, -learning_rate * W1_grad) W2_update = T.inc_subtensor( T.inc_subtensor(W2_psubset, -learning_rate * W2_pgrad)[:, thNegSamples], -learning_rate * W2_ngrad) updates = [(W1, W1_update), (W2, W2_update)] train_op = theano.function( inputs=[thInput, thContext, thNegSamples], outputs=cost, updates=updates, allow_input_downcast=True, ) costs = [] cost_per_epoch = [] sample_indices = range(N) for i in xrange(epochs): t0 = datetime.now() sample_indices = shuffle(sample_indices) cost_per_epoch_i = [] for it in xrange(N): j = sample_indices[it] x = X[j] if len(x) < 2 * self.context_sz + 1: continue cj = [] n = len(x) for jj in xrange(n): start = max(0, jj - self.context_sz) end = min(n, jj + 1 + self.context_sz) context = np.concatenate([x[start:jj], x[(jj + 1):end]]) context = np.array(list(set(context)), dtype=np.int32) neg_samples = self._get_negative_samples( context, num_neg_samples) c = train_op(x[jj], context, neg_samples) cj.append(c / (num_neg_samples + len(context))) ########## try one random window per sentence ########### # jj = np.random.choice(n) # start = max(0, jj - self.context_sz) # end = min(n, jj + 1 + self.context_sz) # context = np.concatenate([x[start:jj], x[(jj+1):end]]) # # NOTE: context can contain DUPLICATES! # # e.g. "<UNKOWN> <UNKOWN> cats and dogs" # context = np.array(list(set(context)), dtype=np.int32) # neg_samples = self._get_negative_samples(context, num_neg_samples) # c = train_op(x[jj], context, neg_samples) # cj.append(c / (num_neg_samples + len(context))) ######################################################### cj = np.mean(cj) cost_per_epoch_i.append(cj) costs.append(cj) if it % 100 == 0: sys.stdout.write('epoch:%d\tj:%d/%d\tcost:%f\r' % (i, it, N, cj)) sys.stdout.flush() epoch_cost = np.mean(cost_per_epoch_i) cost_per_epoch.append(epoch_cost) print "time to complete epoch %d:" % i, datetime.now( ) - t0, 'cost:', epoch_cost self.W1 = W1.get_value() self.W2 = W2.get_value() plt.plot(costs) plt.title('Theano costs') plt.show() plt.plot(cost_per_epoch) plt.title('Theano cost at each epoch') plt.show()
def test_jax_IncSubtensor(): x_np = np.random.uniform(-1, 1, size=(3, 4, 5)).astype(tt.config.floatX) x_tt = tt.arange(3 * 4 * 5).reshape((3, 4, 5)).astype(tt.config.floatX) # "Set" basic indices st_tt = tt.as_tensor_variable(np.array(-10.0, dtype=tt.config.floatX)) out_tt = tt.set_subtensor(x_tt[1, 2, 3], st_tt) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, []) st_tt = tt.as_tensor_variable(np.r_[-1.0, 0.0].astype(tt.config.floatX)) out_tt = tt.set_subtensor(x_tt[:2, 0, 0], st_tt) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, []) out_tt = tt.set_subtensor(x_tt[0, 1:3, 0], st_tt) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, []) # "Set" advanced indices st_tt = tt.as_tensor_variable(np.r_[-1.0, 0.0].astype(tt.config.floatX)) out_tt = tt.set_subtensor(x_tt[[0, 2], 0, 0], st_tt) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, []) st_tt = tt.as_tensor_variable(x_np[[0, 2], 0, :3]) out_tt = tt.set_subtensor(x_tt[[0, 2], 0, :3], st_tt) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, []) # "Set" boolean indices mask_tt = tt.as_tensor_variable(x_np) > 0 out_tt = tt.set_subtensor(x_tt[mask_tt], 0.0) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, []) # "Increment" basic indices st_tt = tt.as_tensor_variable(np.array(-10.0, dtype=tt.config.floatX)) out_tt = tt.inc_subtensor(x_tt[1, 2, 3], st_tt) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, []) st_tt = tt.as_tensor_variable(np.r_[-1.0, 0.0].astype(tt.config.floatX)) out_tt = tt.inc_subtensor(x_tt[:2, 0, 0], st_tt) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, []) out_tt = tt.set_subtensor(x_tt[0, 1:3, 0], st_tt) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, []) # "Increment" advanced indices st_tt = tt.as_tensor_variable(np.r_[-1.0, 0.0].astype(tt.config.floatX)) out_tt = tt.inc_subtensor(x_tt[[0, 2], 0, 0], st_tt) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, []) st_tt = tt.as_tensor_variable(x_np[[0, 2], 0, :3]) out_tt = tt.inc_subtensor(x_tt[[0, 2], 0, :3], st_tt) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, []) # "Increment" boolean indices mask_tt = tt.as_tensor_variable(x_np) > 0 out_tt = tt.set_subtensor(x_tt[mask_tt], 1.0) out_fg = theano.gof.FunctionGraph([], [out_tt]) compare_jax_and_py(out_fg, [])
def create_optimization_updates(cost, params, method="sgd", max_norm=10, updates=None, gradients=None, lr=0.01, eps=1e-8, rho=0.95, beta1=0.9, beta2=0.999, gsums=None, xsums=None): lr = theano.shared(np.float64(lr).astype(theano.config.floatX)) eps = np.float64(eps).astype(theano.config.floatX) rho = theano.shared(np.float64(rho).astype(theano.config.floatX)) beta1 = theano.shared(np.float64(beta1).astype(theano.config.floatX)) beta2 = theano.shared(np.float64(beta2).astype(theano.config.floatX)) gparams = T.grad(cost, params) if gradients is None else gradients g_norm = 0 for g in gparams: g_norm = g_norm + g.norm(2)**2 g_norm = T.sqrt(g_norm) g_norm_list = g_norm # max_norm is useful for sgd if method != "sgd": max_norm = None if max_norm is not None and max_norm is not False: max_norm = theano.shared( np.float64(max_norm).astype(theano.config.floatX)) shrink_factor = T.minimum(max_norm, g_norm + eps) / (g_norm + eps) gparams_clipped = [] for g in gparams: g = shrink_factor * g gparams_clipped.append(g) gparams = gparams_clipped if updates is None: updates = OrderedDict() if gsums is None: gsums = create_accumulators(params) if method != "sgd" else None if xsums is None: xsums = create_accumulators( params) if method != "sgd" and method != "adagrad" else None if method == "sgd": for p, g in zip(params, gparams): if is_subtensor_op(p): origin, _ = get_subtensor_op_inputs(p) updates[origin] = T.inc_subtensor(p, -lr * g) else: updates[p] = p - lr * g elif method == "adagrad": create_adagrad_updates(updates, params, gparams, gsums, lr, eps) elif method == "adadelta": create_adadelta_updates(updates, params, gparams, gsums, xsums, lr, eps, rho) elif method == "adam": create_adam_updates(updates, params, gparams, gsums, xsums, lr, eps, beta1, beta2) else: raise Exception("Unknown optim method: {}\n".format(method)) if method == "adadelta": lr = rho return updates, lr, g_norm_list, gsums, xsums, max_norm
def makeResidualConnectionBetweenLayersAndReturnOutput( myLogger, deeperLayerOutputImagesTrValTest, deeperLayerOutputImageShapesTrValTest, earlierLayerOutputImagesTrValTest, earlierLayerOutputImageShapesTrValTest): # Add the outputs of the two layers and return the output, as well as its dimensions. # Result: The result should have exactly the same shape as the output of the Deeper layer. Both #FMs and Dimensions of FMs. (deeperLayerOutputImageTrain, deeperLayerOutputImageVal, deeperLayerOutputImageTest) = deeperLayerOutputImagesTrValTest (deeperLayerOutputImageShapeTrain, deeperLayerOutputImageShapeVal, deeperLayerOutputImageShapeTest) = deeperLayerOutputImageShapesTrValTest (earlierLayerOutputImageTrain, earlierLayerOutputImageVal, earlierLayerOutputImageTest) = earlierLayerOutputImagesTrValTest (earlierLayerOutputImageShapeTrain, earlierLayerOutputImageShapeVal, earlierLayerOutputImageShapeTest) = earlierLayerOutputImageShapesTrValTest # Note: deeperLayerOutputImageShapeTrain has dimensions: [batchSize, FMs, r, c, z] # The deeper FMs can be greater only when there is upsampling. But then, to do residuals, I would need to upsample the earlier FMs. Not implemented. if np.any(np.asarray(deeperLayerOutputImageShapeTrain[2:]) > np.asarray(earlierLayerOutputImageShapeTrain[2:])) or \ np.any(np.asarray(deeperLayerOutputImageShapeVal[2:]) > np.asarray(earlierLayerOutputImageShapeVal[2:])) or \ np.any(np.asarray(deeperLayerOutputImageShapeTest[2:]) > np.asarray(earlierLayerOutputImageShapeTest[2:])) : myLogger.print3( "ERROR: In function [makeResidualConnectionBetweenLayersAndReturnOutput] the RCZ-dimensions of a deeper layer FMs were found greater than the earlier layers. Not implemented functionality. Exiting!" ) myLogger.print3("\t (train) Dimensions of Deeper Layer=" + str(deeperLayerOutputImageShapeTrain) + ". Dimensions of Earlier Layer=" + str(earlierLayerOutputImageShapeTrain)) myLogger.print3("\t (val) Dimensions of Deeper Layer=" + str(deeperLayerOutputImageShapeVal) + ". Dimensions of Earlier Layer=" + str(earlierLayerOutputImageShapeVal)) myLogger.print3("\t (test) Dimensions of Deeper Layer=" + str(deeperLayerOutputImageShapeTest) + ". Dimensions of Earlier Layer=" + str(earlierLayerOutputImageShapeTest)) exit(1) # get the part of the earlier layer that is of the same dimensions as the FMs of the deeper: partOfEarlierFmsToAddTrain = getMiddlePartOfFms( earlierLayerOutputImageTrain, deeperLayerOutputImageShapeTrain[2:]) partOfEarlierFmsToAddVal = getMiddlePartOfFms( earlierLayerOutputImageVal, deeperLayerOutputImageShapeVal[2:]) partOfEarlierFmsToAddTest = getMiddlePartOfFms( earlierLayerOutputImageTest, deeperLayerOutputImageShapeTest[2:]) # Add the FMs, after taking care of zero padding if the deeper layer has more FMs. numFMsDeeper = deeperLayerOutputImageShapeTrain[1] numFMsEarlier = earlierLayerOutputImageShapeTrain[1] if numFMsDeeper >= numFMsEarlier: outputOfResConnTrain = T.inc_subtensor( deeperLayerOutputImageTrain[:, :numFMsEarlier, :, :, :], partOfEarlierFmsToAddTrain, inplace=False) outputOfResConnVal = T.inc_subtensor( deeperLayerOutputImageVal[:, :numFMsEarlier, :, :, :], partOfEarlierFmsToAddVal, inplace=False) outputOfResConnTest = T.inc_subtensor( deeperLayerOutputImageTest[:, :numFMsEarlier, :, :, :], partOfEarlierFmsToAddTest, inplace=False) else: # Deeper FMs are fewer than earlier. This should not happen in most architectures. But oh well... outputOfResConnTrain = deeperLayerOutputImageTrain + partOfEarlierFmsToAddTrain[:, : numFMsDeeper, :, :, :] outputOfResConnVal = deeperLayerOutputImageVal + partOfEarlierFmsToAddVal[:, : numFMsDeeper, :, :, :] outputOfResConnTest = deeperLayerOutputImageTest + partOfEarlierFmsToAddTest[:, : numFMsDeeper, :, :, :] # Dimensions of output are the same as those of the deeperLayer return (outputOfResConnTrain, outputOfResConnVal, outputOfResConnTest)
def just_numeric_args(a, b): return tt.inc_subtensor(a[s], b)
def u(i): upd = OrderedDict() upd[eta] = T.inc_subtensor(eta[i], dloss * eps * delta) upd[lam_diag] = T.inc_subtensor(lam_diag[i], eps * (r ** 2))
def forward(self, x): out = tt.zeros(x.shape) out = tt.inc_subtensor(out[0], x[0]) out = tt.inc_subtensor(out[1:], tt.log(x[1:] - x[:-1])) return out
def _update_cps(nnet, layer, X, dW, db, loss, idx=None): """ update with compressed feature vectors """ assert layer.isdense or layer.issvm if Cfg.store_on_gpu: assert idx is not None C = Cfg.C D = Cfg.D eps = Cfg.eps k = layer.k K = (C * D) / (C + D) W_s = dW * K * T.cast(1. / nnet.data.n_train, 'floatX') b_s = db * K * T.cast(1. / nnet.data.n_train, 'floatX') l_s = loss * T.cast(1. / nnet.data.n_train, 'floatX') if Cfg.store_on_gpu: Deltaw = W_s - layer.W_i[idx] Deltab = b_s - layer.b_i[idx] Deltal = l_s - layer.l_i[idx] else: Deltaw = W_s - layer.W_i_buffer Deltab = b_s - layer.b_i_buffer Deltal = l_s - layer.l_i_buffer # uncompress feature vectors and sum over mini-batch # Method 1: memory inefficient (full allocation before sum) # DeltaW = T.sum(T.shape_padaxis(X, 2) * # T.shape_padaxis(Deltaw, 1), axis=0) # Method 2: same result but accumulates # results inplace on first dimension dummy = T.dot(X, layer.W) DeltaW = T.grad(cost=None, wrt=layer.W, known_grads={dummy: Deltaw}) gamma = (K * Deltal + T.sum(DeltaW * layer.W) + T.sum(Deltab * layer.b)) / \ (eps + T.sum(DeltaW ** 2) + T.sum(Deltab ** 2)) gamma = gamma.clip(0, 1) W = layer.W - gamma * DeltaW b = layer.b - gamma * Deltab l = layer.l + gamma * Deltal if Cfg.store_on_gpu: # new value to assign W_i = T.inc_subtensor(layer.W_i[idx], gamma * Deltaw) b_i = T.inc_subtensor(layer.b_i[idx], gamma * Deltab) l_i = T.inc_subtensor(layer.l_i[idx], gamma * Deltal) # shared variable to update layer_W_i = layer.W_i layer_b_i = layer.b_i layer_l_i = layer.l_i else: # new value to assign W_i = layer.W_i_buffer + gamma * Deltaw b_i = layer.b_i_buffer + gamma * Deltab l_i = layer.l_i_buffer + gamma * Deltal # shared variable to update layer_W_i = layer.W_i_buffer layer_b_i = layer.b_i_buffer layer_l_i = layer.l_i_buffer # average W_avg = T.cast((k * 1. / (k + 2)), 'floatX') * layer.W_avg + \ T.cast((2. / (k + 2)), 'floatX') * W b_avg = T.cast((k * 1. / (k + 2)), 'floatX') * layer.b_avg + \ T.cast((2. / (k + 2)), 'floatX') * b k = k + 1 updates = ((layer.W, W), (layer.b, b), (layer.W_avg, W_avg), (layer.b_avg, b_avg), (layer.k, k), (layer.l, l), (layer_W_i, W_i), (layer_b_i, b_i), (layer_l_i, l_i), (layer.gamma, gamma)) return updates