def stack_and_shared(_input): """ This will take a list of input variables, turn them into theano shared variables, and return them stacked in a single tensor. :param _input: list of input variables :type _input: list, object, or none :return: symbolic tensor of the input variables stacked, or none :rtype: Tensor or None """ if _input is None: return None elif isinstance(_input, list): shared_ins = [] for _in in _input: try: shared_ins.append(theano.shared(_in)) except TypeError as _: shared_ins.append(_in) return T.stack(shared_ins) else: try: _output = [theano.shared(_input)] except TypeError as _: _output = [_input] return T.stack(_output)
def _step(x_, h_, c_, pred_, prob_): h_a = [] c_a = [] for it in range(self.n_levels): preact = T.dot(h_[it], self.U[it]) preact += T.dot(x_, self.W[it]) + self.b[it] i = T.nnet.sigmoid(_slice(preact, 0, self.n_dim)) f = T.nnet.sigmoid(_slice(preact, 1, self.n_dim)) o = T.nnet.sigmoid(_slice(preact, 2, self.n_dim)) c = T.tanh(_slice(preact, 3, self.n_dim)) c = f * c_[it] + i * c h = o * T.tanh(c) h_a.append(h) c_a.append(c) x_ = h q = T.dot(h, self.L) + self.b0 prob = T.nnet.softmax(q) pred = T.argmax(prob, axis=1) return T.stack(h_a).squeeze(), T.stack(c_a).squeeze(), pred, prob
def predict_K(self, x, z, params): # s_mean, s_x for computing mean from s_x Ks = [] Ks_new = [] offset = 0 for kern, slice_k in zip(self.kernels, self.slices): params_k = params[offset: offset + kern.n_params] K_k, K_new_k = kern.predict_K( x[:, slice_k], z[:, slice_k], params_k) Ks.append(K_k) Ks_new.append(K_new_k) offset += kern.n_params log_weights = TT.concatenate((np.asarray([0]), params[offset:offset + self.n_my_params])) weights = TT.exp(log_weights) / TT.exp(log_weights).sum() if len(self.kernels) == 1: return Ks[0], Ks_new[0] else: # XXX: log_K, should be logadd here (#11) wK = TT.sum( weights[:, None, None] * TT.stack(*Ks), axis=0) wK_new = TT.sum( weights[:, None, None] * TT.stack(*Ks_new), axis=0) return wK, wK_new
def _for_step(self, xi_t, xf_t, xo_t, xc_t, mask_t, h_tm1, c_tm1, context, context_mask, context_att_trans, hist_h, hist_h_att_trans, b_u): # context: (batch_size, context_size, context_dim) # (batch_size, att_layer1_dim) h_tm1_att_trans = T.dot(h_tm1, self.att_h_W1) # (batch_size, context_size, att_layer1_dim) att_hidden = T.tanh(context_att_trans + h_tm1_att_trans[:, None, :]) # (batch_size, context_size, 1) att_raw = T.dot(att_hidden, self.att_W2) + self.att_b2 # (batch_size, context_size) ctx_att = T.exp(att_raw).reshape((att_raw.shape[0], att_raw.shape[1])) if context_mask: ctx_att = ctx_att * context_mask ctx_att = ctx_att / T.sum(ctx_att, axis=-1, keepdims=True) # (batch_size, context_dim) ctx_vec = T.sum(context * ctx_att[:, :, None], axis=1) ##### attention over history ##### if hist_h: hist_h = T.stack(hist_h).dimshuffle((1, 0, 2)) hist_h_att_trans = T.stack(hist_h_att_trans).dimshuffle((1, 0, 2)) h_tm1_hatt_trans = T.dot(h_tm1, self.hatt_h_W1) hatt_hidden = T.tanh(hist_h_att_trans + h_tm1_hatt_trans[:, None, :]) hatt_raw = T.dot(hatt_hidden, self.hatt_W2) + self.hatt_b2 hatt_raw = hatt_raw.flatten(2) h_att_weights = T.nnet.softmax(hatt_raw) # (batch_size, output_dim) h_ctx_vec = T.sum(hist_h * h_att_weights[:, :, None], axis=1) else: h_ctx_vec = T.zeros_like(h_tm1) ##### attention over history ##### i_t = self.inner_activation(xi_t + T.dot(h_tm1 * b_u[0], self.U_i) + T.dot(ctx_vec, self.C_i) + T.dot(h_ctx_vec, self.H_i)) f_t = self.inner_activation(xf_t + T.dot(h_tm1 * b_u[1], self.U_f) + T.dot(ctx_vec, self.C_f) + T.dot(h_ctx_vec, self.H_f)) c_t = f_t * c_tm1 + i_t * self.activation(xc_t + T.dot(h_tm1 * b_u[2], self.U_c) + T.dot(ctx_vec, self.C_c) + T.dot(h_ctx_vec, self.H_c)) o_t = self.inner_activation(xo_t + T.dot(h_tm1 * b_u[3], self.U_o) + T.dot(ctx_vec, self.C_o) + T.dot(h_ctx_vec, self.H_o)) h_t = o_t * self.activation(c_t) h_t = (1 - mask_t) * h_tm1 + mask_t * h_t c_t = (1 - mask_t) * c_tm1 + mask_t * c_t # ctx_vec = theano.printing.Print('ctx_vec')(ctx_vec) return h_t, c_t, ctx_vec
def stack_and_shared(input): """ This will take a list of input variables, turn them into theano shared variables, and return them stacked in a single tensor. Parameters ---------- input : list or object List of input variables to stack into a single shared tensor. Returns ------- tensor Symbolic tensor of the input variables stacked, or None if input was None. """ if input is None: return None elif isinstance(input, list): shared_ins = [] for _in in input: try: shared_ins.append(theano.shared(_in)) except TypeError as _: shared_ins.append(_in) return T.stack(shared_ins) else: try: _output = [theano.shared(input)] except TypeError as _: _output = [input] return T.stack(_output)
def forward_prop_step_stack(x_t, masks, h_prevs, c_prevs, stack_prevs, ptrs_to_top_prevs): # determine, for all layers, if this input was a push/pop is_push, is_pop = map_push_pop(x_t, self.PUSH, self.POP) is_null = get_is_null(x_t, self.NULL) nonsymbolic_hs = [] nonsymbolic_cs = [] nonsymbolic_stacks = [] nonsymbolic_ptrs_to_tops = [] h = x_t for i,layer in enumerate(self.layers): h, c, stack, ptrs_to_top = layer.forward_prop_stack(h, h_prevs[i,:,:], c_prevs[i,:,:], stack_prevs[i,:,:,:], ptrs_to_top_prevs[i,:,:,:], is_push, is_pop, is_null) h = h*masks[:,:,i] / self.dropout # inverted dropout for scaling nonsymbolic_hs.append(h) nonsymbolic_cs.append(c) nonsymbolic_stacks.append(stack) nonsymbolic_ptrs_to_tops.append(ptrs_to_top) h_s = T.stack(nonsymbolic_hs) c_s = T.stack(nonsymbolic_cs) stack_s = T.stack(nonsymbolic_stacks) ptrs_to_top_s = T.stack(nonsymbolic_ptrs_to_tops) o_t = self.W_hy.dot(h) return o_t, h_s, c_s, stack_s, ptrs_to_top_s
def tangent2ambient(self, X, Z): U = tensor.stack((X.U.dot(Z.M) + Z.Up, X.U), 0).reshape((-1, X.U.shape[1])) #U = np.hstack((X.U.dot(Z.M) + Z.Up, X.U)) S = tensor.eye(2*self._k) V = tensor.stack((X.V, Z.Vp), 1).reshape((X.V.shape[0], -1)) #V = np.vstack((X.V, Z.Vp)) return ManifoldElementShared.from_vars((U, S, V), shape=(self._m, self._n), r=self._k)
def tangent2ambient(self, X, Z): U = tensor.stack((X.U.dot(Z.M) + Z.Up, X.U), 0).reshape((-1, X.U.shape[1])) #U = np.hstack((X.U.dot(Z.M) + Z.Up, X.U)) S = tensor.eye(2*self._k) V = tensor.stack((X.V, Z.Vp), 1).reshape((X.V.shape[0], -1)) #V = np.vstack((X.V, Z.Vp)) return (U, S, V)
def retr(self, X, Z, t=None): U, S, V = X Up, M, Vp = Z if t is None: t = 1.0 Qu, Ru = tensor.nlinalg.qr(Up) # we need rq decomposition here Qv, Rv = tensor.nlinalg.qr(Vp[::-1].T) Rv = Rv.T[::-1] Rv = Rv[:, ::-1] Qv = Qv.T[::-1] # now we have rq decomposition (Rv @ Qv = Z.Vp) #Rv, Qv = rq(Z.Vp, mode='economic') zero_block = tensor.zeros((Ru.shape[0], Rv.shape[1])) block_mat = tensor.stack( ( tensor.stack((S + t * M, t * Rv), 1).reshape((Rv.shape[0], -1)), tensor.stack((t * Ru, zero_block), 1).reshape((Ru.shape[0], -1)) ) ).reshape((-1, Ru.shape[1] + Rv.shape[1])) Ut, St, Vt = tensor.nlinalg.svd(block_mat, full_matrices=False) U_res = tensor.stack((U, Qu), 1).reshape((Qu.shape[0], -1)).dot(Ut[:, :self._k]) V_res = Vt[:self._k, :].dot(tensor.stack((V, Qv), 0).reshape((-1, Qv.shape[1]))) # add some machinery eps to get a slightly perturbed element of a manifold # even if we have some zeros in S S_res = tensor.diag(St[:self._k]) + tensor.diag(np.spacing(1) * tensor.ones(self._k)) return (U_res, S_res, V_res)
def generate(self, h_, c_, x_): h_a = [] c_a = [] for it in range(self.n_levels): preact = T.dot(x_, self.W[it]) preact += T.dot(h_[it], self.U[it]) + self.b[it] i = T.nnet.sigmoid(self.slice(preact, 0, self.n_dim)) f = T.nnet.sigmoid(self.slice(preact, 1, self.n_dim)) o = T.nnet.sigmoid(self.slice(preact, 2, self.n_dim)) c = T.tanh(self.slice(preact, 3, self.n_dim)) c = f * c_[it] + i * c h = o * T.tanh(c) h_a.append(h) c_a.append(c) x_ = h q = T.dot(h, self.L) + self.b0 # mask = T.concatenate([T.alloc(np_floatX(1.), q.shape[0] - 1), T.alloc(np_floatX(0.), 1)]) prob = T.nnet.softmax(q / 1) return prob, T.stack(h_a).squeeze(), T.stack(c_a)[0].squeeze()
def func(chol_vec, delta): chol = tt.stack([ tt.stack([tt.exp(0.1 * chol_vec[0]), 0]), tt.stack([chol_vec[1], 2 * tt.exp(chol_vec[2])]), ]) cov = tt.dot(chol, chol.T) return MvNormalLogp()(cov, delta)
def finetune_cost_updates(self, center, mu, learning_rate): """ This function computes the cost and the updates .""" # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, withd one entry per # example in minibatch network_output = self.get_output() temp = T.pow(center - network_output, 2) L = T.sum(temp, axis=1) # Add the network reconstruction error z = self.get_network_reconst() reconst_err = T.sum(T.pow(self.x - z, 2), axis = 1) L = self.beta*L + self.lbd*reconst_err cost1 = T.mean(L) cost2 = self.lbd*T.mean(reconst_err) cost3 = cost1 - cost2 # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost1, self.params) # generate the list of updates updates = [] grad_values = [] param_norm = [] for param, delta, gparam in zip(self.params, self.delta, gparams): updates.append( (delta, mu*delta - learning_rate * gparam) ) updates.append( (param, param + mu*mu*delta - (1+mu)*learning_rate*gparam )) grad_values.append(gparam.norm(L=2)) param_norm.append(param.norm(L=2)) grad_ = T.stack(*grad_values) param_ = T.stack(*param_norm) return ((cost1, cost2, cost3, grad_, param_), updates)
def _setOutputs(self) : inps = [] for l in self.network.inConnections[self] : inps.append(l.outputs) self.outputs = tt.stack(inps).reshape((-1, self.nbChannels, self.height, self.width)) self.testOutputs = tt.stack(inps).reshape((-1, self.nbChannels, self.height, self.width))
def _batch_vectorization(self,**args): fun_in = args["fun"] symbolic_X_list = args["symbolic_X_list"] if "symbolic_c_inp_list" in args and "t" in args: t = args["t"] symbolic_c_inp_list = args["symbolic_c_inp_list"] fun = lambda x,y: fun_in(x,y,t) elif "symbolic_c_inp_list" in args and "t" not in args: symbolic_c_inp_list = args["symbolic_c_inp_list"] fun = fun_in elif "symbolic_c_inp_list" not in args and "t" in args: t = args["t"] symbolic_c_inp_list = [] fun = lambda x,y: fun_in(x,t) fun_list = [] for i in np.arange(self.number_of_rollouts): symbolic_X_list_i = [a[i] for a in symbolic_X_list] symbolic_c_inp_list_i = [a[i] for a in symbolic_c_inp_list] out_list = fun(symbolic_X_list_i,symbolic_c_inp_list) fun_list.append(out_list) if type(fun_list[0]) != list: return T.stack(fun_list,axis = 0) else: ziped_list = [list(a) for a in zip(*fun_list)] return [T.stack(a,axis = 0) for a in ziped_list]
def retr(self, X, Z, t=None): if t is None: t = 1.0 Qu, Ru = tensor.nlinalg.QRFull(Z.Up) # we need rq decomposition here Qv, Rv = tensor.nlinalg.QRFull(Z.Vp[::-1].T) Rv = Rv.T[::-1] Rv[:, :] = Rv[:, ::-1] Qv = Qv.T[::-1] # now we have rq decomposition (Rv @ Qv = Z.Vp) #Rv, Qv = rq(Z.Vp, mode='economic') zero_block = tensor.zeros((Ru.shape[0], Rv.shape[1])) block_mat = tensor.stack( ( tensor.stack((X.S + t * Z.M, t * Rv), 1).reshape((Rv.shape[0], -1)), tensor.stack((t * Ru, zero_block), 1).reshape((Ru.shape[0], -1)) ) ).reshape((-1, Ru.shape[1] + Rv.shape[1])) Ut, St, Vt = tensor.nlinalg.svd(block_mat, full_matrices=False) U = tensor.stack((X.U, Qu), 1).reshape((Qu.shape[0], -1)).dot(Ut[:, :self._k]) V = Vt[:self._k, :].dot(tensor.stack((X.V, Qv), 0).reshape((-1, Qv.shape[1]))) # add some machinery eps to get a slightly perturbed element of a manifold # even if we have some zeros in S S = tensor.diag(St[:self._k]) + tensor.diag(np.spacing(1) * tensor.ones(self._k)) return ManifoldElementShared.from_vars((U, S, V), shape=(self._m, self._n), r=self._k)
def local_gpu_sum(node): if isinstance(node.op, tensor.elemwise.CAReduce): if node.op.scalar_op == scal.add: x, = node.inputs if x.owner and x.owner.op == host_from_gpu: if node.op.axis is None: reduce_mask = [1] * x.type.ndim else: reduce_mask = [0] * x.type.ndim for a in node.op.axis: assert reduce_mask[a] == 0 reduce_mask[a] = 1 gsum = GpuSum(reduce_mask) pattern = "".join(str(i) for i in reduce_mask) if hasattr(gsum, "c_code_reduce_%s" % pattern): rval = host_from_gpu(gsum(gpu_from_host(x))) if rval.type == node.outputs[0].type: return [rval] else: print >>sys.stderr, "WARNING: local_gpu_sum got type wrong" return None else: # Try to make a simpler pattern based on reshaping # The principle is that if two adjacent dimensions have the same value in # the reduce_mask, then we can reshape to make them a single dimension, do # the sum, and then reshape to get them back. shape_of = node.env.shape_feature.shape_of x_shape = shape_of[x] new_in_shp = [x_shape[0]] new_mask = [reduce_mask[0]] for i in range(1, x.type.ndim): if reduce_mask[i] == reduce_mask[i - 1]: new_in_shp[-1] *= x_shape[i] else: new_mask.append(reduce_mask[i]) new_in_shp.append(x_shape[i]) pattern = "".join(str(i) for i in new_mask) new_gsum = GpuSum(new_mask) if hasattr(new_gsum, "c_code_reduce_%s" % pattern): reshaped_x = x.reshape(tensor.stack(*new_in_shp)) sum_reshaped_x = host_from_gpu(new_gsum(gpu_from_host(reshaped_x))) if sum_reshaped_x.ndim != node.outputs[0].ndim: unreshaped_sum = sum_reshaped_x.reshape(tensor.stack(*shape_of[node.outputs[0]])) else: unreshaped_sum = sum_reshaped_x if unreshaped_sum.type == node.outputs[0].type: return [unreshaped_sum] else: print >>sys.stderr, "WARNING: local_gpu_sum got type wrong" return None raise Exception("GpuSum don't have implemented the pattern", pattern) return False
def new_attn_step(self,c_t,g_tm1,m_im1,q): cWq = T.stack([T.dot(T.dot(c_t, self.Wb), q)]) cWm = T.stack([T.dot(T.dot(c_t, self.Wb), m_im1)]) z = T.concatenate([c_t,m_im1,q,c_t*q,c_t*m_im1,T.abs_(c_t-q),T.abs_(c_t-m_im1),cWq,cWm],axis=0) l_1 = T.dot(self.W1, z) + self.b1 l_1 = T.tanh(l_1) l_2 = T.dot(self.W2,l_1) + self.b2 return l_2[0]
def new_attention_step(self, ct, prev_g, mem, q_q): cWq = T.stack([T.dot(T.dot(ct, self.W_b), q_q)]) cWm = T.stack([T.dot(T.dot(ct, self.W_b), mem)]) z = T.concatenate([ct, mem, q_q, ct * q_q, ct * mem, (ct - q_q) ** 2, (ct - mem) ** 2, cWq, cWm]) l_1 = T.dot(self.W_1, z) + self.b_1 l_1 = T.tanh(l_1) l_2 = T.dot(self.W_2, l_1) + self.b_2 G = T.nnet.sigmoid(l_2)[0] return G
def max_pool(images, imgshp, maxpoolshp): """Implements a max pooling layer Takes as input a 2D tensor of shape batch_size x img_size and performs max pooling. Max pooling downsamples by taking the max value in a given area, here defined by maxpoolshp. Outputs a 2D tensor of shape batch_size x output_size. :param images: 2D tensor containing images on which to apply convolution. Assumed to be of shape batch_size x img_size :param imgshp: tuple containing image dimensions :param maxpoolshp: tuple containing shape of area to max pool over :return: out1, symbolic result (2D tensor) :return: out2, logical shape of the output """ N = numpy poolsize = N.int64(N.prod(maxpoolshp)) # imgshp contains either 2 entries (height,width) or 3 (nfeatures,h,w) # in the first case, default nfeatures to 1 if N.size(imgshp) == 2: imgshp = (1,) + imgshp # construct indices and index pointers for sparse matrix, which, # when multiplied with input images will generate a stack of image # patches indices, indptr, spmat_shape, sptype, outshp = \ convolution_indices.conv_eval(imgshp, maxpoolshp, maxpoolshp, mode='valid') # print 'XXXXXXXXXXXXXXXX MAX POOLING LAYER XXXXXXXXXXXXXXXXXXXX' # print 'imgshp = ', imgshp # print 'maxpoolshp = ', maxpoolshp # print 'outshp = ', outshp # build sparse matrix, then generate stack of image patches csc = theano.sparse.CSM(sptype)(N.ones(indices.size), indices, indptr, spmat_shape) patches = sparse.structured_dot(csc, images.T).T pshape = tensor.stack([images.shape[0] *\ tensor.as_tensor(N.prod(outshp)), tensor.as_tensor(imgshp[0]), tensor.as_tensor(poolsize)]) patch_stack = tensor.reshape(patches, pshape, ndim=3) out1 = tensor.max(patch_stack, axis=2) pshape = tensor.stack([images.shape[0], tensor.as_tensor(N.prod(outshp)), tensor.as_tensor(imgshp[0])]) out2 = tensor.reshape(out1, pshape, ndim=3) out3 = tensor.DimShuffle(out2.broadcastable, (0, 2, 1))(out2) return tensor.flatten(out3, 2), outshp
def cost(self, readouts, outputs): # initial state state = self.frnn_initial_state.apply(self.mlp.apply(readouts)) inputs = outputs mus = [] sigmas = [] coeffs = [] for i in range(self.number_of_steps): last_iteration = i == self.number_of_steps - 1 # First generating distribution parameters and sampling. freq_mu = self.mu.apply(state) freq_sigma = self.sigma.apply(state) + self.const freq_coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const freq_mu = freq_mu.reshape((-1, self.frnn_step_size, self.k)) freq_sigma = freq_sigma.reshape((-1, self.frnn_step_size, self.k)) freq_coeff = freq_coeff.reshape((-1, self.k)) # mu,sigma: shape (-1,fs,k) # coeff: shape (-1,k) mus.append(freq_mu) sigmas.append(freq_sigma) coeffs.append(freq_coeff) index = self.frnn_step_size freq_inputs = inputs[ tuple([slice(0, None)] * (inputs.ndim - 1) + [slice(index, index + self.frnn_step_size)]) ] if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(freq_inputs) ) mus = tensor.stack(mus, axis=-2) sigmas = tensor.stack(sigmas, axis=-2) coeffs = tensor.stack(coeffs, axis=-2) mus = mus.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k)) sigmas = sigmas.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k)) coeffs = coeffs.repeat(self.frnn_step_size, axis=-2) mus = mus[tuple([slice(0, None)] * (mus.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] sigmas = sigmas[tuple([slice(0, None)] * (sigmas.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] coeffs = coeffs[tuple([slice(0, None)] * (coeffs.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] # actually prob not necessary mu = mus.reshape((-1, self.target_size)) sigma = sigmas.reshape((-1, self.target_size)) coeff = coeffs.reshape((-1, self.target_size)) return FRNN_NLL(y=outputs, mu=mu, sig=sigma, coeff=coeff, frame_size=self.frame_size, k=self.k)
def _grad_single(self, ct, s, lnC2, GAMMI2): lnC = lnC2 GAMMI = GAMMI2 v = self.v#T.as_tensor(self.v)[:,ct:] v0 = T.as_tensor(v[v[:,0]==0, :]) v1 = T.as_tensor(v[v[:,0]==1, :]) cnp = v.shape[0] # Gradient of fE wrt the priors over final state [ofE, oxS], upd_fE_single = th.scan(fn=self._free_energy, sequences=v, non_sequences=[s,self.h,lnC,self.b]) ofE0 = ofE[v0].sum() ofE1 = ofE[v1].sum() dFE0dlnC = T.jacobian(ofE0, lnC) dFE1dlnC = T.jacobian(ofE1, lnC) dFEdlnC = T.jacobian(ofE, lnC) ofE_ = T.vector() ofE_.tag.test_value = ofE.tag.test_value # Gradient of Gamma with respect to its initial condition: GAMMA, upd_GAMMA = th.scan(fn=self._upd_gamma, outputs_info=[GAMMI], non_sequences=[ofE, self.lambd, self.alpha, self.beta, cnp], n_steps=4) dGdg = T.grad(GAMMA[-1], GAMMI) dGdfE = T.jacobian(GAMMA[-1], ofE) dGdlnC = dGdfE.dot(dFEdlnC) out1 = ofE0 out2 = ofE1 maxout = T.max([out1, out2]) exp_out1 = T.exp(GAMMA[-1]*(out1 - maxout)) exp_out2 = T.exp(GAMMA[-1]*(out2 - maxout)) norm_const = exp_out1 + exp_out2 # Derivative wrt the second output (gammi): Jac1_gammi = (-(out1-out2)*dGdg* T.exp(GAMMA[-1]*(out1+out2 - 2*maxout))/(norm_const**2)) Jac2_gammi = -Jac1_gammi # dfd1_tZ = Jac1_gammi*dCdf[1][0]+ Jac2_gammi*dCdf[1][1] # Derivative wrt first input (lnc) Jac1_lnC = (T.exp(GAMMA[-1]*(out1 + out2 - 2*maxout))/(norm_const**2)* (-dGdlnC*(out1 - out2) - GAMMA[-1]*(dFE0dlnC - dFE1dlnC))) Jac2_lnC = -Jac1_lnC Jac1 = T.concatenate([T.stack(Jac1_gammi), Jac1_lnC]) Jac2 = T.concatenate([T.stack(Jac2_gammi), Jac2_lnC]) self.debug = [Jac1_lnC, Jac2_lnC, Jac2_gammi, Jac1_gammi, dFE0dlnC, dFE1dlnC, dGdg, out1, out2, v0, v1, v, ct] return Jac1, Jac2
def underdamped(): Q = self.Q f = tt.sqrt(tt.maximum(1.0 - 4.0*Q**2, self.eps)) return ( 0.5*self.S0*self.w0*Q*tt.stack([1.0+1.0/f, 1.0-1.0/f]), 0.5*self.w0/Q*tt.stack([1.0-f, 1.0+f]), tt.zeros(0, dtype=self.dtype), tt.zeros(0, dtype=self.dtype), tt.zeros(0, dtype=self.dtype), tt.zeros(0, dtype=self.dtype), )
def gen_stats(params, infos, what_stats): if not what_stats: return [] results = [] for stat in what_stats: print len(params), len(infos) res = stat.comp_all(params, infos) print stat print res results.append(T.stack(*res)) return T.stack(*results)
def local_gpua_careduce(node): if (isinstance(node.op.scalar_op, scalar.basic.Add) or isinstance(node.op.scalar_op, scalar.basic.Mul)): x, = node.inputs greduce = GpuCAReduceCuda(node.op.scalar_op, axis=node.op.axis) if x.dtype != "float32": return gvar = greduce(x) #We need to have the make node called, otherwise the mask can #be None if gvar.owner.op.supports_c_code([gpu_from_host(x)]): return greduce else: # Try to make a simpler pattern based on reshaping # The principle is that if two adjacent dimensions have # the same value in the reduce_mask, then we can reshape # to make them a single dimension, do the reduction, and # then reshape to get them back. if node.op.axis is None: reduce_mask = [1] * x.type.ndim else: reduce_mask = [0] * x.type.ndim for a in node.op.axis: assert reduce_mask[a] == 0 reduce_mask[a] = 1 shape_of = node.fgraph.shape_feature.shape_of x_shape = shape_of[x] new_in_shp = [x_shape[0]] new_mask = [reduce_mask[0]] for i in xrange(1, x.type.ndim): if reduce_mask[i] == reduce_mask[i - 1]: new_in_shp[-1] *= x_shape[i] else: new_mask.append(reduce_mask[i]) new_in_shp.append(x_shape[i]) new_greduce = GpuCAReduceCuda(new_mask, scalar_op) reshaped_x = x.reshape(tensor.stack(*new_in_shp)) gpu_reshaped_x = gpu_from_host(reshaped_x) reshaped_gpu_inputs = [gpu_reshaped_x] if new_greduce.supports_c_code(reshaped_gpu_inputs): reduce_reshaped_x = host_from_gpu( new_greduce(gpu_reshaped_x)) if reduce_reshaped_x.ndim != node.outputs[0].ndim: unreshaped_reduce = reduce_reshaped_x.reshape( tensor.stack(*shape_of[node.outputs[0]])) else: unreshaped_reduce = reduce_reshaped_x return [unreshaped_reduce]
def test_hessian(self): chol_vec = tt.vector('chol_vec') chol_vec.tag.test_value = np.array([0.1, 2, 3]) chol = tt.stack([ tt.stack([tt.exp(0.1 * chol_vec[0]), 0]), tt.stack([chol_vec[1], 2 * tt.exp(chol_vec[2])]), ]) cov = tt.dot(chol, chol.T) delta = tt.matrix('delta') delta.tag.test_value = np.ones((5, 2)) logp = MvNormalLogp()(cov, delta) g_cov, g_delta = tt.grad(logp, [cov, delta]) tt.grad(g_delta.sum() + g_cov.sum(), [delta, cov])
def get_output_for(self, input, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) cores = tuple(getattr(self, attr_name) for attr_name in self.attr_names) unitary_input = tensor.reshape(input, (input.shape[0], 2, self.num_inputs)) IR, II = unitary_input[:, 0, :], unitary_input[:, 1, :] I = tensor.stack([IR, II], axis=0) output = comp_wtt_image(I, cores, self.nd, self.ranks) output = tensor.stack([output[0, ...], output[1, ...]], axis=1) output = output.reshape((input.shape[0], -1)) return output
def t_mk_pool_ready(t_pool_input, t_pool_shape): """ Prepare pooling input :param t_pool_input: 4D theano tensor batch_sz x channels x height x width :param t_pool_shape: theano lvector pool_ch x pool_h x pool_w :return: aux. sizes and input reshaped for pooling """ # sizes # input t_batch_sz = t_pool_input.shape[0] t_in_ch = t_pool_input.shape[1] t_in_h = t_pool_input.shape[2] t_in_w = t_pool_input.shape[3] # pooling t_pool_ch = t_pool_shape[0] t_pool_h = t_pool_shape[1] t_pool_w = t_pool_shape[2] # output t_out_ch = (t_in_ch + t_pool_ch - 1) // t_pool_ch t_out_h = (t_in_h + t_pool_h - 1) // t_pool_h t_out_w = (t_in_w + t_pool_w - 1) // t_pool_w # we will need to pad input (probably), so here's the padded shape: t_padded_ch = t_out_ch * t_pool_ch t_padded_h = t_out_h * t_pool_h t_padded_w = t_out_w * t_pool_w t_padded_pool_in_z = T.zeros(T.stack([t_batch_sz, t_padded_ch, t_padded_h, t_padded_w])) t_padded_pool_in = T.inc_subtensor(t_padded_pool_in_z[:t_batch_sz, :t_in_ch, :t_in_h, :t_in_w], t_pool_input) # below is all computed # spatial pooling t_sp_pooled = images2neibs(t_padded_pool_in, T.stack([t_pool_h, t_pool_w])) # spatial pooling output shape # has size (B * C * H/h * W/w) x (h*w) t_sp_pooled_dims = t_sp_pooled.shape # lines per channel # H*W / (h*w) t_lpc = (t_padded_h * t_padded_w) // (t_pool_h * t_pool_w) # shape to collect channels t_ch_pool_prep_dims_1 = T.stack([t_sp_pooled_dims[0] // t_lpc, t_lpc, t_sp_pooled_dims[1]]) # preparing pooling by channels # reshape to collect channels in a separate dimension t_ch_pool_prep_1 = T.reshape(t_sp_pooled, t_ch_pool_prep_dims_1) t_ch_pool_prep_2 = T.shape_padleft(T.transpose(t_ch_pool_prep_1, [1, 0, 2])) # prepare for channel pooling t_ch_pool_dims = T.stack([t_pool_ch, t_ch_pool_prep_dims_1[-1]]) t_pool_ready = images2neibs(t_ch_pool_prep_2, t_ch_pool_dims) return t_batch_sz, t_in_ch, t_in_h, t_in_w, t_out_ch, t_out_h, t_out_w, t_pool_ready
def symbolic_call(self,x,u): u = TT.clip(u, -self.max_force, self.max_force) #pylint: disable=E1111 dt = self.dt z = TT.take(x,0,axis=x.ndim-1) zdot = TT.take(x,1,axis=x.ndim-1) th = TT.take(x,2,axis=x.ndim-1) thdot = TT.take(x,3,axis=x.ndim-1) u0 = TT.take(u,0,axis=u.ndim-1) th1 = np.pi - th g = 10. mc = 1. # mass of cart mp = .1 # mass of pole muc = .0005 # coeff friction of cart mup = .000002 # coeff friction of pole l = 1. # length of pole def sign(x): return TT.switch(x>0, 1, -1) thddot = -(-g*TT.sin(th1) + TT.cos(th1) * (-u0 - mp * l *thdot**2 * TT.sin(th1) + muc*sign(zdot))/(mc+mp) - mup*thdot / (mp*l)) \ / (l*(4/3. - mp*TT.cos(th1)**2 / (mc + mp))) zddot = (u0 + mp*l*(thdot**2 * TT.sin(th1) - thddot * TT.cos(th1)) - muc*sign(zdot)) \ / (mc+mp) newzdot = zdot + dt*zddot newz = z + dt*newzdot newthdot = thdot + dt*thddot newth = th + dt*newthdot done = (z > self.max_cart_pos) | (z < -self.max_cart_pos) | (th > self.max_pole_angle) | (th < -self.max_pole_angle) ucost = 1e-5*(u**2).sum(axis=u.ndim-1) xcost = 1-TT.cos(th) # notdone = TT.neg(done) #pylint: disable=W0612,E1111 notdone = 1-done costs = TT.stack((done-1)*10., notdone*xcost, notdone*ucost).T #pylint: disable=E1103 newx = TT.stack(newz, newzdot, newth, newthdot).T #pylint: disable=E1103 return [newx,newx,costs,done]
def normout_actfun(input, pool_size, filt_count): """Apply (L2) normout over non-overlapping sets of values.""" l_start = filt_count - pool_size relu_vals = T.stack(\ *[input[:,i:(l_start+i+1):pool_size] for i in range(pool_size)]) pooled_vals = T.sqrt(T.mean(relu_vals**2.0, axis=0)) return pooled_vals
def get_reconstructed_input(self, hidden): """ Computes the reconstructed input given the values of the hidden layer """ repeated_conv = conv.conv2d(input = hidden, filters = self.W_prime, border_mode='full') multiple_conv_out = [repeated_conv.flatten()] * numpy.prod(self.poolsize) stacked_conv_neibs = T.stack(*multiple_conv_out).T stretch_unpooling_out = neibs2images(stacked_conv_neibs, self.pl, self.x.shape) return ReLU(stretch_unpooling_out + self.b_prime.dimshuffle('x', 0, 'x', 'x'))
def convolve(kerns, kshp, nkern, images, imgshp, step=(1, 1), bias=None, mode='valid', flatten=True): """Convolution implementation by sparse matrix multiplication. :note: For best speed, put the matrix which you expect to be smaller as the 'kernel' argument "images" is assumed to be a matrix of shape batch_size x img_size, where the second dimension represents each image in raster order If flatten is "False", the output feature map will have shape: .. code-block:: python batch_size x number of kernels x output_size If flatten is "True", the output feature map will have shape: .. code-block:: python batch_size x number of kernels * output_size .. note:: IMPORTANT: note that this means that each feature map (image generate by each kernel) is contiguous in memory. The memory layout will therefore be: [ <feature_map_0> <feature_map_1> ... <feature_map_n>], where <feature_map> represents a "feature map" in raster order kerns is a 2D tensor of shape nkern x N.prod(kshp) :param kerns: 2D tensor containing kernels which are applied at every pixel :param kshp: tuple containing actual dimensions of kernel (not symbolic) :param nkern: number of kernels/filters to apply. nkern=1 will apply one common filter to all input pixels :param images: tensor containing images on which to apply convolution :param imgshp: tuple containing image dimensions :param step: determines number of pixels between adjacent receptive fields (tuple containing dx,dy values) :param mode: 'full', 'valid' see CSM.evaluate function for details :param sumdims: dimensions over which to sum for the tensordot operation. By default ((2,),(1,)) assumes kerns is a nkern x kernsize matrix and images is a batchsize x imgsize matrix containing flattened images in raster order :param flatten: flatten the last 2 dimensions of the output. By default, instead of generating a batchsize x outsize x nkern tensor, will flatten to batchsize x outsize*nkern :return: out1, symbolic result :return: out2, logical shape of the output img (nkern,heigt,width) :TODO: test for 1D and think of how to do n-d convolutions """ N = numpy # start by computing output dimensions, size, etc kern_size = N.int64(N.prod(kshp)) # inshp contains either 2 entries (height,width) or 3 (nfeatures,h,w) # in the first case, default nfeatures to 1 if N.size(imgshp) == 2: imgshp = (1, ) + imgshp # construct indices and index pointers for sparse matrix, which, # when multiplied with input images will generate a stack of image # patches indices, indptr, spmat_shape, sptype, outshp = \ convolution_indices.conv_eval(imgshp, kshp, step, mode) # build sparse matrix, then generate stack of image patches csc = theano.sparse.CSM(sptype)(N.ones(indices.size), indices, indptr, spmat_shape) patches = (sparse.structured_dot(csc, images.T)).T # compute output of linear classifier pshape = tensor.stack(images.shape[0] * tensor.as_tensor(N.prod(outshp)),\ tensor.as_tensor(imgshp[0] * kern_size)) patch_stack = tensor.reshape(patches, pshape, ndim=2) # kern is of shape: nkern x ksize*number_of_input_features # output is thus of shape: bsize*outshp x nkern output = tensor.dot(patch_stack, kerns.T) # add bias across each feature map (more efficient to do it now) if bias is not None: output += bias # now to have feature maps in raster order ... # go from bsize*outshp x nkern to bsize x nkern*outshp newshp = tensor.stack(images.shape[0],\ tensor.as_tensor(N.prod(outshp)),\ tensor.as_tensor(nkern)) tensout = tensor.reshape(output, newshp, ndim=3) output = tensor.DimShuffle((False, ) * tensout.ndim, (0, 2, 1))(tensout) if flatten: output = tensor.flatten(output, 2) return output, N.hstack((nkern, outshp))
def build_graph(FLAGS): """Define training graph. """ tparams = OrderedDict() trng = RandomStreams( np.random.RandomState(np.random.randint(1024)).randint( np.iinfo(np.int32).max)) print("Building the computational graph") # Define bunch of shared variables init_state = np.zeros((3, 2, FLAGS.batch_size, FLAGS.n_hidden), dtype=np.float32) tstate = sharedX(init_state, name='rnn_state') # Graph input inp = tensor.matrix('inp', dtype='int64') inp_mask = tensor.matrix('inp_mask', dtype='float32') inp.tag.test_value = np.zeros((FLAGS.max_seq_len, FLAGS.batch_size), dtype='int64') inp_mask.tag.test_value = np.ones((FLAGS.max_seq_len, FLAGS.batch_size), dtype='float32') x, y = inp[:-1], inp[1:] y_mask = inp_mask[1:] # Define input embedding layer _i_embed = LinearCell(FLAGS.n_class, FLAGS.n_input_embed, prefix='i_embed', bias=False, input_is_int=True) tparams = merge_dict(tparams, _i_embed._params) # Call input embedding layer h_i_emb_3d = _i_embed(x) # Define the first LSTM module _rnn_1 = LSTMModule(FLAGS.n_input_embed, FLAGS.n_hidden, prefix='lstm_1') tparams = merge_dict(tparams, _rnn_1._params) # Call the first LSTM module (h_rnn_1_3d, c_rnn_1_3d), last_state_1 = _rnn_1(h_i_emb_3d, tstate[0]) # Define the second LSTM module _rnn_2 = LSTMModule(FLAGS.n_hidden, FLAGS.n_hidden, prefix='lstm_2') tparams = merge_dict(tparams, _rnn_1._params) # Call the second LSTM module (h_rnn_2_3d, c_rnn_2_3d), last_state_2 = _rnn_2(h_rnn_1_3d, tstate[1]) # Define the third LSTM module _rnn_3 = LSTMModule(FLAGS.n_hidden, FLAGS.n_hidden, prefix='lstm_3') tparams = merge_dict(tparams, _rnn_3._params) # Call the third LSTM module (h_rnn_3_3d, c_rnn_3_3d), last_state_3 = _rnn_3(h_rnn_2_3d, tstate[2]) # Define output gating layer _o_gate = LinearCell([FLAGS.n_hidden] * 3, 3, prefix='o_gate', activation=tensor.nnet.sigmoid) tparams = merge_dict(tparams, _o_gate._params) # Call output gating layer h_o_gate = _o_gate([h_rnn_1_3d, h_rnn_2_3d, h_rnn_3_3d]) # Define output embedding layer _o_embed = LinearCell([FLAGS.n_hidden] * 3, FLAGS.n_output_embed, prefix='o_embed', activation=tensor.nnet.relu) tparams = merge_dict(tparams, _o_embed._params) # Call output embedding layer h_o_embed = _o_embed([ h_rnn_1_3d * h_o_gate[:, :, 0][:, :, None], h_rnn_2_3d * h_o_gate[:, :, 1][:, :, None], h_rnn_3_3d * h_o_gate[:, :, 2][:, :, None] ]) # Define output layer _output = LinearCell(FLAGS.n_output_embed, FLAGS.n_class, prefix='output') tparams = merge_dict(tparams, _output._params) # Call output layer h_logit = _output([h_o_embed]) logit_shape = h_logit.shape logit = h_logit.reshape([logit_shape[0] * logit_shape[1], logit_shape[2]]) logit = logit - logit.max(axis=1).dimshuffle(0, 'x') probs = logit - tensor.log( tensor.exp(logit).sum(axis=1).dimshuffle(0, 'x')) # Compute the cost y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * FLAGS.n_class + y_flat cost = -probs.flatten()[y_flat_idx] cost = cost.reshape([y.shape[0], y.shape[1]]) cost = (cost * y_mask).sum(0) cost_len = y_mask.sum(0) last_state = tensor.stack([last_state_1, last_state_2, last_state_3], axis=0) f_prop_updates = OrderedDict() f_prop_updates[tstate] = last_state states = [tstate] # Later use for visualization inps = [inp, inp_mask] print("Building f_log_prob function") f_log_prob = theano.function(inps, [cost, cost_len], updates=f_prop_updates) cost = cost.mean() # If the flag is on, apply L2 regularization on weights if FLAGS.weight_decay > 0.: weights_norm = 0. for k, v in tparams.iteritems(): weights_norm += (v**2).sum() cost += weights_norm * FLAGS.weight_decay #print("Computing the gradients") grads = tensor.grad(cost, wrt=itemlist(tparams)) grads = gradient_clipping(grads, tparams, 1.) # Compile the optimizer, the actual computational graph learning_rate = tensor.scalar(name='learning_rate') gshared = [ theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in tparams.iteritems() ] gsup = OrderedDict(izip(gshared, grads)) print("Building f_prop function") f_prop = theano.function(inps, [cost], updates=merge_dict(gsup, f_prop_updates)) opt_updates, opt_tparams = adam(learning_rate, tparams, gshared) if FLAGS.start_from_ckpt and os.path.exists(opt_file_name): opt_params = np.load(opt_file_name) zipp(opt_params, opt_tparams) print("Building f_update function") f_update = theano.function([learning_rate], [], updates=opt_updates, on_unused_input='ignore') #print("Building f_debug function") f_debug = theano.function(inps, [h_rnn_1_3d, h_rnn_2_3d, h_rnn_3_3d], updates=f_prop_updates, on_unused_input='ignore') return f_prop, f_update, f_log_prob, f_debug, tparams, opt_tparams, states, None
def RHNLayer(self, inputs, depth, batch_size, hidden_size, drop_i, drop_s, init_T_bias, init_H_bias, tied_noise): """Variational Recurrent Highway Layer (Theano implementation). References: Zilly, J, Srivastava, R, Koutnik, J, Schmidhuber, J., "Recurrent Highway Networks", 2016 Args: inputs: Theano variable, shape (num_steps, batch_size, hidden_size). depth: int, the number of RHN inner layers i.e. the number of micro-timesteps per timestep. drop_i: float, probability of dropout over inputs. drop_s: float, probability of dropout over recurrent hidden state. init_T_bias: a valid bias_init argument for linear(), initialization of bias of transform gate T. init_H_bias: a valid bias_init argument for linear(), initialization of bias of non-linearity H. tied_noise: boolean, whether to use the same dropout masks when calculating H and when calculating T. Returns: y: Theano variable, recurrent hidden states at each timestep. Shape (num_steps, batch_size, hidden_size). sticky_state_updates: a list of (shared variable, new shared variable value). """ # We first compute the linear transformation of the inputs over all timesteps. # This is done outside of scan() in order to speed up computation. # The result is then fed into scan()'s step function, one timestep at a time. noise_i_for_H = self.get_dropout_noise((batch_size, hidden_size), drop_i) noise_i_for_T = self.get_dropout_noise( (batch_size, hidden_size), drop_i) if not tied_noise else noise_i_for_H i_for_H = self.apply_dropout(inputs, noise_i_for_H) i_for_T = self.apply_dropout(inputs, noise_i_for_T) i_for_H = self.linear(i_for_H, in_size=hidden_size, out_size=hidden_size, bias=True, bias_init=init_H_bias) i_for_T = self.linear(i_for_T, in_size=hidden_size, out_size=hidden_size, bias=True, bias_init=init_T_bias) # Dropout noise for recurrent hidden state. noise_s = self.get_dropout_noise((batch_size, hidden_size), drop_s) if not tied_noise: noise_s = tt.stack( noise_s, self.get_dropout_noise((batch_size, hidden_size), drop_s)) def step_fn(i_for_H_t, i_for_T_t, y_tm1, noise_s): """ Args: Elements of sequences given to scan(): i_for_H_t: linear trans. of inputs for calculating non-linearity H at timestep t. Shape (batch_size, hidden_size). i_for_T_t: linear trans. of inputs for calculating transform gate T at timestep t. Shape (batch_size, hidden_size). Result of previous step function invocation (equals the outputs_info given to scan() on first timestep): y_tm1: Shape (batch_size, hidden_size). Non-sequences given to scan() (these are the same at all timesteps): noise_s: (batch_size, hidden_size) or (2, batch_size, hidden_size), depending on value of tied_noise. """ tanh, sigm = tt.tanh, tt.nnet.sigmoid noise_s_for_H = noise_s if tied_noise else noise_s[0] noise_s_for_T = noise_s if tied_noise else noise_s[1] s_lm1 = y_tm1 for l in range(depth): s_lm1_for_H = self.apply_dropout(s_lm1, noise_s_for_H) s_lm1_for_T = self.apply_dropout(s_lm1, noise_s_for_T) if l == 0: # On the first micro-timestep of each timestep we already have bias # terms summed into i_for_H_t and into i_for_T_t. H = tanh(i_for_H_t + self.linear(s_lm1_for_H, in_size=hidden_size, out_size=hidden_size, bias=False)) T = sigm(i_for_T_t + self.linear(s_lm1_for_T, in_size=hidden_size, out_size=hidden_size, bias=False)) else: H = tanh( self.linear(s_lm1_for_H, in_size=hidden_size, out_size=hidden_size, bias=True, bias_init=init_H_bias)) T = sigm( self.linear(s_lm1_for_T, in_size=hidden_size, out_size=hidden_size, bias=True, bias_init=init_T_bias)) s_l = (H - s_lm1) * T + s_lm1 s_lm1 = s_l y_t = s_l return y_t # The recurrent hidden state of the RHN is sticky (the last hidden state of one batch is carried over to the next batch, # to be used as an initial hidden state). These states are kept in a shared variable. y_0 = theano.shared(np.zeros((batch_size, hidden_size), floatX)) self.reset_hidden_state = lambda: y_0.set_value( np.zeros_like(y_0.get_value())) # invoked before every epoch. y, _ = theano.scan(step_fn, sequences=[i_for_H, i_for_T], outputs_info=[y_0], non_sequences=[noise_s]) y_last = y[-1] sticky_state_updates = [(y_0, y_last)] return y, sticky_state_updates
def init_opt(self): obs_var = ext.new_tensor( 'obs', ndim=2, dtype=theano.config.floatX) # todo: check the dtype manager_obs_var = ext.new_tensor('manager_obs', ndim=2, dtype=theano.config.floatX) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) # this will have to be the advantage every time the manager makes a decision manager_advantage_var = ext.new_tensor('manager_advantage', ndim=1, dtype=theano.config.floatX) skill_advantage_var = ext.new_tensor('skill_advantage', ndim=1, dtype=theano.config.floatX) latent_var_sparse = ext.new_tensor('sparse_latent', ndim=2, dtype=theano.config.floatX) latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX) assert isinstance(self.policy, HierarchicalPolicy) ############################################################# ### calculating the manager portion of the surrogate loss ### ############################################################# # i, j should contain the probability of latent j at time step self.period*i # should be a len(obs)//self.period by len(self.latent) tensor latent_probs = self.policy.manager.dist_info_sym( manager_obs_var)['prob'] old_latent_probs = self.old_policy.manager.dist_info_sym( manager_obs_var)['prob'] actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1) old_actual_latent_probs = TT.sum(old_latent_probs * latent_var_sparse, axis=1) lr = TT.exp( TT.log(actual_latent_probs) - TT.log(old_actual_latent_probs)) manager_surr_loss_vector = TT.minimum( lr * manager_advantage_var, TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * manager_advantage_var) manager_surr_loss = -TT.mean(manager_surr_loss_vector) ############################################################ ### calculating the skills portion of the surrogate loss ### ############################################################ dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents( obs_var) probs = TT.stack([ self.diagonal.log_likelihood_sym(action_var, dist_info) for dist_info in dist_info_vars ], axis=1) actual_action_log_probs = TT.sum( probs * latent_var, axis=1) # todo: verify that dist_info_vars is in order # old policy stuff old_dist_info_vars = self.old_policy.low_policy.dist_info_sym_all_latents( obs_var) old_probs = TT.stack([ self.diagonal.log_likelihood_sym(action_var, dist_info) for dist_info in old_dist_info_vars ], axis=1) old_actual_action_log_probs = TT.sum(old_probs * latent_var, axis=1) skill_lr = TT.exp(actual_action_log_probs - old_actual_action_log_probs) skill_surr_loss_vector = TT.minimum( skill_lr * skill_advantage_var, TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) * skill_advantage_var) skill_surr_loss = -TT.mean(skill_surr_loss_vector) surr_loss = manager_surr_loss / self.average_period + skill_surr_loss input_list = [ obs_var, manager_obs_var, action_var, manager_advantage_var, skill_advantage_var, latent_var, latent_var_sparse ] self.optimizer.update_opt(loss=surr_loss, target=self.policy, inputs=input_list) return dict()
def get_output(self, train): X = self.get_input(train) tensors = [T.roll(X, off, axis=self.axis) for off in self.offsets] return T.stack(tensors, axis=self.offset_axis)
def any_to_tensor_and_labels(x, labels=None): """Util for converting input x to tensor trying to create labels for columns if they are not provided. Default names for columns are ['x0', 'x1', ...], for mappable arrays (e.g. pd.DataFrame) their names are treated as labels. You can override them with `labels` argument. If you have tensor input you should provide labels as we cannot get their shape directly If you pass dict input we cannot rely on labels order thus dict keys are treated as labels anyway Parameters ---------- x : np.ndarray | pd.DataFrame | tt.Variable | dict | list labels : list - names for columns of output tensor Returns ------- (x, labels) - tensor and labels for its columns """ if isinstance(labels, six.string_types): labels = [labels] # pandas.DataFrame # labels can come from here # we can override them if isinstance(x, pd.DataFrame): if not labels: labels = x.columns x = x.as_matrix() # pandas.Series # there can still be a label # we can override labels elif isinstance(x, pd.Series): if not labels: labels = [x.name] x = x.as_matrix()[:, None] # dict # labels are keys, # cannot override them elif isinstance(x, dict): # try to do it via pandas try: x = pd.DataFrame.from_dict(x) labels = x.columns x = x.as_matrix() # some types fail there # another approach is to construct # variable by hand except (PandasError, TypeError): res = [] labels = [] for k, v in x.items(): res.append(v) labels.append(k) x = tt.stack(res, axis=1) if x.ndim == 1: x = x[:, None] # case when it can appear to be some # array like value like lists of lists # numpy deals with it elif not isinstance(x, tt.Variable): x = np.asarray(x) if x.ndim == 0: raise ValueError('Cannot use scalars') elif x.ndim == 1: x = x[:, None] # something really strange goes here, # but user passes labels trusting seems # to be a good option elif labels is not None: x = tt.as_tensor_variable(x) if x.ndim == 0: raise ValueError('Cannot use scalars') elif x.ndim == 1: x = x[:, None] else: # trust input pass # we should check that we can extract labels if labels is None and not isinstance(x, tt.Variable): labels = ['x%d' % i for i in range(x.shape[1])] # for theano variables we should have labels from user elif labels is None: raise ValueError('Please provide labels as ' 'we cannot infer shape of input') else: # trust labels, user knows what he is doing pass # it's time to check shapes if we can if not isinstance(x, tt.Variable): if not len(labels) == x.shape[1]: raise ValueError('Please provide full list ' 'of labels for coefficients, ' 'got len(labels)=%d instead of %d' % (len(labels), x.shape[1])) else: # trust labels, as we raised an # error in bad case, we have labels pass # convert labels to list if isinstance(labels, pd.RangeIndex): labels = ['x%d' % i for i in labels] # maybe it was a tuple ot whatever elif not isinstance(labels, list): labels = list(labels) # as output we need tensor if not isinstance(x, tt.Variable): x = tt.as_tensor_variable(x) # finally check dimensions if x.ndim == 0: raise ValueError('Cannot use scalars') elif x.ndim == 1: x = x[:, None] return x, labels
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') print "==> building input module" self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem = memory[-1] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # TODO: add conditional ending dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, answer_vec, debug, sentEmbdLoadState, sentEmbdType="basic", **kwargs): self.vocab = {} self.ivocab = {} self.debug = debug self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.answer_vec = answer_vec self.sentEmbdType = sentEmbdType if (self.mode != 'deploy'): self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) print(self.vocab_size) elif self.mode == 'deploy': self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input( babi_train_raw) self.vocab_size = len(self.vocab) print(self.vocab_size) # print(self.train_input.shape) # print(self.train_q.shape) # print(self.train_input_mask.shape) #Setting up pre-trained Sentence Embedder for question and input module: if self.mode != 'deploy': print("==> Setting up pre-trained Sentence Embedder") if self.sentEmbdType == "basic": self.sent_embd = SentEmbd.SentEmbd_basic(self.word_vector_size, self.dim) else: dep_tags = utils.load_dep_tags self.sent_embd = SentEmbd.SentEmbd_syntactic( 50, hid_dim, len(dep_tags)) #TODO: Dependency Tags self.sent_embd.load_params(sentEmbdLoadState) self.input_var = T.matrix('input_var') self.q_var = T.vector('question_var') if self.answer_vec == 'word2vec': self.answer_var = T.vector('answer_var') else: self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') if self.answer_vec == 'one_hot' or self.answer_vec == 'index': self.answer_size = self.vocab_size elif self.answer_vec == 'word2vec': self.answer_size = self.word_vector_size else: raise Exception("Invalid answer_vec type") #Setting up Untrained Memory module if self.mode != 'deploy': print("==> Creating parameters for memory module") self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) if self.mode != 'deploy': print( "==> Building episodic memory module (fixed number of steps: %d)" % self.memory_hops) memory = [self.q_var.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem = memory[-1] if self.mode != 'deploy': print("==> Building answer module") self.W_a = nn_utils.normal_param(std=0.1, shape=(self.answer_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) # elif self.answer_module == 'recurrent': # self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.answer_size)) # self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) # self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) # self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.answer_size)) # self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) # self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) # self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.answer_size)) # self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) # self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) # def answer_step(prev_a, prev_y): # a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), # self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, # self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, # self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) # y = T.dot(self.W_a, a) # if self.answer_vec == 'one_hot' or self.answer_vec == 'index': # y = nn_utils.softmax(y) # return [a, y] # # TODO: add conditional ending # dummy = theano.shared(np.zeros((self.answer_size, ), dtype=floatX)) # results, updates = theano.scan(fn=answer_step, # outputs_info=[last_mem, T.zeros_like(dummy)], # n_steps=1) # self.prediction = results[1][-1] else: raise Exception("invalid answer_module") if self.mode != 'deploy': print("==> Collecting all parameters to be trained") self.params = [ self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] # if self.answer_module == 'recurrent': # self.params = self.params + [self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, # self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, # self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid] if self.mode != 'deploy': print("==> Building loss layer and computing updates") if debug: print('Prediction dim:', self.prediction.dimshuffle('x', 0).ndim) print('Answer dim:', self.answer_var.ndim) if self.answer_vec == 'word2vec': self.loss_ce = nn_utils.cosine_proximity_loss( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0][0] else: self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 if debug: print(self.loss.ndim) # if self.debug: print(self.loss.eval({self.input_var:self.train_input,self.q_var:self.train_q,self.answer_var:self.train_answer,self.input_mask_var:self.train_input_mask})) updates = lasagne.updates.adadelta(self.loss, self.params) if self.mode == 'deploy': self.deploy_fn = theano.function( inputs=[self.input_var, self.q_var], outputs=[self.prediction]) else: if self.mode == 'train': print("==> compiling train_fn") self.train_fn = theano.function( inputs=[self.input_var, self.q_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print("==> compiling test_fn") self.test_fn = theano.function( inputs=[self.input_var, self.q_var, self.answer_var], outputs=[ self.prediction, self.loss, self.input_var, self.q_var, last_mem ]) if self.mode == 'train': print("==> computing gradients (for debugging)") gradient = T.grad(self.loss, self.params) self.get_gradient_fn = theano.function( inputs=[self.input_var, self.q_var, self.answer_var], outputs=gradient)
def get_output(self, input_, mask_, hidden_init): """ This function overrides the parents' one. Creates symbolic function to compute output from an input and output (hidden). Math Expression ------------------- Y[t] = out_activation(dot(X[t], W) + dot(Y[t-1], U) + b) if precompute True, compute dot(X[t],W) for all steps first. if mask exist and 1, Y[t] = Y[t-1] Parameters ---------- input_: TensorVariable mask_: TensorVariable hidden_init: TensorVariable Returns ------- TensorVariable """ # input_ are (n_batch, n_timesteps, n_features) # change to (n_timesteps, n_batch, n_features) input_ = input_.dimshuffle(1, 0, 2) # mask_ are (n_batch, n_timesteps) masks = masks.dimshuffle(1, 0, 'x') sequence_length = input_.shape[0] batch_num = input_.shape[1] # precompute input if self.precompute: additional_dims = tuple( input.shape[k] for k in range(2, input.ndim)) # (output_dim,) input = T.reshape(input, (sequence_length * batch_num, ) + additional_dims) input = T.dot(input, self.W) additional_dims = tuple( input.shape[k] for k in range(1, input.ndim)) # (output_dim,) input = T.reshape(input, ( sequence_length, batch_num, ) + additional_dims) # step function def step(input_, hidden): if self.precompute: return self.out_activation.get_output(input_ + T.dot(hidden, self.U) + self.b) else: return self.out_activation.get_output( T.dot(input_, self.W) + T.dot(hidden, self.U) + self.b) # step function, with mask def step_masked(input_, mask_, hidden): hidden_computed = step(input_, hidden) return T.switch(mask_, hidden_computed, hidden) # main operation if self.unroll: counter = range(self.gradient_steps) if self.backward: counter = counter[::-1] # reversed index iter_output = [] outputs_info = [hidden_init] for index in counter: step_input = [input_[index], mask_[index]] + outputs_info step_output = step_masked(*step_input) iter_output.append(step_output) outputs_info = [iter_output[-1]] hidden_output = T.stack(iter_output, axis=0) else: hidden_output = theano.scan( fn=step_masked, sequences=[input_, mask_], outputs_info=[hidden_init], go_backwards=self.backward, n_steps=None, truncate_gradient=self.gradient_steps)[ 0] # only need outputs, not updates # computed output are (n_timesteps, n_batch, n_features) # select only required if self.output_return_index is None: hidden_output_return = hidden_output else: hidden_output_return = hidden_output[self.output_return_index] # change to (n_batch, n_timesteps, n_features) hidden_output_return = hidden_output_return.dimshuffle( 1, 0, *range(2, hidden_output_return.ndim)) # backward order straight if self.backward: hidden_output_return = hidden_output_return[:, ::-1] return hidden_output_return
def neibs2images(neibs, neib_shape, original_shape, mode='valid'): """ Function :func:`neibs2images <theano.sandbox.neighbours.neibs2images>` performs the inverse operation of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`. It inputs the output of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` and reconstructs its input. :param neibs: matrix like the one obtained by :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` :param neib_shape: `neib_shape` that was used in :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` :param original_shape: original shape of the 4d tensor given to :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` :return: Reconstructs the input of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`, a 4d tensor of shape `original_shape`. .. note:: Currently, the function doesn't support tensors created with `neib_step` different from default value. This means that it may be impossible to compute the gradient of a variable gained by :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` w.r.t. its inputs in this case, because it uses :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` for gradient computation. Example, which uses a tensor gained in example for :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`: .. code-block:: python im_new = neibs2images(neibs, (5, 5), im_val.shape) # Theano function definition inv_window = theano.function([neibs], im_new) # Function application im_new_val = inv_window(neibs_val) .. note:: The code will output the initial image array. """ neibs = T.as_tensor_variable(neibs) neib_shape = T.as_tensor_variable(neib_shape) original_shape = T.as_tensor_variable(original_shape) new_neib_shape = T.stack(original_shape[-1] // neib_shape[1], neib_shape[1]) output_2d = images2neibs(neibs.dimshuffle('x', 'x', 0, 1), new_neib_shape, mode=mode) if mode == 'ignore_borders': valid_shape = list(original_shape) valid_shape[2] = (valid_shape[2] // neib_shape[0]) * neib_shape[0] valid_shape[3] = (valid_shape[3] // neib_shape[1]) * neib_shape[1] output_4d = output_2d.reshape(valid_shape) # padding the borders with zeros for d in [2, 3]: pad_shape = list(output_4d.shape) pad_shape[d] = original_shape[d] - valid_shape[d] output_4d = T.concatenate([output_4d, T.zeros(pad_shape)], axis=d) elif mode == 'valid': # TODO: we do not implement all mode with this code. # Add a check for the good cases. output_4d = output_2d.reshape(original_shape) else: raise NotImplementedError("neibs2images do not support mode=%s" % mode) return output_4d
def __init__(self, cooccurrence, z_k, opt, initializer, initial_pz_weight=None, initial_b=None, pz_regularizer=None, tao0=5., tao_min=0.25, tao_decay=1e-6, eps=1e-9): cooccurrence = cooccurrence.astype(np.float32) self.cooccurrence = cooccurrence self.z_k = z_k self.opt = opt x_k = cooccurrence.shape[0] self.x_k = x_k # cooccurrence matrix n = np.sum(cooccurrence, axis=None) _co = cooccurrence / n co = T.constant(_co, name="co") # (x_k, x_k) _co_m = np.sum(_co, axis=1, keepdims=True) co_m = T.constant(_co_m, name="co_m") # (x_k,1) _co_c = _co / (eps + _co_m) _co_h = np.sum(_co * -np.log(eps + _co_c), axis=1, keepdims=True) # (x_k, 1) print "COh: {}".format(np.sum(_co_h)) co_h = T.constant(_co_h, name="co_h") if initial_pz_weight is None: initial_pz_weight = initializer((x_k, z_k)) pz_weight = K.variable(initial_pz_weight) pz = softmax_nd(pz_weight) srng = RandomStreams(123) rnd = srng.uniform(low=0., high=1., dtype='float32', size=(x_k, z_k)) gumbel = -T.log(eps + T.nnet.relu(-T.log(eps + rnd))) iteration = K.variable(0, dtype='int32') temp = T.max(T.stack((tao_min, tao0 / (1. + (tao_decay * iteration))))) z = softmax_nd((pz_weight + gumbel) / (eps + temp)) # z = pz w = K.variable(initializer((z_k, x_k))) if initial_b is None: initial_b = initializer((x_k,)) b = K.variable(initial_b) y = softmax_nd(T.dot(z, w) + b) self.params = [pz_weight, w, b] nll_loss = -T.sum(co * T.log(eps + y), axis=None) reg_loss = T.constant(0.) if pz_regularizer: reg_loss = pz_regularizer(pz) total_loss = nll_loss + reg_loss decay_updates = [(iteration, iteration + 1)] encoding = T.argmax(pz, axis=1) one_hot_encoding = tensor_one_hot(encoding, z_k) # (x_k, z_k) pb = T.dot(T.transpose(one_hot_encoding, (1, 0)), co) m = T.sum(pb, axis=1, keepdims=True) c = pb / (m + eps) validation_nll = -T.sum(pb * T.log(eps + c), axis=None) utilization = T.sum(T.gt(T.sum(one_hot_encoding, axis=0), 0), axis=0) updates = opt.get_updates(loss=total_loss, params=self.params) self.val_fun = theano.function([], validation_nll) self.encodings_fun = theano.function([], encoding) self.train_fun = theano.function([], [reg_loss, nll_loss, utilization, temp], updates=updates + decay_updates) self.weights = self.params + opt.weights + [iteration]
def _infer_ndim_bcast(ndim, shape, *args): """ Infer the number of dimensions from the shape or the other arguments. :rtype: (int, variable, tuple) triple, where the variable is an integer vector, and the tuple contains Booleans. :returns: the first element returned is the inferred number of dimensions. The second element is the shape inferred (combining symbolic and constant informations from shape and args). The third element is a broadcasting pattern corresponding to that shape. """ # Find the minimum value of ndim required by the *args if args: args_ndim = max(arg.ndim for arg in args) else: args_ndim = 0 if isinstance(shape, (tuple, list)): # there is a convention that -1 means the corresponding shape of a # potentially-broadcasted symbolic arg # # This case combines together symbolic and non-symbolic shape # information shape_ndim = len(shape) if ndim is None: ndim = shape_ndim else: if shape_ndim != ndim: raise ValueError( 'ndim should be equal to len(shape), but\n', 'ndim = %s, len(shape) = %s, shape = %s' % (ndim, shape_ndim, shape)) bcast = [] pre_v_shape = [] for i, s in enumerate(shape): if hasattr(s, 'type'): # s is symbolic bcast.append(False) # todo - introspect further pre_v_shape.append(s) else: if s >= 0: pre_v_shape.append(tensor.as_tensor_variable(s)) bcast.append((s == 1)) elif s == -1: n_a_i = 0 for a in args: # ndim: _ _ _ _ _ _ # ashp: s0 s1 s2 s3 # i if i >= ndim - a.ndim: n_a_i += 1 a_i = i + a.ndim - ndim if not a.broadcastable[a_i]: pre_v_shape.append(a.shape[a_i]) bcast.append(False) break else: if n_a_i == 0: raise ValueError( ('Auto-shape of -1 must overlap' 'with the shape of one of the broadcastable' 'inputs')) else: pre_v_shape.append(tensor.as_tensor_variable(1)) bcast.append(True) else: ValueError('negative shape', s) # post-condition: shape may still contain both symbolic and # non-symbolic things if len(pre_v_shape) == 0: v_shape = tensor.constant([], dtype='int32') else: v_shape = tensor.stack(*pre_v_shape) elif shape is None: # The number of drawn samples will be determined automatically, # but we need to know ndim if not args: raise TypeError(('_infer_ndim_bcast cannot infer shape without' ' either shape or args')) template = reduce(lambda a, b: a + b, args) v_shape = template.shape bcast = template.broadcastable ndim = template.ndim else: v_shape = tensor.as_tensor_variable(shape) if ndim is None: ndim = tensor.get_vector_length(v_shape) bcast = [False] * ndim if (not (v_shape.dtype.startswith('int') or v_shape.dtype.startswith('uint'))): raise TypeError('shape must be an integer vector or list', v_shape.dtype) if args_ndim > ndim: raise ValueError( 'ndim should be at least as big as required by args value', (ndim, args_ndim), args) assert ndim == len(bcast) return ndim, tensor.cast(v_shape, 'int32'), tuple(bcast)
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops #self.batch_size = 1 self.l2 = l2 self.normalize_attention = normalize_attention self.train_input, self.train_q, self.train_answer, self.train_choices, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_choices, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = 2 # number of answer choices self.inp_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.ca_var = T.matrix('ca_var') self.cb_var = T.matrix('cb_var') #self.cc_var = T.matrix('cc_var') #self.cd_var = T.matrix('cd_var') self.ans_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') print "==> building input module" self.W_inp_res_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.word_vector_size)), borrow=True) self.W_inp_res_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_inp_res = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_inp_upd_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.word_vector_size)), borrow=True) self.W_inp_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_inp_upd = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_inp_hid_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.word_vector_size)), borrow=True) self.W_inp_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_inp_hid = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.inp_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] self.c_vecs = [] #for choice in [self.ca_var, self.cb_var, self.cc_var, self.cd_var]: for choice in [self.ca_var, self.cb_var]: history, _ = theano.scan(fn=self.input_gru_step, sequences=choice, outputs_info=T.zeros_like(self.b_inp_hid)) self.c_vecs.append(history[-1]) self.c_vecs = T.stack(self.c_vecs).transpose((1, 0)) # (dim, 4) self.inp_c = T.stack([self.inp_c] * 2).transpose( (1, 2, 0)) # (fact_cnt, dim, 4) self.q_q = T.stack([self.q_q] * 2).transpose((1, 0)) # (dim, 4) print "==> creating parameters for memory module" self.W_mem_res_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_mem_res_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_mem_res = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_mem_upd_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_mem_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_mem_upd = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_mem_hid_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_mem_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_mem_hid = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_b = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_1 = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, 10 * self.dim + 3)), borrow=True) self.W_2 = theano.shared(lasagne.init.Normal(0.1).sample( (1, self.dim)), borrow=True) self.b_1 = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.b_2 = theano.shared(lasagne.init.Constant(0.0).sample((1, )), borrow=True) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] # (dim, 4) for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update_batch(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem = memory[-1].flatten() print "==> building answer module" self.W_a = theano.shared(lasagne.init.Normal(0.1).sample( (self.vocab_size, 2 * self.dim)), borrow=True) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # TODO: add conditional ending dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] print "==> collecting all parameters" self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.ans_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.inp_var, self.q_var, self.ans_var, self.ca_var, self.cb_var, # self.cc_var, self.cd_var, self.input_mask_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[ self.inp_var, self.q_var, self.ans_var, self.ca_var, self.cb_var, # self.cc_var, self.cd_var, self.input_mask_var ], outputs=[ self.prediction, self.loss, self.inp_c, self.q_q, last_mem ]) if self.mode == 'train': print "==> computing gradients (for debugging)" gradient = T.grad(self.loss, self.params) self.get_gradient_fn = theano.function( inputs=[ self.inp_var, self.q_var, self.ans_var, self.ca_var, self.cb_var, # self.cc_var, self.cd_var, self.input_mask_var ], outputs=gradient)
def generate_hierarchical_model_parameters(parameter, n_subjects, design, mu_lower, mu_upper, sd_lower, sd_upper, bound_lower, bound_upper, val, testval): if (design['conditions'] is not None): if val is None: mu = tt.stack([ pm.Uniform('{}_{}_mu'.format(parameter, condition), mu_lower, mu_upper, testval=testval) for condition in design['conditions'] ]) sd = tt.stack([ pm.Uniform('{}_{}_sd'.format(parameter, condition), sd_lower, sd_upper, testval=testval) for condition in design['conditions'] ]) bounded = pm.Bound(pm.Normal, bound_lower, bound_upper) parms = [] n_subjects_per_condition = [] for c, condition in enumerate(design['conditions']): n_subjects_in_condition = np.unique(design['subject_index'][ design['condition_index'] == c]).size n_subjects_per_condition.append(n_subjects_in_condition) parms_tmp = bounded('{}_{}'.format(parameter, condition), mu=mu[c], sd=sd[c], shape=(n_subjects_in_condition)) parms_tmp = tt.concatenate([tt.zeros(1), parms_tmp]) parms.append(parms_tmp[design['D'][:, c]][:, None]) parms = tt.concatenate(parms, axis=1) else: parms = [] n_subjects_per_condition = [] for c, condition in enumerate(design['conditions']): n_subjects_in_condition = np.unique(design['subject_index'][ design['condition_index'] == c]).size n_subjects_per_condition.append(n_subjects_in_condition) if len(val) == len(design['conditions']): parms.append( pm.Deterministic( '{}_{}'.format(parameter, condition), tt.ones(n_subjects_in_condition, 1) * val[c])) else: raise ValueError( 'Number of values in {}_val does not match the number of specified {}-conditions.' .format(parameter, parameter)) # make sure all elements in parms have same size for set_i, parm_set in enumerate(parms): if n_subjects_per_condition[set_i] < n_subjects: parms[set_i] = tt.concatenate([ parm_set, tt.zeros( (n_subjects - n_subjects_per_condition[set_i], 1)) ], axis=0) parms = tt.concatenate(parms, axis=1) else: if val is None: mu = pm.Uniform('{}_mu'.format(parameter), mu_lower, mu_upper, testval=testval) sd = pm.Uniform('{}_sd'.format(parameter), sd_lower, sd_upper, testval=testval) bounded = pm.Bound(pm.Normal, bound_lower, bound_upper) parms = bounded(parameter, mu=mu, sd=sd, shape=(n_subjects, 1)) else: parms = pm.Deterministic(parameter, tt.ones((n_subjects, 1)) * val) return parms
# The number of spindles per epoch: num_spindles_per_epoch = pm.Categorical('num_spindles_per_epoch', p=pm.Dirichlet( 'spindle_num_prior', a=spindle_number_prior), testval=1) # ----Tracking is a raters spindle marker is real or contaminate----- # if the number of spindles in an epoch (z) is greater than 0, then use conf to determine if a spindle is real or not #spindle_chance = data['conf'] # pm.math.switch(num_spindles_per_epoch[data['epoch_i']] > 0, data['conf'], 0) spindle_chance_prior = pm.Beta('spindle_chance_prior', alpha=2, beta=1) marker_is_from_real_spindle = pm.Bernoulli('marker_is_from_real_spindle', p=spindle_chance_prior, shape=n_data) marker_is_from_real_spindle_stacked = tt.stack( [marker_is_from_real_spindle, 1 - marker_is_from_real_spindle], axis=1) # stack theta for use in mixture model # ----Mapping between rater's spindles and real spindles (w)---- ## Handy matrix to compare z too compare = np.arange(0, max_spindles_per_epoch + 1) # [0 1 2 3 4 5]*epochs # Acutual prior for "mapping_marker_to_true_spindle" # shape=[n_epochs, max_spindles_per_epoch], # e.g. mapping_marker_to_true_spindle_prior for a single epoch will be like [1 1 1 0 0 0 0], # and therefore the mapping_marker_to_true_spindle's can only be from [0-2]-1 = [-1, 0, 1], where -1=no mapping mapping_marker_to_true_spindle_prior = pm.math.where( compare - num_spindles_per_epoch <= 0, 1, 0) # no_spindles_prior = np.zeros((n_data, 6)) # no_spindles_prior[:, 0] = 1 # mapping_prior = tt.switch(marker_is_from_real_spindle.reshape((n_data, 1)), mapping_marker_to_true_spindle_prior, no_spindles_prior)
def __init__(self, model, algo='fisher', c_lambd_inv=1e-3, rate=1.05, over_sampling=1, rescale='momentum'): """ Init self. Args: model, algo, c_lambd_inv: Start value of \lambda regularizer (used in matrix inversion and in F*v computation). rate: Change per iteration for \lambda. over_sampling: For Fisher-like methods, use multiple random vectors per one sample from dataset. rescale: Can be either False, True or 'momentum'. Implemented algos: 'gn' - Gauss-Newton matrix, 'fisher' - Fisher matrix, 'kr' - Khatri-Rao matrix, 'kr_diag' - block-diagonal KR matrix. """ self.model = model self.algo = algo self.x = self.model.x self.y = T.ivector('y') self.outc = T.matrix('outc') # due to theano bugs self.x_d = shared_empty(2) self.y_d = shared_empty(1, dtype='int32') self.outc_d = shared_empty(2) self.rand_outc_d = shared_empty(3) # --- self.rand_outc = T.tensor3('rand_outc') self.lambd_inv = T.scalar('lambd_inv') self.c_lambd_inv = c_lambd_inv self.rate = rate self.over_sampling = over_sampling self.rescale = rescale # -- target def -- self.f_loss = 0 self.f_loss_samples = 0 for i in range(self.over_sampling): self.f_loss += get_loss(self.model.a, self.rand_outc[i] + my_consider_constant(self.model.a)) * scalar_floatX(self.model.a.shape[0]) self.f_loss_samples += get_loss_samples(self.model.a, self.rand_outc[i] + my_consider_constant(self.model.a)) self.loss = get_loss(self.model.a, self.outc) self.err = get_error(get_pred(self.model.a), self.y) self.updates = OrderedDict() self.grad = sum(([T.grad(self.loss, p)] for p in self.model.params), []) self.grad_vec = T.concatenate([g.flatten() for g in self.grad]) def get_fisher_mat(): grad2d = [] for p in self.model.params: grad2d += [T.jacobian(self.f_loss_samples, p)] if grad2d[-1].ndim == 2: grad2d[-1] = grad2d[-1].dimshuffle(0, 1, 'x') grad2d_vec = T.concatenate([g.flatten(2).T for g in grad2d]).T # tensor wise: F_p,i,j = sum_k grad2d[p,i,k]*grad2d[p,k,j] # just a slow reference implementation of what is below # F = T.mean(T.batched_dot(grad2d_vec.dimshuffle(0, 1, 'x'), grad2d_vec.dimshuffle(0, 'x', 1)), 0)/self.over_sampling F = T.dot(grad2d_vec.T, grad2d_vec)/T.cast(grad2d_vec.shape[0], theano.config.floatX)/self.over_sampling return F if self.algo == 'fisher': self.grad2d = [] for p in self.model.params: self.grad2d += [T.jacobian(self.f_loss_samples, p)] if self.grad2d[-1].ndim == 2: self.grad2d[-1] = self.grad2d[-1].dimshuffle(0, 1, 'x') self.grad2d_vec = T.concatenate([g.flatten(2).T for g in self.grad2d]).T # tensor wise: F_p,i,j = sum_k grad2d[p,i,k]*grad2d[p,k,j] # just a slow reference implementation of what is below # F = T.mean(T.batched_dot(grad2d_vec.dimshuffle(0, 1, 'x'), grad2d_vec.dimshuffle(0, 'x', 1)), 0)/self.over_sampling self.F = T.dot(self.grad2d_vec.T, self.grad2d_vec)/T.cast(self.grad2d_vec.shape[0], theano.config.floatX)/self.over_sampling elif self.algo == 'gn': self.grad2d = [] for p in self.model.params: self.grad2d += [T.jacobian(self.model.a.flatten(), p)] new_shape = (self.model.a.shape[0], self.model.a.shape[1], -1) self.grad2d[-1] = self.grad2d[-1].reshape(new_shape) self.grad2d_vec = T.concatenate([g.flatten(3) for g in self.grad2d], 2) # just a slow reference implementation of what is below # self.F = T.mean(T.batched_dot(self.grad2d_vec.dimshuffle(0, 2, 1), # self.grad2d_vec.dimshuffle(0, 1, 2)), axis=0) self.F = T.tensordot(self.grad2d_vec.dimshuffle(0, 2, 1), self.grad2d_vec.dimshuffle(0, 1, 2), [(0, 2), (0, 1)])/T.cast(self.grad2d_vec.shape[0], theano.config.floatX) elif self.algo.startswith('kr'): self.grads = [] # self.acts = [T.concatenate([self.model.x, T.ones((self.model.x.shape[0], 1))], axis=1)] self.acts = [self.model.x] for l in self.model.layers: cg = T.grad(self.f_loss, l.s) self.grads.append(cg) # self.acts.append(T.concatenate([l.a, T.ones((l.a.shape[0], 1))], axis=1)) self.acts.append(l.a) self.G = [] self.A = [] self.F_block = [] self.F = [] cnt = T.cast(self.grads[0].shape[0], theano.config.floatX) for i in range(len(self.grads)): self.G += [[]] self.A += [[]] for j in range(len(self.grads)): # self.G[-1] += [T.mean(T.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)] # self.A[-1] += [T.mean(T.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)] # self.G[-1] += [T.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1))] # self.A[-1] += [T.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1))] self.G[-1] += [self.grads[i].T.dot(self.grads[j]).dimshuffle('x', 0, 1)/cnt] self.A[-1] += [self.acts[i].T.dot(self.acts[j]).dimshuffle('x', 0, 1)/cnt] if self.algo.endswith('diag'): self.G[-1][-1] *= float(i==j) self.A[-1][-1] *= float(i==j) for i in range(len(self.grads)): self.F_block += [[]] for j in range(len(self.grads)): # depends on whether you want to compute the real fisher with this or the kr approximation # since numpy-base fast_kron somehow computes 3d tensors faster than theano # cblock = fast_kron(self.A[i][j], self.G[i][j]) cblock = native_kron(self.A[i][j], self.G[i][j]) cblock = cblock.reshape(cblock.shape[1:], ndim=2) self.F_block[i] += [cblock] self.F.append(T.concatenate(self.F_block[-1], axis=1)) self.F = T.concatenate(self.F, axis=0) self.F = (self.F+self.F.T)/2 self.Fdamp = self.F+T.identity_like(self.F)*self.lambd_inv # There're 3+ different ways of computing F^-1*v in theano, # and it seems like solve_sym_pos is quite neutral in terms # of performance + it throws an exception if the provided matrix # is singular. # self.new_grad_vec = theano.tensor.slinalg.solve(self.Fdamp, self.grad_vec.dimshuffle(0, 'x')) self.new_grad_vec = solve_sym_pos(self.Fdamp, self.grad_vec) # self.new_grad_vec = gpu_solve(self.Fdamp, self.grad_vec.dimshuffle(0, 'x')) pcount = sum(p.get_value().size for p in self.model.params) self.ch_history = theano.shared(np.zeros((pcount,), dtype=theano.config.floatX)) if self.rescale == 'momentum': self.real_fish = get_fisher_mat() + T.identity_like(self.F)*self.lambd_inv FT = self.real_fish.dot(self.new_grad_vec) FM = self.real_fish.dot(self.ch_history) TFT = self.new_grad_vec.T.dot(FT) MFT = self.ch_history.T.dot(FT) MFM = self.ch_history.T.dot(FM) GT = self.grad_vec.T.dot(self.new_grad_vec) GM = self.grad_vec.T.dot(self.ch_history) tmp1 = T.stack([TFT.reshape(()), MFT.reshape(())], 0).dimshuffle('x', 0) tmp2 = T.stack([MFT.reshape(()), MFM.reshape(())], 0).dimshuffle('x', 0) A = T.concatenate([tmp1, tmp2], 0) A_pinv = T.nlinalg.MatrixPinv()(A) b = T.stack([GT.reshape(()), GM.reshape(())], 0).dimshuffle(0, 'x') res = A_pinv.dot(b).flatten() alpha = res[0] beta = res[1] self.new_grad_vec = self.new_grad_vec * alpha.reshape(()) + self.ch_history * beta.reshape(()) self.F = self.real_fish self.updates[self.ch_history] = self.new_grad_vec elif self.rescale: self.real_fish = get_fisher_mat() + T.identity_like(self.F)*self.lambd_inv lin_fac = self.grad_vec.T.dot(self.new_grad_vec) quad_fac = self.new_grad_vec.T.dot(self.real_fish.dot(self.new_grad_vec)) alpha = lin_fac/quad_fac beta = 0 * alpha self.new_grad_vec *= alpha.reshape(()) self.F = self.real_fish # self.Fdamp = self.F+T.identity_like(self.F)*self.lambd_inv # alpha = T.as_tensor_variable(1) def _apply_gradient_vec(params, new_grad_vec, updates): new_grad = [] offset = 0 for p in params: pval = p.get_value() new_grad += [new_grad_vec[offset:offset+pval.size].reshape(pval.shape)] offset += pval.size updates[p] = p - new_grad[-1] return new_grad self.new_grad = _apply_gradient_vec(self.model.params, self.new_grad_vec, self.updates) self.get_params = theano.function( inputs=[], outputs=self.model.params, on_unused_input='warn' ) self.quad_est_loss = self.new_grad_vec.T.dot(self.F.dot(self.new_grad_vec))/2 self.est_loss = self.quad_est_loss + self.grad_vec.dot(self.new_grad_vec) self.print_pls = {} self.print_pls.update({'shape': self.F.shape[0], 'rank': rank(self.F*10000)}) self.print_pls.update({'grad_mean': T.mean(self.grad_vec**2)**0.5}) self.print_pls.update({'alpha': alpha, 'beta': beta}) # self.print_pls += [self.F] # self.print_pls += [self.real_fish] self.train = theano.function( inputs=[self.lambd_inv], outputs=[self.est_loss, self.loss, self.err] + list(self.print_pls.values()), updates=self.updates, givens={ self.x: self.x_d, self.y: self.y_d, self.outc: self.outc_d, self.rand_outc: self.rand_outc_d }, on_unused_input='warn', allow_input_downcast=True, # profile=True ) self.eva = theano.function( inputs=[], outputs=[self.loss, self.err], givens={ self.x: self.x_d, self.y: self.y_d, self.outc: self.outc_d }, on_unused_input='warn', allow_input_downcast=True ) def step(self, X, y, outc): """Perform single train iteration. Args: X: input vectors y: target labels. outc: target vectors. Returns: Dict consisting of 'loss', 'err', 'est_loss', 'rho', 'delta_ll' and parameters from self.print_pls. """ self.x_d.set_value(X) self.y_d.set_value(y) self.outc_d.set_value(outc) self.rand_outc_d.set_value(floatX(nprng.randn(self.over_sampling, *outc.shape))) old_params = self.get_params() while True: # reset params to saved for op, p in zip(old_params, self.model.params): p.set_value(op) try: t_r = self.train(self.c_lambd_inv) print_pls_vals = t_r[-len(self.print_pls):] self.print_pls_res = {k: v for k, v in zip(self.print_pls.keys(), print_pls_vals)} except numpy.linalg.linalg.LinAlgError: t_r = [1e20, 1e10, 10] + [None] * len(self.print_pls) self.print_pls_res = {k: None for k in self.print_pls.keys()} e_v = self.eva() delta_ll = t_r[1] - e_v[0] rho = delta_ll/float(t_r[0]) print() print('lambda:', round(self.c_lambd_inv, 7), 'rho:', round(rho, 2), 'old loss:', t_r[1], 'new loss:', e_v[0]) if rho < 0: self.c_lambd_inv *= self.rate * 2 continue elif rho < 0.5: self.c_lambd_inv *= self.rate # self.c_lambd_inv = min(self.c_lambd_inv, 0.02) elif rho > 0.5: self.c_lambd_inv /= self.rate else: pass break # self.train.profiler.print_summary() res = {'rho': rho, 'est_loss': t_r[0], 'loss': t_r[1], 'err': t_r[2], 'delta_ll': delta_ll} res.update(self.print_pls_res) return res def evaluate(X_test, y_test, outc_test): """Return loss and error for provided dataset. Args: X_test: input vectors, y_test: target labels, outc_test: target vectors. Returns: Dict consisting of 'test_loss', 'test_err'. """ self.x_d.set_value(X_test) self.y_d.set_value(y_test) self.outc_d.set_value(outc_test) te_v = self.eva() test_loss = te_v[0] test_err = te_v[1] return {'test_loss': test_loss, 'test_err': test_err} def _check_gv_matrix_correctness(self): v = T.vector('v') get_Fv = theano.function( inputs=[v], outputs=[self.F.dot(v)], givens={ self.x: self.x_d, self.outc: self.outc_d }, allow_input_downcast=True ) grad_at = theano.function( inputs=[], outputs=sum(([T.grad(self.loss, p)] for p in self.model.params), []), givens={ self.x: self.x_d, self.outc: self.outc_d }, allow_input_downcast=True ) grads0 = grad_at() vec = [] EPS = 1e-5 for p in self.model.params: vec += [nprng.randn(*p.get_value().shape).astype(theano.config.floatX)] p.set_value(p.get_value()+vec[-1]*EPS) grads1 = grad_at() vec_vec = np.concatenate([p.flatten() for p in vec]) F_vec = get_Fv(vec_vec) F_vec_vec = np.concatenate([f.flatten() for f in F_vec]) grads0_vec = np.concatenate([p.flatten() for p in grads0]) grads1_vec = np.concatenate([p.flatten() for p in grads1]) F_vec_emp = (grads1_vec-grads0_vec)/EPS print(np.mean(F_vec_emp**2)**0.5, np.mean(F_vec_vec**2)**0.5) print(np.max(np.abs(F_vec_emp-F_vec_vec))) exit(0)
def conv2d( input, filters, image_shape=None, filter_shape=None, border_mode="valid", subsample=(1, 1), **kargs, ): """ signal.conv.conv2d performs a basic 2D convolution of the input with the given filters. The input parameter can be a single 2D image or a 3D tensor, containing a set of images. Similarly, filters can be a single 2D filter or a 3D tensor, corresponding to a set of 2D filters. Shape parameters are optional and will result in faster execution. Parameters ---------- input : Symbolic theano tensor for images to be filtered. Dimensions: ([num_images], image height, image width) filters : Symbolic theano tensor for convolution filter(s). Dimensions: ([num_filters], filter height, filter width) border_mode: {'valid', 'full'} See scipy.signal.convolve2d. subsample Factor by which to subsample output. image_shape : tuple of length 2 or 3 ([num_images,] image height, image width). filter_shape : tuple of length 2 or 3 ([num_filters,] filter height, filter width). kwargs See theano.tensor.nnet.conv.conv2d. Returns ------- symbolic 2D,3D or 4D tensor Tensor of filtered images, with shape ([number images,] [number filters,] image height, image width). """ assert input.ndim in (2, 3) assert filters.ndim in (2, 3) # use shape information if it is given to us ### if filter_shape and image_shape: if input.ndim == 3: bsize = image_shape[0] else: bsize = 1 imshp = (1, ) + tuple(image_shape[-2:]) if filters.ndim == 3: nkern = filter_shape[0] else: nkern = 1 kshp = filter_shape[-2:] else: nkern, kshp = None, None bsize, imshp = None, None # reshape tensors to 4D, for compatibility with ConvOp ### if input.ndim == 3: sym_bsize = input.shape[0] else: sym_bsize = 1 if filters.ndim == 3: sym_nkern = filters.shape[0] else: sym_nkern = 1 new_input_shape = tensor.join(0, tensor.stack([sym_bsize, 1]), input.shape[-2:]) input4D = tensor.reshape(input, new_input_shape, ndim=4) new_filter_shape = tensor.join(0, tensor.stack([sym_nkern, 1]), filters.shape[-2:]) filters4D = tensor.reshape(filters, new_filter_shape, ndim=4) # perform actual convolution ### op = conv.ConvOp( output_mode=border_mode, dx=subsample[0], dy=subsample[1], imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize, **kargs, ) output = op(input4D, filters4D) # flatten to 3D tensor if convolving with single filter or single image if input.ndim == 2 and filters.ndim == 2: if config.warn__signal_conv2d_interface: warnings.warn( "theano.tensor.signal.conv2d() now outputs a 2d tensor when both" " inputs are 2d. To disable this warning, set the Theano flag" " warn__signal_conv2d_interface to False", stacklevel=3, ) output = tensor.flatten(output.T, ndim=2).T elif input.ndim == 2 or filters.ndim == 2: output = tensor.flatten(output.T, ndim=3).T return output
def rnn(step_function, inputs, initial_states, go_backwards=False, mask=None, constants=None, unroll=False, input_length=None): '''Iterates over the time dimension of a tensor. # Arguments inputs: tensor of temporal data of shape (samples, time, ...) (at least 3D). step_function: Parameters: input: tensor with shape (samples, ...) (no time dimension), representing input for the batch of samples at a certain time step. states: list of tensors. Returns: output: tensor with shape (samples, ...) (no time dimension), new_states: list of tensors, same length and shapes as 'states'. initial_states: tensor with shape (samples, ...) (no time dimension), containing the initial values for the states used in the step function. go_backwards: boolean. If True, do the iteration over the time dimension in reverse order. mask: binary tensor with shape (samples, time), with a zero for every element that is masked. constants: a list of constant values passed at each step. unroll: whether to unroll the RNN or to use a symbolic loop (`scan`). input_length: must be specified if using `unroll`. # Returns A tuple (last_output, outputs, new_states). last_output: the latest output of the rnn, of shape (samples, ...) outputs: tensor with shape (samples, time, ...) where each entry outputs[s, t] is the output of the step function at time t for sample s. new_states: list of tensors, latest states returned by the step function, of shape (samples, ...). ''' ndim = inputs.ndim assert ndim >= 3, 'Input should be at least 3D.' if unroll: if input_length is None: raise Exception('When specifying `unroll=True`, an `input_length` ' 'must be provided to `rnn`.') axes = [1, 0] + list(range(2, ndim)) inputs = inputs.dimshuffle(axes) if constants is None: constants = [] if mask is not None: if mask.ndim == ndim - 1: mask = expand_dims(mask) assert mask.ndim == ndim mask = mask.dimshuffle(axes) if unroll: indices = list(range(input_length)) if go_backwards: indices = indices[::-1] successive_outputs = [] successive_states = [] states = initial_states for i in indices: output, new_states = step_function(inputs[i], states + constants) if len(successive_outputs) == 0: prev_output = zeros_like(output) else: prev_output = successive_outputs[-1] output = T.switch(mask[i], output, prev_output) kept_states = [] for state, new_state in zip(states, new_states): kept_states.append(T.switch(mask[i], new_state, state)) states = kept_states successive_outputs.append(output) successive_states.append(states) outputs = T.stack(*successive_outputs) states = [] for i in range(len(successive_states[-1])): states.append( T.stack(*[ states_at_step[i] for states_at_step in successive_states ])) else: # build an all-zero tensor of shape (samples, output_dim) initial_output = step_function(inputs[0], initial_states + constants)[0] * 0 # Theano gets confused by broadcasting patterns in the scan op initial_output = T.unbroadcast(initial_output, 0, 1) def _step(input, mask, output_tm1, *states): output, new_states = step_function(input, states) # output previous output if masked. output = T.switch(mask, output, output_tm1) return_states = [] for state, new_state in zip(states, new_states): return_states.append(T.switch(mask, new_state, state)) return [output] + return_states results, _ = theano.scan(_step, sequences=[inputs, mask], outputs_info=[initial_output] + initial_states, non_sequences=constants, go_backwards=go_backwards) # deal with Theano API inconsistency if type(results) is list: outputs = results[0] states = results[1:] else: outputs = results states = [] else: if unroll: indices = list(range(input_length)) if go_backwards: indices = indices[::-1] successive_outputs = [] successive_states = [] states = initial_states for i in indices: output, states = step_function(inputs[i], states + constants) successive_outputs.append(output) successive_states.append(states) outputs = T.stack(*successive_outputs) states = [] for i in range(len(successive_states[-1])): states.append( T.stack(*[ states_at_step[i] for states_at_step in successive_states ])) else: def _step(input, *states): output, new_states = step_function(input, states) return [output] + new_states results, _ = theano.scan(_step, sequences=inputs, outputs_info=[None] + initial_states, non_sequences=constants, go_backwards=go_backwards) # deal with Theano API inconsistency if type(results) is list: outputs = results[0] states = results[1:] else: outputs = results states = [] outputs = T.squeeze(outputs) last_output = outputs[-1] axes = [1, 0] + list(range(2, outputs.ndim)) outputs = outputs.dimshuffle(axes) states = [T.squeeze(state[-1]) for state in states] return last_output, outputs, states
def get_output_mask(self, train=False): X = self.get_input_mask(train) if X is None: return None tensors = [T.roll(X, off, axis=self.axis) for off in self.offsets] return T.stack(tensors, axis=self.offset_axis)
def identity(n): return tensor.stack([tensor.eye(n), tensor.zeros((n, n))], axis=0)
def __init__(self, train_raw, test_raw, dim, mode, l2, l1, batch_norm, dropout, batch_size, ihm_C, los_C, ph_C, decomp_C, partition, nbins, **kwargs): print "==> not used params in network class:", kwargs.keys() self.train_raw = train_raw self.test_raw = test_raw self.dim = dim self.mode = mode self.l2 = l2 self.l1 = l1 self.batch_norm = batch_norm self.dropout = dropout self.batch_size = batch_size self.ihm_C = ihm_C self.los_C = los_C self.ph_C = ph_C self.decomp_C = decomp_C self.nbins = nbins if (partition == 'log'): self.get_bin = metrics.get_bin_log self.get_estimate = metrics.get_estimate_log else: assert self.nbins == 10 self.get_bin = metrics.get_bin_custom self.get_estimate = metrics.get_estimate_custom self.train_batch_gen = self.get_batch_gen(self.train_raw) self.test_batch_gen = self.get_batch_gen(self.test_raw) self.input_var = T.tensor3('X') self.input_lens = T.ivector('L') self.ihm_pos = T.ivector('ihm_pos') self.ihm_mask = T.ivector('ihm_mask') self.ihm_label = T.ivector('ihm_label') self.los_mask = T.imatrix('los_mask') self.los_label = T.matrix('los_label') # for regression #self.los_label = T.imatrix('los_label') self.ph_label = T.imatrix('ph_label') self.decomp_mask = T.imatrix('decomp_mask') self.decomp_label = T.imatrix('decomp_label') print "==> Building neural network" # common network network = layers.InputLayer((None, None, self.train_raw[0][0].shape[1]), input_var=self.input_var) if (self.dropout > 0): network = layers.DropoutLayer(network, p=self.dropout) network = layers.LSTMLayer(incoming=network, num_units=dim, only_return_final=False, grad_clipping=10, ingate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1)), forgetgate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1)), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh, W_in=Orthogonal(), W_hid=Orthogonal()), outgate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1))) if (self.dropout > 0): network = layers.DropoutLayer(network, p=self.dropout) lstm_output = layers.get_output(network) self.params = layers.get_all_params(network, trainable=True) self.reg_params = layers.get_all_params(network, regularizable=True) # for each example in minibatch take the last output last_outputs = [] for index in range(self.batch_size): last_outputs.append(lstm_output[index, self.input_lens[index]-1, :]) last_outputs = T.stack(last_outputs) # take 48h outputs for fixed mortality task mid_outputs = [] for index in range(self.batch_size): mid_outputs.append(lstm_output[index, self.ihm_pos[index], :]) mid_outputs = T.stack(mid_outputs) # in-hospital mortality related network ihm_network = layers.InputLayer((None, dim), input_var=mid_outputs) ihm_network = layers.DenseLayer(incoming=ihm_network, num_units=2, nonlinearity=softmax) self.ihm_prediction = layers.get_output(ihm_network) self.ihm_det_prediction = layers.get_output(ihm_network, deterministic=True) self.params += layers.get_all_params(ihm_network, trainable=True) self.reg_params += layers.get_all_params(ihm_network, regularizable=True) self.ihm_loss = (self.ihm_mask * categorical_crossentropy(self.ihm_prediction, self.ihm_label)).mean() # length of stay related network # Regression los_network = layers.InputLayer((None, None, dim), input_var=lstm_output) los_network = layers.ReshapeLayer(los_network, (-1, dim)) los_network = layers.DenseLayer(incoming=los_network, num_units=1, nonlinearity=rectify) los_network = layers.ReshapeLayer(los_network, (lstm_output.shape[0], -1)) self.los_prediction = layers.get_output(los_network) self.los_det_prediction = layers.get_output(los_network, deterministic=True) self.params += layers.get_all_params(los_network, trainable=True) self.reg_params += layers.get_all_params(los_network, regularizable=True) self.los_loss = (self.los_mask * squared_error(self.los_prediction, self.los_label)).mean(axis=1).mean(axis=0) # phenotype related network ph_network = layers.InputLayer((None, dim), input_var=last_outputs) ph_network = layers.DenseLayer(incoming=ph_network, num_units=25, nonlinearity=sigmoid) self.ph_prediction = layers.get_output(ph_network) self.ph_det_prediction = layers.get_output(ph_network, deterministic=True) self.params += layers.get_all_params(ph_network, trainable=True) self.reg_params += layers.get_all_params(ph_network, regularizable=True) self.ph_loss = nn_utils.multilabel_loss(self.ph_prediction, self.ph_label) # decompensation related network decomp_network = layers.InputLayer((None, None, dim), input_var=lstm_output) decomp_network = layers.ReshapeLayer(decomp_network, (-1, dim)) decomp_network = layers.DenseLayer(incoming=decomp_network, num_units=2, nonlinearity=softmax) decomp_network = layers.ReshapeLayer(decomp_network, (lstm_output.shape[0], -1, 2)) self.decomp_prediction = layers.get_output(decomp_network)[:, :, 1] self.decomp_det_prediction = layers.get_output(decomp_network, deterministic=True)[:, :, 1] self.params += layers.get_all_params(decomp_network, trainable=True) self.reg_params += layers.get_all_params(decomp_network, regularizable=True) self.decomp_loss = nn_utils.multilabel_loss_with_mask(self.decomp_prediction, self.decomp_label, self.decomp_mask) """ data = next(self.train_batch_gen) print max(data[1]) print lstm_output.eval({self.input_var:data[0]}).shape exit() """ if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.reg_params) else: self.loss_l2 = T.constant(0) if self.l1 > 0: self.loss_l1 = self.l1 * nn_utils.l1_reg(self.reg_params) else: self.loss_l1 = T.constant(0) self.reg_loss = self.loss_l1 + self.loss_l2 self.loss = (ihm_C * self.ihm_loss + los_C * self.los_loss + ph_C * self.ph_loss + decomp_C * self.decomp_loss + self.reg_loss) #updates = lasagne.updates.adadelta(self.loss, self.params, # learning_rate=0.001) #updates = lasagne.updates.momentum(self.loss, self.params, # learning_rate=0.00003) #updates = lasagne.updates.adam(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params, beta1=0.5, learning_rate=0.0001) # from DCGAN paper #updates = lasagne.updates.nesterov_momentum(loss, params, momentum=0.9, # learning_rate=0.001, all_inputs = [self.input_var, self.input_lens, self.ihm_pos, self.ihm_mask, self.ihm_label, self.los_mask, self.los_label, self.ph_label, self.decomp_mask, self.decomp_label] train_outputs = [self.ihm_prediction, self.los_prediction, self.ph_prediction, self.decomp_prediction, self.loss, self.ihm_loss, self.los_loss, self.ph_loss, self.decomp_loss, self.reg_loss] test_outputs = [self.ihm_det_prediction, self.los_det_prediction, self.ph_det_prediction, self.decomp_det_prediction, self.loss, self.ihm_loss, self.los_loss, self.ph_loss, self.decomp_loss, self.reg_loss] ## compiling theano functions if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=all_inputs, outputs=train_outputs, updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=all_inputs, outputs=test_outputs)
def __init__(self, n_out, collapse_output=False, directions=4, projection='average', base=None, **kwargs): if base is None: base = [] super(TwoDLSTMLayer, self).__init__(n_out, **kwargs) assert len(self.sources) == 1 source = self.sources[0] n_in = source.attrs['n_out'] X = source.output assert X.ndim == 4 sizes = source.output_sizes self.output_sizes = sizes assert directions in [1, 2, 4], "only 1, 2 or 4 directions are supported" assert projection in ['average', 'concat'], "invalid projection" if base: self.b1 = self.add_param(base[0].b1) self.b2 = self.add_param(base[0].b2) if directions >= 1: self.b3 = self.add_param(base[0].b3) self.b4 = self.add_param(base[0].b4) self.W1, self.V_h1, self.V_v1 = self.add_param( base[0].W1), self.add_param(base[0].V_h1), self.add_param( base[0].V_v1) self.W2, self.V_h2, self.V_v2 = self.add_param( base[0].W2), self.add_param(base[0].V_h2), self.add_param( base[0].V_v2) if directions >= 1: self.W3, self.V_h3, self.V_v3 = self.add_param( base[0].W3), self.add_param(base[0].V_h3), self.add_param( base[0].V_v3) self.W4, self.V_h4, self.V_v4 = self.add_param( base[0].W4), self.add_param(base[0].V_h4), self.add_param( base[0].V_v4) #self.mass = base[0].mass #self.masks = base[0].masks #self.b1 = base[0].b1 #self.b2 = base[0].b2 #if directions >= 1: # self.b3 = base[0].b3 # self.b4 = base[0].b4 #self.W1, self.V_h1, self.V_v1 = base[0].W1, base[0].V_h1, base[0].V_v1 #self.W2, self.V_h2, self.V_v2 = base[0].W2, base[0].V_h2, base[0].V_v2 #if directions >= 1: # self.W3, self.V_h3, self.V_v3 = base[0].W3, base[0].V_h3, base[0].V_v3 # self.W4, self.V_h4, self.V_v4 = base[0].W4, base[0].V_h4, base[0].V_v4 self.mass = base[0].mass self.masks = base[0].masks else: self.b1 = self.create_and_add_bias(n_out, "1") self.b2 = self.create_and_add_bias(n_out, "2") if directions >= 1: self.b3 = self.create_and_add_bias(n_out, "3") self.b4 = self.create_and_add_bias(n_out, "4") self.W1, self.V_h1, self.V_v1 = self.create_and_add_2d_lstm_weights( n_in, n_out, "1") self.W2, self.V_h2, self.V_v2 = self.create_and_add_2d_lstm_weights( n_in, n_out, "2") if directions >= 1: self.W3, self.V_h3, self.V_v3 = self.create_and_add_2d_lstm_weights( n_in, n_out, "3") self.W4, self.V_h4, self.V_v4 = self.create_and_add_2d_lstm_weights( n_in, n_out, "4") # dropout assert len(self.masks) == 1 mask = self.masks[0] if mask is not None: X = self.mass * mask * X if str(theano.config.device).startswith('cpu'): Y = T.zeros_like(X) if projection == 'concat': Y = Y.repeat(directions, axis=-1) n_out *= directions else: if directions <= 2: Y = BidirectionalTwoDLSTMOpInstance(X, self.W1, self.W2, self.V_h1, self.V_h2, self.V_v1, self.V_v2, self.b1, self.b2, sizes) else: Y = MultiDirectionalTwoDLSTMOpInstance( X, self.W1, self.W2, self.W3, self.W4, self.V_h1, self.V_h2, self.V_h3, self.V_h4, self.V_v1, self.V_v2, self.V_v3, self.V_v4, self.b1, self.b2, self.b3, self.b4, sizes) if directions > 1: Y = T.stack(Y[:directions], axis=-1) if projection == 'average': Y = Y.mean(axis=-1) elif projection == 'concat': Y = Y.reshape((Y.shape[0], Y.shape[1], Y.shape[2], Y.shape[3] * Y.shape[4])) n_out *= directions else: Y = Y[0] Y.name = 'Y' self.set_attr('n_out', n_out) self.set_attr('collapse_output', collapse_output) self.set_attr('directions', directions) self.set_attr('projection', projection) #index handling def index_fn(index, size): return T.set_subtensor(index[:size], numpy.cast['int8'](1)) index_init = T.zeros((Y.shape[2], Y.shape[1]), dtype='int8') self.index, _ = theano.scan( index_fn, [index_init, T.cast(sizes[:, 1], "int32")]) self.index = self.index.dimshuffle(1, 0) if collapse_output == 'sum' or collapse_output == True: Y = Y.sum(axis=0) elif collapse_output == 'mean': Y = Y.mean(axis=0) elif collapse_output == 'conv': from TheanoUtil import circular_convolution Y, _ = theano.scan(lambda x_i, x_p: circular_convolution(x_i, x_p), Y, Y[0]) Y = Y[-1] elif collapse_output == 'flatten': self.index = T.ones((Y.shape[0] * Y.shape[1], Y.shape[2]), dtype='int8') Y = Y.reshape((Y.shape[0] * Y.shape[1], Y.shape[2], Y.shape[3])) elif str(collapse_output).startswith('pad_'): pad = numpy.int32(collapse_output.split('_')[-1]) Y = ifelse( T.lt(Y.shape[0], pad), T.concatenate([ Y, T.zeros( (pad - Y.shape[0], Y.shape[1], Y.shape[2], Y.shape[3]), 'float32') ], axis=0), ifelse(T.gt(Y.shape[0], pad), Y[:pad], Y)) Y = Y.dimshuffle(1, 2, 3, 0).reshape( (Y.shape[1], Y.shape[2], Y.shape[3] * Y.shape[0])) self.attrs['n_out'] *= pad elif collapse_output != False: assert False, "invalid collapse mode" if self.attrs['batch_norm']: Y = self.batch_norm( Y, self.attrs['n_out'], index=sizes if not collapse_output else self.index, force_sample=False) self.output = Y
def __init__(self, config): self._params = [] self._np_rng = np.random.RandomState(config.seed // 2 + 123) self._theano_rng = RandomStreams( config.seed // 2 + 321) # generates random numbers directly on GPU self._init_scale = config.init_scale self._is_training = tt.iscalar('is_training') self._lr = theano.shared(cast_floatX(config.learning_rate), 'lr') input_data = tt.imatrix('input_data') # (batch_size, num_steps) targets = tt.imatrix('targets') # (batch_size, num_steps) noise_x = tt.matrix('noise_x') # (batch_size, num_steps) # Embed input words and apply variational dropout (for each sample, the embedding of # a dropped word-type consists of all zeros at all occurrences of word-type in sample). embedding = self.make_param((config.vocab_size, config.hidden_size), 'uniform') inputs = embedding[ input_data.T] # (num_steps, batch_size, hidden_size) inputs = self.apply_dropout(inputs, tt.shape_padright(noise_x.T)) rhn_updates = [] for _ in range(config.num_layers): # y shape: (num_steps, batch_size, hidden_size) y, sticky_state_updates = self.RHNLayer( inputs, config.depth, config.batch_size, config.hidden_size, config.drop_i, config.drop_s, config.init_T_bias, config.init_other_bias, config.tied_noise) rhn_updates += sticky_state_updates inputs = y noise_o = self.get_dropout_noise( (config.batch_size, config.hidden_size), config.drop_o) outputs = self.apply_dropout( y, tt.shape_padleft(noise_o)) # (num_steps, batch_size, hidden_size) # logits softmax_w = embedding.T if config.tied_embeddings else self.make_param( (config.hidden_size, config.vocab_size), 'uniform') softmax_b = self.make_param((config.vocab_size, ), config.init_other_bias) logits = tt.dot( outputs, softmax_w) + softmax_b # (num_steps, batch_size, vocab_size) # probabilities and prediction loss flat_logits = logits.reshape( (config.batch_size * config.num_steps, config.vocab_size)) flat_probs = tt.nnet.softmax(flat_logits) flat_targets = targets.T.flatten() # (batch_size * num_steps,) xentropies = tt.nnet.categorical_crossentropy( flat_probs, flat_targets) # (batch_size * num_steps,) pred_loss = xentropies.sum() / config.batch_size # weight decay l2_loss = 0.5 * tt.sum(tt.stack([tt.sum(p**2) for p in self._params])) loss = pred_loss + config.weight_decay * l2_loss grads = theano.grad(loss, self._params) # gradient clipping global_grad_norm = tt.sqrt( tt.sum(tt.stack([tt.sum(g**2) for g in grads]))) clip_factor = ifelse( global_grad_norm < config.max_grad_norm, cast_floatX(1), tt.cast(config.max_grad_norm / global_grad_norm, floatX)) param_updates = [(p, p - self._lr * clip_factor * g) for p, g in zip(self._params, grads)] self.train = theano.function([input_data, targets, noise_x], loss, givens={self._is_training: np.int32(1)}, updates=rhn_updates + param_updates) self.evaluate = theano.function( [input_data, targets], loss, # Note that noise_x is unused in computation graph of this function since _is_training is false. givens={ self._is_training: np.int32(0), noise_x: tt.zeros((config.batch_size, config.num_steps)) }, updates=rhn_updates) self._num_params = np.sum( [param.get_value().size for param in self._params]) if config.load_model: self.load(config.load_model)
def e_hat(y, X, *es): e = tt.stack(es[:-2]) return y - tt.dot(X, es[-2]) - tt.dot(e, es[-1])
def Unskew(padded): """ input.shape: (batch size, HEIGHT, 2*WIDTH - 1, dim) """ return T.stack([padded[:, i, i:i + WIDTH, :] for i in range(HEIGHT)], axis=1)
def local_gpua_careduce(node, context_name): if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, scalar.Maximum, scalar.Minimum)): ctx = get_context(context_name) if ctx.kind == b'opencl': op = GpuCAReduceCPY if node.op.scalar_op not in [scalar.add, scalar.mul]: # We don't support yet all reduction with cpy code. return elif ctx.kind == b'cuda': op = GpuCAReduceCuda else: return False x, = node.inputs greduce = op( node.op.scalar_op, axis=node.op.axis, dtype=getattr(node.op, 'dtype', None), acc_dtype=getattr(node.op, 'acc_dtype', None)) gvar = greduce(x) # We need to have the make node called, otherwise the mask can # be None if (op is GpuCAReduceCPY or gvar.owner.op.supports_c_code([ as_gpuarray_variable(x, context_name)])): return greduce else: # Try to make a simpler pattern based on reshaping # The principle is that if two adjacent dimensions have # the same value in the reduce_mask, then we can reshape # to make them a single dimension, do the reduction, and # then reshape to get them back. if node.op.axis is None: reduce_mask = [1] * x.type.ndim else: reduce_mask = [0] * x.type.ndim for a in node.op.axis: assert reduce_mask[a] == 0 reduce_mask[a] = 1 shape_of = node.fgraph.shape_feature.shape_of x_shape = shape_of[x] new_in_shp = [x_shape[0]] new_mask = [reduce_mask[0]] for i in xrange(1, x.type.ndim): if reduce_mask[i] == reduce_mask[i - 1]: new_in_shp[-1] *= x_shape[i] else: new_mask.append(reduce_mask[i]) new_in_shp.append(x_shape[i]) new_axis = [] for idx, m in enumerate(new_mask): if m == 1: new_axis.append(idx) greduce = op( node.op.scalar_op, axis=new_axis, reduce_mask=new_mask, dtype=getattr(node.op, 'dtype', None), acc_dtype=getattr(node.op, 'acc_dtype', None)) reshaped_x = x.reshape(tensor.stack(new_in_shp)) gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name) gvar = greduce(gpu_reshaped_x) # We need to have the make node called, otherwise the mask can # be None reshaped_gpu_inputs = [gpu_reshaped_x] if greduce.supports_c_code(reshaped_gpu_inputs): reduce_reshaped_x = host_from_gpu( greduce(gpu_reshaped_x)) if reduce_reshaped_x.ndim != node.outputs[0].ndim: unreshaped_reduce = reduce_reshaped_x.reshape( tensor.stack(shape_of[node.outputs[0]])) else: unreshaped_reduce = reduce_reshaped_x return [unreshaped_reduce]
def pack(x): return T.stack(*x)
def build_model(prepared_data, clamp_L0=0.4, eeg_column_i=None, **kwargs): # ########## # STEP1: order the data properly so that we can read from it sequentially # when training the model subject_x, skill_x, correct_y, start_x, eeg_x, eeg_table, stim_pairs, train_idx, valid_idx = prepared_data N = len(correct_y) train_mask = idx_to_mask(train_idx, N) valid_mask = idx_to_mask(valid_idx, N) # sort data by subject and skill sorted_i = sorted(xrange(N), key=lambda i: (subject_x[i], skill_x[i], start_x[i])) skill_x = skill_x[sorted_i] subject_x = subject_x[sorted_i] correct_y = correct_y[sorted_i] start_x = start_x[sorted_i] train_mask = train_mask[sorted_i] valid_mask = valid_mask[sorted_i] train_idx = np.nonzero(train_mask)[0] valid_idx = np.nonzero(valid_mask)[0] n_skills = np.max(skill_x) + 1 n_subjects = np.max(subject_x) + 1 # binarize eeg eeg_single_x = np.zeros(N) if eeg_column_i is not None: eeg_column = eeg_table[eeg_x, eeg_column_i] above_median = np.greater(eeg_column, np.median(eeg_column)) eeg_single_x[above_median] = 1 # prepare parameters p_T = 0.5 p_G = 0.1 p_S = 0.2 p_L0 = 0.7 if clamp_L0 is None: p_L0 = 0.7 else: p_L0 = clamp_L0 # eeg_single_x = np.zeros(N) parameter_base = np.ones(n_skills) tp_L0, t_L0 = make_probability(parameter_base * p_L0, name='L0') tp_T, t_T = make_probability(np.ones((n_skills, 2)) * p_T, name='p(T)') tp_G, t_G = make_probability(p_G, name='p(G)') tp_S, t_S = make_probability(p_S, name='p(S)') # declare and prepare variables for theano i = T.ivector('i') dummy_float = make_shared(0, name='dummy') skill_i, subject_i = T.iscalars('skill_i', 'subject_i') correct_y = make_shared(correct_y, to_int=True) eeg_single_x = make_shared(eeg_single_x, to_int=True) def step(correct_i, eeg, prev_L, prev_p_C, P_T, P_S, P_G): Ln = prev_L + (1 - prev_L) * P_T[eeg] p_C = prev_L * (1 - P_S) + (1 - prev_L) * P_G return Ln, p_C # set up theano functions ((results, p_C), updates) = theano.scan(fn=step, sequences=[correct_y[i], eeg_single_x[i]], outputs_info=[tp_L0[skill_i], dummy_float], non_sequences=[tp_T[skill_i], tp_G, tp_S]) p_y = T.stack(1 - p_C, p_C) loss = neg_log_loss(p_y, correct_y[i]) learning_rate = T.fscalar('learning_rate') if clamp_L0 is None: params = [t_T, t_L0] else: params = [t_T] update_parameters = [(param, param - learning_rate * T.grad(loss, param)) for param in params] tf_train = theano.function(inputs=[i, skill_i, learning_rate], updates=update_parameters, outputs=[loss, results, i], allow_input_downcast=True) tf_valid = theano.function(inputs=[i, skill_i], outputs=[loss, results, i], allow_input_downcast=True) def f_train((i, (subject_i, skill_i)), learning_rate): return tf_train(i, skill_i, learning_rate)
u0 = T.dscalar("u0") u1 = T.dscalar("u1") u2 = T.dscalar("u2") x_inputs = [x0_v, x1_v, x0_h, x1_h, u0, u1] u_inputs = [u2] # Discrete dynamics model definition. f = T.stack([ x0_v + (x1_v * dt), x1_v + (((U0 / (T0**2)) * alpha_v * u2 + (U0 / T0) * beta_v * u1 + gamma_v * U0 * u0 - phi_v * (X0 / T0) * x1_v - xi_v * X0 * x0_v) / (X0 / (T0**2))) * dt, # x2_v + , x0_h + (x1_h * dt), x1_h + (((U0 / (T0**2)) * alpha_h * u2 + (U0 / T0) * beta_h * u1 + gamma_h * U0 * u0 - phi_h * (X0 / T0) * x1_h - xi_h * X0 * x0_h) / (X0 / (T0**2))) * dt, # x2_h + , u0 + (u1 * dt), u1 + (u2 * dt) ]) dynamics = AutoDiffDynamics(f, x_inputs, u_inputs) # dynamics = FiniteDiffDynamics(f, 6, 1) # dynamics = BatchAutoDiffDynamics(f, state_size, action_size) # Q = np.eye(dynamics.state_size)#state error # cost = transpose(x) * Q * x + transpose(u) * R * u
def neibs2images(neibs, neib_shape, original_shape, mode="valid"): """ Function :func:`neibs2images <theano.sandbox.neighbours.neibs2images>` performs the inverse operation of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`. It inputs the output of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` and reconstructs its input. Parameters ---------- neibs : 2d tensor Like the one obtained by :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`. neib_shape `neib_shape` that was used in :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`. original_shape Original shape of the 4d tensor given to :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` Returns ------- object Reconstructs the input of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`, a 4d tensor of shape `original_shape`. Notes ----- Currently, the function doesn't support tensors created with `neib_step` different from default value. This means that it may be impossible to compute the gradient of a variable gained by :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` w.r.t. its inputs in this case, because it uses :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` for gradient computation. Examples -------- Example, which uses a tensor gained in example for :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`: .. code-block:: python im_new = neibs2images(neibs, (5, 5), im_val.shape) # Theano function definition inv_window = theano.function([neibs], im_new) # Function application im_new_val = inv_window(neibs_val) .. note:: The code will output the initial image array. """ neibs = tt.as_tensor_variable(neibs) neib_shape = tt.as_tensor_variable(neib_shape) original_shape = tt.as_tensor_variable(original_shape) new_neib_shape = tt.stack( [original_shape[-1] // neib_shape[1], neib_shape[1]]) output_2d = images2neibs(neibs.dimshuffle("x", "x", 0, 1), new_neib_shape, mode=mode) if mode == "ignore_borders": # We use set_subtensor to accept original_shape we can't infer # the shape and still raise error when it don't have the right # shape. valid_shape = original_shape valid_shape = tt.set_subtensor( valid_shape[2], (valid_shape[2] // neib_shape[0]) * neib_shape[0]) valid_shape = tt.set_subtensor( valid_shape[3], (valid_shape[3] // neib_shape[1]) * neib_shape[1]) output_4d = output_2d.reshape(valid_shape, ndim=4) # padding the borders with zeros for d in [2, 3]: pad_shape = list(output_4d.shape) pad_shape[d] = original_shape[d] - valid_shape[d] output_4d = tt.concatenate( [output_4d, tt.zeros(pad_shape)], axis=d) elif mode == "valid": # TODO: we do not implement all mode with this code. # Add a check for the good cases. output_4d = output_2d.reshape(original_shape, ndim=4) else: raise NotImplementedError(f"neibs2images do not support mode={mode}") return output_4d