def masked_softmax2(x, mask=None): """ softmax over axis=1 deal with case where mask may be all 0 :param x: :param mask: :return: """ if x.ndim not in (1, 2) \ or x.dtype not in T.float_dtypes: raise ValueError('x must be 1-d or 2-d tensor of floats. Got %s with ndim=%d' % x.type, x.ndim) if mask is not None and mask.ndim != x.ndim: raise ValueError('mask must have the same dim with x. Got x=%d-d and mask=%d-d' % x.ndim, mask.ndim) if x.ndim == 1: x = T.shape_padleft(x, n_ones=1) if mask is not None and mask.ndim == 1: mask = T.shape_padleft(mask, n_ones=1) e_x = T.exp(x - x.max(axis=1)[:, None]) if mask is not None: e_x *= mask # avoid 0-division denorm = e_x.sum(axis=1) + 1.0 - T.max(mask, axis=1) sm = e_x / denorm[:, None] return sm
def masked_softmax(x, mask=None): """ softmax over axis=1 there must be at least one 1 in mask :param x: :param mask: :return: """ if x.ndim not in (1, 2) \ or x.dtype not in T.float_dtypes: raise ValueError('x must be 1-d or 2-d tensor of floats. Got %s with ndim=%d' % (x.type, x.ndim)) if mask and mask.ndim != x.ndim: raise ValueError('mask must have the same dim with x. Got x.ndim=%d and mask.ndim=%d' % (x.ndim, mask.ndim)) if x.ndim == 1: x = T.shape_padleft(x, n_ones=1) if mask is not None and mask.ndim == 1: mask = T.shape_padleft(mask, n_ones=1) e_x = T.exp(x - x.max(axis=1)[:, None]) if mask is not None: e_x *= mask sm = e_x / e_x.sum(axis=1)[:, None] return sm
def bn_ff(x, mean, var, gamma=1., beta=0., prefix="", axis=0, use_popstats=False): assert x.ndim == 2 if not use_popstats: mean = x.mean(axis=axis) var = x.var(axis=axis) mean.tag.bn_statistic = True mean.tag.bn_label = prefix + "_mean" var.tag.bn_statistic = True var.tag.bn_label = prefix + "_var" var_corrected = var + 1e-7 y = theano.tensor.nnet.bn.batch_normalization(inputs=x, gamma=gamma, beta=beta, mean=T.shape_padleft(mean), std=T.shape_padleft( T.sqrt(var_corrected))) assert y.ndim == 2 return y, mean, var
def create_prediction(self):#做一次predict的方法 gfs=self.gfs pm25in=self.pm25in #初始第一次前传 x=T.concatenate([gfs[:,0],gfs[:,1],gfs[:,2],pm25in[:,0],pm25in[:,1],self.cnt[:,:,0]],axis=1) if self.celltype==RNN: init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size, name="RNN.initial_hidden_state")), x.shape[0], axis=0) if x.ndim > 1 else create_shared(layer.hidden_size, name="RNN.initial_hidden_state")) if hasattr(layer, 'initial_hidden_state') else None for layer in self.model.layers] if self.celltype==LSTM: init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state")), x.shape[0], axis=0) if x.ndim > 1 else create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state")) if hasattr(layer, 'initial_hidden_state') else None for layer in self.model.layers] self.layerstatus=self.model.forward(x,init_hiddens) #results.shape?40*1 self.results=self.layerstatus[-1] if self.steps > 1: self.layerstatus=self.model.forward(T.concatenate([gfs[:,1],gfs[:,2],gfs[:,3],pm25in[:,1],self.results,self.cnt[:,:,1]],axis=1),self.layerstatus) self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) #前传之后step-2次 for i in xrange(2,self.steps): self.layerstatus=self.model.forward(T.concatenate([gfs[:,i],gfs[:,i+1],gfs[:,i+2],T.shape_padright(self.results[:,i-2]),T.shape_padright(self.results[:,i-1]),self.cnt[:,:,i]],axis=1),self.layerstatus) #need T.shape_padright??? self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1) return self.results
def sym_mask_logdensity_estimator_intermediate(self, x, mask): non_linearity_name = self.parameters["nonlinearity"].get_name() assert non_linearity_name == "sigmoid" or non_linearity_name == "RLU" x = x.T # BxD mask = mask.T # BxD output_mask = constantX(1) - mask # BxD D = constantX(self.n_visible) d = mask.sum(1) # d is the 1-based index of the dimension whose value to infer (not the size of the context) masked_input = x * mask # BxD h = self.nonlinearity(T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1) # BxH for l in xrange(self.n_layers - 1): h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l]) # BxH z_alpha = T.tensordot(h, self.V_alpha, [[1], [1]]) + T.shape_padleft(self.b_alpha) z_mu = T.tensordot(h, self.V_mu, [[1], [1]]) + T.shape_padleft(self.b_mu) z_sigma = T.tensordot(h, self.V_sigma, [[1], [1]]) + T.shape_padleft(self.b_sigma) temp = T.exp(z_alpha) # + 1e-6 # temp += T.shape_padright(temp.sum(2)/1e-3) Alpha = temp / T.shape_padright(temp.sum(2)) # BxDxC Mu = z_mu # BxDxC Sigma = T.exp(z_sigma) # + 1e-6 #BxDxC # Alpha = Alpha * T.shape_padright(output_mask) + T.shape_padright(mask) # Mu = Mu * T.shape_padright(output_mask) # Sigma = Sigma * T.shape_padright(output_mask) + T.shape_padright(mask) # Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x*output_mask)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2*np.pi)) #BxDxC Phi = ( -constantX(0.5) * T.sqr((Mu - T.shape_padright(x)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) ) # BxDxC logdensity = (log_sum_exp(Phi + T.log(Alpha), axis=2) * output_mask).sum(1) * D / (D - d) return (logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h)
def generic_compute_Lx_batches(samples, weights, biases, bs, cbs): tsamples = [x.reshape((bs//cbs, cbs, x.shape[1])) for x in samples] final_ws = [T.unbroadcast(T.shape_padleft(T.zeros_like(x)),0) for x in weights] final_bs = [T.unbroadcast(T.shape_padleft(T.zeros_like(x)),0) for x in biases] n_samples = len(samples) n_weights = len(weights) n_biases = len(biases) def comp_step(*args): lsamples = args[:n_samples] terms1 = generic_compute_Lx_term1(lsamples, weights, biases) rval = [] for (term1, acc) in zip(terms1, args[n_samples:]): rval += [acc + term1] return rval rvals,_ = theano.sandbox.scan.scan( comp_step, sequences=tsamples, states=final_ws + final_bs, n_steps=bs // cbs, profile=0, mode=theano.Mode(linker='cvm_nogc'), flags=['no_optimization'] ) accs1 = [x[0]/numpy.float32(bs//cbs) for x in rvals] accs2 = generic_compute_Lx_term2(samples,weights,biases) return [x - y for x, y in zip(accs1, accs2)]
def pos_phase(self, v, init_state, n_steps=1, eps=1e-3): """ Mixed mean-field + sampling inference in positive phase. :param v: input being conditioned on :param init: dictionary of initial values :param n_steps: number of Gibbs updates to perform afterwards. """ def pos_mf_iteration(g1, h1, v, pos_counter): h2 = self.h_hat(g1, v) s2_1 = self.s1_hat(g1, v) s2_0 = self.s0_hat(g1, v) g2 = self.g_hat(h2, s2_1, s2_0) # stopping criterion dl_dghat = T.max(abs(self.dlbound_dg(g2, h2, s2_1, s2_0, v))) dl_dhhat = T.max(abs(self.dlbound_dh(g2, h2, s2_1, s2_0, v))) stop = T.maximum(dl_dghat, dl_dhhat) return [g2, h2, s2_1, s2_0, v, pos_counter + 1], theano.scan_module.until(stop < eps) states = [T.unbroadcast(T.shape_padleft(init_state['g'])), T.unbroadcast(T.shape_padleft(init_state['h'])), {'steps': 1}, {'steps': 1}, T.unbroadcast(T.shape_padleft(v)), T.unbroadcast(T.shape_padleft(0.))] rvals, updates = scan( pos_mf_iteration, states = states, n_steps=n_steps) return [rval[0] for rval in rvals]
def compute_Lx_batches(v, g, h, xw_mat, xv_mat, xa, xb, xc, bs, cbs): xw = xw_mat.flatten() xv = xv_mat.flatten() tv = v.reshape((bs // cbs, cbs, v.shape[1])) tg = g.reshape((bs // cbs, cbs, g.shape[1])) th = h.reshape((bs // cbs, cbs, h.shape[1])) final_w1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xw_mat)),0) final_v1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xv_mat)),0) final_a1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xa)),0) final_b1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xb)),0) final_c1 = T.unbroadcast(T.shape_padleft(T.zeros_like(xc)),0) def comp_step(lv, lg, lh, acc_w1, acc_v1, acc_a1, acc_b1, acc_c1): terms1 = compute_Lx_term1(lv, lg, lh, xw, xv, xa, xb, xc) accs1 = [acc_w1, acc_v1, acc_a1, acc_b1, acc_c1] rval = [] for (term1, acc) in zip(terms1,accs1): rval += [acc + term1] return rval rvals,_ = theano.sandbox.scan.scan( comp_step, sequences=[tv,tg,th], states=[ final_w1, final_v1, final_a1, final_b1, final_c1], n_steps=bs // cbs, profile=0, mode=theano.Mode(linker='cvm_nogc'), flags=['no_optimization'] ) accs1 = [x[0]/numpy.float32(bs//cbs) for x in rvals] accs2 = compute_Lx_term2(v,g,h,xw,xv,xa,xb,xc) return [x - y for x, y in zip(accs1, accs2)]
def bn(x, gammas, betas, mean, var, args): assert mean.ndim == 1 assert var.ndim == 1 assert x.ndim == 2 if not args.use_population_statistics: mean = x.mean(axis=0) var = x.var(axis=0) #var = T.maximum(var, args.epsilon) #var = var + args.epsilon if args.baseline: y = x + betas else: var_corrected = var + args.epsilon y = theano.tensor.nnet.bn.batch_normalization( inputs=x, gamma=gammas, beta=betas, mean=T.shape_padleft(mean), std=T.shape_padleft(T.sqrt(var_corrected)), mode="high_mem") assert mean.ndim == 1 assert var.ndim == 1 return y, mean, var
def NVIL(discriminator, log_Z, g_output_logit, n_samples, trng, batch_size=64): R = trng.uniform(size=(n_samples, batch_size, DIM_C, DIM_X, DIM_Y), dtype=floatX_) g_output = T.nnet.sigmoid(g_output_logit) samples = (R <= T.shape_padleft(g_output)).astype(floatX_) D_r = lasagne.layers.get_output(discriminator) D_f = lasagne.layers.get_output(discriminator, samples.reshape((-1, DIM_C, DIM_X, DIM_Y))) D_f_ = D_f.reshape((n_samples, batch_size)) log_w = D_f_ log_g = -( (1. - samples) * T.shape_padleft(g_output_logit) + T.shape_padleft(T.nnet.softplus(-g_output_logit))).sum(axis=(2, 3, 4)) log_N = T.log(log_w.shape[0]).astype(floatX_) log_Z_est = log_sum_exp(log_w - log_N, axis=0) log_w_tilde = log_w - T.shape_padleft(log_Z_est) - log_N w_tilde = T.exp(log_w_tilde) r = theano.gradient.disconnected_grad((log_w - log_Z - 1)) generator_loss = -(r * log_g).mean() discriminator_loss = (T.nnet.softplus(-D_r)).mean() + ( T.nnet.softplus(-D_f)).mean() + D_f.mean() return generator_loss, discriminator_loss, D_r, D_f, log_Z_est, log_w, w_tilde, {}
def Step_decoding(self): ## if i_y = None, then it means decoding the first word i_y = tensor.matrix('i_y', dtype='int64') i_y_mask = tensor.matrix('i_y_mask', dtype=config.floatX) h_ = tensor.matrix('h', dtype=config.floatX) c_ = tensor.matrix('c', dtype=config.floatX) flag = tensor.scalar('flag', dtype=config.floatX) state_below = tensor.alloc(numpy_floatX(0.), i_y_mask.shape[0], i_y_mask.shape[1], self.dim) ## shape=(1,n,d) shape = (i_y.shape[0], i_y.shape[1], self.dim) #i_y_repr = self.emb.emb_W[i_y.flatten()].reshape(shape) i_y_repr = self.emb.fprop(i_y) state_below = ifelse(tensor.gt(flag, 0.5), i_y_repr, state_below) #state_below = tensor.switch(tensor.gt(flag, 0.5), self.emb.fprop(i_y), state_below) proj_h, proj_c = self.fprop(state_below, i_y_mask, h_, c_) proj_h, proj_c = proj_h[0], proj_c[0] final_layer_h = _slice(proj_h, self.layers - 1, self.dim) proj_xx = tensor.dot(final_layer_h, self.U) proj_x = proj_xx + self.b assert proj_h.ndim == 2, 'ndim error' self.dbg = theano.function([i_y,i_y_mask,h_,c_,flag],\ [proj_h, self.U.shape, self.b.shape],on_unused_input='ignore') prob = softmax(proj_x) self.comp_next_probs_hc = theano.function([i_y,i_y_mask,h_,c_,flag], \ [tensor.shape_padleft(prob), tensor.shape_padleft(proj_h), tensor.shape_padleft(proj_c), proj_x, ])
def set_output(self): self._output = tensor.sum( tensor.shape_padright(self._prev_layer.output) * tensor.shape_padleft(self.W.val), axis=-2) if self._bias: self._output += tensor.shape_padleft(self.b.val)
def bn(x, gammas, betas, mean, var, args, axis=0, prefix="", normalize=True, use_popstats=False): assert x.ndim == 2 if not normalize or not use_popstats: ### FIXME do we want to share statistics across space as well? mean = x.mean(axis=axis) var = x.var(axis=axis) mean.tag.bn_statistic = True mean.tag.bn_label = prefix + "_mean" var.tag.bn_statistic = True var.tag.bn_label = prefix + "_var" if not args.use_bn: y = x + betas else: var_corrected = var + 1e-7 y = theano.tensor.nnet.bn.batch_normalization( inputs=x, gamma=gammas, beta=betas, mean=T.shape_padleft(mean), std=T.shape_padleft(T.sqrt(var_corrected)), mode="low_mem") return y, mean, var
def eval(r, s, coord='xy'): if coord == 'uv': return self.eval(r, s, coord) * other.eval(r, s, coord) else: # xy m1 = self.eval(r, s, coord) m2 = other.eval(r, s, coord) # try loop doesn't work here.. sometimes fftconvolve perfectly happy with T object # note that for this to work, coord must be a uniform grid if all((isinstance(m1, (int, long, float, complex, np.ndarray)) for obj in (m1, m2))): from scipy.signal import fftconvolve dv = (r[0, 1] - r[0, 0]) * (s[1, 0] - s[0, 0] ) # must be from e.g. meshgrid ret = fftconvolve(m1, m2, mode='same') * dv print("numpy path (convolve)") return ret else: print("theano path (convolve)") import theano.tensor as T dv = (r[0, 1] - r[0, 0]) * (s[1, 0] - s[0, 0] ) # must be from e.g. meshgrid # ret = fftconvolve(m1, m2, mode='same') * dv m1pad = T.shape_padleft(m1, 2) m2pad = T.shape_padleft(m2, 2) ret = T.nnet.conv2d( m1pad, m2pad, border_mode='half', filter_flip=False)[0, 0] / dv return ret
def filter_and_prob(inpt, transition, emission, visible_noise_mean, visible_noise_cov, hidden_noise_mean, hidden_noise_cov, initial_hidden, initial_hidden_cov): step = forward_step( transition, emission, visible_noise_mean, visible_noise_cov, hidden_noise_mean, hidden_noise_cov) hidden_mean_0 = T.zeros_like(hidden_noise_mean).dimshuffle('x', 0) hidden_cov_0 = T.zeros_like(hidden_noise_cov).dimshuffle('x', 0, 1) f0, F0, ll0 = step(inpt[0], hidden_mean_0, hidden_cov_0) replace = {hidden_noise_mean: initial_hidden, hidden_noise_cov: initial_hidden_cov} f0 = theano.clone(f0, replace) F0 = theano.clone(F0, replace) ll0 = theano.clone(ll0, replace) (f, F, ll), _ = theano.scan( step, sequences=inpt[1:], outputs_info=[f0, F0, None]) ll = ll.sum(axis=0) f = T.concatenate([T.shape_padleft(f0), f]) F = T.concatenate([T.shape_padleft(F0), F]) ll += ll0 return f, F, ll
def BGAN(discriminator, g_output_logit, n_samples, trng, batch_size=64): d = OrderedDict() R = trng.uniform(size=(n_samples, batch_size, DIM_C, DIM_X, DIM_Y), dtype=floatX_) g_output = T.nnet.sigmoid(g_output_logit) samples = (R <= T.shape_padleft(g_output)).astype(floatX_) # Create expression for passing real data through the discriminator D_r = lasagne.layers.get_output(discriminator) D_f = lasagne.layers.get_output(discriminator, samples.reshape((-1, DIM_C, DIM_X, DIM_Y))) D_f_ = D_f.reshape((n_samples, batch_size)) log_d1 = -T.nnet.softplus(-D_f_) log_d0 = -(D_f_ + T.nnet.softplus(-D_f_)) log_w = log_d1 - log_d0 log_g = -( (1. - samples) * T.shape_padleft(g_output_logit) + T.shape_padleft(T.nnet.softplus(-g_output_logit))).sum(axis=(2, 3, 4)) log_N = T.log(log_w.shape[0]).astype(floatX_) log_Z_est = log_sum_exp(log_w - log_N, axis=0) log_w_tilde = log_w - T.shape_padleft(log_Z_est) - log_N w_tilde = T.exp(log_w_tilde) w_tilde_ = theano.gradient.disconnected_grad(w_tilde) generator_loss = -(w_tilde_ * log_g).sum(0).mean() discriminator_loss = (T.nnet.softplus(-D_r)).mean() + ( T.nnet.softplus(-D_f)).mean() + D_f.mean() return generator_loss, discriminator_loss, D_r, D_f, log_Z_est, log_w, w_tilde, d
def attention_gate(self, facts, memory, question): # TODO: for the first iteration question and memory are the same so # we can speedup the computation # facts is (num_batch * fact_length * memory_dim) # questions is (num_batch * memory_dim) # memory is (num_batch * memory_dim) # attention_gates must be (fact_length * nb_batch * 1) # Compute z (num_batch * fact_length * (7*memory_dim + 2)) # Dimshuffle facts to get a shape of # (fact_length * num_batch * memory_dim) facts = facts.dimshuffle(1, 0, 2) # Pad questions and memory to be of shape # (_ * num_batch * memory_dim) memory = T.shape_padleft(memory) question = T.shape_padleft(question) to_concatenate = list() to_concatenate.extend([facts, memory, question]) to_concatenate.extend([facts * question, facts * memory]) to_concatenate.extend([T.abs_(facts - question), T.abs_(facts - memory)]) # z = concatenate(to_concatenate, axis=2) # TODO: to be continued for the moment just return ones return T.ones((facts.shape[1], facts.shape[0], 1))
def im_to_col(im, psize, n_channels=3): """Similar to MATLAB's im2col function. Args: im - a Theano tensor3, of the form <n_channels, height, width>. psize - an int specifying the (square) block size to use n_channels - the number of channels in im Returns: a 5-tensor of the form <patch_id_i, patch_id_j, n_channels, psize, psize>. """ assert im.ndim == 3, "im must have dimension 3." im = im[:, ::-1, ::-1] res = T.zeros((n_channels, psize * psize, im.shape[1] - psize + 1, im.shape[2] - psize + 1)) filts = T.reshape(T.eye(psize * psize, psize * psize), (psize * psize, psize, psize)) filts = T.shape_padleft(filts).dimshuffle((1, 0, 2, 3)) for i in range(n_channels): cur_slice = T.shape_padleft(im[i], n_ones=2) res = T.set_subtensor(res[i], T.nnet.conv.conv2d(cur_slice, filts)[0]) return res.dimshuffle((0, 2, 3, 1)).reshape( (n_channels, im.shape[1] - psize + 1, im.shape[2] - psize + 1, psize, psize)).dimshuffle((1, 2, 0, 3, 4))
def density_given_previous_a_and_x( x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activation_factor, p_prev, a_prev, x_prev, ): a = a_prev + x_prev * w h = self.nonlinearity(a * activation_factor) # BxH Alpha = T.nnet.softmax( T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC Sigma = T.exp( (T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC p = p_prev + log_sum_exp( -constantX(0.5) * T.sqr((Mu - x) / Sigma) - T.log(Sigma) - constantX(0.5 * numpy.log(2 * numpy.pi)) + T.log(Alpha)) return (p, a, x)
def __init__(self, n, p, *args, **kwargs): super(Multinomial, self).__init__(*args, **kwargs) p = p / tt.sum(p, axis=-1, keepdims=True) n = np.squeeze(n) # works also if n is a tensor if len(self.shape) > 1: m = self.shape[-2] try: assert n.shape == (m,) except (AttributeError, AssertionError): n = n * tt.ones(m) self.n = tt.shape_padright(n) self.p = p if p.ndim > 1 else tt.shape_padleft(p) elif n.ndim == 1: self.n = tt.shape_padright(n) self.p = p if p.ndim > 1 else tt.shape_padleft(p) else: # n is a scalar, p is a 1d array self.n = tt.as_tensor_variable(n) self.p = tt.as_tensor_variable(p) self.mean = self.n * self.p mode = tt.cast(tt.round(self.mean), 'int32') diff = self.n - tt.sum(mode, axis=-1, keepdims=True) inc_bool_arr = tt.abs_(diff) > 0 mode = tt.inc_subtensor(mode[inc_bool_arr.nonzero()], diff[inc_bool_arr.nonzero()]) self.mode = mode
def __init__(self, n, p, *args, **kwargs): super(Multinomial, self).__init__(*args, **kwargs) p = p / tt.sum(p, axis=-1, keepdims=True) n = np.squeeze(n) # works also if n is a tensor if len(self.shape) > 1: m = self.shape[-2] try: assert n.shape == (m, ) except (AttributeError, AssertionError): n = n * tt.ones(m) self.n = tt.shape_padright(n) self.p = p if p.ndim > 1 else tt.shape_padleft(p) elif n.ndim == 1: self.n = tt.shape_padright(n) self.p = p if p.ndim > 1 else tt.shape_padleft(p) else: # n is a scalar, p is a 1d array self.n = tt.as_tensor_variable(n) self.p = tt.as_tensor_variable(p) self.mean = self.n * self.p mode = tt.cast(tt.round(self.mean), 'int32') diff = self.n - tt.sum(mode, axis=-1, keepdims=True) inc_bool_arr = tt.abs_(diff) > 0 mode = tt.inc_subtensor(mode[inc_bool_arr.nonzero()], diff[inc_bool_arr.nonzero()]) self.mode = mode
def _span_sums(stt, end, p_lens, max_p_len, batch_size, dim, max_ans_len): # Sum of every start element and corresponding max_ans_len end elements. # # stt (max_p_len, batch_size, dim) # end (max_p_len, batch_size, dim) # p_lens (batch_size,) max_ans_len_range = tt.shape_padleft(tt.arange(max_ans_len)) # (1, max_ans_len) offsets = tt.shape_padright(tt.arange(max_p_len)) # (max_p_len, 1) end_idxs = max_ans_len_range + offsets # (max_p_len, max_ans_len) end_idxs_flat = end_idxs.flatten() # (max_p_len*max_ans_len,) end_padded = tt.concatenate( # (max_p_len+max_ans_len-1, batch_size, dim) [end, tt.zeros((max_ans_len-1, batch_size, dim))], axis=0) end_structured = end_padded[end_idxs_flat] # (max_p_len*max_ans_len, batch_size, dim) end_structured = end_structured.reshape( # (max_p_len, max_ans_len, batch_size, dim) (max_p_len, max_ans_len, batch_size, dim)) stt_shuffled = stt.dimshuffle((0,'x',1,2)) # (max_p_len, 1, batch_size, dim) span_sums = stt_shuffled + end_structured # (max_p_len, max_ans_len, batch_size, dim) span_sums_reshaped = span_sums.dimshuffle((2,0,1,3)).reshape( # (batch_size, max_p_len*max_ans_len, dim) (batch_size, max_p_len*max_ans_len, dim)) p_lens_shuffled = tt.shape_padright(p_lens) # (batch_size, 1) end_idxs_flat_shuffled = tt.shape_padleft(end_idxs_flat) # (1, max_p_len*max_ans_len) span_masks_reshaped = tt.lt(end_idxs_flat_shuffled, p_lens_shuffled) # (batch_size, max_p_len*max_ans_len) span_masks_reshaped = cast_floatX(span_masks_reshaped) # (batch_size, max_p_len*max_ans_len, dim), (batch_size, max_p_len*max_ans_len) return span_sums_reshaped, span_masks_reshaped
def get_output_for(self, input, **kwargs): if self.tied_feamap: return input * T.gt(input, 0) + input * T.le(input, 0) \ * T.shape_padleft(T.shape_padright(self.W[seg], n_ones = len(input_dim) - 2)) else: return input * T.gt(input, 0) + input * T.le(input, 0) \ * T.shape_padleft(self.W)
def bn(x, gammas, betas, mean, var, args): assert mean.ndim == 1 assert var.ndim == 1 assert x.ndim == 2 if not args.use_population_statistics: mean = x.mean(axis=0) var = x.var(axis=0) #var = T.maximum(var, args.epsilon) #var = var + args.epsilon if args.baseline: y = x + betas else: #var_corrected = var.zeros_like() + 1.0 if args.clipvar: var_corrected = theano.tensor.switch(theano.tensor.eq(var, 0.), 1.0, var + args.epsilon) else: var_corrected = var + args.epsilon y = theano.tensor.nnet.bn.batch_normalization( inputs=x, gamma=gammas, beta=betas, mean=T.shape_padleft(mean), std=T.shape_padleft(T.sqrt(var_corrected)), mode="high_mem") assert mean.ndim == 1 assert var.ndim == 1 return y, mean, var
def sample(self, x0=None, h0s=None, n_samples=10, n_steps=10, condition_on=None, debug=False): '''Samples from an initial state. Args: x0 (Optional[T.tensor]): initial input state. h0 (Optional[T.tensor]): initial recurrent state. n_samples (Optional[int]): if no x0 or h0, used to initial batch. Number of chains. n_steps (int): number of sampling steps. Returns: OrderedDict: dictionary of results. hiddens, probabilities, and preacts. theano.OrderedUpdates: updates. ''' if x0 is None: x0, _ = self.output_net.sample( p=T.constant(0.5).astype(floatX), size=(n_samples, self.output_net.dim_out)).astype(floatX) if h0s is None: h0s = [ T.alloc(0., x.shape[1], dim_h).astype(floatX) for dim_h in self.dim_hs ] seqs = [] outputs_info = h0s + [x0, None, None] non_seqs = [] non_seqs += self.get_sample_params() if n_steps == 1: inps = outputs_info[:-2] + non_seqs outs = self.step_sample(*inps) updates = theano.OrderedUpdates() hs = outs[:self.n_layers] x, p, z = outs[-3:] x = T.shape_padleft(x) p = T.shape_padleft(p) z = T.shape_padleft(z) hs = [T.shape_padleft(h) for h in hs] else: outs, updates = scan(self.step_sample, seqs, outputs_info, non_seqs, n_steps, name=self.name + '_sampling', strict=False) hs = outs[:self.n_layers] x, p, z = outs[-3:] return OrderedDict(x=x, p=p, z=z, hs=hs), updates
def scalar_armijo_search(phi, phi0, derphi0, c1=constant(1e-4), n_iters=10, profile=0): """ .. todo:: WRITEME """ alpha0 = one phi_a0 = phi(alpha0) alpha1 = -(derphi0) * alpha0 ** 2 / 2.0 /\ (phi_a0 - phi0 - derphi0 * alpha0) phi_a1 = phi(alpha1) csol1 = phi_a0 <= phi0 + c1 * derphi0 csol2 = phi_a1 <= phi0 + c1 * alpha1 * derphi0 def armijo(alpha0, alpha1, phi_a0, phi_a1): factor = alpha0**2 * alpha1**2 * (alpha1 - alpha0) a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \ alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0) a = a / factor b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \ alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0) b = b / factor alpha2 = (-b + TT.sqrt(abs(b**2 - 3 * a * derphi0))) / (3.0 * a) phi_a2 = phi(alpha2) end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0 end_condition = TT.bitwise_or(TT.isnan(alpha2), end_condition) end_condition = TT.bitwise_or(TT.isinf(alpha2), end_condition) alpha2 = TT.switch( TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.), one - alpha2 / alpha1 < 0.96), alpha1 / constant(2.), alpha2) return [alpha1, alpha2, phi_a1, phi_a2], \ theano.scan_module.until(end_condition) states = [] states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)] states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)] # print 'armijo' rvals, _ = scan(armijo, states=states, n_steps=n_iters, name='armijo', mode=theano.Mode(linker='cvm'), profile=profile) sol_scan = rvals[1][0] a_opt = ifelse(csol1, one, ifelse(csol2, alpha1, sol_scan)) score = ifelse(csol1, phi_a0, ifelse(csol2, phi_a1, rvals[2][0])) return a_opt, score
def get_output(self, train=False): X = self.get_input(train=train) c0 = self.c0[None,:] * T.ones((X.shape[0], self.context_dim)) cn = self.cn[None,:] * T.ones((X.shape[0], self.context_dim)) X = T.concatenate( [ T.shape_padleft(self.e0,2) * T.ones((X.shape[0], 1, X.shape[2])), X, T.shape_padleft(self.en,2) * T.ones((X.shape[0], 1, X.shape[2])), ], axis = 1 ) X = X.dimshuffle(1,0,2) # timestep 置于第一纬 # 只有将int32 mask 强制转换为 float32 才不会在scan里面将mask_t[:, None] * cl_t 结果upcast成float64 mask = T.cast(self.get_output_mask(train=train), T.config.floatX) mask = mask.dimshuffle(1,0) # timestep 置于第一纬 #theano.printing.debugprint([mask], print_type=True) def _forward_step(e_t, e_tm1, mask_t, cl_tm1): #print 'e_t:', e_t.type.ndim #print 'cl_t:', cl_tm1.type.ndim cl_t = T.nnet.sigmoid( T.dot(cl_tm1, self.Wl) + T.dot(e_tm1, self.Wsl) ) cl_t = mask_t[:, None] * cl_t + (1. - mask_t[:, None]) * cl_tm1 # 如果它被mask就直接继承那个词 #theano.printing.debugprint([mask_t], print_type=True) #theano.printing.debugprint([cl_t], print_type=True) return cl_t def _backward_step(e_t, e_tp1, mask_t, cr_tp1): cr_t = T.nnet.sigmoid( T.dot(cr_tp1, self.Wr) + T.dot(e_tp1, self.Wsr)) cr_t = mask_t[:, None] * cr_t + (1. - mask_t[:, None]) * cr_tp1 # 如果它被mask就直接继承那个词 return cr_t Cl, _ = theano.scan(_forward_step, sequences=[dict(input=X, taps=[0, -1]), mask], outputs_info=[ dict(initial=c0, taps=[-1]) # 注意不是c0!!! ], ) Cr, _ = theano.scan(_backward_step, sequences=[dict(input=X, taps=[0, -1]), mask], outputs_info=[ dict(initial=cn, taps=[-1]) ], go_backwards=True, ) Cr = Cr[::-1] # 翻转Cr def _concatenate_activation_step(e_t, mask_t, cl_t, cr_t): #print theano.printing.debugprint(cr_t, print_type=True) h_t = T.tanh( T.dot(T.concatenate([e_t, cl_t, cr_t], axis=1), self.W2) + self.b2) h_t = mask_t[:, None] * h_t + (1. - mask_t[:, None]) * (-10000000000.) # 将mask的地方设置为最小值 return h_t Y, _ = theano.scan(_concatenate_activation_step, sequences=[X, mask, Cl, Cr], outputs_info=None, ) return Y.dimshuffle(1,0,2) # 重置样本为第一维
def transform_targets(targets): """Transform targets into a format suitable for passing to cost().""" reshaped = T.shape_padleft(targets) blanks = T.fill(reshaped, _BLANK) result = T.concatenate([blanks, reshaped]).dimshuffle(1, 0, 2).reshape((2*targets.shape[0], targets.shape[1])) result = T.concatenate([result, T.shape_padleft(result[0])]) return result
def scalar_armijo_search(phi, phi0, derphi0, c1=constant(1e-4), n_iters=10, profile=0): alpha0 = one phi_a0 = phi(alpha0) alpha1 = -(derphi0) * alpha0 ** 2 / 2.0 /\ (phi_a0 - phi0 - derphi0 * alpha0) phi_a1 = phi(alpha1) csol1 = phi_a0 <= phi0 + c1 * derphi0 csol2 = phi_a1 <= phi0 + c1 * alpha1 * derphi0 def armijo(alpha0, alpha1, phi_a0, phi_a1): factor = alpha0 ** 2 * alpha1 ** 2 * (alpha1 - alpha0) a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \ alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0) a = a / factor b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \ alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0) b = b / factor alpha2 = (-b + TT.sqrt(abs(b ** 2 - 3 * a * derphi0))) / (3.0 * a) phi_a2 = phi(alpha2) end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0 end_condition = TT.bitwise_or( TT.isnan(alpha2), end_condition) end_condition = TT.bitwise_or( TT.isinf(alpha2), end_condition) alpha2 = TT.switch( TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.), one - alpha2 / alpha1 < 0.96), alpha1 / constant(2.), alpha2) return [alpha1, alpha2, phi_a1, phi_a2], \ theano.scan_module.until(end_condition) states = [] states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)] states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)] # print 'armijo' rvals, _ = scan( armijo, states=states, n_steps=n_iters, name='armijo', mode=theano.Mode(linker='cvm'), profile=profile) sol_scan = rvals[1][0] a_opt = ifelse(csol1, one, ifelse(csol2, alpha1, sol_scan)) score = ifelse(csol1, phi_a0, ifelse(csol2, phi_a1, rvals[2][0])) return a_opt, score
def transform_targets(targets): """Transform targets into a format suitable for passing to cost().""" reshaped = T.shape_padleft(targets) blanks = T.fill(reshaped, _BLANK) result = T.concatenate([blanks, reshaped]).dimshuffle(1, 0, 2).reshape( (2 * targets.shape[0], targets.shape[1])) result = T.concatenate([result, T.shape_padleft(result[0])]) return result
def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma,activation_factor, p_prev, a_prev, x_prev,): a = a_prev + x_prev * w h = self.nonlinearity(a * activation_factor) # BxH Alpha = T.nnet.softmax(T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC Sigma = T.exp((T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC p = p_prev + log_sum_exp(-constantX(0.5) * T.sqr((Mu - x) / Sigma) - T.log(Sigma) - constantX(0.5 * numpy.log(2 * numpy.pi)) + T.log(Alpha)) return (p, a, x)
def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activations_factor, p_prev, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activations_factor) # BxH Alpha = T.nnet.softmax(T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC Sigma = T.exp((T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC p = p_prev + log_sum_exp(T.log(Alpha) - T.log(2 * Sigma) - T.abs_(Mu - T.shape_padright(x, 1)) / Sigma) return (p, a, x)
def density_and_gradients(x_i, x_im1, w_i, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activation_factor, a_i, lp_accum, dP_da_ip1): B = T.cast(x_i.shape[0], theano.config.floatX) pot = a_i * activation_factor h = self.nonlinearity(pot) # BxH z_alpha = T.dot(h, V_alpha) + T.shape_padleft(b_alpha) z_mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) z_sigma = T.dot(h, V_sigma) + T.shape_padleft(b_sigma) Alpha = T.nnet.softmax(z_alpha) # BxC Mu = z_mu # BxC Sigma = T.exp(z_sigma) # BxC Phi = -T.log( 2 * Sigma) - T.abs_(Mu - T.shape_padright(x_i, 1)) / Sigma wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) lp_current = log_sum_exp(wPhi) # lp_current_sum = T.sum(lp_current) Pi = T.exp(wPhi - T.shape_padright(lp_current, 1)) # # dp_dz_alpha = Pi - Alpha # BxC # dp_dz_alpha = T.grad(lp_current_sum, z_alpha) gb_alpha = dp_dz_alpha.mean(0, dtype=theano.config.floatX) # C gV_alpha = T.dot(h.T, dp_dz_alpha) / B # HxC # dp_dz_mu = T.grad(lp_current_sum, z_mu) dp_dz_mu = Pi * T.sgn(T.shape_padright(x_i, 1) - Mu) / Sigma # dp_dz_mu = dp_dz_mu * Sigma gb_mu = dp_dz_mu.mean(0, dtype=theano.config.floatX) gV_mu = T.dot(h.T, dp_dz_mu) / B # dp_dz_sigma = T.grad(lp_current_sum, z_sigma) dp_dz_sigma = Pi * (T.abs_(T.shape_padright(x_i, 1) - Mu) / Sigma - 1) gb_sigma = dp_dz_sigma.mean(0, dtype=theano.config.floatX) gV_sigma = T.dot(h.T, dp_dz_sigma) / B dp_dh = T.dot(dp_dz_alpha, V_alpha.T) + T.dot( dp_dz_mu, V_mu.T) + T.dot(dp_dz_sigma, V_sigma.T) # BxH if non_linearity_name == "sigmoid": dp_dpot = dp_dh * h * (1 - h) elif non_linearity_name == "RLU": dp_dpot = dp_dh * (pot > 0) gfact = (dp_dpot * a_i).sum(1).mean( 0, dtype=theano.config.floatX) # 1 dP_da_i = dP_da_ip1 + dp_dpot * activation_factor # BxH gW = T.dot(T.shape_padleft(x_im1, 1), dP_da_i).flatten() / B return (a_i - T.dot(T.shape_padright(x_im1, 1), T.shape_padleft(w_i, 1)), lp_accum + lp_current, dP_da_i, gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma, gfact)
def map_fn(image, image_shape, a, b, location, scale): # apply_inner expects a batch axis image = T.shape_padleft(image) location = T.shape_padleft(location) scale = T.shape_padleft(scale) patch = self.apply_inner(image, location, scale, a, b) # return without batch axis return patch[0]
def map_fn(image, a, b, location, scale): # apply_inner expects a batch axis image = T.shape_padleft(image) location = T.shape_padleft(location) scale = T.shape_padleft(scale) patch = self.apply_inner(image, location, scale, a, b) # return without batch axis return patch[0]
def viterbi(trans_probs): """ :param trans_probs: N * max(L) * D * D tensor """ N = trans_probs.shape[0] D = trans_probs.shape[-1] T1_0 = trans_probs[:, 0, 0] # N * D matrix T2_0 = T.zeros((N, D)) # N * D matrix def step_forward(trans_probs_l, T1_lm1): T1_l = T.max(T.shape_padright(T1_lm1) * trans_probs_l, axis=1) # N * D matrix T2_l = T.argmax(T.shape_padright(T1_lm1) * trans_probs_l, axis=1) # N * D matrix return T.cast(T1_l, 'float32'), T.cast(T2_l, 'float32') ([T1, T2], _) = theano.scan( step_forward, sequences=trans_probs[:, 1:].dimshuffle((1, 0, 2, 3)), outputs_info=[T1_0, None], ) # (max(L)-1) * N * D tensors T1 = T.concatenate([T.shape_padleft(T1_0), T1], axis=0) # max(L) * N * D T2 = T.concatenate([T.shape_padleft(T2_0), T2], axis=0) # max(L) * N * D char_L = T.cast(T.argmax(T1[-1], axis=1), 'float32') # N def step_backward(T2_lp1, char_lp1): char_l = T2_lp1[T.arange(N), T.cast(char_lp1, 'int32')] # N return T.cast(char_l, 'float32') chars, _ = theano.scan( step_backward, sequences=T2[1:][::-1], outputs_info=[char_L], ) # (max(L)-1) * N chars = chars[::-1] # (max(L)-1) * N chars = T.concatenate([chars, T.shape_padleft(char_L)], axis=0).T # N * max(L) probs = get_probs(chars, trans_probs) # N * max(L) return chars, probs # N * max(L) and N * max(L)
def BGAN(discriminator, g_output_logit, n_samples, trng, batch_size=64): d = OrderedDict() d['g_output_logit'] = g_output_logit g_output_logit_ = g_output_logit.reshape((-1, N_WORDS)) d['g_output_logit_'] = g_output_logit_ g_output = T.nnet.softmax(g_output_logit_) g_output = g_output.reshape((batch_size, L_GEN, N_WORDS)) d['g_output'] = g_output p_t = T.tile(T.shape_padleft(g_output), (n_samples, 1, 1, 1)) d['p_t'] = p_t p = p_t.reshape((-1, N_WORDS)) d['p'] = p samples = trng.multinomial(pvals=p).astype(floatX) samples = theano.gradient.disconnected_grad(samples) samples = samples.reshape((n_samples, batch_size, L_GEN, N_WORDS)) d['samples'] = samples D_r = lasagne.layers.get_output(discriminator) D_f = lasagne.layers.get_output( discriminator, samples.reshape((-1, L_GEN, N_WORDS))) D_f_ = D_f.reshape((n_samples, -1, L_GEN)) d.update(D_r=D_r, D_f=D_f, D_f_=D_f_) log_d1 = -T.nnet.softplus(-D_f_) log_d0 = -(D_f_ + T.nnet.softplus(-D_f_)) log_w = D_f_ d.update(log_d1=log_d1, log_d0=log_d0, log_w=log_w) log_N = T.log(log_w.shape[0]).astype(log_w.dtype) log_Z_est = log_sum_exp(log_w - log_N, axis=0) log_Z_est = theano.gradient.disconnected_grad(log_Z_est) d['log_Z_est'] = log_Z_est log_g = (samples * (g_output_logit - log_sum_exp2( g_output_logit, axis=2))[None, :, :, :]).sum(axis=3) d['log_g'] = log_g log_N = T.log(log_w.shape[0]).astype(floatX) log_Z_est = log_sum_exp(log_w - log_N, axis=0) log_w_tilde = log_w - T.shape_padleft(log_Z_est) - log_N w_tilde = T.exp(log_w_tilde) w_tilde_ = theano.gradient.disconnected_grad(w_tilde) d.update(log_w_tilde=log_w_tilde, w_tilde=w_tilde) generator_loss = -(w_tilde_ * log_g).sum(0).mean() discriminator_loss = (T.nnet.softplus(-D_r)).mean() + ( T.nnet.softplus(-D_f)).mean() + D_f.mean() d.update(generator_loss=generator_loss, discriminator_loss=discriminator_loss) return generator_loss, discriminator_loss, D_r, D_f, log_Z_est, d
def get_output_for(self, input, **kwargs): output = T.zeros_like(input) for seg in range(0, self.num_segs): if self.tied_feamap: output += self.basisf(input, self.x_start + self.x_step * seg, self.x_start + self.x_step * (seg + 1)) \ * T.shape_padleft(T.shape_padright(self.W[seg], n_ones = len(input_dim) - 2)) else: output += self.basisf(input, self.x_start + self.x_step * seg, self.x_start + self.x_step * (seg + 1)) \ * T.shape_padleft(self.W[seg]) return output
def multinomial_BGAN(discriminator, g_output_logit, n_samples=None, log_Z=None, batch_size=None, dim_c=None, dim_x=None, dim_y=None): g_output_logit_ = g_output_logit.transpose(0, 2, 3, 1) g_output_logit_ = g_output_logit_.reshape((-1, dim_c)) g_output = T.nnet.softmax(g_output_logit_) g_output = g_output.reshape((batch_size, dim_x, dim_y, dim_c)) p_t = T.tile(T.shape_padleft(g_output), (n_samples, 1, 1, 1, 1)) p = p_t.reshape((-1, dim_c)) samples = trng.multinomial(pvals=p).astype(floatX) samples = theano.gradient.disconnected_grad(samples) samples = samples.reshape((n_samples, batch_size, dim_x, dim_y, dim_c)) samples = samples.transpose(0, 1, 4, 2, 3) real_out = lasagne.layers.get_output(discriminator) fake_out = lasagne.layers.get_output( discriminator, samples.reshape((-1, dim_c, dim_x, dim_y))) log_w = fake_out.reshape((n_samples, -1)) log_g = ((samples * (g_output_logit - log_sum_exp( g_output_logit, axis=1, keepdims=True))[None, :, :, :, :]) .sum(axis=(2, 3, 4))) log_N = T.log(log_w.shape[0]).astype(floatX) log_Z_est = log_sum_exp(log_w - log_N, axis=0) log_w_tilde = log_w - T.shape_padleft(log_Z_est) - log_N w_tilde = T.exp(log_w_tilde) w_tilde_ = theano.gradient.disconnected_grad(w_tilde) generator_loss = -(w_tilde_ * log_g).sum(0).mean() discriminator_loss = (T.nnet.softplus(-real_out)).mean() + ( T.nnet.softplus(-fake_out)).mean() + fake_out.mean() g_results = { 'g loss': generator_loss, 'p(fake==0)': 1. - T.nnet.sigmoid(fake_out).mean(), 'E[logit(fake)]': fake_out.mean(), 'log w': log_w.mean(), 'log w var': log_w.std() ** 2, 'norm w': w_tilde.mean(), 'norm w var': w_tilde.std() ** 2, 'ESS': (1. / (w_tilde ** 2).sum(0)).mean(), } d_results = { 'd loss': discriminator_loss, 'p(real==1)': T.nnet.sigmoid(real_out).mean(), 'E[logit(real)]': real_out.mean(), } return g_results, d_results, log_Z_est
def get_output_for(self, input, **kwargs): output = input * T.gt(input, 0) for seg in range(0, self.num_segs): if self.tied_feamap: output += self.basisf(input, T.shape_padleft(T.shape_padright(self.P[seg], n_ones = len(input_dim) - 2))) \ * T.shape_padleft(T.shape_padright(self.W[seg], n_ones = len(input_dim) - 2)) else: output += self.basisf(input, T.shape_padleft(self.P[seg])) \ * T.shape_padleft(self.W[seg]) return output
def conv1d(vector, kernel, flip=False): """ One-dimensional linear convolution of a vector with a 1d kernel. """ return tt.nnet.conv2d( tt.shape_padleft(vector, 3), tt.shape_padleft(kernel, 3), input_shape=(1, 1, 1, -1), filter_shape=(1, 1, 1, -1), filter_flip=flip, )[0, 0, 0, :]
def __init__(self, rng, input, filter_shape, image_shape, zero_pad=True, poolsize=(2, 2), read_file=False, W_input=None, b_input=None): assert image_shape[1] == filter_shape[1] fan_in = numpy.prod(filter_shape[1:]) fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared( numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) if read_file==True: self.W = W_input self.b = b_input if zero_pad==True: input=input.transpose(2, 0, 1, 3) input=T.concatenate([T.shape_padleft(T.zeros_like(input[0]), 1), input, T.shape_padleft(T.zeros_like(input[0]), 1)], axis=0) input=input.transpose(1, 2, 0, 3) input=input.transpose(3, 0, 1, 2) input=T.concatenate([T.shape_padleft(T.zeros_like(input[0]), 1), input, T.shape_padleft(T.zeros_like(input[0]), 1)], axis=0) input=input.transpose(1, 2, 3, 0) self.input = input image_shape = (image_shape[0], image_shape[1], image_shape[2]+2, image_shape[3]+2) conv_out = conv.conv2d( input=self.input, filters=self.W, filter_shape=filter_shape, image_shape=image_shape, border_mode='valid' ) pooled_out = downsample.max_pool_2d( input=conv_out, ds=poolsize, ignore_border=True ) self.switch = T.abs_(1 - T.sgn(T.abs_(conv_out - pooled_out.repeat(2, axis=2).repeat(2, axis=3)))) self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) self.params = [self.W, self.b]
def binary_BGAN(discriminator, g_output_logit, n_samples=None, log_Z=None, batch_size=None, dim_c=None, dim_x=None, dim_y=None): '''Discrete BGAN for discrete binary variables. ''' # Sample from a uniform distribution and generate samples over input. R = trng.uniform(size=(n_samples, batch_size, dim_c, dim_x, dim_y), dtype=floatX) g_output = T.nnet.sigmoid(g_output_logit) samples = (R <= T.shape_padleft(g_output)).astype(floatX) # Feed samples through the discriminator. real_out = lasagne.layers.get_output(discriminator) fake_out = lasagne.layers.get_output( discriminator, samples.reshape((-1, dim_c, dim_x, dim_y))) log_w = fake_out.reshape((n_samples, batch_size)) # Get the log probabilities of the samples. log_g = -((1. - samples) * T.shape_padleft(g_output_logit) + T.shape_padleft(T.nnet.softplus(-g_output_logit))).sum( axis=(2, 3, 4)) # Get the normalized weights. log_N = T.log(log_w.shape[0]).astype(floatX) log_Z_est = log_sum_exp(log_w - log_N, axis=0) log_w_tilde = log_w - T.shape_padleft(log_Z_est) - log_N w_tilde = T.exp(log_w_tilde) w_tilde_ = theano.gradient.disconnected_grad(w_tilde) # Losses. generator_loss = -(w_tilde_ * log_g).sum(0).mean() discriminator_loss = (T.nnet.softplus(-real_out)).mean() + ( T.nnet.softplus(-fake_out)).mean() + fake_out.mean() g_results = { 'g loss': generator_loss, 'p(fake==0)': 1. - T.nnet.sigmoid(fake_out).mean(), 'E[logit(fake)]': fake_out.mean(), 'log w': log_w.mean(), 'log w var': log_w.std() ** 2, 'norm w': w_tilde.mean(), 'norm w var': w_tilde.std() ** 2, 'ESS': (1. / (w_tilde ** 2).sum(0)).mean(), } d_results = { 'd loss': discriminator_loss, 'p(real==1)': T.nnet.sigmoid(real_out).mean(), 'E[logit(real)]': real_out.mean(), } return g_results, d_results, log_Z_est
def density_and_gradients(x_i, x_im1, w_i, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activation_factor, a_i, lp_accum, dP_da_ip1): B = T.cast(x_i.shape[0], theano.config.floatX) pot = a_i * activation_factor h = self.nonlinearity(pot) # BxH z_alpha = T.dot(h, V_alpha) + T.shape_padleft(b_alpha) z_mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) z_sigma = T.dot(h, V_sigma) + T.shape_padleft(b_sigma) Alpha = T.nnet.softmax(z_alpha) # BxC Mu = z_mu # BxC Sigma = T.exp(z_sigma) # BxC Phi = -T.log(2 * Sigma) - T.abs_(Mu - T.shape_padright(x_i, 1)) / Sigma wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) lp_current = log_sum_exp(wPhi) # lp_current_sum = T.sum(lp_current) Pi = T.exp(wPhi - T.shape_padright(lp_current, 1)) # # dp_dz_alpha = Pi - Alpha # BxC # dp_dz_alpha = T.grad(lp_current_sum, z_alpha) gb_alpha = dp_dz_alpha.mean(0, dtype=theano.config.floatX) # C gV_alpha = T.dot(h.T, dp_dz_alpha) / B # HxC # dp_dz_mu = T.grad(lp_current_sum, z_mu) dp_dz_mu = Pi * T.sgn(T.shape_padright(x_i, 1) - Mu) / Sigma # dp_dz_mu = dp_dz_mu * Sigma gb_mu = dp_dz_mu.mean(0, dtype=theano.config.floatX) gV_mu = T.dot(h.T, dp_dz_mu) / B # dp_dz_sigma = T.grad(lp_current_sum, z_sigma) dp_dz_sigma = Pi * (T.abs_(T.shape_padright(x_i, 1) - Mu) / Sigma - 1) gb_sigma = dp_dz_sigma.mean(0, dtype=theano.config.floatX) gV_sigma = T.dot(h.T, dp_dz_sigma) / B dp_dh = T.dot(dp_dz_alpha, V_alpha.T) + T.dot(dp_dz_mu, V_mu.T) + T.dot(dp_dz_sigma, V_sigma.T) # BxH if non_linearity_name == "sigmoid": dp_dpot = dp_dh * h * (1 - h) elif non_linearity_name == "RLU": dp_dpot = dp_dh * (pot > 0) gfact = (dp_dpot * a_i).sum(1).mean(0, dtype=theano.config.floatX) # 1 dP_da_i = dP_da_ip1 + dp_dpot * activation_factor # BxH gW = T.dot(T.shape_padleft(x_im1, 1), dP_da_i).flatten() / B return (a_i - T.dot(T.shape_padright(x_im1, 1), T.shape_padleft(w_i, 1)), lp_accum + lp_current, dP_da_i, gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma, gfact)
def density_and_gradients(x_i, x_im1, w_i, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activation_factor, a_i, lp_accum, dP_da_ip1): B = T.cast(x_i.shape[0], floatX) pot = a_i * activation_factor h = self.nonlinearity(pot) # BxH z_alpha = T.dot(h, V_alpha) + T.shape_padleft(b_alpha) z_mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) z_sigma = T.dot(h, V_sigma) + T.shape_padleft(b_sigma) Alpha = T.nnet.softmax(z_alpha) # BxC Mu = z_mu # BxC Sigma = T.exp(z_sigma) # BxC Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x_i, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * numpy.log(2 * numpy.pi)) wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) lp_current = -log_sum_exp(wPhi) # negative log likelihood # lp_current_sum = T.sum(lp_current) Pi = T.exp(wPhi - T.shape_padright(lp_current, 1)) # # dp_dz_alpha = Pi - Alpha # BxC # dp_dz_alpha = T.grad(lp_current_sum, z_alpha) gb_alpha = dp_dz_alpha.mean(0, dtype=floatX) # C gV_alpha = T.dot(h.T, dp_dz_alpha) / B # HxC dp_dz_mu = -Pi * (Mu - T.shape_padright(x_i, 1)) / T.sqr(Sigma) # dp_dz_mu = T.grad(lp_current_sum, z_mu) dp_dz_mu = dp_dz_mu * Sigma # Heuristic gb_mu = dp_dz_mu.mean(0, dtype=floatX) gV_mu = T.dot(h.T, dp_dz_mu) / B dp_dz_sigma = Pi * (T.sqr(T.shape_padright(x_i, 1) - Mu) / T.sqr(Sigma) - 1) # dp_dz_sigma = T.grad(lp_current_sum, z_sigma) gb_sigma = dp_dz_sigma.mean(0, dtype=floatX) gV_sigma = T.dot(h.T, dp_dz_sigma) / B dp_dh = T.dot(dp_dz_alpha, V_alpha.T) + T.dot(dp_dz_mu, V_mu.T) + T.dot(dp_dz_sigma, V_sigma.T) # BxH if self.hidden_act == "sigmoid": dp_dpot = dp_dh * h * (1 - h) elif self.hidden_act == "ReLU": dp_dpot = dp_dh * (pot > 0) gfact = (dp_dpot * a_i).sum(1).mean(0, dtype=floatX) # 1 dP_da_i = dP_da_ip1 + dp_dpot * activation_factor # BxH gW = T.dot(T.shape_padleft(x_im1, 1), dP_da_i).flatten() / B return (a_i - T.dot(T.shape_padright(x_im1, 1), T.shape_padleft(w_i, 1)), lp_accum + lp_current, dP_da_i, gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma, gfact)
def smooth(filtered_mean, filtered_cov, transition, hidden_noise_mean, hidden_noise_cov): step = backward_step(transition, hidden_noise_mean, hidden_noise_cov) (g, G), _ = theano.scan( step, sequences=[filtered_mean[:-1], filtered_cov[:-1]], outputs_info=[filtered_mean[-1], filtered_cov[-1]], go_backwards=True) g = T.concatenate([T.shape_padleft(filtered_mean[-1]), g]) G = T.concatenate([T.shape_padleft(filtered_cov[-1]), G]) return g[::-1], G[::-1]
def _warp_times(self, t): delta = tt.shape_padleft(t) / tt.shape_padright(self.period, t.ndim) delta += tt.shape_padright(self._base_time, t.ndim) ind = tt.cast(tt.floor(delta), "int64") dt = tt.stack([ttv[tt.clip(ind[i], 0, ttv.shape[0]-1)] for i, ttv in enumerate(self.ttvs)], -1) return tt.shape_padright(t) + dt
def build_NADE(self,v, W, V, b, c): a = T.shape_padright(v) * T.shape_padleft(W) a = a.dimshuffle(1, 0, 2) c_init = c if c.ndim == 1: c_init = T.dot(T.ones((v.shape[0], 1)), T.shape_padleft(c)) (activations, s), updates = theano.scan(lambda V_i, a_i, partial_im1: (a_i + partial_im1, T.dot(V_i, T.nnet.sigmoid(partial_im1.T))), sequences=[V.T, a], outputs_info=[c_init, None]) s = s.T + b y = T.nnet.sigmoid(s) cost = -v*T.log(y) - (1-v)*T.log(1-y) cost = cost.sum() / v.shape[0] return s, y, cost
def _forward_vars(activations, ttargets): """Calculate the CTC forward variables: for each example, a matrix of shape (sequence length, target length) where entry (t,u) corresponds to the log-probability of the network predicting the target sequence prefix [0:u] by time t.""" ttargets = T.cast(ttargets, 'int32') activations = T.log(activations) # For each example, a matrix of shape (seq len, target len) with values # corresponding to activations at each time and sequence position. probs = activations[:, T.shape_padleft(T.arange(activations.shape[1])).T, ttargets] initial_probs = _initial_probabilities( probs.shape[1], ttargets.shape[1]) skip_allowed = _skip_allowed(ttargets) def step(p_curr, p_prev): no_change = p_prev next_label = helpers.right_shift_rows(p_prev, 1, _LOG_ZERO) skip = helpers.right_shift_rows(p_prev + skip_allowed, 2, _LOG_ZERO) return p_curr + _log_add_3(no_change, next_label, skip) probabilities, _ = theano.scan( step, sequences=[probs], outputs_info=[initial_probs] ) return probabilities
def one_hot(self,t, r=None): if r is None: r = T.max(t) + 1 ranges = T.shape_padleft(T.arange(r), t.ndim) return T.cast(T.eq(ranges, T.shape_padright(t, 1)) ,dtype =theano.config.floatX)
def predict_sequence(x, lengths, return_all=False, return_memory=False): if x.ndim > 1: outputs_info = [None] + [dict(initial=T.repeat(T.shape_padleft(layer.initial_hidden_state), x.shape[0], axis=0), taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')] else: outputs_info = [None] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')] outputs_info = outputs_info + [None] result, updates = theano.scan(step, sequences = [x.T if x.ndim > 1 else x], outputs_info = outputs_info) if return_all: return result else: res = result[-1].dimshuffle(1, 0, 2) if x.ndim > 1 else result[-1] price_preds = self.price_model.forward( self.model.layers[-2].postprocess_activation( result[-2][lengths, T.arange(0, lengths.shape[0])] ), None, [] )[-1][:,0] if x.ndim > 1 else \ self.price_model.forward( self.model.layers[-2].postprocess_activation( result[-2][-1] ), None, [])[-1][0] # gate values can be obtained by asking for them from the stacked cells if return_memory: return result[0], res, price_preds else: return res, price_preds
def blk_tridag_chol(A, B): ''' Compute the cholesky decompoisition of a symmetric, positive definite block-tridiagonal matrix. Inputs: A - [T x n x n] tensor, where each A[i,:,:] is the ith block diagonal matrix B - [T-1 x n x n] tensor, where each B[i,:,:] is the ith (upper) 1st block off-diagonal matrix Outputs: R - python list with two elements * R[0] - [T x n x n] tensor of block diagonal elements of Cholesky decomposition * R[1] - [T-1 x n x n] tensor of (lower) 1st block off-diagonal elements of Cholesky ''' # Code for computing the cholesky decomposition of a symmetric block tridiagonal matrix def compute_chol(Aip1, Bi, Li, Ci): Ci = T.dot(Bi.T, Tla.matrix_inverse(Li).T) Dii = Aip1 - T.dot(Ci, Ci.T) Lii = Tsla.cholesky(Dii) return [Lii,Ci] L1 = Tsla.cholesky(A[0]) C1 = T.zeros_like(B[0]) # this scan returns the diagonal and off-diagonal blocks of the cholesky decomposition mat, updates = theano.scan(fn=compute_chol, sequences=[A[1:], B], outputs_info=[L1,C1]) mat[0] = T.concatenate([T.shape_padleft(L1), mat[0]]) return mat
def _e_step(psamples, W_list, b_list, n_steps=100, eps=1e-5): """ Performs 'n_steps' of mean-field inference (used to compute positive phase statistics) Parameters ---------- psamples : array-like object of theano shared variables State of each layer of the DBM (during the inference process). psamples[0] points to the input n_steps : integer Number of iterations of mean-field to perform """ depth = len(psamples) new_psamples = [T.unbroadcast(T.shape_padleft(psample)) for psample in psamples] # now alternate mean-field inference for even/odd layers def mf_iteration(*psamples): new_psamples = [p for p in psamples] for i in xrange(1, depth, 2): new_psamples[i] = hi_given(psamples, i, W_list, b_list) for i in xrange(2, depth, 2): new_psamples[i] = hi_given(psamples, i, W_list, b_list) score = 0.0 for i in xrange(1, depth): score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])), score) return new_psamples, theano.scan_module.until(score < eps) new_psamples, updates = scan(mf_iteration, states=new_psamples, n_steps=n_steps) return [x[0] for x in new_psamples]
def cost(self): """ :rtype: (theano.Variable | None, dict[theano.Variable,theano.Variable] | None) :returns: cost, known_grads """ known_grads = None if self.loss == 'ce' or self.loss == 'priori': if self.attrs.get("target", "").endswith("[sparse:coo]"): assert isinstance(self.y, tuple) assert len(self.y) == 3 from NativeOp import crossentropy_softmax_and_gradient_z_sparse y_mask = self.network.j[self.attrs.get("target", "").replace("[sparse:coo]", "[sparse:coo:2:0]")] ce, grad_z = crossentropy_softmax_and_gradient_z_sparse( self.z, self.index, self.y[0], self.y[1], self.y[2], y_mask) return self.norm * T.sum(ce), {self.z: grad_z} if self.y_data_flat.type == T.ivector().type: # Use crossentropy_softmax_1hot to have a more stable and more optimized gradient calculation. # Theano fails to use it automatically; I guess our self.i indexing is too confusing. #idx = self.index.flatten().dimshuffle(0,'x').repeat(self.y_m.shape[1],axis=1) # faster than line below #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m * idx, y_idx=self.y_data_flat * self.index.flatten()) nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i]) #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) #nll = -T.log(T.nnet.softmax(self.y_m)[self.i,self.y_data_flat[self.i]]) #z_c = T.exp(self.z[:,self.y]) #nll = -T.log(z_c / T.sum(z_c,axis=2,keepdims=True)) #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) #nll = T.set_subtensor(nll[self.j], T.constant(0.0)) else: nll = -T.dot(T.log(T.clip(self.p_y_given_x[self.i], 1.e-38, 1.e20)), self.y_data_flat[self.i].T) return self.norm * T.sum(nll), known_grads elif self.loss == 'entropy': h_e = T.exp(self.y_m) #(TB) pcx = T.clip((h_e / T.sum(h_e, axis=1, keepdims=True)).reshape((self.index.shape[0],self.index.shape[1],self.attrs['n_out'])), 1.e-6, 1.e6) # TBD ee = -T.sum(pcx[self.i] * T.log(pcx[self.i])) # TB #nll, pcxs = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y[self.i]) nll, _ = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) # TB ce = nll.reshape(self.index.shape) * self.index # TB y = self.y_data_flat.reshape(self.index.shape) * self.index # TB f = T.any(T.gt(y,0), axis=0) # B return T.sum(f * T.sum(ce, axis=0) + (1-f) * T.sum(ee, axis=0)), known_grads #return T.sum(T.switch(T.gt(T.sum(y,axis=0),0), T.sum(ce, axis=0), -T.sum(ee, axis=0))), known_grads #return T.switch(T.gt(T.sum(self.y_m[self.i]),0), T.sum(nll), -T.sum(pcx * T.log(pcx))), known_grads elif self.loss == 'priori': pcx = self.p_y_given_x[self.i, self.y_data_flat[self.i]] pcx = T.clip(pcx, 1.e-38, 1.e20) # For pcx near zero, the gradient will likely explode. return -T.sum(T.log(pcx)), known_grads elif self.loss == 'sse': if self.y_data_flat.dtype.startswith('int'): y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim=1), 'int32') y_oh = T.eq(T.shape_padleft(T.arange(self.attrs['n_out']), y_f.ndim), T.shape_padright(y_f, 1)) return T.mean(T.sqr(self.p_y_given_x[self.i] - y_oh[self.i])), known_grads else: #return T.sum(T.sum(T.sqr(self.y_m - self.y.reshape(self.y_m.shape)), axis=1)[self.i]), known_grads return T.sum(T.sqr(self.y_m[self.i] - self.y_data_flat.reshape(self.y_m.shape)[self.i])), known_grads #return T.sum(T.sum(T.sqr(self.z - (self.y.reshape((self.index.shape[0], self.index.shape[1], self.attrs['n_out']))[:self.z.shape[0]])), axis=2).flatten()[self.i]), known_grads #y_z = T.set_subtensor(T.zeros((self.index.shape[0],self.index.shape[1],self.attrs['n_out']), dtype='float32')[:self.z.shape[0]], self.z).flatten() #return T.sum(T.sqr(y_z[self.i] - self.y[self.i])), known_grads #return T.sum(T.sqr(self.y_m - self.y[:self.z.shape[0]*self.index.shape[1]]).flatten()[self.i]), known_grads else: assert False, "unknown loss: %s" % self.loss
def project_into_l2_ball(arr, radius=1): """Return ``arr`` projected into the L2 ball. Parameters ---------- arr : Theano variable Array of shape either ``(n, d)`` or ``(d,)``. If the former, all rows are projected individually. radius : float, optional [default: 1] Returns ------- res : Theano variable Projected result of the same shape as ``arr``. """ # Distinguish whether we are given a single or many vectors to work upon. batch = arr.ndim == 2 if not batch: arr = T.shape_padleft(arr) lengths = T.sqrt((arr ** 2).sum(axis=1)).dimshuffle(0, 'x') arr = T.switch(lengths > T.sqrt(radius), arr / lengths * radius, arr) if not batch: arr = arr[0] return arr
def local_gpualloc(node): replace = False if node.op == tensor.alloc: if node.inputs[0].owner and node.inputs[0].owner.op == host_from_gpu: replace = True elif all([c != 'output' and c.op == gpu_from_host for c, idx in node.outputs[0].clients]): replace = True elif all([c != 'output' and c.op == tensor.join and all([i.owner and i.owner.op in [host_from_gpu, tensor.alloc] for i in c.inputs[1:]]) for c, idx in node.outputs[0].clients]): replace = True if replace: val = node.inputs[0] shp = node.inputs[1:] old_out = node.outputs[0] val2 = tensor.shape_padleft(val, len(shp) - val.ndim) new_out = host_from_gpu(gpu_alloc(val, *shp)) if new_out.type != old_out.type: assert new_out.type.ndim == old_out.type.ndim assert new_out.type.dtype == old_out.type.dtype for b_old, b_new in zip(old_out.type.broadcastable, new_out.type.broadcastable): assert b_new or (not b_old) new_out = tensor.patternbroadcast(new_out. old_out.broadcastable) return [new_out]