def KS_distance(expected, observed, mode='D'): """ A symbolic theano expression for Kolmogorov-Smirnov statistical distance. Note: this implementation uses `cumsum`. Theano implementation of `cumsum` falls back to numpy one, thus no accelerated code will be generated. :param expected: 1D tensor, expectation or canonical distribution. :param observed: 2D tensor, first dimension is spatial, the second one represents probabilities of empirical distribution. :param mode: possible modes: D - two-sided KS distance, D- and D+ - one sided KS distance. :return: symbolic expression for Kolmogorov-Smirnov distance. """ expected_ecpf = T.cumsum(expected) observed_ecpf = T.cumsum(observed, axis=1) if mode == 'D': difference = abs(observed_ecpf - expected_ecpf[None, :]) elif mode == 'D+': difference = T.maximum(observed_ecpf - expected_ecpf[None, :], np.float32(0.0)) elif mode == 'D-': difference = T.maximum(expected_ecpf[None, :] - observed_ecpf, np.float32(0.0)) else: raise Exception('Unknown mode for KS distance: %s' % mode) return T.max(difference, axis=1)
def findalpha2(D, W): W = T.flatten(W) D = T.flatten(D) # the positive part n1 = T.sum(T.gt(W, 0.)) ind1 = T.argsort(W)[::-1] cum_DW1 = T.cumsum(T.abs_(D * W)[ind1]) cum_D1 = T.cumsum(D[ind1]) c1 = cum_DW1 / cum_D1 / 2 # tmp = W[ind] - cum_DW_D mask1 = T.lt( (W[ind1][0:n1 - 1] - c1[0:n1 - 1]) * (W[ind1][1:n1] - c1[0:n1 - 1]), 0) thr1 = c1[mask1.nonzero()][T.argmax( c1[mask1.nonzero()] * c1[mask1.nonzero()] * cum_D1[mask1.nonzero()])] from theano.ifelse import ifelse thres1 = ifelse(T.gt(mask1.nonzero()[0].shape[0], 0), thr1, 0.7 * c1[n1 - 1]) # the negative part n2 = T.sum(T.lt(W, 0.)) ind2 = ind1[::-1] cum_DW2 = T.cumsum(T.abs_(D * W)[ind2]) cum_D2 = T.cumsum(D[ind2]) c2 = cum_DW2 / cum_D2 / 2 # tmp = W[ind] - cum_DW_D mask2 = T.lt( (-W[ind2][0:n2 - 1] - c2[0:n2 - 1]) * (-W[ind2][1:n2] - c2[0:n2 - 1]), 0) thr2 = c2[mask2.nonzero()][T.argmax( c1[mask2.nonzero()] * c1[mask2.nonzero()] * cum_D2[mask2.nonzero()])] from theano.ifelse import ifelse thres2 = ifelse(T.gt(mask2.nonzero()[0].shape[0], 0), thr2, 0.7 * c2[n2 - 1]) return thres1, thres2
def config_beta_updates(self, beta_lr=1e-3): # determine fraction of up-moving particles at each temperature fup = self.nup / (self.nup + self.ndown) self.get_fup = theano.function([], fup) # cost function is: C = \sum_i (fup_{i+1} - fup_i) ^ 2 # \frac{dC}{d \lambda_i} = \sum_{j >= i} # \frac{dC}{df_j} # \frac{df_j}{d\beta_j} # \frac{d\beta_j}{d \delta \beta_i = -1 # \frac{d \delta \beta_i}{d\lambda_i} = exp(\lambda_i) # vectors of length n_beta-3 f_i = fup[1:-1] f_im1 = fup[:-2] f_ip1 = fup[2:] ## \frac{dC}{df_j} ## # vector of length n_beta-2 dc_df = 2*(f_i - f_im1) - 2*(f_ip1 - f_i) ## \frac{df_j}{d\beta_j} : estimate it from empirical data ## # vector of length n_beta-1 df_db = (fup[1:] - fup[:-1]) / (self.betas[1:] - self.betas[:-1] + 1e-3) # vector of length n_beta-2 df_db_avg = (df_db[1:] + df_db[:-1])/2. dc_dlambda = T.cumsum(dc_df * df_db_avg * -1 * T.exp(self.lambdas)) # gradient-based beta update new_lambdas = self.lambdas - beta_lr * dc_dlambda updates = {self._lambdas: T.set_subtensor(self._lambdas[:self.n_beta-2], new_lambdas)} self.grad_update_betas = theano.function([], new_lambdas, updates=updates)
def ewma(series, axis=None, span=VOL_WINDOW_SPAN, adjust=True, initial=None): """ Exponentially-weighted moving average """ if axis is None: if series.ndim == 1: axis=0 else: raise ValueError("Please specify which axis to compute ewma over (usually time axis)") assert span >= 1 alpha = 2. / (span + 1) series = T.swapaxes(series, axis, 0) if adjust: assert initial is None initial = T.zeros_like(series[0]) else: if initial is None: initial = series[0] initial /= alpha def ewma_numerator_step(a_i, prev_ewma): return a_i + (1. - alpha) * prev_ewma ewma_numerators, _ = theano.scan(ewma_numerator_step, series, outputs_info=initial, strict=True) if adjust: ewma_denominators = T.cumsum((1 - alpha) ** T.arange(ewma_numerators.shape[0])) series_ewma = ewma_numerators / ewma_denominators.reshape((-1,)+(1,)*(ewma_numerators.ndim-1)) else: series_ewma = ewma_numerators * alpha series_ewma = T.swapaxes(series_ewma, 0, axis) return series_ewma
def sample_from_distribution(p, srng): assert p.ndim == 2 cs = T.cumsum(p, axis=1) rnd = srng.uniform(low=0., high=1., dtype='float32', size=(p.shape[0],)) sel = T.sum(T.gt(rnd.dimshuffle((0, 'x')), cs), axis=1) sel = T.clip(sel, 0, p.shape[1] - 1) return T.cast(sel, 'int32')
def log_likelihood_sym_1traj_GPOMDP(self, x_var, dist_info_vars): means = dist_info_vars["mean"] log_stds = dist_info_vars["log_std"] zs = (x_var - means) / TT.exp(log_stds) return TT.cumsum(- TT.sum(log_stds, axis=-1) - \ 0.5 * TT.sum(TT.square(zs), axis=-1) - \ 0.5 * means.shape[-1] * np.log(2 * np.pi))
def logp(self, x): n = self.n eta = self.eta diag_idxs = self.diag_idxs cumsum = tt.cumsum(x**2) variance = tt.zeros(n) variance = tt.inc_subtensor(variance[0], x[0]**2) variance = tt.inc_subtensor( variance[1:], cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]]) sd_vals = tt.sqrt(variance) logp_sd = self.sd_dist.logp(sd_vals).sum() corr_diag = x[diag_idxs] / sd_vals logp_lkj = (2 * eta - 3 + n - tt.arange(n)) * tt.log(corr_diag) logp_lkj = tt.sum(logp_lkj) # Compute the log det jacobian of the second transformation # described in the docstring. idx = tt.arange(n) det_invjac = tt.log(corr_diag) - idx * tt.log(sd_vals) det_invjac = det_invjac.sum() norm = _lkj_normalizing_constant(eta, n) return norm + logp_lkj + logp_sd + det_invjac
def scan_gen(self, rng, x0, *params): # sequences, outputs, non_sequences idx = 0 h0s = params[idx:idx + len(self.lstms)] idx += len(self.lstms) xembed = params[idx] idx += 1 yw = params[idx] idx += 1 yb = params[idx] idx += 1 xe = xembed[x0, :] y0 = xe h1s = [] for i, lstm in enumerate(self.lstms): p = params[idx:idx + len(lstm.recurrent_params)] idx += len(lstm.recurrent_params) h1, y1 = lstm.step(xs=[y0], h0=h0s[i], params=p) h1s.append(h1) y0 = y1 p1 = softmax_nd(T.dot(y0, yw) + yb) cs = T.cumsum(p1, axis=1) x1 = T.sum(T.gt(rng.dimshuffle((0, 'x')), cs), axis=1) x1 = T.clip(x1, 0, cs.shape[1] - 1) x1 = T.cast(x1 + 1, 'int32') assert idx == len(params) return [x1] + h1s
def reportDelayDistFunc(cases,mu1,sig1,mu2,sig2,r,n): m1 = tt.cast(mu1,'float64') s1 = tt.cast(sig1,'float64') m2 = tt.cast(mu2,'float64') s2 = tt.cast(sig2,'float64') sr = tt.cast(r,'float64') n = tt.cast(n,'int64') x = tt.arange(1,n+1) # Prepare the Distributions sr = tt.clip(r,1e-12,1-1e-12) d1 = tt_lognormal(x,tt.log(m1),s1) d2 = tt_lognormal(x,tt.log(m2),s2) print(d1,d2) d1 = tt.alloc(d1,1,d1.shape[0]) d2 = tt.alloc(d2,1,d2.shape[0]) # Prepare cases as diagonal of matrix cin = tt.cast(cases,'float64') c2d = tt.nlinalg.alloc_diag(cin) # Create a Vector cf1 = tt.signal.conv.conv2d(c2d,d1,border_mode='full') cf2 = tt.signal.conv.conv2d(c2d,d2,border_mode='full') cfo = (sr*cf1.T + (tt.ones_like(sr)-sr)*cf2.T).T reported = tt.cumsum(cfo,axis=1) return reported,cfo#,dists
def stick_breaking_log(u): """Return log of weights from stick-breaking process.""" lu = tns.concatenate((tns.log(u), [0.0])) cs = tns.concatenate(([0.0], tns.cumsum(tns.log1p(-u)))) lw = lu + cs return lw
def __init__(self, n_classes, n_features): #aprendizado por linha X = T.dvector('x') #classe de saída é um inteiro Y = T.iscalar('y') W = theano.shared(np.zeros((n_classes, n_features))) self.params = [W] z = T.dot(X, W.T) #(n_samples, n_classes) scores = T.cumsum(z) #(n_samples, n_classes) output = T.argmax(scores) #n_samples integers self.pred = theano.function([X], output) #Loss function L = T.sum(scores[output] - scores[Y]) #error count err = T.sum(T.neq(output, Y)) #compute gradient gW = T.grad(L, W) #update updates = [(W, W - gW)] self.train = theano.function([X, Y], [L, err], updates=updates) self.err = theano.function([X, Y], err)
def get_output_for(self, policy, greedy=False, **kwargs): if greedy: # greedy branch chosen_action_ids = T.argmax(policy, axis=-1).astype(self.output_dtype) else: if self.assume_normalized: probas = policy else: probas = policy / T.sum(policy, axis=-1, keepdims=True) # p1, p1+p2, p1+p2+p3, ... 1 cum_probas = T.cumsum(probas, axis=-1) rnd_shape = T.stack([*policy.shape[:-1], 1]) batch_randomness = self.rng.uniform(low=0., high=1., size=rnd_shape) batch_randomness = T.repeat(batch_randomness, policy.shape[-1] - 1, axis=-1) chosen_action_ids = T.sum( (batch_randomness > cum_probas[:, :, :-1]), axis=-1, dtype=self.output_dtype) return chosen_action_ids
def logp(self, x): n = self.n eta = self.eta diag_idxs = self.diag_idxs cumsum = tt.cumsum(x ** 2) variance = tt.zeros(n) variance = tt.inc_subtensor(variance[0], x[0] ** 2) variance = tt.inc_subtensor( variance[1:], cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]]) sd_vals = tt.sqrt(variance) logp_sd = self.sd_dist.logp(sd_vals).sum() corr_diag = x[diag_idxs] / sd_vals logp_lkj = (2 * eta - 3 + n - tt.arange(n)) * tt.log(corr_diag) logp_lkj = tt.sum(logp_lkj) # Compute the log det jacobian of the second transformation # described in the docstring. idx = tt.arange(n) det_invjac = tt.log(corr_diag) - idx * tt.log(sd_vals) det_invjac = det_invjac.sum() norm = _lkj_normalizing_constant(eta, n) return norm + logp_lkj + logp_sd + det_invjac
def compute_output(self, network, in_vw): axis = network.find_hyperparameter(["axis"]) network.create_vw( "default", variable=T.cumsum(in_vw.variable, axis=axis), shape=in_vw.shape, tags={"output"} )
def mask_for_prediction(self, prediction): prediction_mask = tensor.lt( tensor.cumsum(tensor.eq(prediction, self.eos_label) .astype(theano.config.floatX), axis=0), 1).astype(theano.config.floatX) prediction_mask = tensor.roll(prediction_mask, 1, 0) prediction_mask = tensor.set_subtensor( prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :])) return prediction_mask
def get_mask_by_eos(is_eos): """takes indicator of "it ends now", returns mask. Ignores everything after first end. :param is_eos: indicator that is 0 for all :type is_eos: theano.matrix """ assert is_eos.ndim==2 is_right_after_eos = T.concatenate([T.zeros_like(is_eos[:,:1]),is_eos[:,:-1]],-1) is_after_eos = T.eq(T.cumsum(is_right_after_eos,axis=-1),0).astype('uint8') return is_after_eos
def cumulative_sum(tensor, axis=-1): """ Keras' backend does not have tf.cumsum(). We're adding it here. """ if K.backend() == 'tensorflow': import tensorflow as tf return tf.cumsum(tensor, axis=axis) else: import theano.tensor as T return T.cumsum(tensor, axis=axis)
def mixed_generate(self, return_initial_states=True, **kwargs): critic = self.generator.readout.critic groundtruth = kwargs.pop('groundtruth') groundtruth_mask = kwargs.pop('groundtruth_mask') step = kwargs.pop('step') sampling_inputs = dict_subset( kwargs, self.generator.readout.sample.inputs) actor_scores = self.generator.readout.scores(**sampling_inputs) critic_inputs = { name: kwargs['critic_' + name] for name in critic.generator.readout.merge_names} critic_outputs = critic.generator.readout.outputs( groundtruth, groundtruth_mask, **critic_inputs) epsilon = numpy.array(self.generator.readout.epsilon, dtype=theano.config.floatX) actor_probs = tensor.exp(actor_scores) # This is a poor man's 1-hot argmax critic_probs = self.softmax.apply(critic_outputs * 1000) probs = (actor_probs * (tensor.constant(1) - epsilon) + critic_probs * epsilon) x = self.theano_rng.uniform(size=(probs.shape[0],)) samples = (tensor.gt(x[:, None], tensor.cumsum(probs, axis=1)) .astype(theano.config.floatX) .sum(axis=1) .astype('int64')) samples = tensor.minimum(samples, probs.shape[1] - 1) actor_feedback = self.generator.feedback.apply(samples, as_dict=True) actor_states_contexts = dict_subset( kwargs, self.generator.recurrent.apply.states + self.generator.recurrent.apply.contexts) actor_states_outputs = self.generator.recurrent.apply( as_dict=True, iterate=False, **dict_union(actor_feedback, actor_states_contexts)) critic_feedback = critic.generator.feedback.apply(samples, as_dict=True) critic_states_contexts = { name: kwargs['critic_' + name] for name in critic.generator.recurrent.apply.states + critic.generator.recurrent.apply.contexts} critic_apply_kwargs = dict( as_dict=True, iterate=False, **dict_union(critic_feedback, critic_states_contexts)) if self.generator.readout.critic_uses_actor_states: critic_apply_kwargs['extra_inputs'] = actor_states_outputs['states'] critic_states_outputs = critic.generator.recurrent.apply(**critic_apply_kwargs) return ([samples, step + 1] + actor_states_outputs.values() + critic_states_outputs.values())
def get_mask_by_eos(is_eos): """takes indicator of "it ends now", returns mask. Ignores everything after first end. :param is_eos: indicator that is 0 for all :type is_eos: theano.matrix """ assert is_eos.ndim == 2 is_right_after_eos = T.concatenate( [T.zeros_like(is_eos[:, :1]), is_eos[:, :-1]], -1) is_after_eos = T.eq(T.cumsum(is_right_after_eos, axis=-1), 0).astype('uint8') return is_after_eos
def _get_hidden_layer_connectivity(self, layerIdx): layer_size = self._hidden_sizes[layerIdx] if layerIdx == 0: p_vals = self._get_p(T.min(self.layers_connectivity[layerIdx])) else: p_vals = self._get_p(T.min(self.layers_connectivity_updates[layerIdx-1])) # #Implementations of np.choose in theano GPU # return T.nonzero(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX))[1].astype(dtype=theano.config.floatX) # return T.argmax(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX), axis=1) return T.sum(T.cumsum(self._mrng.multinomial(pvals=T.tile(p_vals[::-1][None, :], (layer_size, 1)), dtype=theano.config.floatX), axis=1), axis=1)
def _calc_rewards(self, symbolic_batch): assert symbolic_batch.ndim == 2 rewards = T.eq(self.target_idxs_shared[None, None, :], symbolic_batch[:, :, None]).any(-1) rewards = T.cast(rewards, 'int32') assert rewards.ndim == 2 # Find EOS_ix in batch done_mask = T.eq(symbolic_batch, self.vocab.EOS_ix) # Set done==True for all words after EOS_ix done_mask = T.concatenate([T.zeros_like(done_mask[:, :1]), done_mask[:, :-1]], axis=1) is_alive = T.eq(T.cumsum(done_mask, axis=1), 0).astype('uint8') return -rewards, is_alive
def pool(self, inputs): '''Convert the inputs into a fractionally max-pooled output tensor. Implementation adapted from ebenolson: https://github.com/Lasagne/Lasagne/pull/171 ''' _, _, n_in0, n_in1 = self.input_shape n_out0 = fractional_conv_output_length(n_in0, self.pool_size[0]) n_out1 = fractional_conv_output_length(n_in1, self.pool_size[1]) # Variable stride across the input creates fractional reduction. a = theano.shared( np.array([2] * (n_in0 - n_out0) + [1] * (2 * n_out0 - n_in0))) b = theano.shared( np.array([2] * (n_in1 - n_out1) + [1] * (2 * n_out1 - n_in1))) # Randomize the input strides. a = theano_shuffled(a) b = theano_shuffled(b) # Convert to input positions, starting at 0. a = T.concatenate(([0], a[:-1])) b = T.concatenate(([0], b[:-1])) a = T.cumsum(a) b = T.cumsum(b) # Positions of the other corners. c = T.clip(a + 1, 0, n_in0 - 1) d = T.clip(b + 1, 0, n_in1 - 1) # Index the four positions in the pooling window and stack them. temp = T.stack(inputs[:, :, a, :][:, :, :, b], inputs[:, :, c, :][:, :, :, b], inputs[:, :, a, :][:, :, :, d], inputs[:, :, c, :][:, :, :, d]) out = T.max(temp, axis=0) return out
def sample_categorical(rng, p, axis=-1, values=None): """ p is a n-d array, where the final dimension is a discrete distibution (does not need to be normalized). Sample from that distribution. This will return an array of shape p.shape[:-1] with values in range [0, p.shape[-1]) :param rng: A theano shared_randomstreams.RandomStream object :param p: An ndarray of arbitrary shape, where the values along (axis) are interpreted as an unnormalized discrete probability distribution (so if p.shape[2]==5, it means that the variable can take on 5 possible values). :param axis: The axis which we consider to be the distribution (only -1 (last axis)) supported now. :param values: The values of the variable. len(values) must equal p.shape[axis]. If not included, the values will be considered to be integers in range(0, p.shape[axis]) """ # TODO: assert no negative values in p / assert p normalized along axis instead of dividing # TODO: assert len(values) == p.shape[axis] assert axis == -1, 'Currenly you can only sample along the last axis.' p = p / tt.sum(p, axis=axis, keepdims=True) # TODO: Check that differnt RNGs are doing the same thing! if isinstance(rng, TensorVariable): # Externally generated random numbers - we receive the maximum number of uniform random numbers # we could need, and then generate samplews from thos. old_p_shape = p.shape random_numbers = rng[:p.size].reshape((p.size, 1)) cumulative_prob_mass = tt.cumsum(p.reshape((-1, p.shape[-1])), axis=1) samples = random_numbers < cumulative_prob_mass samples.reshape(old_p_shape) elif isinstance(rng, MRG_RandomStreams): # MRG_RandomStreams is faster but only works for 2-d pvals, so we have to reshape and # then unreshape. old_p_shape = p.shape samples = rng.multinomial(n=1, pvals=p.reshape((-1, p.shape[-1]))) samples = samples.reshape(old_p_shape) elif isinstance(rng, CURAND_RandomStreams): # TODO: Make this work if possible - problem now is it needs to know shape in advance raise NotImplementedError("Curand doesn't work yet.") cumulative_prob_mass = np.cumsum(p, axis=axis) samples = rng.uniform( size=tt.set_subtensor(p.shape[axis], 1)) > cumulative_prob_mass else: samples = tt.switch( tt.eq(p.size, 0), tt.zeros(p.shape), rng.multinomial(n=1, pvals=tt.switch(tt.eq(p.size, 0), 1, p))) indices = tt.argmax( samples, axis=-1 ) # Argmax is just a way to find the location of the only element that is 1. if values is not None: return values[indices] return indices
def mask_for_prediction(self, prediction, groundtruth_mask=None, extra_generation_steps=None): prediction_mask = tensor.lt( tensor.cumsum(tensor.eq(prediction, self.eos_label) .astype(theano.config.floatX), axis=0), 1).astype(theano.config.floatX) prediction_mask = tensor.roll(prediction_mask, 1, 0) prediction_mask = tensor.set_subtensor( prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :])) if groundtruth_mask: max_lengths = groundtruth_mask.sum(axis=0) + extra_generation_steps prediction_mask *= tensor.lt( tensor.arange(prediction.shape[0])[:, None], max_lengths[None, :]) return prediction_mask
def log_z_given_v(self, v): Wx_plusb = T.dot(v, self.W.T) + self.b energies = T.nnet.softplus(Wx_plusb) # Sum over h' if self.penalty == "softplus_bi": energies -= self.beta*T.nnet.softplus(self.b) # Add penality term elif self.penalty == "softplus0": energies -= self.beta*T.nnet.softplus(0) # Add penality term else: raise NameError("Invalid penalty term") energies = T.cumsum(energies, axis=1) # Cumsum over z return energies
def _get_hidden_layer_connectivity(self, layerIdx): layer_size = self._hidden_sizes[layerIdx] if layerIdx == 0: lc = self.layers_connectivity[layerIdx] p_vals = self._get_p(T.min(lc)) else: lc = self.layers_connectivity_updates[layerIdx-1] p_vals = self._get_p(T.min(lc)) return T.sum( T.cumsum(self._mrng.multinomial( pvals=T.tile(p_vals[::-1][None, :],(layer_size, 1)), dtype=floatX), axis=1), axis=1 )
def findalpha(D, W): W = T.abs_(T.flatten(W)) D = T.flatten(D) # sorted_W = T.sort(W)[::-1] ind = T.argsort(W)[::-1] cum_DW = T.cumsum(T.abs_(D * W)[ind]) cum_D = T.cumsum(D[ind]) cum_DW_D = cum_DW / cum_D / 2 # tmp = W[ind] - cum_DW_D tmp = W[ind][:-1] - cum_DW_D[:-1] tmp1 = W[ind][1:] - cum_DW_D[:-1] tmp3 = tmp * tmp1 mask = T.lt(tmp3, 0) tmp4 = mask.nonzero()[0].shape[0] tmp5 = cum_DW_D[mask.nonzero()] * cum_DW_D[mask.nonzero()] * cum_D[ mask.nonzero()] bb = cum_DW_D[mask.nonzero()][T.argmax(tmp5)] from theano.ifelse import ifelse thres = ifelse(T.gt(tmp4, 0), bb, 0.7 * cum_DW_D[-1]) return thres
def get_output_for(self, input, **kwargs): # _, _, n_in0, n_in1 = self.input_shape # _, _, n_out0, n_out1 = self.get_output_shape() # Variable stride across the input creates fractional reduction # a = theano.shared( # np.array([2] * (n_in0 - n_out0) + [1] * (2 * n_out0 - n_in0), dtype=np.int8), # borrow=True) # b = theano.shared( # np.array([2] * (n_in1 - n_out1) + [1] * (2 * n_out1 - n_in1), dtype=np.int8), # borrow=True) self.a_shared.set_value(self.a_init, borrow=True) self.b_shared.set_value(self.b_init, borrow=True) a, b = self.a_shared, self.b_shared # Randomize the input strides a = self._theano_shuffled(a) b = self._theano_shuffled(b) # Convert to input positions, starting at 0 a = T.concatenate(([0], a[:-1])) b = T.concatenate(([0], b[:-1])) a = T.cumsum(a) b = T.cumsum(b) # Positions of the other corners c = T.clip(a + 1, 0, self.input_shape[2] - 1) d = T.clip(b + 1, 0, self.input_shape[3] - 1) # Index the four positions in the pooling window and stack them #shit won't fit in GPU memory temp = T.stack(input[:, :, a, :][:, :, :, b], input[:, :, c, :][:, :, :, b], input[:, :, a, :][:, :, :, d], input[:, :, c, :][:, :, :, d]) return self.pool_function(temp, axis=0)
def log_z_given_v(self, v): Wx_plusb = T.dot(v, self.W.T) + self.b energies = T.nnet.softplus(Wx_plusb) # Sum over h' if self.penalty == "softplus_bi": energies -= self.beta * T.nnet.softplus( self.b) # Add penality term elif self.penalty == "softplus0": energies -= self.beta * T.nnet.softplus(0) # Add penality term else: raise NameError("Invalid penalty term") energies = T.cumsum(energies, axis=1) # Cumsum over z return energies
def _get_hidden_layer_connectivity(self, layerIdx): layer_size = self._hidden_sizes[layerIdx] if layerIdx == 0: p_vals = self._get_p(T.min(self.layers_connectivity[layerIdx])) else: p_vals = self._get_p( T.min(self.layers_connectivity_updates[layerIdx - 1])) # #Implementations of np.choose in theano GPU # return T.nonzero(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX))[1].astype(dtype=theano.config.floatX) # return T.argmax(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX), axis=1) return T.sum(T.cumsum(self._mrng.multinomial( pvals=T.tile(p_vals[::-1][None, :], (layer_size, 1)), dtype=theano.config.floatX), axis=1), axis=1)
def get_nll(self, input): input_times_W = input.T[:, :, None] * self.W[:, None, :] #acc_input_times_W = T.concatenate([T.zeros_like(input_times_W[[0]]), T.cumsum(input_times_W, axis=0)[:-1]], axis=0) # Hack for no GPUSplit acc_input_times_W = T.cumsum(input_times_W, axis=0) #acc_input_times_W = T.roll(acc_input_times_W, 1, axis=1???) # USES Join internally too acc_input_times_W = T.set_subtensor(acc_input_times_W[1:], acc_input_times_W[:-1]) acc_input_times_W = T.set_subtensor(acc_input_times_W[0, :], 0.0) acc_input_times_W += self.b[None, None, :] h = self.hidden_activation(acc_input_times_W) pre_output = T.sum(h * self.W_prime[:, None, :], axis=2) + self.b_prime[:, None] output = T.nnet.sigmoid(pre_output) nll = T.sum(T.nnet.softplus(-input.T * pre_output + (1 - input.T) * pre_output), axis=0) return nll, output
def sample_categorical(rng, p, axis = -1, values = None): """ p is a n-d array, where the final dimension is a discrete distibution (does not need to be normalized). Sample from that distribution. This will return an array of shape p.shape[:-1] with values in range [0, p.shape[-1]) :param rng: A theano shared_randomstreams.RandomStream object :param p: An ndarray of arbitrary shape, where the values along (axis) are interpreted as an unnormalized discrete probability distribution (so if p.shape[2]==5, it means that the variable can take on 5 possible values). :param axis: The axis which we consider to be the distribution (only -1 (last axis)) supported now. :param values: The values of the variable. len(values) must equal p.shape[axis]. If not included, the values will be considered to be integers in range(0, p.shape[axis]) """ # TODO: assert no negative values in p / assert p normalized along axis instead of dividing # TODO: assert len(values) == p.shape[axis] assert axis == -1, 'Currenly you can only sample along the last axis.' p = p/tt.sum(p, axis = axis, keepdims=True) # TODO: Check that differnt RNGs are doing the same thing! if isinstance(rng, TensorVariable): # Externally generated random numbers - we receive the maximum number of uniform random numbers # we could need, and then generate samplews from thos. old_p_shape = p.shape random_numbers = rng[:p.size].reshape((p.size, 1)) cumulative_prob_mass = tt.cumsum(p.reshape((-1, p.shape[-1])), axis = 1) samples = random_numbers < cumulative_prob_mass samples.reshape(old_p_shape) elif isinstance(rng, MRG_RandomStreams): # MRG_RandomStreams is faster but only works for 2-d pvals, so we have to reshape and # then unreshape. old_p_shape = p.shape samples = rng.multinomial(n=1, pvals = p.reshape((-1, p.shape[-1]))) samples = samples.reshape(old_p_shape) elif isinstance(rng, CURAND_RandomStreams): # TODO: Make this work if possible - problem now is it needs to know shape in advance raise NotImplementedError("Curand doesn't work yet.") cumulative_prob_mass = np.cumsum(p, axis = axis) samples = rng.uniform(size = tt.set_subtensor(p.shape[axis], 1)) > cumulative_prob_mass else: samples = tt.switch(tt.eq(p.size, 0), tt.zeros(p.shape), rng.multinomial(n=1, pvals = tt.switch(tt.eq(p.size, 0), 1, p))) indices = tt.argmax(samples, axis = -1) # Argmax is just a way to find the location of the only element that is 1. if values is not None: return values[indices] return indices
def CTR_AUC(self, y): #ll = T.ones((y.shape[0] * 1), 'int8'); py = self.p_y_given_x[T.arange(y.shape[0]), 1] py_si = T.argsort(-py) py_s = T.cumsum(y[py_si]) score = T.sum(T.dot(py_s, 1 - y[py_si])) score = score * 1.0 / T.sum(y) score = score / (y.shape[0] - T.sum(y)) return 1 - score
def CTR_AUC(self, y): #ll = T.ones((y.shape[0] * 1), 'int8'); py = self.p_y_given_x[T.arange(y.shape[0]), 1]; py_si = T.argsort(-py); py_s = T.cumsum(y[py_si]); score = T.sum(T.dot(py_s, 1-y[py_si])); score = score*1.0 / T.sum(y); score = score/(y.shape[0] - T.sum(y)); return 1-score;
def __init__(self, rng, x, topic_num=100): #input L2_input = sparse.csr_matrix("x",dtype=theano.config.floatX) #params vocab_size = x.shape[1] mu, sigma = x.data.mean(), x.data.var()**0.5 rng = numpy.random.RandomState(numpy.random.randint(2**32-1)) if rng is None else rng self.L2_w = theano.shared(\ numpy.asarray(\ rng.normal(loc=mu,scale=sigma,size=(vocab_size, topic_num)),\ dtype=theano.config.floatX\ ),\ borrow=True\ ) self.L2_b = theano.shared(numpy.zeros(topic_num,dtype=theano.config.floatX), borrow=True) self.params = [self.L2_w, self.L2_b] #stick-breaking:sticks->orthgonal sticks L2_stick = sparse.dot(L2_input,self.L2_w)+self.L2_b-\ 0.5*(L2_input.size/vocab_size*tensor.sum(self.L2_w**2,0)+self.L2_b**2) zero_space = tensor.zeros((L2_input.shape[0],1),dtype=theano.config.floatX) L2_orth_stick = tensor.join(1, L2_stick, zero_space)\ - tensor.join(1, zero_space, tensor.cumsum(L2_stick,1)) Pasterik_orth_stick = tensor.log(1 + tensor.exp(L2_orth_stick)) #training model definition Likelihood = tensor.mean(Pasterik_orth_stick) grads = theano.grad(Likelihood, self.params)#gradient w.r.t params eta = tensor.scalar("eta") updates = [(param, param+eta*grad) for param, grad in zip(self.params, grads)] self._fit = theano.function(\ inputs=[L2_input, eta],\ outputs=Likelihood,\ updates=updates\ ) #predict model definition self._predict = theano.function(\ inputs=[L2_input],\ outputs=tensor.argmax(L2_stick,axis=-1)\ ) self._codec = theano.function(\ inputs=[L2_input],\ outputs=L2_stick>0\ )
def DYNAMICS(self, STATE, ACTION): OLD_ANGLES = STATE[0:self.n] OLD_VELOCITY = STATE[self.n:-2] FRICTIONLESS = self.inertia * OLD_VELOCITY + (1 - self.inertia) * ACTION NEW_VELOCITY = (1 - self.friction) * FRICTIONLESS # NEW_ANGLES = OLD_ANGLES + NEW_VELOCITY NEW_ANGLES = OLD_ANGLES + OLD_VELOCITY ABSOLUTE_ANGLES = tns.cumsum(NEW_ANGLES) X = tns.sum(self.lengths * np.cos(ABSOLUTE_ANGLES)) Y = tns.sum(self.lengths * np.sin(ABSOLUTE_ANGLES)) return tns.concatenate([NEW_ANGLES, NEW_VELOCITY, [X, Y]])
def test_sample_z_given_v(self): v = T.matrix('v') h = T.matrix('h') z = T.iscalar('z') E = theano.function([v, h, z], logsumexp(-self.model.E(v, h, z))) v1 = np.random.rand(1, self.input_size).astype(config.floatX) H = cartesian([(0, 1)] * self.hidden_size, dtype=config.floatX) energies = [] for z in range(1, self.hidden_size + 1): h = np.array(H[::2**(self.hidden_size - z)]) energies.append(E(v1, h, z)) probs = T.nnet.softmax(T.stack(energies)) expected_icdf = T.cumsum(probs[:, ::-1], axis=1)[:, ::-1].eval() # Test inverse cdf v = T.matrix('v') icdf_z_given_v = theano.function([v], self.model.icdf_z_given_v(v)) assert_array_almost_equal(icdf_z_given_v(v1), expected_icdf) batch_size = 500000 self.model.batch_size = batch_size sample_zmask_given_v = theano.function( [v], self.model.sample_zmask_given_v(v)) v2 = np.tile(v1, (self.model.batch_size, 1)) #theano.printing.pydotprint(sample_zmask_given_v) z_mask = sample_zmask_given_v(v2) # First hidden units should always be considered i.e. z_mask[:, 0] == 1 assert_equal(np.sum(z_mask[:, 0] == 0, axis=0), 0) # Test that sampled masks are as expected i.e. equal expected_icdf freq_per_z = np.sum(z_mask, axis=0) / self.model.batch_size assert_array_almost_equal( freq_per_z, expected_icdf[0], decimal=3, err_msg= "Tested using MC sampling, rerun it to be certain that is an error or increase 'batch_size'." )
def scan(self, x, rnd, h0, wph, wpx, wpb, whh, whx, whz, whb, z_embeddings, *params): assert len(params) == len(self.mlp_p.params) + len(self.mlp_h.params) params_p = params[:len(self.mlp_p.params)] params_h = params[len(self.mlp_p.params):] ctx_p = self.activation(T.dot(h0, wph) + T.dot(x, wpx) + wpb) pz = self.mlp_p.call_on_params(ctx_p, params_p) assert pz.ndim == 1 cs = T.cumsum(pz, axis=0) sel = T.sum(T.gt(rnd, cs)) sel = T.clip(sel, 0, self.z_k - 1) pzs = pz[sel] ze = z_embeddings[sel, :] ctx_h = self.activation( T.dot(h0, whh) + T.dot(x, whx) + T.dot(ze, whz) + whb) hd = self.mlp_h.call_on_params(ctx_h, params_h) h1 = h0 + hd return h1, sel, pzs
def get_output_for(self, policy, greedy=False, **kwargs): """ picks the action with probabilities from policy :param policy: probabilities for all actions (e.g. a2c actor policy or standartized Q-values) :type policy: tensor of float[batch_id, action_id] :returns: actions ids of actions picked :rtype: vector of int[batch_id] """ if greedy: # greedy branch chosen_action_ids = T.argmax(policy, axis=-1).astype(self.action_dtype) else: # probabilistic branch batch_size, n_actions = policy.shape if self.assume_normalized: probas = policy else: probas = policy / T.sum(policy, axis=1, keepdims=True) # p1, p1+p2, p1+p2+p3, ... 1 cum_probas = T.cumsum(probas, axis=1) batch_randomness = self.rng.uniform(low=0., high=1., size=[batch_size, 1]) # idea: to compute the chosen action we count how many cumulative probabilities are # less than the random number [0,1]. # we deliberately exclude the LAST cumulative probability because it has to be equal to 1 # by definition (never being less than random[0,1]), but it can be less due to # inaccurate float32 computation, causing algorithm to pick action id = (n_actions)+1 # which results in IndexError chosen_action_ids = T.sum((batch_randomness > cum_probas[:, :-1]), axis=1, dtype=self.action_dtype) return chosen_action_ids
def get_output_for(self,policy,greedy=False,**kwargs): """ picks the action with probabilities from policy arguments: policy float[batch_id, action_id]: policy values for all actions (e.g. Qvalues of action probabilities) returns: actions int[batch_id]: ids of actions picked """ ##probablistic branch if not greedy: batch_size,n_actions = policy.shape if self.assume_normalized: probas = policy else: probas = policy / T.sum(policy,axis=1,keepdims=True) #p1, p1+p2, p1+p2+p3, ... 1 cum_probas = T.cumsum(probas,axis=1) batch_randomness = self.rng.uniform(low=0.,high=1., size = [probas.shape[0],1]) #idea: to compute the chosen action we count how many cumulative probabilities are #less than the random number [0,1]. #we deliberately exclude the LAST cumulative probability because it has to equal 1 # by definition (never being less than random[0,1]), but it can be less due to #inaccurate float32 computation, causing algorithm to pick action id = (n_actions)+1 #which results in IndexError chosen_action_ids = T.sum((batch_randomness > cum_probas[:,:-1]), axis=1, dtype=self.action_dtype) else: #greedy branch chosen_action_ids = T.argmax(policy,axis=-1).astype(self.action_dtype) return chosen_action_ids
def add_exploration(recognizer, data, train_conf): prediction = None prediction_mask = None explore_conf = train_conf.get('exploration', 'imitative') if explore_conf in ['greedy', 'mixed']: length_expand = 10 prediction = recognizer.get_generate_graph( n_steps=recognizer.labels.shape[0] + length_expand)['outputs'] prediction_mask = tensor.lt( tensor.cumsum(tensor.eq(prediction, data.eos_label), axis=0), 1).astype(floatX) prediction_mask = tensor.roll(prediction_mask, 1, 0) prediction_mask = tensor.set_subtensor( prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :])) if explore_conf == 'mixed': batch_size = recognizer.labels.shape[1] targets = tensor.concatenate([ recognizer.labels, tensor.zeros((length_expand, batch_size), dtype='int64')]) targets_mask = tensor.concatenate([ recognizer.labels_mask, tensor.zeros((length_expand, batch_size), dtype=floatX)]) rng = MRG_RandomStreams() generate = rng.binomial((batch_size,), p=0.5, dtype='int64') prediction = (generate[None, :] * prediction + (1 - generate[None, :]) * targets) prediction_mask = (tensor.cast(generate[None, :] * prediction_mask, floatX) + tensor.cast((1 - generate[None, :]) * targets_mask, floatX)) prediction_mask = theano.gradient.disconnected_grad(prediction_mask) elif explore_conf != 'imitative': raise ValueError return prediction, prediction_mask
def test_sample_z_given_v(self): v = T.matrix('v') h = T.matrix('h') z = T.iscalar('z') E = theano.function([v, h, z], logsumexp(-self.model.E(v, h, z))) v1 = np.random.rand(1, self.input_size).astype(config.floatX) H = cartesian([(0, 1)] * self.hidden_size, dtype=config.floatX) energies = [] for z in range(1, self.hidden_size+1): h = np.array(H[::2**(self.hidden_size-z)]) energies.append(E(v1, h, z)) probs = T.nnet.softmax(T.stack(energies)) expected_icdf = T.cumsum(probs[:, ::-1], axis=1)[:, ::-1].eval() # Test inverse cdf v = T.matrix('v') icdf_z_given_v = theano.function([v], self.model.icdf_z_given_v(v)) assert_array_almost_equal(icdf_z_given_v(v1), expected_icdf) batch_size = 500000 self.model.batch_size = batch_size sample_zmask_given_v = theano.function([v], self.model.sample_zmask_given_v(v)) v2 = np.tile(v1, (self.model.batch_size, 1)) #theano.printing.pydotprint(sample_zmask_given_v) z_mask = sample_zmask_given_v(v2) # First hidden units should always be considered i.e. z_mask[:, 0] == 1 assert_equal(np.sum(z_mask[:, 0] == 0, axis=0), 0) # Test that sampled masks are as expected i.e. equal expected_icdf freq_per_z = np.sum(z_mask, axis=0) / self.model.batch_size assert_array_almost_equal(freq_per_z, expected_icdf[0], decimal=3, err_msg="Tested using MC sampling, rerun it to be certain that is an error or increase 'batch_size'.")
def sample_zmask_given_v(self, v): p = self.theano_rng.multinomial(pvals=self.pdf_z_given_v(v), dtype=theano.config.floatX) return T.cumsum(p[:, ::-1], axis=1)[:, ::-1]
def __init__(self, input=None, n_visible=784, n_hidden=500, \ W=None, hbias=None, vbias=None, numpy_rng = None, theano_rng=None, batch_size=1, n_beta=10, beta_lbound=0., n_swaps=10, n_rtime=1, rtime_a=1, rtime_b=100, tau=None): """ RBM constructor. Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing CD updates. :param input: None for standalone RBMs or symbolic variable if RBM is part of a larger graph. :param n_visible: number of visible units :param n_hidden: number of hidden units :param W: None for standalone RBMs or symbolic variable pointing to a shared weight matrix in case RBM is part of a DBN network; in a DBN, the weights are shared between RBMs and layers of a MLP :param hbias: None for standalone RBMs or symbolic variable pointing to a shared hidden units bias vector in case RBM is part of a different network :param vbias: None for standalone RBMs or a symbolic variable pointing to a shared visible units bias :param n_rtime: time constant is inversely proportional to n_rtime x <return time> :param tau: optional fixed time constant (overries n_rtime) """ self.n_visible = n_visible self.n_hidden = n_hidden # deal with random number generation self.numpy_rng = numpy_rng if numpy_rng is not None else numpy.random.RandomState(123) self.theano_rng = theano_rng if theano_rng is not None else RandomStreams(self.numpy_rng.randint(2**30)) if W is None : # W is initialized with `initial_W` which is uniformely sampled # from -4*sqrt(6./(n_visible+n_hidden)) and 4*sqrt(6./(n_hidden+n_visible)) # the output of uniform if converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU initial_W = 0.01 * self.numpy_rng.randn(n_visible, n_hidden) initial_W = numpy.asarray(initial_W, dtype = theano.config.floatX) # theano shared variables for weights and biases W = theano.shared(value = initial_W, name = 'W') if hbias is None : # create shared variable for hidden units bias hbias = theano.shared(value = numpy.zeros(n_hidden, dtype = theano.config.floatX), name='hbias') if vbias is None : # create shared variable for visible units bias vbias = theano.shared(value =numpy.zeros(n_visible, dtype = theano.config.floatX),name='vbias') # initialize input layer for standalone RBM or layer0 of DBN self.input = input if not input: self.input = T.matrix('input') self.W = W self.hbias = hbias self.vbias = vbias bufsize = 100 ######################################################################### # Fields indexed by mixstat: nvis, E, beta, labels, rtime # Fields indexed by temp index: mixstat, fup_target, nup, ndown, swapstat ######################################################################### ### initialize tempering stuff ### self.batch_size = batch_size # size of negative minibatch self.n_beta = theano.shared(n_beta, name='n_beta') # number of temperatures in system self.n_chain = theano.shared(batch_size * n_beta, name='n_chain') # number of active chains in nvis array self._nvis = theano.shared(self.numpy_rng.randint(0,2,size=(batch_size*bufsize, n_visible)), name='nvis') self.nvis = self._nvis[:self.n_chain] # vectors containing energy and free-energy of current negative particles (at T=1) self._E = theano.shared(numpy.zeros(batch_size*bufsize), name='E') self.E = self._E[:self.n_chain] ## Betas are parametrized as delta_bi = exp(\lambda_i) ## Resulting betas are linearly spaced between 1 and 0 # shared parameters are the lambda_i lambdas = numpy.zeros(bufsize) # leave room to grow ... lambdas[:n_beta-2] = numpy.log((1.0 - beta_lbound)/(n_beta-1)) self._lambdas = theano.shared(lambdas, name='lambdas') self.lambdas = self._lambdas[:n_beta-2] # initialize data structure to map nhid/nvis rows to a given temperature mixstat = numpy.zeros((batch_size, bufsize), dtype='int32') mixstat[:, :n_beta] = numpy.arange(batch_size*n_beta).reshape(batch_size, n_beta) self._mixstat = theano.shared(mixstat, name='mixstat') self.mixstat = self._mixstat[:, :self.n_beta] # convert lambdas to actual beta values _betas1 = 1 - T.cumsum(T.exp(self.lambdas)) _betas2 = T.join(0, T.shape_padright(1.0), _betas1) _betas3 = T.join(0, _betas2, T.shape_padright(beta_lbound)) self.betas = _betas3 self.mixed_betas = pt_mix(self.betas, self.mixstat) self.mixed_betas_matrix = T.shape_padright(self.mixed_betas) self.get_betas = theano.function([], self.betas) # labels: 1 means going up in temperature, 0 going down in temperature labels = LBL_NONE * numpy.ones(batch_size*bufsize, dtype='int32') labels[mixstat[:,0]] = LBL_UP self.labels = theano.shared(labels, name='labels') # configure histogram of up moving particles _nup = numpy.zeros(bufsize) _nup[:n_beta] = numpy.linspace(1,0,n_beta) self._nup = theano.shared(_nup, name='nup') self.nup = self._nup[:self.n_beta] # configure histogram of down moving particles _ndown = numpy.zeros(bufsize) _ndown[:n_beta] = numpy.linspace(0,1,n_beta) self._ndown = theano.shared(_ndown, name='ndown') self.ndown = self._ndown[:self.n_beta] # return time rtime = numpy.zeros(batch_size*bufsize, dtype='int32') self.rtime = theano.shared(rtime, name='rtime') self.avg_rtime = theano.shared( numpy.asarray(rtime_deo(0.4,n_beta), dtype=theano.config.floatX), name='avg_rtime') # use return time as the time constant for all moving averages if not tau: self.tau = rtime_a/(n_rtime*self.avg_rtime + rtime_b) else: self.tau = T.as_tensor(tau) self.get_tau = theano.function([], self.tau) # create PT Op self.n_swaps = n_swaps self._swapstat = theano.shared(numpy.zeros(bufsize), name='swapstat') self.swapstat = self._swapstat[:self.n_beta] self.pt_swaps = PT_Swaps(n_swaps=self.n_swaps, seed=self.numpy_rng.randint(1 << 32))
def backward(self, y): x = tt.zeros(y.shape) x = tt.inc_subtensor(x[..., 0], y[..., 0]) x = tt.inc_subtensor(x[..., 1:], tt.exp(y[..., 1:])) return tt.cumsum(x, axis=-1)
def compute_output(self, network, in_vw): if network.find_hyperparameter(["deterministic"]): import warnings warnings.warn("OverlappingRandomFractionalMaxPool2DNode has " "no deterministic implementation") pool_size = network.find_hyperparameter(["pool_size"]) assert len(pool_size) == 2 for alpha in pool_size: assert 1 < alpha < 2 pool_fn = network.find_hyperparameter(["pool_function"], T.max) # NOTE: MRG_RandomStreams doesn't have "permutation" srng = T.shared_randomstreams.RandomStreams() def theano_shuffled(in_vw): n = in_vw.shape[0] shuffled = T.permute_row_elements(in_vw.T, srng.permutation(n=n)).T return shuffled out_shape = list(in_vw.shape) for axis, alpha in zip([2, 3], pool_size): out_shape[axis] = int(np.ceil(float(out_shape[axis]) / alpha)) out_shape = tuple(out_shape) n_in0, n_in1 = in_vw.shape[2:] n_out0, n_out1 = out_shape[2:] # Variable stride across the input creates fractional reduction a = theano.shared( np.array([2] * (n_in0 - n_out0) + [1] * (2 * n_out0 - n_in0))) b = theano.shared( np.array([2] * (n_in1 - n_out1) + [1] * (2 * n_out1 - n_in1))) # Randomize the input strides a = theano_shuffled(a) b = theano_shuffled(b) # Convert to input positions, starting at 0 a = T.concatenate(([0], a[:-1])) b = T.concatenate(([0], b[:-1])) a = T.cumsum(a) b = T.cumsum(b) # Positions of the other corners c = T.clip(a + 1, 0, n_in0 - 1) d = T.clip(b + 1, 0, n_in1 - 1) # Index the four positions in the pooling window and stack them in_var = in_vw.variable temp = T.stack(in_var[:, :, a, :][:, :, :, b], in_var[:, :, c, :][:, :, :, b], in_var[:, :, a, :][:, :, :, d], in_var[:, :, c, :][:, :, :, d]) out_var = pool_fn(temp, axis=0) network.create_vw( "default", variable=out_var, shape=out_shape, tags={"output"} )
def __init__(self, n_out = None, n_units = None, direction = 1, truncation = -1, sampling = 1, encoder = None, unit = 'lstm', n_dec = 0, attention = "none", recurrent_transform = "none", recurrent_transform_attribs = "{}", attention_template = 128, attention_distance = 'l2', attention_step = "linear", attention_beam = 0, attention_norm = "exp", attention_momentum = "none", attention_sharpening = 1.0, attention_nbest = 0, attention_store = False, attention_smooth = False, attention_glimpse = 1, attention_filters = 1, attention_accumulator = 'sum', attention_loss = 0, attention_bn = 0, attention_lm = 'none', attention_ndec = 1, attention_memory = 0, attention_alnpts = 0, attention_epoch = 1, attention_segstep=0.01, attention_offset=0.95, attention_method="epoch", attention_scale=10, context=-1, base = None, aligner = None, lm = False, force_lm = False, droplm = 1.0, forward_weights_init=None, bias_random_init_forget_shift=0.0, copy_weights_from_base=False, segment_input=False, join_states=False, sample_segment=None, **kwargs): """ :param n_out: number of cells :param n_units: used when initialized via Network.from_hdf_model_topology :param direction: process sequence in forward (1) or backward (-1) direction :param truncation: gradient truncation :param sampling: scan every nth frame only :param encoder: list of encoder layers used as initalization for the hidden state :param unit: cell type (one of 'lstm', 'vanilla', 'gru', 'sru') :param n_dec: absolute number of steps to unfold the network if integer, else relative number of steps from encoder :param recurrent_transform: name of recurrent transform :param recurrent_transform_attribs: dictionary containing parameters for a recurrent transform :param attention_template: :param attention_distance: :param attention_step: :param attention_beam: :param attention_norm: :param attention_sharpening: :param attention_nbest: :param attention_store: :param attention_align: :param attention_glimpse: :param attention_lm: :param base: list of layers which outputs are considered as based during attention mechanisms :param lm: activate RNNLM :param force_lm: expect previous labels to be given during testing :param droplm: probability to take the expected output as predecessor instead of the real one when LM=true :param bias_random_init_forget_shift: initialize forget gate bias of lstm networks with this value """ source_index = None if len(kwargs['sources']) == 1 and (kwargs['sources'][0].layer_class.endswith('length') or kwargs['sources'][0].layer_class.startswith('length')): kwargs['sources'] = [] source_index = kwargs['index'] unit_given = unit from Device import is_using_gpu if unit == 'lstm': # auto selection if not is_using_gpu(): unit = 'lstme' elif recurrent_transform == 'none' and (not lm or droplm == 0.0): unit = 'lstmp' else: unit = 'lstmc' elif unit in ("lstmc", "lstmp") and not is_using_gpu(): unit = "lstme" if segment_input: if is_using_gpu(): unit = "lstmps" else: unit = "lstms" if n_out is None: assert encoder n_out = sum([enc.attrs['n_out'] for enc in encoder]) kwargs.setdefault("n_out", n_out) if n_units is not None: assert n_units == n_out self.attention_weight = T.constant(1.,'float32') if len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('length'): kwargs['sources'] = [] elif len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('signal'): kwargs['sources'] = [] super(RecurrentUnitLayer, self).__init__(**kwargs) self.set_attr('from', ",".join([s.name for s in self.sources]) if self.sources else "null") self.set_attr('n_out', n_out) self.set_attr('unit', unit_given.encode("utf8")) self.set_attr('truncation', truncation) self.set_attr('sampling', sampling) self.set_attr('direction', direction) self.set_attr('lm', lm) self.set_attr('force_lm', force_lm) self.set_attr('droplm', droplm) if bias_random_init_forget_shift: self.set_attr("bias_random_init_forget_shift", bias_random_init_forget_shift) self.set_attr('attention_beam', attention_beam) self.set_attr('recurrent_transform', recurrent_transform.encode("utf8")) if isinstance(recurrent_transform_attribs, str): recurrent_transform_attribs = json.loads(recurrent_transform_attribs) if attention_template is not None: self.set_attr('attention_template', attention_template) self.set_attr('recurrent_transform_attribs', recurrent_transform_attribs) self.set_attr('attention_distance', attention_distance.encode("utf8")) self.set_attr('attention_step', attention_step.encode("utf8")) self.set_attr('attention_norm', attention_norm.encode("utf8")) self.set_attr('attention_sharpening', attention_sharpening) self.set_attr('attention_nbest', attention_nbest) attention_store = attention_store or attention_smooth or attention_momentum != 'none' self.set_attr('attention_store', attention_store) self.set_attr('attention_smooth', attention_smooth) self.set_attr('attention_momentum', attention_momentum.encode('utf8')) self.set_attr('attention_glimpse', attention_glimpse) self.set_attr('attention_filters', attention_filters) self.set_attr('attention_lm', attention_lm) self.set_attr('attention_bn', attention_bn) self.set_attr('attention_accumulator', attention_accumulator) self.set_attr('attention_ndec', attention_ndec) self.set_attr('attention_memory', attention_memory) self.set_attr('attention_loss', attention_loss) self.set_attr('n_dec', n_dec) self.set_attr('segment_input', segment_input) self.set_attr('attention_alnpts', attention_alnpts) self.set_attr('attention_epoch', attention_epoch) self.set_attr('attention_segstep', attention_segstep) self.set_attr('attention_offset', attention_offset) self.set_attr('attention_method', attention_method) self.set_attr('attention_scale', attention_scale) if segment_input: if not self.eval_flag: #if self.eval_flag: if isinstance(self.sources[0],RecurrentUnitLayer): self.inv_att = self.sources[0].inv_att #NBT else: if not join_states: self.inv_att = self.sources[0].attention #NBT else: assert hasattr(self.sources[0], "nstates"), "source does not have number of states!" ns = self.sources[0].nstates self.inv_att = self.sources[0].attention[(ns-1)::ns] inv_att = T.roll(self.inv_att.dimshuffle(2, 1, 0),1,axis=0)#TBN inv_att = T.set_subtensor(inv_att[0],T.zeros((inv_att.shape[1],inv_att.shape[2]))) inv_att = T.max(inv_att,axis=-1) else: inv_att = T.zeros((self.sources[0].output.shape[0],self.sources[0].output.shape[1])) if encoder and hasattr(encoder[0],'act'): self.set_attr('encoder', ",".join([e.name for e in encoder])) if base: self.set_attr('base', ",".join([b.name for b in base])) else: base = encoder self.base = base self.encoder = encoder if aligner: self.aligner = aligner self.set_attr('n_units', n_out) unit = eval(unit.upper())(**self.attrs) assert isinstance(unit, Unit) self.unit = unit kwargs.setdefault("n_out", unit.n_out) n_out = unit.n_out self.set_attr('n_out', unit.n_out) if n_dec < 0: source_index = self.index n_dec *= -1 if n_dec != 0: self.target_index = self.index if isinstance(n_dec,float): if not source_index: source_index = encoder[0].index if encoder else base[0].index lengths = T.cast(T.ceil(T.sum(T.cast(source_index,'float32'),axis=0) * n_dec), 'int32') idx, _ = theano.map(lambda l_i, l_m:T.concatenate([T.ones((l_i,),'int8'),T.zeros((l_m-l_i,),'int8')]), [lengths], [T.max(lengths)+1]) self.index = idx.dimshuffle(1,0)[:-1] n_dec = T.cast(T.ceil(T.cast(source_index.shape[0],'float32') * numpy.float32(n_dec)),'int32') else: if encoder: self.index = encoder[0].index self.index = T.ones((n_dec,self.index.shape[1]),'int8') else: n_dec = self.index.shape[0] # initialize recurrent weights self.W_re = None if unit.n_re > 0: self.W_re = self.add_param(self.create_recurrent_weights(unit.n_units, unit.n_re, name="W_re_%s" % self.name)) # initialize forward weights bias_init_value = self.create_bias(unit.n_in).get_value() if bias_random_init_forget_shift: assert unit.n_units * 4 == unit.n_in # (input gate, forget gate, output gate, net input) bias_init_value[unit.n_units:2 * unit.n_units] += bias_random_init_forget_shift self.b.set_value(bias_init_value) if not forward_weights_init: forward_weights_init = "random_uniform(p_add=%i)" % unit.n_re else: self.set_attr('forward_weights_init', forward_weights_init) self.forward_weights_init = forward_weights_init self.W_in = [] sample_mean, gamma = None, None if copy_weights_from_base: self.params = {} #self.W_re = self.add_param(base[0].W_re) #self.W_in = [ self.add_param(W) for W in base[0].W_in ] #self.b = self.add_param(base[0].b) self.W_re = base[0].W_re self.W_in = base[0].W_in self.b = base[0].b if self.attrs.get('batch_norm', False): sample_mean = base[0].sample_mean gamma = base[0].gamma #self.masks = base[0].masks #self.mass = base[0].mass else: for s in self.sources: W = self.create_forward_weights(s.attrs['n_out'], unit.n_in, name="W_in_%s_%s" % (s.name, self.name)) self.W_in.append(self.add_param(W)) # make input z = self.b for x_t, m, W in zip(self.sources, self.masks, self.W_in): if x_t.attrs['sparse']: if x_t.output.ndim == 3: out_dim = x_t.output.shape[2] elif x_t.output.ndim == 2: out_dim = 1 else: assert False, x_t.output.ndim if x_t.output.ndim == 3: z += W[T.cast(x_t.output[:,:,0], 'int32')] elif x_t.output.ndim == 2: z += W[T.cast(x_t.output, 'int32')] else: assert False, x_t.output.ndim elif m is None: z += T.dot(x_t.output, W) else: z += self.dot(self.mass * m * x_t.output, W) #if self.attrs['batch_norm']: # z = self.batch_norm(z, unit.n_in) num_batches = self.index.shape[1] self.num_batches = num_batches non_sequences = [] if self.attrs['lm'] or attention_lm != 'none': if not 'target' in self.attrs: self.attrs['target'] = 'classes' if self.attrs['droplm'] > 0.0 or not (self.train_flag or force_lm): if copy_weights_from_base: self.W_lm_in = base[0].W_lm_in self.b_lm_in = base[0].b_lm_in else: l = sqrt(6.) / sqrt(unit.n_out + self.y_in[self.attrs['target']].n_out) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(unit.n_out, self.y_in[self.attrs['target']].n_out)), dtype=theano.config.floatX) self.W_lm_in = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_in_"+self.name)) self.b_lm_in = self.create_bias(self.y_in[self.attrs['target']].n_out, 'b_lm_in') l = sqrt(6.) / sqrt(unit.n_in + self.y_in[self.attrs['target']].n_out) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(self.y_in[self.attrs['target']].n_out, unit.n_in)), dtype=theano.config.floatX) if copy_weights_from_base: self.W_lm_out = base[0].W_lm_out else: self.W_lm_out = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_out_"+self.name)) if self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm): self.lmmask = 1 #if recurrent_transform != 'none': # recurrent_transform = recurrent_transform[:-3] elif self.attrs['droplm'] < 1.0 and (self.train_flag or force_lm): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(self.rng.randint(1234) + 1) self.lmmask = T.cast(srng.binomial(n=1, p=1.0 - self.attrs['droplm'], size=self.index.shape), theano.config.floatX).dimshuffle(0,1,'x').repeat(unit.n_in,axis=2) else: self.lmmask = T.zeros_like(self.index, dtype='float32').dimshuffle(0,1,'x').repeat(unit.n_in,axis=2) if recurrent_transform == 'input': # attention is just a sequence dependent bias (lstmp compatible) src = [] src_names = [] n_in = 0 for e in base: #src_base = [ s for s in e.sources if s.name not in src_names ] #src_names += [ s.name for s in e.sources ] src_base = [ e ] src_names += [e.name] src += [s.output for s in src_base] n_in += sum([s.attrs['n_out'] for s in src_base]) self.xc = T.concatenate(src, axis=2) l = sqrt(6.) / sqrt(self.attrs['n_out'] + n_in) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, 1)), dtype=theano.config.floatX) self.W_att_xc = self.add_param(self.shared(value=values, borrow=True, name = "W_att_xc")) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, self.attrs['n_out'] * 4)), dtype=theano.config.floatX) self.W_att_in = self.add_param(self.shared(value=values, borrow=True, name = "W_att_in")) zz = T.exp(T.tanh(T.dot(self.xc, self.W_att_xc))) # TB1 self.zc = T.dot(T.sum(self.xc * (zz / T.sum(zz, axis=0, keepdims=True)).repeat(self.xc.shape[2],axis=2), axis=0, keepdims=True), self.W_att_in) recurrent_transform = 'none' elif recurrent_transform == 'attention_align': max_skip = base[0].attrs['max_skip'] values = numpy.zeros((max_skip,), dtype=theano.config.floatX) self.T_b = self.add_param(self.shared(value=values, borrow=True, name="T_b"), name="T_b") l = sqrt(6.) / sqrt(self.attrs['n_out'] + max_skip) values = numpy.asarray(self.rng.uniform( low=-l, high=l, size=(self.attrs['n_out'], max_skip)), dtype=theano.config.floatX) self.T_W = self.add_param(self.shared(value=values, borrow=True, name="T_W"), name="T_W") y_t = T.dot(self.base[0].attention, T.arange(self.base[0].output.shape[0], dtype='float32')) # NB y_t = T.concatenate([T.zeros_like(y_t[:1]), y_t], axis=0) # (N+1)B y_t = y_t[1:] - y_t[:-1] # NB self.y_t = y_t # T.clip(y_t,numpy.float32(0),numpy.float32(max_skip - 1)) self.y_t = T.cast(self.base[0].backtrace,'float32') elif recurrent_transform == 'attention_segment': assert aligner.attention, "Segment-wise attention requires attention points!" recurrent_transform_inst = RecurrentTransform.transform_classes[recurrent_transform](layer=self) assert isinstance(recurrent_transform_inst, RecurrentTransform.RecurrentTransformBase) unit.recurrent_transform = recurrent_transform_inst self.recurrent_transform = recurrent_transform_inst # scan over sequence for s in range(self.attrs['sampling']): index = self.index[s::self.attrs['sampling']] if context > 0: from TheanoUtil import context_batched n_batches = z.shape[1] time, batch, dim = z.shape[0], z.shape[1], z.shape[2] #z = context_batched(z[::direction or 1], window=context)[::direction or 1] # TB(CD) from theano.ifelse import ifelse def context_window(idx, x_in, i_in): x_out = x_in[idx:idx + context] x_out = x_out.dimshuffle('x',1,0,2).reshape((1, batch, dim * context)) i_out = i_in[idx:idx+1].repeat(context, axis=0) i_out = ifelse(T.lt(idx,context),T.set_subtensor(i_out[:context - idx],numpy.int8(0)),i_out).reshape((1, batch * context)) return x_out, i_out z = z[::direction or 1] i = index[::direction or 1] out, _ = theano.map(context_window, sequences = [T.arange(z.shape[0])], non_sequences = [T.concatenate([T.zeros((context - 1,z.shape[1],z.shape[2]),dtype='float32'),z],axis=0), i]) z = out[0][::direction or 1] i = out[1][::direction or 1] # T(BC) direction = 1 z = z.reshape((time * batch, context * dim)) # (TB)(CD) z = z.reshape((time * batch, context, dim)).dimshuffle(1,0,2) # C(TB)D i = i.reshape((time, context, batch)).dimshuffle(1,0,2).reshape((context, time * batch)) index = i num_batches = time * batch sequences = z sources = self.sources if encoder: if recurrent_transform == "attention_segment": if hasattr(encoder[0],'act'): outputs_info = [T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act)] else: # outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ] outputs_info[0] = self.aligner.output[-1] elif hasattr(encoder[0],'act'): outputs_info = [ T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act) ] else: outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ] sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0)) else: outputs_info = [ T.alloc(numpy.cast[theano.config.floatX](0), num_batches, unit.n_units) for a in range(unit.n_act) ] if self.attrs['lm'] and self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm): if self.network.y[self.attrs['target']].ndim == 3: sequences += T.dot(self.network.y[self.attrs['target']],self.W_lm_out) else: y = self.y_in[self.attrs['target']].flatten() sequences += self.W_lm_out[y].reshape((index.shape[0],index.shape[1],unit.n_in)) if sequences == self.b: sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0)) if unit.recurrent_transform: outputs_info += unit.recurrent_transform.get_sorted_state_vars_initial() index_f = T.cast(index, theano.config.floatX) unit.set_parent(self) if segment_input: outputs = unit.scan_seg(x=sources, z=sequences[s::self.attrs['sampling']], att = inv_att, non_sequences=non_sequences, i=index_f, outputs_info=outputs_info, W_re=self.W_re, W_in=self.W_in, b=self.b, go_backwards=direction == -1, truncate_gradient=self.attrs['truncation']) else: outputs = unit.scan(x=sources, z=sequences[s::self.attrs['sampling']], non_sequences=non_sequences, i=index_f, outputs_info=outputs_info, W_re=self.W_re, W_in=self.W_in, b=self.b, go_backwards=direction == -1, truncate_gradient=self.attrs['truncation']) if not isinstance(outputs, list): outputs = [outputs] if outputs: outputs[0].name = "%s.act[0]" % self.name if context > 0: for i in range(len(outputs)): outputs[i] = outputs[i][-1].reshape((outputs[i].shape[1]//n_batches,n_batches,outputs[i].shape[2])) if unit.recurrent_transform: unit.recurrent_transform_state_var_seqs = outputs[-len(unit.recurrent_transform.state_vars):] if self.attrs['sampling'] > 1: if s == 0: self.act = [ T.alloc(numpy.cast['float32'](0), self.index.shape[0], self.index.shape[1], n_out) for act in outputs ] self.act = [ T.set_subtensor(tot[s::self.attrs['sampling']], act) for tot,act in zip(self.act, outputs) ] else: self.act = outputs[:unit.n_act] if len(outputs) > unit.n_act: self.aux = outputs[unit.n_act:] if self.attrs['attention_store']: self.attention = [ self.aux[i].dimshuffle(0,2,1) for i,v in enumerate(sorted(unit.recurrent_transform.state_vars.keys())) if v.startswith('att_') ] # NBT for i in range(len(self.attention)): vec = T.eye(self.attention[i].shape[2], 1, -direction * (self.attention[i].shape[2] - 1)) last = vec.dimshuffle(1, 'x', 0).repeat(self.index.shape[1], axis=1) self.attention[i] = T.concatenate([self.attention[i][1:],last],axis=0)[::direction] self.cost_val = numpy.float32(0) if recurrent_transform == 'attention_align': back = T.ceil(self.aux[sorted(unit.recurrent_transform.state_vars.keys()).index('t')]) def make_output(base, yout, trace, length): length = T.cast(length, 'int32') idx = T.cast(trace[:length][::-1],'int32') x_out = T.concatenate([base[idx],T.zeros((self.index.shape[0] + 1 - length, base.shape[1]), 'float32')],axis=0) y_out = T.concatenate([yout[idx,T.arange(length)],T.zeros((self.index.shape[0] + 1 - length, ), 'float32')],axis=0) return x_out, y_out output, _ = theano.map(make_output, sequences = [base[0].output.dimshuffle(1,0,2), self.y_t.dimshuffle(1,2,0), back.dimshuffle(1,0), T.sum(self.index,axis=0,dtype='float32')]) self.attrs['n_out'] = base[0].attrs['n_out'] self.params.update(unit.params) self.output = output[0].dimshuffle(1,0,2)[:-1] z = T.dot(self.act[0], self.T_W)[:-1] + self.T_b z = z.reshape((z.shape[0] * z.shape[1], z.shape[2])) idx = (self.index[1:].flatten() > 0).nonzero() idy = (self.index[1:][::-1].flatten() > 0).nonzero() y_out = T.cast(output[1],'int32').dimshuffle(1, 0)[:-1].flatten() nll, _ = T.nnet.crossentropy_softmax_1hot(x=z[idx], y_idx=y_out[idy]) self.cost_val = T.sum(nll) recog = T.argmax(z[idx], axis=1) real = y_out[idy] self.errors = lambda: T.sum(T.neq(recog, real)) return back += T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32') idx = (self.index[:-1].flatten() > 0).nonzero() idx = T.cast(back[::-1].flatten()[idx],'int32') x_out = base[0].output #x_out = x_out.dimshuffle(1,0,2).reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx] #x_out = x_out.reshape((self.index.shape[1], self.index.shape[0] - 1, x_out.shape[1])).dimshuffle(1,0,2) x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx] x_out = x_out.reshape((self.index.shape[0] - 1, self.index.shape[1], x_out.shape[1])) self.output = T.concatenate([x_out, base[0].output[1:]],axis=0) self.attrs['n_out'] = base[0].attrs['n_out'] self.params.update(unit.params) return skips = T.dot(T.nnet.softmax(z), T.arange(z.shape[1], dtype='float32')).reshape(self.index[1:].shape) shift = T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32') skips = T.concatenate([T.zeros_like(self.y_t[:1]),self.y_t[:-1]],axis=0) idx = shift + T.cumsum(skips, axis=0) idx = T.cast(idx[:-1].flatten(),'int32') #idx = (idx.flatten() > 0).nonzero() #idx = base[0].attention.flatten() x_out = base[0].output[::-1] x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx] x_out = x_out.reshape((self.index.shape[0], self.index.shape[1], x_out.shape[1])) self.output = T.concatenate([base[0].output[-1:], x_out], axis=0)[::-1] self.attrs['n_out'] = base[0].attrs['n_out'] self.params.update(unit.params) return if recurrent_transform == 'batch_norm': self.params['sample_mean_batch_norm'].custom_update = T.dot(T.mean(self.act[0],axis=[0,1]),self.W_re) self.params['sample_mean_batch_norm'].custom_update_normalized = True self.make_output(self.act[0][::direction or 1], sample_mean=sample_mean, gamma=gamma) self.params.update(unit.params)
def optimize_expert_weights(expert_predictions, average_distribution, mask_matrix=None, targets=None, num_cross_validation_masks=2, num_folds=1, eps=1e-14, cutoff=0.01, do_optimization=True, expert_weights=None, optimal_params=None, special_average=False, *args, **kwargs): """ :param expert_predictions: experts x validation_samples x 600 x :param mask_matrix: experts x validation_samples x :param targets: validation_samples x 600 x :param average_distribution: 600 x :param eps: :return: """ if expert_weights is not None: mask_matrix = mask_matrix[expert_weights>cutoff,:] # remove expert_predictions = expert_predictions[expert_weights>cutoff,:,:] # remove NUM_EXPERTS = expert_predictions.shape[0] NUM_FILTER_PARAMETERS = 2 WINDOW_SIZE = 599 # optimizing weights X = theano.shared(expert_predictions.astype('float32')) # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600) x_coor = theano.shared(np.linspace(-(WINDOW_SIZE-1)/2, (WINDOW_SIZE-1)/2, num=WINDOW_SIZE, dtype='float32')) # targets = (NUM_VALIDATIONS, 600) NUM_VALIDATIONS = expert_predictions.shape[1] ind = theano.shared(np.zeros((NUM_VALIDATIONS,), dtype='int32')) # targets = (NUM_VALIDATIONS, 600) if optimal_params is None: params_init = np.concatenate([ np.ones((NUM_EXPERTS,), dtype='float32'), np.ones((NUM_FILTER_PARAMETERS,), dtype='float32') ]) else: params_init = optimal_params.astype('float32') params = theano.shared(params_init.astype('float32')) #params = T.vector('params', dtype='float32') # expert weights = (NUM_EXPERTS,) C = 0.0001 if not special_average: # Create theano expression # inputs: W = params[:NUM_EXPERTS] weights = T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0) preds = X.take(ind, axis=1) mask = theano.shared(mask_matrix.astype('float32')).take(ind, axis=1) # expression masked_weights = mask * weights tot_masked_weights = T.clip(masked_weights.sum(axis=0), 1e-7, utils.maxfloat) preds_weighted_masked = preds * masked_weights.dimshuffle(0, 1, 'x') cumulative_distribution = preds_weighted_masked.sum(axis=0) / tot_masked_weights.dimshuffle(0, 'x') # loss l1_loss = weights.sum() else: # calculate the weighted average for each of these experts weights = generate_information_weight_matrix(expert_predictions, average_distribution) # = (NUM_EXPERTS, NUM_VALIDATIONS, 600) weight_matrix = theano.shared((mask_matrix[:,:,None]*weights).astype('float32')) pdf = utils.cdf_to_pdf(expert_predictions) x_log = np.log(pdf) x_log[pdf<=0] = np.log(eps) # Compute the mean X_log = theano.shared(x_log.astype('float32')) # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600) X_log_i = X_log.take(ind, axis=1) w_i = weight_matrix.take(ind, axis=1) W = params[:NUM_EXPERTS] w_i = w_i * T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0, 'x') #the different predictions, are the experts geom_av_log = T.sum(X_log_i * w_i, axis=0) / (T.sum(w_i, axis=0) + eps) geom_av_log = geom_av_log - T.max(geom_av_log,axis=-1).dimshuffle(0,'x') # stabilizes rounding errors? geom_av = T.exp(geom_av_log) geom_pdf = geom_av/T.sum(geom_av,axis=-1).dimshuffle(0,'x') l1_loss = 0 cumulative_distribution = T.cumsum(geom_pdf, axis=-1) if not do_optimization: ind.set_value(range(NUM_VALIDATIONS)) f_eval = theano.function([], cumulative_distribution) cumulative_distribution = f_eval() return cumulative_distribution[0] else: # convert to theano_values (for regularization) t_valid = theano.shared(targets.astype('float32')) # targets = (NUM_VALIDATIONS, 600) t_train = theano.shared(targets.astype('float32')) # targets = (NUM_VALIDATIONS, 600) CRPS_train = T.mean((cumulative_distribution - t_train.take(ind, axis=0))**2) + C * l1_loss CRPS_valid = T.mean((cumulative_distribution - t_valid.take(ind, axis=0))**2) iter_optimize = theano.function([], CRPS_train, on_unused_input="ignore", updates=lasagne.updates.adam(CRPS_train, [params], 1.0)) f_val = theano.function([], CRPS_valid) def optimize_my_params(): for _ in xrange(40 if special_average else 100): # early stopping score = iter_optimize() result = params.get_value() return result, score if num_cross_validation_masks==0: ind.set_value(range(NUM_VALIDATIONS)) params.set_value(params_init) optimal_params, train_score = optimize_my_params() final_weights = -1e10 * np.ones(expert_weights.shape,) final_weights[np.where(expert_weights>cutoff)] = optimal_params[:NUM_EXPERTS] final_params = np.concatenate(( final_weights, optimal_params[NUM_EXPERTS:])) return softmax(final_weights), train_score, final_params else: final_params = [] final_losses = [] print print print for fold in xrange(num_folds): for i_cross_validation in xrange(num_cross_validation_masks): print "\r\033[F\033[F\033[Fcross_validation %d/%d"%(fold*num_cross_validation_masks+i_cross_validation+1, num_folds*num_cross_validation_masks) val_indices = get_cross_validation_indices(range(NUM_VALIDATIONS), validation_index=i_cross_validation, number_of_splits=num_cross_validation_masks, rng_seed=fold, ) indices = [i for i in range(NUM_VALIDATIONS) if i not in val_indices] #out, crps, d = scipy.optimize.fmin_l_bfgs_b(f, w_init, fprime=g, pgtol=1e-09, epsilon=1e-08, maxfun=10000) ind.set_value(indices) params.set_value(params_init) result, train_score = optimize_my_params() final_params.append(result) ind.set_value(val_indices) validation_score = f_val() print " Current train value: %.6f" % train_score print " Current validation value: %.6f" % validation_score final_losses.append(validation_score) optimal_params = np.mean(final_params, axis=0) average_loss = np.mean(final_losses) expert_weights_result = softmax(optimal_params[:NUM_EXPERTS]) filter_param_result = optimal_params[NUM_EXPERTS:NUM_EXPERTS+NUM_FILTER_PARAMETERS] #print "filter param result:", filter_param_result return expert_weights_result, average_loss, optimal_params # (NUM_EXPERTS,)
def backward(self, seq_inputs, seq_h, seq_updates, grad_seq_att, cov, query=None, extra_grad_seq_h=None, seq_mask=None): """ Parameters ---------- seq_inputs: (length_seq, bs, n_features) seq_mask: (length_seq, bs) seq_h: (length_seq, bs, n_hidden) seq_updates: (length_seq, bs, n_hidden) the update vectors that have been added to the covariance matrix at each timestep in the step pass. query: (bs, n_hidden) or None grad_seq_att: (length_seq, bs, n_hidden) Gradient of the loss with respect to T.dot(cov, query) cov: (bs, n_hidden, n_hidden) Covariance matrix computed in the step pass """ if not seq_mask: seq_mask = T.ones(seq_inputs.shape[:-1]) if not extra_grad_seq_h: extra_grad_seq_h = T.zeros(grad_seq_att.shape) # Build backward graph of the recurrence (back_input, back_mask, back_h_pre, back_grad_h, back_grad_input, back_grad_h_pre, back_grad_params) = self.build_backward_rec_graph() # Build backward graph of the attention update rule (u_h, u_mask, u_C_pre, u_grad_att, u_query, (u_grad_h, u_grad_params)) = self.attention_update_rule.build_backward_graph() cumsum_grad_seq_att = T.cumsum(grad_seq_att[::-1], axis=0) def step(input, mask, cumsum_grad_att, extra_grad_h, h, h_pre, update, grad_h, C, *prev_grad_params): """ A single timestep of the backward pass. Parameters ---------- input: (batch_size, n_in) mask: (batch_size,) cumsum_grad_att: (batch_size, n_hidden) h: (batch_size, n_hidden) h_pre: (batch_size, n_hidden) update: (batch_size, n_hidden) grad_h: (batch_size, n_hidden) C: (batch_size, n_hidden, n_hidden) *prev_grad_params Returns ------- grad_input: (batch_size, n_in) grad_h_pre: (batch_size, n_hidden) C_pre: (batch_size, n_hidden, n_hidden) gradients with respect to the params (both of the recurrent and the update rule) """ C_pre = self.attention_update_rule.restore_previous_matrix(C, update) att_grads = theano.clone( output=[u_grad_h] + u_grad_params, replace={u_h: h, u_mask: mask, u_C_pre: C_pre, u_grad_att: cumsum_grad_att, u_query: h}) grad_h_att = att_grads[0] grad_params_att = att_grads[1:] grad_h_att *= 1000 / T.sum(seq_mask, axis=0)[:, None] grad_h_att = T.switch(mask[:, None], grad_h_att, .0) rec_grads = theano.clone( output=[back_grad_input, back_grad_h_pre] + back_grad_params, replace={back_input: input, back_mask: mask, back_h_pre: h_pre, back_grad_h: extra_grad_h + grad_h + grad_h_att}) grad_input = rec_grads[0] grad_h_pre = rec_grads[1] grad_params_rec = rec_grads[2:] grad_params = grad_params_att + grad_params_rec scan_outputs = [grad_input, grad_h_pre, C_pre] for prev_grad, grad in zip(prev_grad_params, grad_params): scan_outputs.append(prev_grad + grad) return tuple(scan_outputs) seq_h = T.concatenate([T.zeros_like(seq_h[0:1]), seq_h]) params = self.attention_update_rule.params + self.rec_params grads, _ = theano.scan( fn=step, sequences=[seq_inputs[::-1], seq_mask[::-1], cumsum_grad_seq_att, extra_grad_seq_h[::-1], dict(input=seq_h[::-1], taps=[0, 1]), seq_updates[::-1]], outputs_info=([None, T.zeros_like(seq_h[0]), cov] + [T.zeros_like(m) for m in params]), name='backward') grads_input = grads[0][::-1] grads_param = [g[-1] for g in grads[3:]] return grads_input, params, grads_param
def icdf_z_given_v(self, v): return T.cumsum(self.pdf_z_given_v(v)[:, ::-1], axis=1)[:, ::-1]
def monotonicity_penalty(weights, mask_x=None): cumsums = tensor.cumsum(weights, axis=2) penalties = tensor.maximum(cumsums[1:] - cumsums[:-1], 0).sum(axis=2) if mask_x: penalties *= mask_x[1:] return penalties.sum()
def _update_neural_stack(self, V_tm1, s_tm1, d_t, u_t, v_t, time,stack=True): ############################################################ #Equation 1 V_t = V_tm1 V_t = T.set_subtensor(V_t[::,time,::],v_t) ############################################################ #equation 2 if stack: s_op = T.cumsum(s_tm1[::,1:time][::,::-1],axis=1) #Size t-2 s_op = s_op[::,::-1] #padding input_shape = s_op.shape output_shape = (input_shape[0], input_shape[1] + 1) output = T.zeros(output_shape) s_op = T.set_subtensor(output[:, :input_shape[1]], s_op) else: s_op = T.cumsum(s_tm1[::,:time-1],axis=1) #Size t-2 #padding input_shape = s_op.shape output_shape = (input_shape[0], input_shape[1] + 1) output = T.zeros(output_shape) s_op = T.set_subtensor(output[:, 1:input_shape[1]+1], s_op) s_op = u_t.dimshuffle(0,"x") - s_op s_op = T.maximum(s_op,0) #ifelse to deal with time == 0 #m = T.max() #ifelse(T.ge(time,1),time,T.cast(1,"int32")) s_op = s_tm1[::,:time]-s_op s_op = T.maximum(s_op,0) s_t = s_tm1 s_t = T.set_subtensor(s_t[::,:time], s_op) s_t = T.set_subtensor(s_t[::,time], d_t) ############################################################ #equation 3 if stack: s_op = T.cumsum(s_t[::,1:time+1][::,::-1],axis=1) #Size t-1 s_op = s_op[::,::-1] #left padding input_shape = s_op.shape output_shape = (input_shape[0], input_shape[1] + 1) output = T.zeros(output_shape) s_op = T.set_subtensor(output[:, :input_shape[1]], s_op) else: s_op = T.cumsum(s_t[::,:time],axis=1) #Size t-1 #left padding input_shape = s_op.shape output_shape = (input_shape[0], input_shape[1] + 1) output = T.zeros(output_shape) s_op = T.set_subtensor(output[:,1:1+input_shape[1]], s_op) # Max operation s_op = 1 - s_op s_op = T.maximum(s_op,0) #Min operation s_op = T.minimum(s_t[::,:time+1],s_op) r_t = T.sum(s_op[::,:time+1].dimshuffle(0,1,"x")*V_t[::,:time+1,::],axis=1) return V_t, s_t,r_t