def gradients(cost, parameters, lr=0.001): updates = [] c = 0 for param in parameters: update = param - lr * theano.grad(cost, param) if c == 1 or c == 3: # update = t.minimum(t.abs_(update), np.pi) * (update / abs(update)) # # update = t.maximum(update, 0) # update = t.minimum(update, np.pi) update = ifelse(t.lt(update, 0), np.pi * 2 - 0.001, update) update = ifelse(t.gt(update, np.pi * 2), 0.001, update) if c == 2: update = ifelse(t.lt(update, 2), float(20), update) elif c == 5 or c == 6: update = t.maximum(update, -5) update = t.minimum(update, 5) updates.append((param, update)) c += 1 return updates
def theano_digitize(x, bins): """ Equivalent to numpy digitize. Parameters ---------- x : Theano tensor or array_like The array or matrix to be digitized bins : array_like The bins with which x should be digitized Returns ------- A Theano tensor The indices of the bins to which each value in input array belongs. """ binned = T.zeros_like(x) + len(bins) for i in range(len(bins)): bin=bins[i] if i == 0: binned=T.switch(T.lt(x,bin),i,binned) else: ineq = T.and_(T.ge(x,bins[i-1]),T.lt(x,bin)) binned=T.switch(ineq,i,binned) binned=T.switch(T.isnan(x), len(bins), binned) return binned
def irprop_minus_updates(params, grads): # IRPROP- parameters updates = [] deltas = 0.1*numpy.ones(len(params)) last_params = params positiveStep = 1.2 negativeStep = 0.5 maxStep = 50. minStep = math.exp(-6) for param, gparam, delta, last_gparam in zip(params, grads, deltas, last_params): # calculate change change = T.sgn(gparam * last_gparam) if T.gt(change, 0) : delta = T.minimum(delta * positiveStep, maxStep) if T.lt(delta, minStep): delta = minStep elif T.lt(change, 0): delta = T.maximum(delta * negativeStep, minStep) if T.gt(delta, params['maxStep']): delta = params['maxStep'] last_gparam = 0 # update the weights updates.append((param, param - T.sgn(gparam) * delta)) # store old change last_gparam = gparam return updates
def _backward_negative_z(inputs, weights, normed_relevances, bias=None): inputs_plus = inputs * T.gt(inputs, 0) weights_plus = weights * T.gt(weights, 0) inputs_minus = inputs * T.lt(inputs, 0) weights_minus = weights * T.lt(weights, 0) # Compute weights+ * inputs- and weights- * inputs+ negative_part_a = conv2d( normed_relevances, weights_plus.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full" ) negative_part_a *= inputs_minus negative_part_b = conv2d( normed_relevances, weights_minus.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full" ) negative_part_b *= inputs_plus together = negative_part_a + negative_part_b if bias is not None: bias_negative = bias * T.lt(bias, 0) bias_relevance = bias_negative.dimshuffle("x", 0, "x", "x") * normed_relevances # Divide bias by weight size before convolving back # mean across channel, 0, 1 dims (hope this is correct?) fraction_bias = bias_relevance / T.prod(weights.shape[1:]).astype(theano.config.floatX) bias_rel_in = conv2d( fraction_bias, T.ones_like(weights).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full" ) together += bias_rel_in return together
def build_update(self, alpha=0.01, beta=0.0): W = self.W lambda_mult=self.lambda_mult y=self.y C = self.C lower_bound = theano.shared(np.float32(0.0)) updates = build_gradDescent_step(W, lambda_mult, alpha,beta) updatelambda_mult = updates[1] # \Longleftrightarrow <<===>> \lambda_i'(t+1) updatelambda_mult = updatelambda_mult - T.dot(y,updatelambda_mult)/T.dot(y,y) * y # Longleftrightarrow <<===>> \lambda_i''(t+1) # use theano.tensor.switch because we need an elementwise comparison # if \lambda_I''(t+1)> C, C updatelambda_mult = T.switch( T.lt( C , updatelambda_mult), C, updatelambda_mult) updatelambda_mult = T.switch( T.lt( updatelambda_mult,lower_bound), lower_bound, updatelambda_mult) updatelambda_mult = sandbox.cuda.basic_ops.gpu_from_host( updatelambda_mult) updatefunction = theano.function(inputs=[], outputs = W, updates=[(lambda_mult, updatelambda_mult)]) self._update_lambda_mult_graph = updatelambda_mult self.update_function = updatefunction return updatelambda_mult, updatefunction
def __init__(self, x, lower, upper, *args, **kwargs): super(Uniform, self).__init__(*args, **kwargs) self._logp = T.log(T.switch(T.gt(x, upper), 0, T.switch(T.lt(x, lower), 0, 1/(upper - lower)))) self._cdf = T.switch(T.gt(x, up), 1, T.switch(T.lt(x, low), 0, (x - low)/(up - low))) self._add_expr('x', x) self._add_expr('lower', lower) self._add_expr('upper', upper)
def _recursive_step(self, i, regs, tokens, seqs, back_routes, back_lens): seq = seqs[i] # Encoding left, right, target = seq[0], seq[1], seq[2] left_rep = ifelse(T.lt(left, 0), tokens[-left], regs[left]) right_rep = ifelse(T.lt(right, 0), tokens[-right], regs[right]) rep = self._encode_computation(left_rep, right_rep) if self.deep: inter_rep = rep rep = self._deep_encode(inter_rep) else: inter_rep = T.constant(0) new_regs = T.set_subtensor(regs[target], rep) back_len = back_lens[i] back_reps, lefts, rights = self._unfold(back_routes[i], new_regs, back_len) gf_W_d1, gf_W_d2, gf_B_d1, gf_B_d2, distance, rep_gradient = self._unfold_gradients(back_reps, lefts, rights, back_routes[i], tokens, back_len) return ([rep, inter_rep, left_rep, right_rep, new_regs, rep_gradient, distance], self.decode_optimizer.setup([self.W_d1, self.W_d2, self.B_d1, self.B_d2], [gf_W_d1, gf_W_d2, gf_B_d1, gf_B_d2], method=self.optimization, beta=self.beta))
def tnormal_icdf(size, avg, std, lbound, ubound, theano_rng, dtype): """ Alternative Method: sample = -Phi_inv(Phi(-lbound)*(1-u) + Phi(-ubound)*u) """ def Phi(x): erfarg = (x - avg) / (std * SQRT2) rval = 0.5 * (1. + T.erf(erfarg)) return rval.astype(dtype) def Phi_inv(y, eps=3e-8): """ eps was calibrated for cublas.erfinv using float32 """ temp = 2. * y - 1. erfinv_input = T.clip(temp, -1+eps, 1-eps) rval = avg + std * SQRT2 * T.erfinv(erfinv_input) return rval.astype(dtype) # center lower and upper bounds based on mean u = theano_rng.uniform(size=size, dtype=dtype) # Inverse CDF method. When method becomes numerically unstable, we simply # return the bounds based on whether avg < lbound, or ubound < avg. cdf_range = Phi(ubound) - Phi(lbound) sample = T.switch( T.or_( T.lt(cdf_range, 3e-8), T.gt(cdf_range, 1-3e-8)), T.switch( T.lt(avg, lbound), lbound, ubound), Phi_inv(Phi(lbound) + u * cdf_range)) return sample
def generate_subpop_input(r_E, r_I, n_pairs): c = T.scalar("c", dtype='float32') h = T.matrix("h", dtype='float32') W_EE = T.tensor3("W_EE", dtype='float32') W_EI = T.tensor3("W_EI", dtype='float32') W_IE = T.tensor3("W_IE", dtype='float32') W_II = T.tensor3("W_II", dtype='float32') r_e = T.matrix("r_e", dtype='float32') r_i = T.matrix("r_i", dtype='float32') I_E = T.matrix('I_E', dtype='float32') I_I = T.matrix('I_I', dtype='float32') I_thresh_E = T.matrix('I_thresh_E', dtype='float32') I_thresh_I = T.matrix('I_thresh_I', dtype='float32') # Compile functions: I_E = c*h + T.sum(T.sum(W_EE*r_e,1),1).reshape((n_pairs, n_pairs)).T - T.sum(T.sum(W_EI*r_i,1),1).reshape((n_pairs, n_pairs)).T I_I = c*h + T.sum(T.sum(W_IE*r_e,1),1).reshape((n_pairs, n_pairs)).T - T.sum(T.sum(W_II*r_i,1),1).reshape((n_pairs, n_pairs)).T I_thresh_E = T.switch(T.lt(I_E,0), 0, I_E) I_thresh_I = T.switch(T.lt(I_I,0), 0, I_I) inputs = theano.function(inputs=[c,h,W_EE,W_EI,W_IE,W_II], outputs=[I_thresh_E, I_thresh_I], givens={r_e:r_E, r_i:r_I}, allow_input_downcast=True) return inputs
def __init__(self, random_state=None, low=0.0, high=1.0): super(Uniform, self).__init__(low=low, high=high, random_state=random_state, optimizer=None) # pdf self.pdf_ = T.switch( T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)), 0., 1. / (self.high - self.low)).ravel() self.make_(self.pdf_, "pdf") # -log pdf self.nnlf_ = T.switch( T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)), np.inf, T.log(self.high - self.low)).ravel() self.make_(self.nnlf_, "nnlf") # cdf self.cdf_ = T.switch( T.lt(self.X, self.low), 0., T.switch( T.lt(self.X, self.high), (self.X - self.low) / (self.high - self.low), 1.)).ravel() self.make_(self.cdf_, "cdf") # ppf self.ppf_ = self.p * (self.high - self.low) + self.low self.make_(self.ppf_, "ppf", args=[self.p])
def interval_reduction(a, b, c, d, tol): fc = f(c) fd = f(d) a, b, c, d = ifelse(T.lt(fc, fd), [a, d, d - golden_ratio * (d - a), c], [c, b, d, c + golden_ratio * (b - c)]) stoprule = theano.scan_module.until(T.lt(T.abs_(c - d), tol)) return [a, b, c, d], stoprule
def rprop(param,learning_rate,gparam,mask,updates,current_cost,previous_cost, eta_plus=1.2,eta_minus=0.5,max_delta=50, min_delta=10e-6): previous_grad = sharedX(numpy.ones(param.shape.eval()),borrow=True) delta = sharedX(learning_rate * numpy.ones(param.shape.eval()),borrow=True) previous_inc = sharedX(numpy.zeros(param.shape.eval()),borrow=True) zero = T.zeros_like(param) one = T.ones_like(param) change = previous_grad * gparam new_delta = T.clip( T.switch( T.eq(gparam,0.), delta, T.switch( T.gt(change,0.), delta*eta_plus, T.switch( T.lt(change,0.), delta*eta_minus, delta ) ) ), min_delta, max_delta ) new_previous_grad = T.switch( T.eq(mask * gparam,0.), previous_grad, T.switch( T.gt(change,0.), gparam, T.switch( T.lt(change,0.), zero, gparam ) ) ) inc = T.switch( T.eq(mask * gparam,0.), zero, T.switch( T.gt(change,0.), - T.sgn(gparam) * new_delta, T.switch( T.lt(change,0.), zero, - T.sgn(gparam) * new_delta ) ) ) updates.append((previous_grad,new_previous_grad)) updates.append((delta,new_delta)) updates.append((previous_inc,inc)) return param + inc * mask
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic or self.rate == 0: return input else: drop = self._srng.uniform(input.shape) z = T.lt(drop, 0.5 * self.rate) o = T.lt(T.abs_(drop - 0.75 * self.rate), 0.25 * self.rate) input = T.set_subtensor(input[z.nonzero()], 0.) input = T.set_subtensor(input[o.nonzero()], 1.) return input
def berhu(predictions, targets,s=0.2,l=0.5,m=1.2): # Compute mask mask = T.gt(targets, l) * T.lt(targets,m) # Compute n of valid pixels n_valid = T.sum(mask) # Redundant mult here r = (predictions - targets) * mask c = s * T.max(T.abs_(r)) a_r = T.abs_(r) b = T.switch(T.lt(a_r, c), a_r, ((r**2) + (c**2))/(2*c)) return T.sum(b)/n_valid
def _bpts_step(self, i, gradient_reg, seqs, reps, inter_reps, left_subreps, right_subreps, rep_gradients): # BPTS seq = seqs[i] left, right, target = seq[0], seq[1], seq[2] left_is_token = T.lt(left, 0) right_is_token = T.lt(right, 0) bpts_gradient = gradient_reg[target] rep_gradient = rep_gradients[i] + bpts_gradient if self.deep: # Implementation note: # As the gradient of deep encoding func wrt W_ee includes the input representation. # If we let T.grad to find that input representation directly, it will stuck in an infinite loop. # So we must use SRG in this case. _fake_input_rep, = make_float_vectors("_fake_input_rep") deep_rep = self._deep_encode(_fake_input_rep) node_map = {deep_rep: reps[i], _fake_input_rep: inter_reps[i]} g_wee = SRG(T.grad(T.sum(deep_rep), self.W_ee), node_map) * rep_gradient g_bee = SRG(T.grad(T.sum(deep_rep), self.B_ee), node_map) * rep_gradient g_inter_rep = SRG(T.grad(T.sum(deep_rep), _fake_input_rep), node_map) * rep_gradient inter_rep = inter_reps[i] else: g_wee = T.constant(0) g_bee = T.constant(0) g_inter_rep = rep_gradient inter_rep = reps[i] # Accelerate computation by using saved internal values. # For the limitation of SRG, known_grads can not be used here. _fake_left_rep, _fake_right_rep = make_float_vectors("_fake_left_rep", "_fake_right_rep") rep_node = self._encode_computation(_fake_left_rep, _fake_right_rep) if self.deep: rep_node = self._deep_encode(rep_node) node_map = {_fake_left_rep: left_subreps[i], _fake_right_rep: right_subreps[i], rep_node: inter_rep} g_we1 = SRG(T.grad(T.sum(rep_node), self.W_e1), node_map) * g_inter_rep g_we2 = SRG(T.grad(T.sum(rep_node), self.W_e2), node_map) * g_inter_rep g_be = SRG(T.grad(T.sum(rep_node), self.B_e), node_map) * g_inter_rep g_left_p = SRG(T.grad(T.sum(rep_node), _fake_left_rep), node_map) * g_inter_rep g_right_p = SRG(T.grad(T.sum(rep_node), _fake_right_rep), node_map) * g_inter_rep gradient_reg = ifelse(left_is_token, gradient_reg, T.set_subtensor(gradient_reg[left], g_left_p)) gradient_reg = ifelse(right_is_token, gradient_reg, T.set_subtensor(gradient_reg[right], g_right_p)) return g_we1, g_we2, g_be, g_wee, g_bee, gradient_reg
def _step( i, pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r ): xk = -(x * k1 * k2) / (k3 * k4) pk = pkm1 + pkm2 * xk qk = qkm1 + qkm2 * xk pkm2 = pkm1 pkm1 = pk qkm2 = qkm1 qkm1 = qk xk = (x * k5 * k6) / (k7 * k8) pk = pkm1 + pkm2 * xk qk = qkm1 + qkm2 * xk pkm2 = pkm1 pkm1 = pk qkm2 = qkm1 qkm1 = qk old_r = r r = tt.switch(tt.eq(qk, zero), r, pk/qk) k1 += one k2 += k26update k3 += two k4 += two k5 += one k6 -= k26update k7 += two k8 += two big_cond = tt.gt(tt.abs_(qk) + tt.abs_(pk), BIG) biginv_cond = tt.or_( tt.lt(tt.abs_(qk), BIGINV), tt.lt(tt.abs_(pk), BIGINV) ) pkm2 = tt.switch(big_cond, pkm2 * BIGINV, pkm2) pkm1 = tt.switch(big_cond, pkm1 * BIGINV, pkm1) qkm2 = tt.switch(big_cond, qkm2 * BIGINV, qkm2) qkm1 = tt.switch(big_cond, qkm1 * BIGINV, qkm1) pkm2 = tt.switch(biginv_cond, pkm2 * BIG, pkm2) pkm1 = tt.switch(biginv_cond, pkm1 * BIG, pkm1) qkm2 = tt.switch(biginv_cond, qkm2 * BIG, qkm2) qkm1 = tt.switch(biginv_cond, qkm1 * BIG, qkm1) return ((pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r), until(tt.abs_(old_r - r) < (THRESH * tt.abs_(r))))
def rebuild(self): for i, (inputs, f) in enumerate(self.wiring): if not inputs: continue lin_comb = T.dot(T.concatenate([self._vlayers[j] for j in inputs], axis=1), self._vweights[i]) add_biases = lin_comb + self._vbiases[i] self._vlayers[i] = f(add_biases) self._output = T.concatenate([self._vlayers[j] for j in self.output_layers], axis=1) self._targets = [T.matrix() for j in self.output_layers] crossentropy = sum([(T.nnet.categorical_crossentropy(self._vlayers[j], self._targets[i]) if self.wiring[j][1] == SOFTMAX_FUN else ((self._vlayers[j] - self._targets[i]) ** 2 / (1+self._targets[i].max())**2).sum()) for i, j in enumerate(self.output_layers) ]) self._cost = (crossentropy.sum() + self.L2REG/(self.layers[i]) * sum((weight**2).sum() for weight in self._vweights if weight is not None)+ # + # L2 regularization 0.01* self.L2REG/math.sqrt(self.layers[i]) * sum((bias**2).sum() for j, bias in enumerate(self._vbiases) if bias is not None and self.wiring[j][1] != LINEAR_FUN)) # L2 regularization self._costnoreg = crossentropy.sum() self._derivatives = [None] * len(self.layers) self._updates = [] MAX_DERIV = 1000 for i, (inputs, f) in enumerate(self.wiring): if not inputs: continue deriv1 = T.grad(self._cost, self._vweights[i]) deriv1p = T.switch(T.lt(deriv1, MAX_DERIV), deriv1, MAX_DERIV) deriv1pp = T.switch(T.gt(deriv1p, -MAX_DERIV), deriv1p, -MAX_DERIV) #deriv1ppp = T.switch(T.isnan(deriv1pp), 0, deriv1pp) deriv2 = T.grad(self._cost, self._vbiases[i]) deriv2p = T.switch(T.lt(deriv2, MAX_DERIV), deriv2, MAX_DERIV) deriv2pp = T.switch(T.gt(deriv2p, -MAX_DERIV), deriv2p, -MAX_DERIV) #deriv2ppp = T.switch(T.isnan(deriv2pp), 0, deriv2pp) self._derivatives[i] = (deriv1pp, deriv2pp) self._updates.append((self._vweights[i], self._vweights[i] - self.learning_rate * self._derivatives[i][0])) self._updates.append((self._vbiases[i], self._vbiases[i] - self.learning_rate * self._derivatives[i][1])) self._prediction = theano.function(inputs=[self._vlayers[i] for i in self.input_layers], outputs=self._output) self._train = theano.function(inputs=self._targets+[self._vlayers[i] for i in self.input_layers], outputs=self._cost, updates=self._updates, allow_input_downcast=True) #mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) # debug NaN self._costfun = theano.function(inputs=self._targets+[self._vlayers[i] for i in self.input_layers], outputs=self._costnoreg, allow_input_downcast=True)
def _forward_negative_z(inputs, weights, bias=None): inputs_plus = inputs * T.gt(inputs, 0) weights_plus = weights * T.gt(weights, 0) inputs_minus = inputs * T.lt(inputs, 0) weights_minus = weights * T.lt(weights, 0) negative_part_a = conv2d(inputs_plus, weights_minus) negative_part_b = conv2d(inputs_minus, weights_plus) together = negative_part_a + negative_part_b if bias is not None: bias_negative = bias * T.lt(bias, 0) together += bias_negative.dimshuffle("x", 0, "x", "x") return together
def relevance_conv_a_b_sign_switch(inputs, weights, out_relevances, a, b, bias=None): assert a is not None assert b is not None assert a - b == 1 # For each input, determine what outputs = conv2d(inputs, weights) if bias is not None: outputs += bias.dimshuffle("x", 0, "x", "x") # do not use bias further, only to determine direction of outputs bias = None # stabilize # prevent division by 0 and division by small numbers eps = 1e-4 outputs += T.sgn(outputs) * eps outputs += T.eq(outputs, 0) * eps positive_forward = _forward_positive_z(inputs, weights, bias) negative_forward = _forward_negative_z(inputs, weights, bias) rel_for_positive_outputs = out_relevances * T.gt(outputs, 0) rel_for_negative_outputs = out_relevances * T.lt(outputs, 0) positive_norm_with_trend = positive_forward * T.gt(outputs, 0) negative_norm_with_trend = negative_forward * T.lt(outputs, 0) # minus to make overall norm positive norm_with_trend = positive_norm_with_trend - negative_norm_with_trend # stabilize also norm_with_trend += T.eq(norm_with_trend, 0) * eps in_positive_with_trend = _backward_positive_z(inputs, weights, rel_for_positive_outputs / norm_with_trend, bias) in_negative_with_trend = _backward_negative_z(inputs, weights, rel_for_negative_outputs / norm_with_trend, bias) # Minus in_negative since in_with_trend should not switch signs in_with_trend = in_positive_with_trend - in_negative_with_trend positive_norm_against_trend = positive_forward * T.lt(outputs, 0) negative_norm_against_trend = negative_forward * T.gt(outputs, 0) # minus to make overall norm positive norm_against_trend = positive_norm_against_trend - negative_norm_against_trend # stabilize also norm_against_trend += T.eq(norm_against_trend, 0) * eps in_positive_against_trend = _backward_positive_z( inputs, weights, rel_for_negative_outputs / norm_against_trend, bias ) in_negative_against_trend = _backward_negative_z( inputs, weights, rel_for_positive_outputs / norm_against_trend, bias ) # Minus in_negative since switching signs is done below in_against_trend = in_positive_against_trend - in_negative_against_trend in_relevances = a * in_with_trend - b * in_against_trend return in_relevances
def cubicBSpline(self, L): b = T.zeros_like(L) idx4 = T.ge(L, 0) * T.lt(L, 1) idx3 = T.ge(L, 1) * T.lt(L, 2) idx2 = T.ge(L, 2) * T.lt(L, 3) idx1 = T.ge(L, 3) * T.le(L, 4) b = T.switch(T.eq(idx4, 1), T.pow(L, 3) / 6, b) b = T.switch(T.eq(idx3, 1), (-3*T.pow(L-1,3) + 3*T.pow(L-1,2) + 3*(L-1) + 1) / 6, b) b = T.switch(T.eq(idx2, 1), ( 3*T.pow(L-2,3) - 6*T.pow(L-2,2) + 4) / 6, b) b = T.switch(T.eq(idx1, 1), (- T.pow(L-3,3) + 3*T.pow(L-3,2) - 3*(L-3) + 1) / 6, b) return b.T # b is K x K' and thus, as we multiply from the right with
def learning_updates(self): step = self.learning_rate self.grads = [] self.steps = [] for param in self.params: v = param.get_value() n = param.name self.grads.append(theano.shared(np.zeros_like(v), name=n + '_grad')) self.steps.append( theano.shared(np.zeros_like(v) + step, name=n + '_step')) for param, step_tm1, grad_tm1 in zip(self.params, self.steps, self.grads): grad = TT.grad(self.J, param) test = grad * grad_tm1 same = TT.gt(test, 0) diff = TT.lt(test, 0) step = TT.minimum( self.max_step, TT.maximum( self.min_step, step_tm1 * (TT.eq(test, 0) + same * self.step_increase + diff * self.step_decrease))) grad = grad - diff * grad yield param, param - TT.sgn(grad) * step yield grad_tm1, grad yield step_tm1, step
def __init__(self, config, loss, params): self._lr = get_shared_floatX(config.learning_rate, 'lr') self._t = get_shared_floatX(1, 't') self._all_m_tm1 = [] self._all_v_tm1 = [] self._updates = [(self._t, self._t + 1)] if config.lr_decay: lr_coef = tt.pow(config.lr_decay, (self._t - 1) // config.lr_decay_freq) self._updates.append((self._lr, lr_coef * config.learning_rate)) grads = theano.grad(loss, params) self._global_grad_norm = tt.sqrt(tt.sum(tt.stack([tt.sum(g**2.) for g in grads]))) if config.max_grad_norm: global_clip_factor = ifelse(tt.lt(self._global_grad_norm, config.max_grad_norm), cast_floatX_np(1.), cast_floatX(config.max_grad_norm/self._global_grad_norm)) grads = [global_clip_factor * g for g in grads] lr_t = self._lr * \ clip_sqrt(1 - tt.pow(config.adam_beta2, self._t)) / (1 - tt.pow(config.adam_beta1, self._t)) for p, g in zip(params, grads): m_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_m_' + p.name) v_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_v_' + p.name) self._all_m_tm1.append(m_tm1) self._all_v_tm1.append(v_tm1) m_t = config.adam_beta1 * m_tm1 + (1-config.adam_beta1) * g v_t = config.adam_beta2 * v_tm1 + (1-config.adam_beta2) * tt.sqr(g) delta_t = -lr_t * m_t / (clip_sqrt(v_t) + config.adam_eps) p_t = p + delta_t self._updates += [(m_tm1, m_t), (v_tm1, v_t), (p, p_t)]
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean((network_output - prediction_func)**2, axis=1) params = list(iter_parameters(self)) param_vector = parameters2vector(self) J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params)).dot( J.T).dot(mse_for_each_sample) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def normal_lcdf(mu, sigma, x): z = (x - mu) / sigma return tt.switch( tt.lt(z, -1.0), tt.log(tt.erfcx(-z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2., tt.log1p(-tt.erfc(z / tt.sqrt(2.)) / 2.) )
def incomplete_beta(a, b, value): '''Incomplete beta implementation Power series and continued fraction expansions chosen for best numerical convergence across the board based on inputs. ''' machep = tt.constant(np.MachAr().eps, dtype='float64') one = tt.constant(1, dtype='float64') w = one - value ps = incomplete_beta_ps(a, b, value) flip = tt.gt(value, (a / (a + b))) aa, bb = a, b a = tt.switch(flip, bb, aa) b = tt.switch(flip, aa, bb) xc = tt.switch(flip, value, w) x = tt.switch(flip, w, value) tps = incomplete_beta_ps(a, b, x) tps = tt.switch(tt.le(tps, machep), one - machep, one - tps) # Choose which continued fraction expansion for best convergence. small = tt.lt(x * (a + b - 2.0) - (a - one), 0.0) cfe = incomplete_beta_cfe(a, b, x, small) w = tt.switch(small, cfe, cfe / xc) # Direct incomplete beta accounting for flipped a, b. t = tt.exp(a * tt.log(x) + b * tt.log(xc) + gammaln(a + b) - gammaln(a) - gammaln(b) + tt.log(w / a)) t = tt.switch(flip, tt.switch(tt.le(t, machep), one - machep, one - t), t) return tt.switch( tt.and_(flip, tt.and_(tt.le((b * x), one), tt.le(x, 0.95))), tps, tt.switch(tt.and_(tt.le(b * value, one), tt.le(value, 0.95)), ps, t))
def apply(self, application_call, defs, def_mask): """ Returns vector per each word in sequence using the dictionary based lookup """ # Short listing defs = (T.lt(defs, self._num_input_words) * defs + T.ge(defs, self._num_input_words) * self._vocab.unk) # Memory bottleneck: # For instance (16101,52,300) ~= 32GB. # [(16786, 52, 1), (16786, 52, 100)] # TODO: Measure memory consumption here and check if it is in sensible range # or maybe introduce some control in Retrieval? defs_emb = self._def_lookup.apply(defs) application_call.add_auxiliary_variable( unk_ratio(defs, def_mask, self._vocab.unk), name='def_unk_ratio') if self._translate: logger.info("Translating in MeanPoolReadDefinitions") # Translate. Crucial for recovering useful information from embeddings defs_emb = self._def_translate.apply(defs_emb) def_emb_mask = def_mask[:, :, None] defs_emb = (def_emb_mask * defs_emb).sum(axis=1) if self._normalize: defs_emb = defs_emb / def_emb_mask.sum(axis=1) return defs_emb
def output_probabilistic(self, m_x, v_x): m_linear = T.dot(m_x, self.m_W[ 0, :, : ]) + T.tile(self.m_b[ 0, :, : ], [ m_x.shape[ 0 ], 1 ]) v_linear = T.dot(m_x**2, self.v_W[ 0, :, : ]) + T.dot(v_x, self.m_W[ 0, :, : ]**2) + T.dot(v_x, self.v_W[ 0, :, : ]) + \ T.tile(self.v_b[ 0, :, : ], [ m_x.shape[ 0 ], 1 ]) if not self.output_layer: # We compute the mean and variance after the ReLU activation alpha = m_linear / T.sqrt(v_linear) gamma = Network_layer.gamma(-alpha) gamma_robust = -alpha - 1.0 / alpha + 2.0 / alpha**3 gamma_final = T.switch(T.lt(-alpha, T.fill(alpha, 30)), gamma, gamma_robust) v_aux = m_linear + T.sqrt(v_linear) * gamma_final m_a = Network_layer.n_cdf(alpha) * v_aux v_a = m_a * v_aux * Network_layer.n_cdf(-alpha) + Network_layer.n_cdf(alpha) * v_linear * (1 - gamma_final * (gamma_final + alpha)) return (m_a, v_a) else: return (m_linear, v_linear)
def normal_lcdf(mu, sigma, x): z = (x - mu) / sigma return tt.switch( tt.lt(z, -1.0), tt.log(tt.erfcx(-z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2, tt.log1p(-tt.erfc(z / tt.sqrt(2.)) / 2.) )
def apply(self, v, **kwargs): input = v.output z = T.mean(input) stdev = T.std(input) nv = vcopy(v) histogram = [] buckets = self.get_buckets() for beg, end in buckets: a = T.ge(input, beg) b = T.lt(input, end) percent = T.sum(a * b) / T.prod(input.shape).astype(floatX) histogram.append(percent) r = { 'name': self.name, 'mean': z, 'stdev': stdev, 'histogram': histogram } if 'activation_monitoring' in nv: nv.activation_monitoring.append(r) else: nv.activation_monitoring = [r] return self.post_apply(nv, **kwargs)
def get_embed_sampled(embed_tensor, sample=False): if not sample: randomization = 0.5 else: randomization = theano_rng.uniform(size=embed_tensor.shape) return T.switch(T.lt(randomization, embed_tensor), 1.0, 0.0) #(val,dim)
def piecewisePooling_feed(self, new_input): # mentions_batch = 句子数×1×88×60 # eli_batch = 句子数×1 mentions_batch, e1i_batch, e2i_batch = new_input # conv # input = 句子数×1×88×60 # filter = 230×1×3×60 self.conv_out = conv.conv2d(input=mentions_batch, filters=self.W, filter_shape=self.filter_shape, image_shape=self.image_shape) # conv_out=句子数×230×86×1 # nonlinear_out = 句子数×230×86×1 if self.non_linear.lower() == "tanh": # b是0 self.nonlinear_out = T.tanh(self.conv_out + self.b.dimshuffle('x', 0, 'x', 'x')) elif self.non_linear.lower() == "relu": self.nonlinear_out = T.nnet.relu( self.conv_out + self.b.dimshuffle('x', 0, 'x', 'x')) else: raise NotImplementedError # pooling # filter_h = 3 filter_h = self.filter_shape[2] # n_pad_head = 4 n_pad_head = conf.getint('settings', 'max_filter_h') - 1 assert n_pad_head == 4 # numpy.floor向上取整 # 反正偏移了3个单位 idx_shift = n_pad_head - int(numpy.floor( filter_h / 2)) # 经过pad和convolution后, 相对于e1i, e2i(pad前)偏移了多少. e1i_conved = e1i_batch + idx_shift e2i_conved = e2i_batch + idx_shift # 得到每个句子左实体的位置 + 1和右实体的位置 + 1 [m_seg2_st_batch, m_seg3_st_batch], _ = \ theano.scan(fn=lambda e1i, e2i: ifelse(T.lt(e1i, e2i), (e1i + 1, e2i + 1), (e2i + 1, e1i + 1)), sequences=[e1i_conved, e2i_conved]) nonlinear_out_3d = self.nonlinear_out.flatten(3) def piecewise_pooling(conved_m, m_seg2_st, m_seg3_st): seg1_out = T.max(conved_m[:, :m_seg2_st], axis=1) seg2_out = T.max(conved_m[:, m_seg2_st:m_seg3_st], axis=1) seg3_out = T.max(conved_m[:, m_seg3_st:], axis=1) return T.transpose(T.stack( (seg1_out, seg2_out, seg3_out))).flatten() # 对于每一个句子返回一个230×3的向量 pooling_2d, _ = theano.scan( fn=piecewise_pooling, sequences=[nonlinear_out_3d, m_seg2_st_batch, m_seg3_st_batch]) self.input = new_input self.output = pooling_2d
def irprop_minus_updates(params, grads): # IRPROP- parameters updates = [] deltas = 0.1 * numpy.ones(len(params), theano.config.floatX) last_params = params positiveStep = 1.2 negativeStep = 0.5 maxStep = 50 #1. minStep = math.exp(-6) for param, gparam, delta, last_gparam in zip(params, grads, deltas, last_params): # calculate change change = T.sgn(gparam * last_gparam) if T.gt(change, 0): delta = T.minimum(delta * positiveStep, maxStep) elif T.lt(change, 0): delta = T.maximum(delta * negativeStep, minStep) last_gparam = 0 delta = delta.astype('float32') # update the weights updates.append((param, param - T.sgn(gparam) * delta)) # store old change last_gparam = gparam return updates
def _span_sums(stt, end, p_lens, max_p_len, batch_size, dim, max_ans_len): # Sum of every start element and corresponding max_ans_len end elements. # # stt (max_p_len, batch_size, dim) # end (max_p_len, batch_size, dim) # p_lens (batch_size,) max_ans_len_range = tt.shape_padleft(tt.arange(max_ans_len)) # (1, max_ans_len) offsets = tt.shape_padright(tt.arange(max_p_len)) # (max_p_len, 1) end_idxs = max_ans_len_range + offsets # (max_p_len, max_ans_len) end_idxs_flat = end_idxs.flatten() # (max_p_len*max_ans_len,) end_padded = tt.concatenate( # (max_p_len+max_ans_len-1, batch_size, dim) [end, tt.zeros((max_ans_len-1, batch_size, dim))], axis=0) end_structured = end_padded[end_idxs_flat] # (max_p_len*max_ans_len, batch_size, dim) end_structured = end_structured.reshape( # (max_p_len, max_ans_len, batch_size, dim) (max_p_len, max_ans_len, batch_size, dim)) stt_shuffled = stt.dimshuffle((0,'x',1,2)) # (max_p_len, 1, batch_size, dim) span_sums = stt_shuffled + end_structured # (max_p_len, max_ans_len, batch_size, dim) span_sums_reshaped = span_sums.dimshuffle((2,0,1,3)).reshape( # (batch_size, max_p_len*max_ans_len, dim) (batch_size, max_p_len*max_ans_len, dim)) p_lens_shuffled = tt.shape_padright(p_lens) # (batch_size, 1) end_idxs_flat_shuffled = tt.shape_padleft(end_idxs_flat) # (1, max_p_len*max_ans_len) span_masks_reshaped = tt.lt(end_idxs_flat_shuffled, p_lens_shuffled) # (batch_size, max_p_len*max_ans_len) span_masks_reshaped = cast_floatX(span_masks_reshaped) # (batch_size, max_p_len*max_ans_len, dim), (batch_size, max_p_len*max_ans_len) return span_sums_reshaped, span_masks_reshaped
def _activation(self, Y, L, M, W): """Returns the activation for a given input. Derived from the generative model formulation of hierarchical Poisson mixtures, the formular for the activation in the network reads as follows: I_c = \sum_d \log(W_{cd})y_d + \log(M_{lc}) for labeled data \sum_d \log(W_{cd})y_d + \log(\sum_k M_{kc}) for unlabeled data s_c = softmax(I_c) """ # first: complete inference to find label # Input integration: I = T.tensordot(Y, T.log(W), axes=[1, 1]) # recurrent term: vM = M[L] L_index = T.eq(L, -1).nonzero() vM = T.set_subtensor(vM[L_index], T.sum(M, axis=0)) # numeric trick to prevent overflow in the exp-function max_exponent = 86. - T.ceil(T.log(I.shape[1].astype('float32'))) scale = T.switch(T.gt(T.max(I, axis=1, keepdims=True), max_exponent), T.max(I, axis=1, keepdims=True) - max_exponent, 0.) # numeric approximation to prevent underflow in the exp-function: # map too low values of I to a fixed minimum value min_exponent = -87. + T.ceil(T.log(I.shape[1].astype('float32'))) I = T.switch(T.lt(I - scale, min_exponent), scale + min_exponent, I) # activation: recurrent softmax with overflow protection s = vM * T.exp(I - scale) / T.sum( vM * T.exp(I - scale), axis=1, keepdims=True) return s
def __init__(self, f, θs, α=0.001, β1=0.9, β2=0.999, β3=0.999, k=0.1, K=10., ε=1e-8, dec=0.): α, β1, β2, β3, ε, dec = [ np.cast[floatX](h) for h in [α, β1, β2, β3, ε, dec] ] t = theano.shared(0, name="t") t_u = (t, t + 1) f_prev = theano.shared(np.cast[floatX](0), name="f_prev") ch_fact_lbound = T.switch(T.gt(f, f_prev), 1 + k, 1 / (1 + K)) ch_fact_ubound = T.switch(T.gt(f, f_prev), 1 + K, 1 / (1 + k)) f_ch_fact = f / f_prev f_ch_fact = T.switch(T.lt(f_ch_fact, ch_fact_lbound), ch_fact_lbound, f_ch_fact) f_ch_fact = T.switch(T.gt(f_ch_fact, ch_fact_ubound), ch_fact_ubound, f_ch_fact) f_hat = T.switch(T.gt(t_u[1], 1), f_prev * f_ch_fact, f) f_u = (f_prev, f_hat) self.ms = [ theano.shared(np.zeros(θ.shape.eval(), dtype=floatX), borrow=True, name="m") for θ in θs ] self.vs = [ theano.shared(np.zeros(θ.shape.eval(), dtype=floatX), borrow=True, name="v") for θ in θs ] d = theano.shared(one, name="d") d_den = T.switch(T.gt(f_hat, f_prev), f_prev, f_hat) d_t = (β3 * d) + (one - β3) * T.abs_((f_hat - f_prev) / d_den) d_t = T.switch(T.gt(t_u[1], one), d_t, one) d_u = (d, d_t) gs = T.grad(f, θs) m_us = [(m, β1 * m + (one - β1) * g) for m, g in zip(self.ms, gs)] m_hats = [m_u[1] / (one - T.pow(β1, t_u[1])) for m_u in m_us] v_us = [(v, β2 * v + (one - β2) * T.sqr(g)) for v, g in zip(self.vs, gs)] v_hats = [v_u[1] / (one - T.pow(β2, t_u[1])) for v_u in v_us] θ_us = [(θ, θ - (α / (one + (t_u[1] * dec))) * m_hat / ((T.sqrt(v_hat) * d_t) + ε)) for θ, m_hat, v_hat in zip(θs, m_hats, v_hats)] self.updates = m_us + v_us + [t_u, f_u, d_u] + θ_us
def __init__(self, inverse_scale=1.0): """Constructor. Parameters ---------- * `inverse_scale` [float]: The inverse scale. """ super(Exponential, self).__init__(inverse_scale=inverse_scale) # pdf self.pdf_ = T.switch( T.lt(self.X, 0.), 0., self.inverse_scale * T.exp(-self.inverse_scale * self.X)).ravel() self._make(self.pdf_, "pdf") # -log pdf self.nll_ = bound( -T.log(self.inverse_scale) + self.inverse_scale * self.X, np.inf, self.inverse_scale > 0.).ravel() self._make(self.nll_, "nll") # cdf self.cdf_ = (1. - T.exp(-self.inverse_scale * self.X)).ravel() self._make(self.cdf_, "cdf") # ppf self.ppf_ = -T.log(1. - self.p) / self.inverse_scale self._make(self.ppf_, "ppf", args=[self.p])
def negativeLogLikelihood(self, y, weightPerClass): # Used in training. # param y: y = T.itensor4('y'). Dimensions [batchSize, r, c, z] # weightPerClass is a vector with 1 element per class. #Weighting the cost of the different classes in the cost-function, in order to counter class imbalance. e1 = np.finfo(np.float32).tiny addTinyProbMatrix = T.lt(self.p_y_given_x_train, 4 * e1) * e1 weightPerClassBroadcasted = weightPerClass.dimshuffle( 'x', 0, 'x', 'x', 'x') log_p_y_given_x_train = T.log( self.p_y_given_x_train + addTinyProbMatrix ) #added a tiny so that it does not go to zero and I have problems with nan again... weighted_log_p_y_given_x_train = log_p_y_given_x_train * weightPerClassBroadcasted # return -T.mean( weighted_log_p_y_given_x_train[T.arange(y.shape[0]), y] ) # Not a very elegant way to do the indexing but oh well... indexDim0 = T.arange( weighted_log_p_y_given_x_train.shape[0]).dimshuffle( 0, 'x', 'x', 'x') indexDim2 = T.arange( weighted_log_p_y_given_x_train.shape[2]).dimshuffle( 'x', 0, 'x', 'x') indexDim3 = T.arange( weighted_log_p_y_given_x_train.shape[3]).dimshuffle( 'x', 'x', 0, 'x') indexDim4 = T.arange( weighted_log_p_y_given_x_train.shape[4]).dimshuffle( 'x', 'x', 'x', 0) return -T.mean(weighted_log_p_y_given_x_train[indexDim0, y, indexDim2, indexDim3, indexDim4])
def relevance_conv_a_b_abs(inputs, weights, out_relevances, a, b, bias=None): assert a is not None assert b is not None assert a - b == 1 weights_plus = weights * T.gt(weights, 0) weights_neg = weights * T.lt(weights, 0) plus_norm = conv2d(T.abs_(inputs), weights_plus) # stabilize, prevent division by 0 eps = 1e-4 plus_norm += T.eq(plus_norm, 0) * eps plus_rel_normed = out_relevances / plus_norm in_rel_plus = conv2d(plus_rel_normed, weights_plus.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full") in_rel_plus *= T.abs_(inputs) # minuses to get positive outputs, since will be subtracted # at end of function neg_norm = -conv2d(T.abs_(inputs), weights_neg) neg_norm += T.eq(neg_norm, 0) * eps neg_rel_normed = out_relevances / neg_norm in_rel_neg = -conv2d(neg_rel_normed, weights_neg.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full") in_rel_neg *= T.abs_(inputs) in_relevance = a * in_rel_plus - b * in_rel_neg return in_relevance
def get_hid(self, X, X_embedded): """ Get the hidden layers for the recognition RNN given a batch of N sentences :param X: (N x max(L)) matrix representing the text :param X_embedded: (N x max(L) x E) tensor representing the embedded text :return: rnn_depth -dimensional list of hidden states for the recognition RNN """ # If x is less or equal than 0 then return 0, else 1 (exclude unused words) mask = T.switch(T.lt(X, 0), 0, 1) # N x max(L) h_prev = X_embedded # N x max(L) x E all_h = [] for h in range(len(self.rnn)): h_prev = self.rnn[h].get_output_for([h_prev, mask]) # N x max(L) x dim(hid) all_h.append(h_prev[:, -1]) hid = T.concatenate(all_h, axis=-1) return hid
def test_elemwise_composite_float64(): # test that we don't fuse composite elemwise with float64 somewhere inside # nvcc by default downcast them to float32. We would need to tell him not # to do so, but that possible only on some device. a = tensor.fmatrix() b = tensor.fmatrix() av = theano._asarray(numpy.random.rand(4, 4), dtype='float32') bv = numpy.ones((4, 4), dtype='float32') def get_all_basic_scalar(composite_op): l = [] for i in composite_op.fgraph.toposort(): if isinstance(i, theano.scalar.Composite): l += get_all_basic_scalar(i) else: l.append(i) return l for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'), mode_with_gpu.excluding('elemwise_fusion')]: f = pfunc([a, b], tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2, b), 'float32'), mode=mode) out = f(av, bv) assert numpy.all(out == ((av ** 2) < bv)) for node in f.maker.fgraph.toposort(): if isinstance(node.op, cuda.GpuElemwise): if isinstance(node.op.scalar_op, theano.scalar.Composite): scals = get_all_basic_scalar(node.op.scalar_op) for s in scals: assert not any([i.type.dtype == 'float64' for i in s.inputs + s.outputs])
def VINormal(dim, const_str, const_fx, K, nfit=30000): """\ Normal (full-rank) sampling, fit with ADVI to a high-potential probability distribution :input dim: The dimensionality :input const_str: Constraint strings; used to define potentials :input const_fx: Constraint callables, included for API compatibility :input K: Number of points to sample :input nfit: Number of gradient iterations for variational inference :returns: A set of points X drawn from a N(μ,Σ); where the parameters are fit by variational inference to match the potential distribution formed by the potentials -c*g_i; for c=7500 """ with pm.Model() as mod: x = pm.Uniform('x', shape=dim) for i, const in enumerate(const_str): cname = 'g%d' % i g = pm.Deterministic(cname, eval(const, {'__builtins__': None}, {'x': x } )) pname = '%s_pot' % cname pm.Potential(pname, tt.switch(tt.lt(g, 0), 7500*g, 0)) fit_res = pm.fit(nfit, method='fullrank_advi', obj_n_mc=3) trace = fit_res.sample(K) return trace['x']
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean( (network_output - prediction_func) ** 2, axis=1 ) params = list(iter_parameters(self)) param_vector = parameters2vector(self) J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params) ).dot(J.T).dot(mse_for_each_sample) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def grad(self, args, g_outs): return [ T.switch( T.or_(T.lt(g_out, self.lower_bound), T.gt(g_out, self.upper_bound)), T.cast(0, dtype=g_out.dtype), g_out) for g_out in g_outs ]
def relevance_conv_a_b_abs(inputs, weights, out_relevances, a,b, bias=None): assert a is not None assert b is not None assert a - b == 1 weights_plus = weights * T.gt(weights, 0) weights_neg = weights * T.lt(weights, 0) plus_norm = conv2d(T.abs_(inputs), weights_plus) # stabilize, prevent division by 0 eps=1e-4 plus_norm += (T.eq(plus_norm,0) * eps) plus_rel_normed = out_relevances / plus_norm in_rel_plus = conv2d(plus_rel_normed, weights_plus.dimshuffle(1,0,2,3)[:,:,::-1,::-1], border_mode='full') in_rel_plus *= T.abs_(inputs) # minuses to get positive outputs, since will be subtracted # at end of function neg_norm = -conv2d(T.abs_(inputs), weights_neg) neg_norm += (T.eq(neg_norm,0) * eps) neg_rel_normed = out_relevances / neg_norm in_rel_neg = -conv2d(neg_rel_normed, weights_neg.dimshuffle(1,0,2,3)[:,:,::-1,::-1], border_mode='full') in_rel_neg *= T.abs_(inputs) in_relevance = a * in_rel_plus - b * in_rel_neg return in_relevance
def test_elemwise_composite_float64(): # test that we don't fuse composite elemwise with float64 somewhere inside # nvcc by default downcast them to float32. We would need to tell him not # to do so, but that possible only on some device. a = tensor.fmatrix() b = tensor.fmatrix() av = theano._asarray(numpy.random.rand(4, 4), dtype='float32') bv = numpy.ones((4, 4), dtype='float32') def get_all_basic_scalar(composite_op): l = [] for i in composite_op.env.toposort(): if isinstance(i, theano.scalar.Composite): l += get_all_basic_scalar(i) else: l.append(i) return l for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'), mode_with_gpu.excluding('elemwise_fusion')]: f = pfunc([a, b], tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2, b), 'float32'), mode=mode) out = f(av, bv) assert numpy.all(out == ((av ** 2) < bv)) for node in f.maker.env.toposort(): if isinstance(node.op, cuda.GpuElemwise): if isinstance(node.op.scalar_op, theano.scalar.Composite): scals = get_all_basic_scalar(node.op.scalar_op) for s in scals: assert not any([i.type.dtype == 'float64' for i in s.inputs + s.outputs])
def angle_axis_to_rotation_matrix(angle_axis): n = T.sqrt(T.sum(angle_axis**2)) def aa2R(): angle_axis_normalized = angle_axis / n x = angle_axis_normalized[0] y = angle_axis_normalized[1] z = angle_axis_normalized[2] s, c = T.sin(n), T.cos(n) R = T.zeros((3, 3), dtype=angle_axis.dtype) R = T.set_subtensor(R[0, 0], x * x + (1 - x * x) * c) R = T.set_subtensor(R[0, 1], x * y * (1 - c) - z * s) R = T.set_subtensor(R[0, 2], x * z * (1 - c) + y * s) R = T.set_subtensor(R[1, 0], x * y * (1 - c) + z * s) R = T.set_subtensor(R[1, 1], y * y + (1 - y * y) * c) R = T.set_subtensor(R[1, 2], y * z * (1 - c) - x * s) R = T.set_subtensor(R[2, 0], x * z * (1 - c) - y * s) R = T.set_subtensor(R[2, 1], z * y * (1 - c) + x * s) R = T.set_subtensor(R[2, 2], z * z + (1 - z * z) * c) return R return th.ifelse.ifelse(T.lt(n, .0001), T.eye(3, dtype=angle_axis.dtype), aa2R())
def MCMC(dim, const_str, const_fx, K, chains=3, cores=3): """\ MCMC sampling, with potentials lowering the probability of drawing failing points :input dim: The dimensionality :input const_str: Constraint strings; used to define potentials :input const_fx: Constraint callables, included for API compatibility :input K: Number of points to sample :input chains: Number of independent MCMC chains to run :input cores: Number of CPU cores to run for parallelization :returns: A set of points X drawn from the potential -c*g_i; for c=[1, 10, 20]. This involves three successive samplings, which should total K draws. """ lambda_values = [1, 10, 20] k = int(K/(chains*len(lambda_values))) Xvals = list() for lam in lambda_values: with pm.Model() as mod: x = pm.Uniform('x', shape=dim) for i, const in enumerate(const_str): cname = 'g%d' % i g = pm.Deterministic(cname, eval(const, {'__builtins__': None}, {'x': x } )) pname = '%s_pot' % cname pm.Potential(pname, tt.switch(tt.lt(g, 0), lam*g, 0)) trace = pm.sample(k, tune=1000, chains=chains, cores=cores) Xvals.append(trace['x']) return np.vstack(Xvals)
def _best_path_decode(activations): """Calculate the CTC best-path decoding for a given activation sequence. In the returned matrix, shorter sequences are padded with -1s.""" # For each timestep, get the highest output decoding = T.argmax(activations, axis=2) # prev_outputs[time][example] == decoding[time - 1][example] prev_outputs = T.concatenate([T.alloc(_BLANK, 1, decoding.shape[1]), decoding], axis=0)[:-1] # Filter all repetitions to zero (blanks are already zero) decoding = decoding * T.neq(decoding, prev_outputs) # Calculate how many blanks each sequence has relative to longest sequence blank_counts = T.eq(decoding, 0).sum(axis=0) min_blank_count = T.min(blank_counts, axis=0) max_seq_length = decoding.shape[0] - min_blank_count # used later padding_needed = blank_counts - min_blank_count # Generate the padding matrix by ... doing tricky things max_padding_needed = T.max(padding_needed, axis=0) padding_needed = padding_needed.dimshuffle('x',0).repeat(max_padding_needed, axis=0) padding = T.arange(max_padding_needed).dimshuffle(0,'x').repeat(decoding.shape[1],axis=1) padding = PADDING * T.lt(padding, padding_needed) # Apply the padding decoding = T.concatenate([decoding, padding], axis=0) # Remove zero values nonzero_vals = decoding.T.nonzero_values() decoding = T.reshape(nonzero_vals, (decoding.shape[1], max_seq_length)).T return decoding
def softmax(self, D, I): D = D * T.constant(self.attrs['sharpening'], 'float32') if self.attrs['norm'] == 'exp': E = T.exp(-D) * I E = E / T.maximum(T.sum(E,axis=0,keepdims=True),T.constant(1e-20,'float32')) elif self.attrs['norm'] == 'sigmoid': E = (numpy.float32(1) - T.tanh(D)**2) * I elif self.attrs['norm'] == 'lstm': n_out = self.attrs['template'] def lstm(z, i_t, s_p, h_p): z += T.dot(h_p, self.N_re) i = T.outer(i_t, T.alloc(numpy.cast['int8'](1), n_out)) ingate = T.nnet.sigmoid(z[:,n_out: 2 * n_out]) forgetgate = T.nnet.sigmoid(z[:,2 * n_out:3 * n_out]) outgate = T.nnet.sigmoid(z[:,3 * n_out:]) input = T.tanh(z[:,:n_out]) s_t = input * ingate + s_p * forgetgate h_t = T.tanh(s_t) * outgate return theano.gradient.grad_clip(s_t * i, -50, 50), h_t * i E, _ = theano.scan(lstm, sequences=[D,I], outputs_info=[T.zeros((n_out,), 'float32'), T.zeros((n_out,), 'int32')]) E = T.nnet.sigmoid(T.dot(E,self.N_out)) else: raise NotImplementedError() if self.attrs['nbest'] > 1: opt = T.minimum(self.attrs['nbest'], E.shape[0]) score = (T.sort(E, axis=0)[-opt]).dimshuffle('x',0).repeat(E.shape[0],axis=0) E = T.switch(T.lt(E,score), T.zeros_like(E), E) return E
def ALB_softmax_health_weighting(o, t, o2, health, v, alpha_0, beta_0, alpha_1, beta_1, d, tau_p, tau_n, unchosen_p, b, tau_p_w, tau_n_w, decay_w): # Without variance weighting b = 1. / b # Convert inverse temperature to temperature unchosen_0 = T.switch(T.le(v, 0.5), 1, unchosen_p) unchosen_1 = T.switch(T.gt(v, 0.5), 1, unchosen_p) health = T.switch(T.lt(health, 0), 0, health) tau_p = T.switch(T.ge(tau_p, 0), tau_p * (1 - tau_p_w * health), tau_p * (1 - (1 + tau_p_w * health))) tau_n = T.switch(T.ge(tau_n, 0), tau_n * (1 - tau_n_w * health), tau_n * (1 - (1 + tau_n_w * health))) d = T.switch(T.ge(tau_p, 0), d * (1 - decay_w * health), d * (1 - (1 + decay_w * health))) # Only update if outcome isn't missing alpha_0 = T.switch(T.ge(o, 0), (1 - d) * alpha_0 + (o * tau_p * unchosen_0), alpha_0) beta_0 = T.switch(T.ge(o, 0), (1 - d) * beta_0 + ((1 - o) * tau_n * unchosen_0), beta_0) alpha_1 = T.switch(T.ge(o2, 0), (1 - d) * alpha_1 + (o2 * tau_p * unchosen_1), alpha_1) beta_1 = T.switch(T.ge(o2, 0), (1 - d) * beta_1 + ((1 - o2) * tau_n * unchosen_1), beta_1) value_0 = alpha_0 / (alpha_0 + beta_0) value_1 = alpha_1 / (alpha_1 + beta_1) value = ((value_0 - value_1) + 1) / 2. var_0 = (alpha_0 * beta_0) / (T.pow(alpha_0 + beta_0, 2) * (alpha_0 + beta_0 + 1)) var_1 = (alpha_1 * beta_1) / (T.pow(alpha_1 + beta_1, 2) * (alpha_1 + beta_1 + 1)) value = np.exp(b * value) / (np.exp(b * value) + np.exp(b * (1 - value))) return (value, alpha_0, beta_0, alpha_1, beta_1, var_0, var_1, value_0, value_1, o, o2, unchosen_0, unchosen_1)
def relevance_conv_z_b(out_relevances, inputs, weights, min_in, max_in, bias=None): # min in /max in can be symbolic or number, so no way to check # any assertions here if bias is not None: log.warning("Bias not respected for conv z_b") weights_b = T.lt(weights, 0) * weights * -max_in weights_b += T.gt(weights, 0) * weights * -min_in norms_for_relevances = conv2d(inputs, weights) norms_for_relevances += T.sum(weights_b, axis=(1,2,3)).dimshuffle( 'x',0,'x','x') # prevent division by 0... norms_for_relevances += T.eq(norms_for_relevances, 0) * 1 normed_relevances = out_relevances / norms_for_relevances # upconv data in_relevances_data = conv2d(normed_relevances, weights.dimshuffle(1,0,2,3)[:,:,::-1,::-1], border_mode='full') in_relevances_data *= inputs # upconv weight offsets to enforce positivity in_relevances_b = conv2d(normed_relevances, weights_b.dimshuffle(1,0,2,3)[:,:,::-1,::-1], border_mode='full') in_relevances = in_relevances_data + in_relevances_b return in_relevances
def __init__(self, grads, p, b1, b2, alpha, epsilon=10e-8): # Perform Gradient Clipping grad_norm = grads.norm(L=2) grads = T.switch(T.lt(1.0, grad_norm), grads / grad_norm, grads) #self.L = L self.p = p self.b1 = b1 self.b2 = b2 self.alpha = alpha self.t = theano.shared(value=numpy.cast[theano.config.floatX](1.0)) self.t_next = self.t + 1 self.g = grads.astype(dtype=theano.config.floatX) self.m = theano.shared(value=numpy.zeros_like( p.get_value(), dtype=theano.config.floatX), name='m', borrow=True, broadcastable=self.p.broadcastable) self.m_next = self.b1 * self.m + (1 - self.b1) * self.g self.v = theano.shared(value=numpy.zeros_like( p.get_value(), dtype=theano.config.floatX), name='v', borrow=True, broadcastable=self.p.broadcastable) self.v_next = b2 * self.v + (1 - self.b2) * self.g * self.g self.m_ub = self.m / (1 - b1**self.t) self.v_ub = self.v / (1 - b2**self.t) self.update = self.p - alpha * self.m_ub / (T.sqrt(self.v_ub) + epsilon) self.updates = [(self.t, self.t_next), (self.m, self.m_next), (self.v, self.v_next), (self.p, self.update)]
def convert_method(self, method_string): if method_string == 'sigmoid': return Tensor.nnet.sigmoid elif method_string == 'tanh': return Tensor.tanh elif method_string == 'scaled_tanh': return lambda x: 1.7159 * Tensor.tanh(0.66 * x) elif method_string == 'soft_sigmoid': return soft_sigmoid elif method_string == 'relu': return lambda x: x * (x > 0) elif method_string == 'relu2': return lambda x: Tensor.switch(Tensor.lt(x, -1), -1, x) * Tensor.switch(Tensor.gt(x, 1), 1, x) / x elif method_string == 'leakyrelu': return lambda x: x * (x > 0) + 0.01 * x * (x < 0) elif method_string == 'shiftedrelu': return lambda x: x * (x > -1) elif method_string == 'hard_sigmoid': return Tensor.nnet.hard_sigmoid elif method_string == 'none': return lambda x: x else: raise Exception('method unknown')
def loop(idx): if T.lt(idx, 0): return size + idx if T.ge(idx, size): return idx - size else: return idx
def _editdist(s, t): """ Levenshtein's edit distance function :param s: vector, source string :param t: vector, target string :return: edit distance, scalar """ def update(x, previous_row): current_row = previous_row + 1 current_row = tensor.set_subtensor( current_row[1:], tensor.minimum( current_row[1:], tensor.add(previous_row[:-1], tensor.neq(target, x)))) current_row = tensor.set_subtensor( current_row[1:], tensor.minimum(current_row[1:], current_row[0:-1] + 1)) return current_row source, target = ifelse(tensor.lt(s.shape[0], t.shape[0]), (t, s), (s, t)) previous_row = tensor.arange(target.size + 1, dtype=theano.config.floatX) result, updates = theano.scan(fn=update, sequences=source, outputs_info=previous_row, name='editdist') return result[-1, -1]
def rprop_core(params, gradients, rprop_increase=1.01, rprop_decrease=0.99, rprop_min_step=0, rprop_max_step=100, learning_rate=0.01): """ Rprop optimizer. See http://sci2s.ugr.es/keel/pdf/algorithm/articulo/2003-Neuro-Igel-IRprop+.pdf. """ for param, grad in zip(params, gradients): grad_tm1 = theano.shared(np.zeros_like(param.get_value()), name=param.name + '_grad') step_tm1 = theano.shared(np.zeros_like(param.get_value()) + learning_rate, name=param.name + '_step') test = grad * grad_tm1 same = T.gt(test, 0) diff = T.lt(test, 0) step = T.minimum( rprop_max_step, T.maximum( rprop_min_step, step_tm1 * (T.eq(test, 0) + same * rprop_increase + diff * rprop_decrease))) grad = grad - diff * grad yield param, param - T.sgn(grad) * step yield grad_tm1, grad yield step_tm1, step
def test_ifelse(self): config1 = theano.config.profile config2 = theano.config.profile_memory try: theano.config.profile = True theano.config.profile_memory = True a, b = T.scalars('a', 'b') x, y = T.scalars('x', 'y') z = ifelse(T.lt(a, b), x * 2, y * 2) p = theano.ProfileStats(False) if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]: m = "FAST_RUN" else: m = None f_ifelse = theano.function([a, b, x, y], z, profile=p, name="test_ifelse", mode=m) val1 = 0. val2 = 1. big_mat1 = 10 big_mat2 = 11 out = f_ifelse(val1, val2, big_mat1, big_mat2) finally: theano.config.profile = config1 theano.config.profile_memory = config2
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: return self.p * input else: return theano.ifelse.ifelse( T.lt(self._srng.uniform((1, ), 0, 1)[0], self.p), input, T.zeros(input.shape))
def apply(self , src , mask_length , tgt): """ viterbi algorithm """ result , updates = theano.scan( fn = self.train_step, sequences = src, outputs_info = [self.A_start, None] , non_sequences = self.A , n_steps = mask_length ) # the score of best path best_path_score = result[0][-1].max() idx = T.argmax(result[0][-1]) #backtracking res2 , _ = theano.scan( fn = lambda dps , idx , idx2 : [dps[idx] , idx], sequences = result[1][::-1], outputs_info = [idx , idx], n_steps = mask_length ) # the path of best score best_path = res2[1] #if len(best_path) < seq_len: # best_path.extend((seq_len - len(best_path)) * [2]) # the score of tgt path tgt_score = self.decode(src , mask_length , tgt) # max_margin max_margin = T.sum(T.neq(tgt[:mask_length] , best_path)) cost = best_path_score + max_margin - tgt_score return T.switch(T.lt(cost , T.alloc(numpy.float32(0.))) , T.alloc(numpy.float32(0.)) , cost ),best_path
def __init__(self, config, loss, params): self._lr = get_shared_floatX(config.learning_rate, 'lr') self._t = get_shared_floatX(1, 't') self._all_m_tm1 = [] self._all_v_tm1 = [] self._updates = [(self._t, self._t + 1)] if config.lr_decay: lr_coef = tt.pow(config.lr_decay, (self._t - 1) // config.lr_decay_freq) self._updates.append((self._lr, lr_coef * config.learning_rate)) grads = theano.grad(loss, params) #grads = theano.grad(loss, params, disconnected_inputs='ignore') self._global_grad_norm = tt.sqrt(tt.sum(tt.stack([tt.sum(g**2.) for g in grads]))) if config.max_grad_norm: global_clip_factor = ifelse(tt.lt(self._global_grad_norm, config.max_grad_norm), cast_floatX_np(1.), cast_floatX(config.max_grad_norm/self._global_grad_norm)) # global_clip_factor = tt.minimum(cast_floatX(config.max_grad_norm/self._global_grad_norm), cast_floatX_np(1)) grads = [global_clip_factor * g for g in grads] lr_t = self._lr * \ clip_sqrt(1 - tt.pow(config.adam_beta2, self._t)) / (1 - tt.pow(config.adam_beta1, self._t)) for p, g in zip(params, grads): m_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_m_' + p.name) v_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_v_' + p.name) self._all_m_tm1.append(m_tm1) self._all_v_tm1.append(v_tm1) m_t = config.adam_beta1 * m_tm1 + (1-config.adam_beta1) * g v_t = config.adam_beta2 * v_tm1 + (1-config.adam_beta2) * tt.sqr(g) delta_t = -lr_t * m_t / (clip_sqrt(v_t) + config.adam_eps) p_t = p + delta_t self._updates += [(m_tm1, m_t), (v_tm1, v_t), (p, p_t)]