def custom_loss(y_true,y_pred): ''' Args: y_true: Ground Truth output y_pred: Predicted output The forms of these two vectors are: ###################################### ## x,y,h,w,p1,p2,...,p20,objectness ## ###################################### Returns: The loss caused by y_pred ''' y1 = y_pred y2 = y_true loss = 0.0 scale_vector = [] scale_vector.extend([2]*4) scale_vector.extend([1]*20) scale_vector = np.reshape(np.asarray(scale_vector),(1,len(scale_vector))) for i in range(49): y1_piece = y1[:,i*25:i*25+24] y2_piece = y2[:,i*25:i*25+24] y1_piece = y1_piece * scale_vector y2_piece = y2_piece * scale_vector loss_piece = T.sum(T.square(y1_piece - y2_piece),axis=1) loss = loss + loss_piece * y2[:,i*25+24] loss = loss + T.square(y2[:,i*25+24] - y1[:,i*25+24]) loss = T.sum(loss) return loss
def log_likelihood(self): Users = self.L[:, :-1] Items = self.R[:, :-1] UserBiases = self.L[:, -1].reshape((-1, 1)) ItemBiases = self.R[:, -1].reshape((-1, 1)) A = T.dot(self.L[:, :-1], (self.R[:, :-1]).T) A = T.inc_subtensor(A[:, :], UserBiases) A = T.inc_subtensor(A[:, :], ItemBiases.T) B = A * self.counts loglik = T.sum(B) A = T.exp(A) A += 1 A = T.log(A) A = (self.counts + 1) * A loglik -= T.sum(A) # L2 regularization loglik -= 0.5 * self.reg_param * T.sum(T.square(self.L[:, :-1])) loglik -= 0.5 * self.reg_param * T.sum(T.square(self.R[:, :-1])) # Return negation of LogLikelihood cause we will minimize cost return -loglik
def kl_div_p_q(self, p_mean, p_std, q_mean, q_std): """KL divergence D_{KL}[p(x)||q(x)] for a fully factorized Gaussian""" numerator = T.square(p_mean - q_mean) + \ T.square(p_std) - T.square(q_std) denominator = 2 * T.square(q_std) + 1e-8 return T.sum( numerator / denominator + T.log(q_std) - T.log(p_std))
def custom_loss(y_true, y_pred): epsilon = 0.001 first_log = T.log(T.clip(y_pred, 0.001, np.inf) + 1.) second_log = T.log(T.clip(y_true, 0.001, np.inf) + 1.) first_sum = T.log(T.sum(T.clip(y_pred, 0.001, np.inf))+1) second_sum = T.log(T.sum(T.clip(y_true, 0.001, np.inf))+1) return T.mean(T.square(first_log-second_log), axis=-1) + CMC_PENALTY*T.square(first_sum-second_sum)
def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), W=lasagne.init.Normal(0.05), train_g=False, init_stdv=1., nonlinearity=relu, **kwargs): super(WeightNormLayer, self).__init__(incoming, **kwargs) self.nonlinearity = nonlinearity self.init_stdv = init_stdv k = self.input_shape[1] if b is not None: self.b = self.add_param(b, (k,), name="b", regularizable=False) if g is not None: self.g = self.add_param(g, (k,), name="g", regularizable=False, trainable=train_g) if len(self.input_shape)==4: self.axes_to_sum = (0,2,3) self.dimshuffle_args = ['x',0,'x','x'] else: self.axes_to_sum = 0 self.dimshuffle_args = ['x',0] # scale weights in layer below incoming.W_param = incoming.W #incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape)) if incoming.W_param.ndim==4: if isinstance(incoming, Deconv2DLayer): W_axes_to_sum = (0,2,3) W_dimshuffle_args = ['x',0,'x','x'] else: W_axes_to_sum = (1,2,3) W_dimshuffle_args = [0,'x','x','x'] else: W_axes_to_sum = 0 W_dimshuffle_args = ['x',0] if g is not None: incoming.W = incoming.W_param * (self.g/T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args) else: incoming.W = incoming.W_param / T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))
def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total): sigma2 = tt.square(sigma) Kuu = cov_total(Xu) Kuf = cov_total(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = cov_total(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 else: # VFE or DTC Lamd = tt.ones_like(Qffd) * sigma2 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - mean_total(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) Kus = self.cov_func(Xu, Xnew) As = solve_lower(Luu, Kus) mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c)) C = solve_lower(L_B, As) if diag: Kss = self.cov_func(Xnew, diag=True) var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0) if pred_noise: var += sigma2 return mu, var else: cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) + tt.dot(tt.transpose(C), C)) if pred_noise: cov += sigma2 * tt.identity_like(cov) return mu, stabilize(cov)
def __init__(self, input, centerbias = None, alpha=1.0): self.input = input if centerbias is None: centerbias = np.ones(12) self.alpha = theano.shared(value = np.array(alpha).astype(theano.config.floatX), name='alpha') self.centerbias_ys = theano.shared(value=np.array(centerbias, dtype=theano.config.floatX), name='centerbias_ys') self.centerbias_xs = theano.shared(value=np.linspace(0, 1, len(centerbias), dtype=theano.config.floatX), name='centerbias_xs') height = T.cast(input.shape[0], theano.config.floatX) width = T.cast(input.shape[1], theano.config.floatX) x_coords = (T.arange(width) - 0.5*width) / (0.5*width) y_coords = (T.arange(height) - 0.5*height) / (0.5*height) + 0.0001 # We cannot have zeros in there because of grad x_coords = x_coords.dimshuffle('x', 0) y_coords = y_coords.dimshuffle(0, 'x') dists = T.sqrt(T.square(x_coords) + self.alpha*T.square(y_coords)) self.max_dist = T.sqrt(1 + self.alpha) self.dists = dists/self.max_dist self.factors = nonlinearity(self.dists, self.centerbias_xs, self.centerbias_ys, len(centerbias)) apply_centerbias = T.gt(self.centerbias_ys.shape[0], 2) self.output = ifelse(apply_centerbias, self.input+self.factors, self.input) self.params = [self.centerbias_ys, self.alpha]
def __init__(self, xdim, args, dec_nonlin=None): self.xdim = xdim self.hdim = args.hdim self.zdim = args.zdim self.lmbda = args.lmbda # weight decay coefficient * 2 self.x = T.matrix('x', dtype=floatX) self.eps = T.matrix('eps', dtype=floatX) self.train_i = T.scalar('train_i', dtype=floatX) self.dec = args.decM self.COV = args.COV self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps, COV=self.COV) if self.dec == 'bernoulli': # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y) self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) elif self.dec == 'gaussian': self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x, activation=dec_nonlin, COV=self.COV) else: raise RuntimeError('unrecognized decoder %' % dec) #encoder part + decoder part if self.COV == False: self.enc_cost = -T.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var)) else: self.enc_cost = -T.sum(kldu_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var, self.enc_mlp.u)) self.cost = (self.enc_cost + self.dec_mlp.cost) / args.batsize self.params = self.enc_mlp.params + self.dec_mlp.params ##[T.grad(self.cost, p) + self.lmbda * p for p in self.params] self.gparams = [T.grad(self.cost, p) for p in self.params] self.gaccums = [shared(value=np.zeros(p.get_value().shape, dtype=floatX)) for p in self.params] self.lr = args.lr * (1-args.lmbda)**self.train_i # update params, update sum(grad_params) for adagrade self.updates = [ (param, param - self.lr*gparam/T.sqrt(gaccum+T.square(gparam)+ADAG_EPS)) for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums) ] self.updates += [ (gaccum, gaccum + T.square(gparam)) for gaccum, gparam in zip(self.gaccums, self.gparams) ] self.train = function( inputs=[self.x, self.eps, self.train_i], outputs=self.cost, updates=self.updates ) self.test = function( inputs=[self.x, self.eps], outputs=self.cost, updates=None ) # can be used for semi-supervised learning for example self.encode = function( inputs=[self.x, self.eps], outputs=self.enc_mlp.out ) # use this to sample self.decode = function( inputs=[self.enc_mlp.out], ##z with shape (1,2) outputs=self.dec_mlp.out ) ##mlp103 .out=.mu+.sigma*eps
def mmd_full(x_t, y_t, alpha=0.5): """ Implementation of the full kernel MMD statistic (gaussian kernel)""" N = x_t.shape[1] M = y_t.shape[1] term1 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(x_t, N) - T.tile(x_t, N)))) term2 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(x_t, M) - T.tile(y_t, N)))) term3 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(y_t, M) - T.tile(y_t, M)))) return term1 - 2 * term2 + term3
def _do_calc(self, v0, v1): if self._func is None: xv0 = T.matrix('v0') xv1 = T.matrix('v1') norm0 = T.sqrt(T.square(xv0).sum(axis=1, keepdims=True)) norm1 = T.sqrt(T.square(xv1).sum(axis=0, keepdims=True)) dist = 1 - T.dot(xv0 / norm0, xv1 / norm1) self._func = theano.function([xv0, xv1], dist) return self._func(v0, v1)
def in_transit(self, t, r=0.0, texp=None): """Get a list of timestamps that are in transit Args: t (vector): A vector of timestamps to be evaluated. r (Optional): The radii of the planets. texp (Optional[float]): The exposure time. Returns: The indices of the timestamps that are in transit. """ z = tt.zeros_like(self.a) r = tt.as_tensor_variable(r) + z R = self.r_star + z # Wrap the times into time since transit hp = 0.5 * self.period dt = tt.mod(self._warp_times(t) - self.t0 + hp, self.period) - hp if self.ecc is None: # Equation 14 from Winn (2010) k = r / R arg = tt.square(1 + k) - tt.square(self.b) factor = R / (self.a * self.sin_incl) hdur = hp * tt.arcsin(factor * tt.sqrt(arg)) / np.pi t_start = -hdur t_end = hdur flag = z else: M_contact = self.contact_points_op( self.a, self.ecc, self.cos_omega, self.sin_omega, self.cos_incl + z, self.sin_incl + z, R + r) flag = M_contact[2] t_start = (M_contact[0] - self.M0) / self.n t_start = tt.mod(t_start + hp, self.period) - hp t_end = (M_contact[1] - self.M0) / self.n t_end = tt.mod(t_end + hp, self.period) - hp t_start = tt.switch(tt.gt(t_start, 0.0), t_start - self.period, t_start) t_end = tt.switch(tt.lt(t_end, 0.0), t_end + self.period, t_end) if texp is not None: t_start -= 0.5*texp t_end += 0.5*texp mask = tt.any(tt.and_(dt >= t_start, dt <= t_end), axis=-1) result = ifelse(tt.all(tt.eq(flag, 0)), tt.arange(t.size)[mask], tt.arange(t.size)) return result
def square_dist(self, X, Xs): X2 = tt.sum(tt.square(X), 1) if Xs is None: sqd = (-2.0 * tt.dot(X, tt.transpose(X)) + (tt.reshape(X2, (-1, 1)) + tt.reshape(X2, (1, -1)))) else: Xs2 = tt.sum(tt.square(Xs), 1) sqd = (-2.0 * tt.dot(X, tt.transpose(Xs)) + (tt.reshape(Xs2, (-1, 1)) + tt.reshape(Xs2, (1, -1)))) return tt.clip(sqd, 0.0, np.inf)
def dynamics_costs_obs(self,x,u): fmatrix = TT.matrix(dtype=floatX).type uvector = TT.vector(dtype='int8').type ctrl_lo, ctrl_hi = self.ctrl_bounds() @theano.as_op(itypes=[fmatrix,fmatrix,uvector],otypes=[fmatrix,fmatrix,fmatrix,fmatrix,fmatrix]) def stepmulti2op(x_nd,u_ne,done_n): x_nd = x_nd.copy() u_ne = np.clip(u_ne, ctrl_lo, ctrl_hi) move_to_origin = self.md["move_to_origin"] offset_n2 = x_nd[:,move_to_origin].copy() x_nd[:,move_to_origin] -= offset_n2 x_nd,f,dcom,dist,kin = self.world.StepMulti2(x_nd.astype("float64"),u_ne.astype("float64"),done_n) for _ in xrange(self.frame_skip-1): x_nd,f1,dcom1,dist,kin = self.world.StepMulti2(x_nd.astype("float64"),u_ne.astype("float64"),done_n) dcom += dcom1 f += f1 f /= self.frame_skip dist = np.clip(dist, 0, .1) # XXX clip level ad hoc # Consider using nan_to_num here x_nd[:,move_to_origin] += offset_n2 return (x_nd.astype(floatX),f.astype(floatX),dcom.astype(floatX),dist.astype(floatX),kin.astype(floatX)) done = self.trial_done(x) notdone = 1 - done y,f,dcom,dist,kin = stepmulti2op(x,u,done) if self.vel_cost_type == "linear": cost_vel = (-self.vel_cost_coeff/self.world_info["timestep"]) * dcom[:,0] elif self.vel_cost_type == "quadratic": cost_vel = TT.square(dcom[:,0]/self.world_info["timestep"] - self.vel_cost_target) #pylint: disable=E1111 else: raise ValueError cost_ctrl = .5*self.ctrl_cost_coeff*TT.square(u).sum(axis=1) cost_impact = .5*self.impact_cost_coeff * TT.square(f).sum(axis=1) if self.clip_impact_cost: cost_impact = TT.minimum(cost_impact, self.clip_impact_cost) #pylint: disable=E1111 jntpos_mask = self.world_info["jnt_islimited"] if self.jntpos_root_only: jntpos_mask &= (self.world_info["jnt_body_id"]==1) jntpos_inds = np.flatnonzero(jntpos_mask) jntpos_dofs = np.array([dofidx for (dofidx,jntidx) in enumerate(self.world_info["dof_jnt_id"]) if jntidx in jntpos_inds]) cost_jntpos = (.5*self.jntpos_cost_coeff) * (TT.abs_ if self.jntpos_use_l1 else TT.square)(y[:,jntpos_dofs]).sum(axis=1) cost_done = (done-1)*self.done_cost_coeff feats = [y[:,1:],f,dist] if self.use_kinematic_features: feats.append(kin) obs = TT.concatenate(feats,axis=1) return [TT.switch(done[:,None], x, y), [notdone*cost_vel, notdone*cost_ctrl, notdone*cost_impact, notdone*cost_jntpos, cost_done] , obs ]
def __init__(self, xdim, args, dec='bernoulli'): self.xdim = xdim self.hdim = args.hdim self.zdim = args.zdim self.lmbda = args.lmbda # weight decay coefficient * 2 self.x = T.matrix('x', dtype=floatX) self.eps = T.matrix('eps', dtype=floatX) # XXX make this more general self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps) if dec == 'bernoulli': # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y) self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) elif dec == 'gaussian': self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x) else: raise RuntimeError('unrecognized decoder %' % dec) self.cost = (-T.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var)) + self.dec_mlp.cost) / args.batch_size self.params = self.enc_mlp.params + self.dec_mlp.params print(self.params) self.gparams = [T.grad(self.cost, p) + self.lmbda * p for p in self.params] self.gaccums = [theano.shared(value=np.zeros(p.get_value().shape, dtype=floatX)) for p in self.params] # XXX using adagrad update as described in paper, could try other optimizers self.updates = [ (param, param - args.lr * gparam / T.sqrt(gaccum + T.square(gparam) + ADAGRAD_EPS)) for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums) ] self.updates += [ (gaccum, gaccum + T.square(gparam)) for gaccum, gparam in zip(self.gaccums, self.gparams) ] self.train = theano.function( inputs=[self.x, self.eps], outputs=self.cost, updates=self.updates ) self.test = theano.function( inputs=[self.x, self.eps], outputs=self.cost, updates=None ) # can be used for semi-supervised learning for example self.encode = theano.function( inputs=[self.x, self.eps], outputs=self.enc_mlp.out ) # use this to sample self.decode = theano.function( inputs=[self.enc_mlp.out], outputs=self.dec_mlp.out )
def calc(self, y, output): if y.ndim == 1: loss = (T.square(y - output)) else: axis = tuple(range(y.ndim))[1:] loss = T.sum(T.square(y - output), axis=axis) if self.mode: loss = T.mean(loss) else: loss = T.sum(loss) return self.weight * loss
def rmsprop(cost, params, lr, alpha=0.95, eps=1e-8, max_norm=None, max_norm_elemwise=None): grads = clip_grads([T.grad(cost, p) for p in params], max_norm=max_norm, max_norm_elemwise=max_norm_elemwise) accums = [theano.shared(value=np.zeros(p.get_value().shape, dtype=floatX)) for p in params] updates = [ (a, alpha * a + (1 - alpha) * T.square(g)) for g, a in zip(grads, accums) ] # XXX worth fix to assign square(grad) to accum during first iter? updates = updates + [ (p, p - lr * g / (T.sqrt(alpha * a + (1 - alpha) * T.square(g)) + eps)) for p, g, a in zip(params, grads, accums) ] return updates, total_norm(grads), total_norm(params)
def get_stencil(self, t, r=None, texp=None): if r is None or texp is None: return tt.shape_padright(t) z = tt.zeros_like(self.a) r = tt.as_tensor_variable(r) R = self.r_star + z hp = 0.5 * self.period if self.ecc is None: # Equation 14 from Winn (2010) k = r / self.r_star arg1 = tt.square(1 + k) - tt.square(self.b) arg2 = tt.square(1 - k) - tt.square(self.b) factor = R / (self.a * self.sin_incl) hdur1 = hp * tt.arcsin(factor * tt.sqrt(arg1)) / np.pi hdur2 = hp * tt.arcsin(factor * tt.sqrt(arg2)) / np.pi ts = [-hdur1, -hdur2, hdur2, hdur1] flag = z else: M_contact1 = self.contact_points_op( self.a, self.ecc, self.cos_omega, self.sin_omega, self.cos_incl + z, self.sin_incl + z, R + r) M_contact2 = self.contact_points_op( self.a, self.ecc, self.cos_omega, self.sin_omega, self.cos_incl + z, self.sin_incl + z, R - r) flag = M_contact1[2] + M_contact2[2] ts = [ tt.mod((M_contact1[0]-self.M0)/self.n+hp, self.period)-hp, tt.mod((M_contact2[0]-self.M0)/self.n+hp, self.period)-hp, tt.mod((M_contact2[1]-self.M0)/self.n+hp, self.period)-hp, tt.mod((M_contact1[1]-self.M0)/self.n+hp, self.period)-hp ] start = self.period * tt.floor((tt.min(t) - self.t0) / self.period) end = self.period * (tt.ceil((tt.max(t) - self.t0) / self.period) + 1) start += self.t0 end += self.t0 tout = [] for i in range(4): if z.ndim < 1: tout.append(ts[i] + tt.arange(start, end, self.period)) else: tout.append(theano.scan( fn=lambda t0, s0, e0, p0: t0 + tt.arange(s0, e0, p0), sequences=[ts[i], start, end, self.period], )[0].flatten()) ts = tt.sort(tt.concatenate(tout)) return ts, flag
def square_dist(self, X, Z): X = tt.mul(X, 1.0 / self.lengthscales) Xs = tt.sum(tt.square(X), 1) if Z is None: sqd = -2.0 * tt.dot(X, tt.transpose(X)) +\ (tt.reshape(Xs, (-1, 1)) + tt.reshape(Xs, (1, -1))) else: Z = tt.mul(Z, 1.0 / self.lengthscales) Zs = tt.sum(tt.square(Z), 1) sqd = -2.0 * tt.dot(X, tt.transpose(Z)) +\ (tt.reshape(Xs, (-1, 1)) + tt.reshape(Zs, (1, -1))) return tt.clip(sqd, 0.0, np.inf)
def square_dist(self, X, Z): X = tt.as_tensor_variable(X) Xs = tt.sum(tt.square(X), 1) if Z is None: sqd = -2.0 * tt.dot(X, tt.transpose(X)) +\ (tt.reshape(Xs, (-1, 1)) + tt.reshape(Xs, (1, -1))) else: Z = tt.as_tensor_variable(Z) Zs = tt.sum(tt.square(Z), 1) sqd = -2.0 * tt.dot(X, tt.transpose(Z)) +\ (tt.reshape(Xs, (-1, 1)) + tt.reshape(Zs, (1, -1))) return tt.clip(sqd, 0.0, np.inf)
def log_likelihood(self): Users = self.L[:, :-2] Items = self.R[:, :-2] UserBiases = self.L[:, -1] ItemBiases = self.R[:, -2] UserOuter = self.L[:, -2] ItemOuter = self.R[:, -1] ## A = T.dot(Users, Items.T) ## A += UserBiases ## A += ItemBiases.T ## B = A * self.counts ## loglik = T.sum(B) # A implicitly stored as self.L @ self.R.T # loglik = T.sum(A * self.counts) => sum over nonzeros only print('nnz size: {}'.format(self.counts.nonzero()[0].size)) loglik = T.dot(self.evaluate_lowrank(self.L, self.R, self.counts.nonzero(), fast=False), np.array(self.counts[self.counts.nonzero()]).ravel()) ## A = T.exp(A) ## A += 1 ## A = T.log(A) # There we use Taylor series ln(exp(x) + 1) = ln(2) + x/2 + x^2/8 + O(x^4) at x=0 # ln(2) const_term = (T.ones((self.num_users, 1)) * np.log(2), T.ones((self.num_items, 1))) # x/2 first_order_term = (0.5 * self.L, 0.5 * self.R) # x^2/8 second_order_term = hadamard((self.L, self.R), (self.L, self.R), self.num_factors) second_order_term = tuple(factor / 8.0 for factor in second_order_term) grouped_factors = list(zip(const_term, first_order_term, second_order_term)) A = (T.concatenate(grouped_factors[0], axis=1), T.concatenate(grouped_factors[1], axis=1)) ## A = (self.counts + 1) * A ## loglik -= T.sum(A) loglik -= sum_lowrank(A) loglik -= T.dot(self.evaluate_lowrank(A[0], A[1], self.counts.nonzero(), fast=False), np.array(self.counts[self.counts.nonzero()]).ravel()) # L2 regularization loglik -= 0.5 * self.reg_param * T.sum(T.square(Users)) loglik -= 0.5 * self.reg_param * T.sum(T.square(Items)) # we need strictly maintain UserOuter and ItemOuter be ones, just to ensure they properly # outer products with biases loglik -= self.num_users * T.sum(T.square(UserOuter - 1)) loglik -= self.num_items * T.sum(T.square(ItemOuter - 1)) # Return negation of LogLikelihood cause we will minimize cost return -loglik
def _do_calc(self, v0, v1): if self._func is None: xv0 = T.matrix('v0') xv1 = T.matrix('v1') sqrsum0 = T.square(xv0).sum(axis=1, keepdims=True) sqrsum1 = T.square(xv1).sum(axis=0, keepdims=True) dot = T.dot(xv0, xv1) dist = sqrsum0 + sqrsum1 - dot * 2 if self._do_sqrt: dist = T.sqrt(dist) self._func = theano.function([xv0, xv1], dist) return self._func(v0, v1)
def full(self, X, Xs=None): X, Xs = self._slice(X, Xs) rx = self.lfunc(tt.as_tensor_variable(X), self.args) if Xs is None: rz = self.lfunc(tt.as_tensor_variable(X), self.args) r2 = self.square_dist(X, X) else: rz = self.lfunc(tt.as_tensor_variable(Xs), self.args) r2 = self.square_dist(X, Xs) rx2 = tt.reshape(tt.square(rx), (-1, 1)) rz2 = tt.reshape(tt.square(rz), (1, -1)) return (tt.sqrt((2.0 * tt.outer(rx, rz)) / (rx2 + rz2)) * tt.exp(-1.0 * r2 / (rx2 + rz2)))
def get_norms(model, gradients): """Compute norm of weights and their gradients divided by the number of elements""" norms = [] grad_norms = [] for param_name, param in model.params.iteritems(): norm = T.sqrt(T.sum(T.square(param))) / T.prod(param.shape.astype(theano.config.floatX)) norm.name = 'norm_' + param_name norms.append(norm) grad = gradients[param] grad_norm = T.sqrt(T.sum(T.square(grad))) / T.prod(grad.shape.astype(theano.config.floatX)) grad_norm.name = 'grad_norm_' + param_name grad_norms.append(grad_norm) return norms, grad_norms
def full(self, X, Xs=None): X, Xs = self._slice(X, Xs) rx = self.lfunc(X, self.args) rx2 = tt.reshape(tt.square(rx), (-1, 1)) if Xs is None: r2 = self.square_dist(X, X) rz = self.lfunc(X, self.args) else: r2 = self.square_dist(X, Xs) rz = self.lfunc(Xs, self.args) rz2 = tt.reshape(tt.square(rz), (1, -1)) return (tt.sqrt((2.0 * tt.dot(rx, tt.transpose(rz))) / (rx2 + rz2)) * tt.exp(-1.0 * r2 / (rx2 + rz2)))
def adagrad(cost, params, lr, eps=1e-8, max_norm=None, max_norm_elemwise=None): grads = clip_grads([T.grad(cost, p) for p in params], max_norm=max_norm, max_norm_elemwise=max_norm_elemwise) accums = [theano.shared(value=np.zeros(p.get_value().shape, dtype=floatX)) for p in params] updates = [ (p, p - lr * g / T.sqrt(a + T.square(g) + eps)) for p, g, a in zip(params, grads, accums) ] updates += [ (a, a + T.square(g)) for a, g in zip(accums, grads) ] return updates, total_norm(grads), total_norm(params)
def optimize(self, params, cost): grads = tensor.grad(cost=theano.gradient.grad_clip(cost, -10, 10), wrt=params) accus = [_shared_zeros_like(p.get_value()) for p in params] delta_accus = [_shared_zeros_like(p.get_value()) for p in params] updates = [] for p, g, a, d_a in zip(params, grads, accus, delta_accus): new_a = self.rho * a + (1.0 - self.rho) * tensor.square(g) updates.append((a, new_a)) update = g * tensor.sqrt(d_a + EPS) / tensor.sqrt(new_a + EPS) new_p = p - self.lrate * update updates.append((p, new_p)) new_d_a = self.rho * d_a + (1.0 - self.rho) * tensor.square(update) updates.append((d_a, new_d_a)) return updates
def adadelta(params, cost, lr=1.0, rho=0.95): grads = T.grad(cost, params) accus = [shared_zeros_like(p.get_value()) for p in params] delta_accus = [shared_zeros_like(p.get_value()) for p in params] updates = [] for p, g, a, d_a in zip(params, grads, accus, delta_accus): new_a = rho * a + (1.0 - rho) * T.square(g) updates.append((a, new_a)) update = g * T.sqrt(d_a + epsilon) / T.sqrt(new_a + epsilon) new_p = p - lr * update updates.append((p, new_p)) new_d_a = rho * d_a + (1.0 - rho) * T.square(update) updates.append((d_a, new_d_a)) return updates
def mmd_approx(x_t, y_t, alpha=0.5): """ Implementation of the linear time approximation to the gaussian kernel MMD statistic""" M = x_t.shape[1] // 2 odd_x = x_t[:, ::2] even_x = x_t[:, 1::2] odd_y = y_t[:, ::2] even_y = y_t[:, 1::2] term1 = 2 * T.mean(T.exp(-0.5 * (1 / alpha) * T.square(odd_x - even_x))) # k(x_{2i-1}, x_{2i}) term2 = 2 * T.mean(T.exp(-0.5 * (1 / alpha) * T.square(odd_y - even_y))) # k(y_{2i-1}, y_{2i}) term3 = 2 * T.mean(T.exp(-0.5 * (1 / alpha) * T.square(odd_x - even_y))) # k(x_{2i-1}, y_{2i}) term4 = 2 * T.mean(T.exp(-0.5 * (1 / alpha) * T.square(even_x - odd_y))) # k(x_{2i}, y_{2i-1}) return term1 + term2 - term3 - term4
def adadelta(params, cost, lr=1.0, rho=0.95): # from https://github.com/fchollet/keras/blob/master/keras/optimizers.py grads = T.grad(cost, params) accus = [shared_zeros_like(p.get_value()) for p in params] delta_accus = [shared_zeros_like(p.get_value()) for p in params] updates = [] for p, g, a, d_a in zip(params, grads, accus, delta_accus): new_a = rho * a + (1.0 - rho) * T.square(g) updates.append((a, new_a)) update = g * T.sqrt(d_a + epsilon) / T.sqrt(new_a + epsilon) new_p = p - lr * update updates.append((p, new_p)) new_d_a = rho * d_a + (1.0 - rho) * T.square(update) updates.append((d_a, new_d_a)) return updates
def get_updates(self, grads): norms = None for (d, dp, g) in grads: if norms is None: norms = T.sum(T.square(g)) else: norms += T.sum(T.square(g)) updates = [] for (d, dp, g) in grads: g *= ifelse(T.lt(norms, self.threshold), 1., self.threshold / norms) if self.momentum > 0: g = self.momentum * dp + (1 - self.momentum) * g updates.append((dp, g)) updates.append((d, d - self.lr * g)) return updates, T.sum(norms)
def __init__(self, incoming, num_units, theta=lasagne.init.Normal(0.1), b=lasagne.init.Constant(0.), weight_scale=lasagne.init.Constant(1.), train_scale=False, nonlinearity=relu, **kwargs): super(DenseLayer, self).__init__(incoming, **kwargs) self.nonlinearity = (lasagne.nonlinearities.identity if nonlinearity is None else nonlinearity) self.num_units = num_units num_inputs = int(np.prod(self.input_shape[1:])) self.theta = self.add_param(theta, (num_inputs, num_units), name="theta") self.weight_scale = self.add_param(weight_scale, (num_units, ), name="weight_scale", trainable=train_scale) self.W = self.theta * (self.weight_scale / T.sqrt( T.sum(T.square(self.theta), axis=0))).dimshuffle('x', 0) self.b = self.add_param(b, (num_units, ), name="b")
def _build_conditional(self, Xnew, pred_noise, diag, X, y, noise, cov_total, mean_total): Kxx = cov_total(X) Kxs = self.cov_func(X, Xnew) Knx = noise(X) rxx = y - mean_total(X) L = cholesky(stabilize(Kxx) + Knx) A = solve_lower(L, Kxs) v = solve_lower(L, rxx) mu = self.mean_func(Xnew) + tt.dot(tt.transpose(A), v) if diag: Kss = self.cov_func(Xnew, diag=True) var = Kss - tt.sum(tt.square(A), 0) if pred_noise: var += noise(Xnew, diag=True) return mu, var else: Kss = self.cov_func(Xnew) cov = Kss - tt.dot(tt.transpose(A), A) if pred_noise: cov += noise(Xnew) return mu, stabilize(cov)
def _compute_losses(self, model_output): mask = self.dataset.symb_mask # stopping_criteria_outputs.shape : (batch_size, seq_len) stopping_criteria_outputs = model_output[0][:, :, 0] # regression_outputs.shape : (batch_size, seq_len, regression_layer_size) regression_outputs = model_output[1] # mixture_weights.shape : (batch_size, seq_len, n_gaussians) # means.shape : (batch_size, seq_len, n_gaussians, 3) # stds.shape : (batch_size, seq_len, n_gaussians, 3) mixture_weights, means, stds = self.model.get_mixture_parameters(regression_outputs, ndim=4) # targets.shape : (batch_size, seq_len, 1, 3) targets = self.dataset.symb_targets[:, :, None, :3] # stopping_criteria_targets.shape : (batch_size, seq_len) stopping_criteria_targets = self.dataset.symb_targets[:, :, 3] log_prefix = -2 * T.log(mixture_weights) + self.d * np.float32(np.log(2*np.pi)) + 2 * T.sum(T.log(stds), axis=-1) square_mahalanobis_dist = T.sum(T.square((targets - means) / stds), axis=-1) gaussian_mixture_nll_per_time_step = -logsumexp(-0.5 * (log_prefix + square_mahalanobis_dist), axis=2) stopping_cross_entropy_per_time_step = T.nnet.binary_crossentropy(stopping_criteria_outputs, stopping_criteria_targets) # loss_per_timestep.shape : (batch_size, seq_len) # self.gamma should be used to balance the two loss terms. Consider tweaking this hyperparameter if training goes wrong. self.loss_per_time_step = gaussian_mixture_nll_per_time_step + self.gamma * stopping_cross_entropy_per_time_step # loss_per_seq.shape : (batch_size,) # loss_per_seq is the log probability for each sequence self.loss_per_seq = T.sum(self.loss_per_time_step * mask, axis=1) if not self.sum_over_timestep: # loss_per_seq is the average log probability for each sequence self.loss_per_seq /= T.sum(mask, axis=1) return self.loss_per_seq
def setup_orbit_model(self, period=None): # Estimate initial period from TD's # Get period estimate ls_model = LombScargle(self.times, self.tds[0]) f = np.linspace(1e-3, 0.5 / np.median(np.diff(self.times)), 10000) power = ls_model.power(f, method="fast", normalization="psd") period_t = 1 / f[np.argmax(power)] with self.model as model: # Parameters self.period = pm.Normal("period", mu=period_t, sd=100) self.tref = pm.Uniform("tref", lower=-5000, upper=5000) self.varpi = pm.Uniform("varpi", lower=0, upper=50) self.eccen = pm.Uniform("eccen", lower=1e-3, upper=0.999) self.lighttime = pm.Uniform('lighttime', lower=-2000, upper=2000, shape=(len(self.freqs))) # Deterministic transformations # Mean anom M = 2.0 * np.pi * (self.times - self.tref) / self.period # True anom f = get_true_anomaly(M, self.eccen + tt.zeros_like(M)) factor = 1.0 - tt.square(self.eccen) factor /= 1.0 + self.eccen * tt.cos(f) psi = -factor * tt.sin(f + self.varpi) tau = self.lighttime[:, None] * psi[None, :] taumodel = pm.Deterministic('taumodel', tau - tt.mean(tau)) # Condition on the observations pm.Normal("obs", mu=taumodel, sd=None, observed=self.tds)
def _build_conditional(self, Xnew, pred_noise, diag): Xs, y, sigma = self.Xs, self.y, self.sigma # Old points X = cartesian(*Xs) delta = y - self.mean_func(X) Kns = [f(x) for f, x in zip(self.cov_funcs, Xs)] eigs_sep, Qs = zip(*map(eigh, Kns)) # Unzip QTs = list(map(tt.transpose, Qs)) eigs = kron_diag(*eigs_sep) # Combine separate eigs if sigma is not None: eigs += sigma**2 # New points Km = self.cov_func(Xnew, diag=diag) Knm = self.cov_func(X, Xnew) Kmn = Knm.T # Build conditional mu alpha = kron_dot(QTs, delta) alpha = alpha / eigs[:, None] alpha = kron_dot(Qs, alpha) mu = tt.dot(Kmn, alpha).ravel() + self.mean_func(Xnew) # Build conditional cov A = kron_dot(QTs, Knm) A = A / tt.sqrt(eigs[:, None]) if diag: Asq = tt.sum(tt.square(A), 0) cov = Km - Asq if pred_noise: cov += sigma else: Asq = tt.dot(A.T, A) cov = Km - Asq if pred_noise: cov += sigma * tt.identity_like(cov) return mu, cov
def get_output_for(self, input, deterministic=False, set_bn_updates=True, **kwargs): if deterministic: norm_features = ( input - self.avg_batch_mean.dimshuffle(*self.dimshuffle_args) ) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args) else: batch_mean = T.mean(input, axis=self.axes_to_sum).flatten() centered_input = input - batch_mean.dimshuffle( *self.dimshuffle_args) batch_var = T.mean(T.square(centered_input), axis=self.axes_to_sum).flatten() batch_stdv = T.sqrt(1e-6 + batch_var) norm_features = centered_input / batch_stdv.dimshuffle( *self.dimshuffle_args) # BN updates if set_bn_updates: new_m = 0.9 * self.avg_batch_mean + 0.1 * batch_mean new_v = 0.9 * self.avg_batch_var + T.cast( (0.1 * input.shape[0]) / (input.shape[0] - 1), th.config.floatX) * batch_var self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)] if hasattr(self, 'g'): activation = norm_features * self.g.dimshuffle( *self.dimshuffle_args) else: activation = norm_features if hasattr(self, 'b'): activation += self.b.dimshuffle(*self.dimshuffle_args) return self.nonlinearity(activation)
def __call__(self, X): XY = X.dot(X.T) x2 = tt.reshape(tt.sum(tt.square(X), axis=1), (X.shape[0], 1)) X2e = tt.repeat(x2, X.shape[0], axis=1) H = tt.sub(tt.add(X2e, X2e.T), 2 * XY) V = tt.sort(H.flatten()) length = V.shape[0] # median distance h = tt.switch(tt.eq((length % 2), 0), # if even vector tt.mean(V[((length//2)-1):((length//2)+1)]), # if odd vector V[length // 2]) h = tt.sqrt(0.5 * h / tt.log(X.shape[0].astype('float32') + 1.0)) Kxy = tt.exp(-H / h ** 2 / 2.0) dxkxy = -tt.dot(Kxy, X) sumkxy = tt.sum(Kxy, axis=1).dimshuffle(0, 'x') dxkxy = tt.add(dxkxy, tt.mul(X, sumkxy)) / (h ** 2) return Kxy, dxkxy
def get_update_func(self): print('*** Update Function of Rmsprop ......') # opt_log.info("*** Update Function of Rmsprop ......") updates = [] lr = TT.scalar(self._s("learning_rate"), dtype=theano.config.floatX) rho = TT.scalar(self._s("decay_rate"), dtype=theano.config.floatX) eps = numpy_floatX(1E-6) self.meansquare = [ theano.shared(p.get_value() * numpy_floatX(0.), name="%s.meansquare" % p.name) for p in self.model.param ] g_msnew_list = [ rho * g_ms + (1 - rho) * (TT.square(g)) for g, g_ms in zip(self.grad, self.meansquare) ] updates += [(g_ms, g_msnew) for g_ms, g_msnew in zip(self.meansquare, g_msnew_list)] updates += [ (p, p - lr * g / TT.sqrt(g_msnew + eps)) for p, g, g_msnew in zip(self.model.param, self.grad, g_msnew_list) ] return self.model.get_update_func(updates, [lr, rho])
def _log_like(self, X, Y, n_examples): f_out = lasagne.layers.get_output(self.f_net, X) f_mean = f_out[:, 0].reshape((-1, 1)) f_log_var = f_out[:, 1].reshape((-1, 1)) f_var_inv = 1. / (T.exp(f_log_var) + 1e-8) MSE = T.square(Y - f_mean) if self.out_type == 'Gaussian': log_like = T.sum( T.sum(-MSE * (0.5 * f_var_inv) - 0.5 * f_log_var, axis=1)) else: raise RuntimeError('{} not implemented'.format(self.out_type)) # scale by batch size to make this work nicely with the updaters above log_like /= T.cast(X.shape[0], theano.config.floatX) #priors, scale these by dataset size for the same reason # prior for the variance self.tn_examples = sharedX(np.float32(n_examples)) log_like += self.variance_prior.log_like(f_log_var, n_examples) / self.tn_examples # prior for the weights log_like += self.weight_prior.log_like( lasagne.layers.get_all_params( self.f_net, regularizable=True)) / self.tn_examples return log_like, T.sum(MSE)
def adam(cost, params, lr, beta1=0.9, beta2=0.999, eps=1e-8, param_grads=None): # CHECK: Performs Gradient ascent? Likely yes updates = [] if param_grads == None: grads = tensor.grad(cost, params); assert len(params) == len(grads) else: grads = theano.shared(param_grads) t0 = theano.shared(np.array(0., dtype=theano.config.floatX)) t = t0 + 1 corr1 = (1 - beta1**t) corr2 = (1 - beta2**t) alpha = lr * tensor.sqrt(corr2) / corr1 for p, g in zip(params, grads): m = theano.shared(value=np.zeros(p.get_value().shape, dtype=theano.config.floatX), broadcastable=p.broadcastable) v = theano.shared(value=np.zeros(p.get_value().shape, dtype=theano.config.floatX), broadcastable=p.broadcastable) m_t = beta1 * m + (1 - beta1) * g v_t = beta2 * v + (1 - beta2) * tensor.square(g) p_t = p - alpha * m_t/(tensor.sqrt(v_t) + eps) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((t0, t)) return updates
def negativ_log_likelihood(self, f_net, X, y, n_examples, weight_prior, variance_prior): f_out = lasagne.layers.get_output(f_net, X) f_mean = f_out[:, 0].reshape((-1, 1)) f_log_var = f_out[:, 1].reshape((-1, 1)) f_var_inv = 1. / (T.exp(f_log_var) + 1e-16) mse = T.square(y - f_mean) log_like = T.sum( T.sum(-mse * (0.5 * f_var_inv) - 0.5 * f_log_var, axis=1)) # scale by batch size to make this work nicely with the updaters above log_like /= T.cast(X.shape[0], theano.config.floatX) # scale the priors by the dataset size for the same reason # prior for the variance tn_examples = T.cast(n_examples, theano.config.floatX) log_like += variance_prior.log_like(f_log_var, n_examples) # prior for the weights params = lasagne.layers.get_all_params(f_net, trainable=True) log_like += weight_prior.log_like(params) / tn_examples return -log_like, T.mean(mse)
def _compute_losses(self, model_output): mask = self.dataset.symb_mask # regression_outputs.shape = (batch_size, seq_length, regression_layer_size) stopping_criteria_outputs = model_output[0][:, :, 0] regression_outputs = model_output[1] # mu.shape : (batch_size, seq_len, 3) # sigma.shape : (batch_size, seq_len, 3) mu, sigma = self.model.get_distribution_parameters(regression_outputs) # targets.shape : (batch_size, seq_len, 3) targets = self.dataset.symb_targets[:, :, :3] stopping_criteria_targets = self.dataset.symb_targets[:, :, 3] square_mahalanobis_dist = T.sum(T.square((targets - mu) / sigma), axis=-1) nll_per_time_step = 0.5 * (self.d * np.float32(np.log(2 * np.pi)) + 2 * T.sum(T.log(sigma), axis=-1) + square_mahalanobis_dist) stopping_cross_entropy_per_time_step = T.nnet.binary_crossentropy( stopping_criteria_outputs, stopping_criteria_targets) # loss_per_timestep.shape : (batch_size, seq_len) # self.gamma should be used to balance the two loss terms. Consider tweaking this hyperparameter if training goes wrong. self.loss_per_time_step = nll_per_time_step + self.gamma * stopping_cross_entropy_per_time_step # loss_per_seq.shape : (batch_size,) # loss_per_seq is the log probability for each sequence self.loss_per_seq = T.sum(self.loss_per_time_step * mask, axis=1) if not self.sum_over_timestep: # loss_per_seq is the average log probability for each sequence self.loss_per_seq /= T.sum(mask, axis=1) return self.loss_per_seq
def lmul_sq_T(self, x): raise NotImplementedError( "This method is not yet modified since copy-pasting from pylearn2.linear.conv2d" ) """ Kind of a stupid hacky method used to support convolutional score matching. Ought to find a way to make _filters symbolic rather than shared. """ assert x.dtype == self._filters.dtype op_axes = ('b', 'c', 0, 1) axes = self.output_axes if tuple(axes) != op_axes: x = x.dimshuffle(axes.index('b'), axes.index('c'), axes.index(0), axes.index(1)) # dot(x, sq(A).T) dummy_v = T.tensor4() sqfilt = T.square(self._filters) z_hs = 0. #conv2d(dummy_v, sqfilt, #image_shape=self._img_shape, #filter_shape=self._filters_shape, #kernel_stride=self._kernel_stride, #pad = self.pad #) rval, xdummy = z_hs.owner.op.grad((dummy_v, sqfilt), (x, )) # Format the output based on the input space axes = self.input_space.axes assert len(axes) == 4 if tuple(axes) != op_axes: rval = rval.dimshuffle(op_axes.index(axes[0]), op_axes.index(axes[1]), op_axes.index(axes[2]), op_axes.index(axes[3])) return rval
def rbf_kernel(X): # TODO. rbf may not be a good choice for high dimension data XY = tt.dot(X, X.transpose()) x2 = tt.reshape(tt.sum(tt.square(X), axis=1), (X.shape[0], 1)) X2e = tt.repeat(x2, X.shape[0], axis=1) H = tt.sub(tt.add(X2e, X2e.transpose()), 2 * XY) V = H.flatten() # median distance h = ifelse(tt.eq((V.shape[0] % 2), 0), # if even vector tt.mean(tt.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector tt.sort(V)[V.shape[0] // 2]) h = tt.sqrt(0.5 * h / tt.log(X.shape[0].astype('float32') + 1.0)) Kxy = tt.exp(-H / h ** 2 / 2.0) dxkxy = -tt.dot(Kxy, X) sumkxy = tt.sum(Kxy, axis=1).dimshuffle(0, 'x') dxkxy = tt.add(dxkxy, tt.mul(X, sumkxy)) / (h ** 2) return Kxy, dxkxy
def vgd_kernel(X0): XY = T.dot(X0, X0.transpose()) x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1)) X2e = T.repeat(x2, X0.shape[0], axis=1) H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY) V = H.flatten() # median distance h = T.switch( T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[((V.shape[0] // 2) - 1):((V.shape[0] // 2) + 1)]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2. Kxy = T.exp(-H / h**2 / 2.0) dxkxy = -T.dot(Kxy, X0) sumkxy = T.sum(Kxy, axis=1).dimshuffle(0, 'x') dxkxy = T.add(dxkxy, T.mul(X0, sumkxy)) / (h**2) return (Kxy, dxkxy, h)
def second_order_update(loss_or_grads, params, oldparams, step_size): """Second-order update method for optimizing loss_last_sample, so basically, KL term (new params || old params) + NLL of latest sample. The Hessian is evaluated at the origin and provides curvature information to make a more informed step in the correct descent direction.""" grads = T.grad(loss_or_grads, params) updates = OrderedDict() for i in range(len(params)): param = params[i] grad = grads[i] if param.name == 'mu' or param.name == 'b_mu': oldparam_rho = oldparams[i + 1] invH = T.square(T.log(1 + T.exp(oldparam_rho))) else: oldparam_rho = oldparams[i] p = param H = 2. * (T.exp(2 * p)) / \ (1 + T.exp(p))**2 / (T.log(1 + T.exp(p))**2) invH = 1. / H updates[param] = param - step_size * invH * grad return updates
def adam_conditional_updates( params, cost, mincost, lr=0.001, mom1=0.9, mom2=0.999): # if cost is less than mincost, don't do update updates = [] grads = T.grad(cost, params) t = th.shared(np.cast[th.config.floatX](1.)) for p, g in zip(params, grads): v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) v_t = mom1 * v + (1. - mom1) * g mg_t = mom2 * mg + (1. - mom2) * T.square(g) v_hat = v_t / (1. - mom1**t) mg_hat = mg_t / (1. - mom2**t) g_t = v_hat / T.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append((v, ifelse(cost < mincost, v, v_t))) updates.append((mg, ifelse(cost < mincost, mg, mg_t))) updates.append((p, ifelse(cost < mincost, p, p_t))) updates.append((t, ifelse(cost < mincost, t, t + 1))) return updates
def ADAM(lr, params, grads, loss, iteration, beta_1=0.9, beta_2=0.999, epsilon=1e-8): """ ADAM update """ t = iteration lr_t = lr * T.sqrt(1 - T.pow(beta_2, t)) / (1 - T.pow(beta_1, t)) w_decay = cfg.TRAIN.WEIGHT_DECAY updates = [] for p, g in zip(params, grads): # zero init of moment m = theano.shared(p.val.get_value() * 0.) # zero init of velocity v = theano.shared(p.val.get_value() * 0.) if p.is_bias or w_decay == 0: regularized_g = g else: regularized_g = g + w_decay * p.val m_t = (beta_1 * m) + (1 - beta_1) * regularized_g v_t = (beta_2 * v) + (1 - beta_2) * T.square(regularized_g) p_t = p.val - lr_t * m_t / (T.sqrt(v_t) + epsilon) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p.val, p_t)) return updates
def _apply_gradients(self, grads_and_vars): b1, b2 = self.args['beta1'], self.args['beta2'] ep, lr = self.args['epsilon'], self.args['learning_rate'] b1_pow = self._create_slot(b1, 'beta1_power') b2_pow = self._create_slot(b2, 'beta2_power') alpha = lr * T.sqrt(1.0 - b2_pow) / (1.0 - b1_pow) updates = OrderedDict() for grad, var in grads_and_vars: m = self._create_slot_var(var, 'm') v = self._create_slot_var(var, 'v') new_m = m + (1.0 - b1) * (grad - m) new_v = v + (1.0 - b2) * (T.square(grad) - v) new_var = var - (new_m * alpha) / (T.sqrt(new_v) + ep) updates[m] = new_m updates[v] = new_v updates[var] = new_var updates[b1_pow] = b1_pow * b1 updates[b2_pow] = b2_pow * b2 return Operation(op=updates)
def lmul_sq_T(self, x): """ Kind of a stupid hacky method used to support convolutional score matching. Ought to find a way to make _filters symbolic rather than shared. """ assert x.dtype == self._filters.dtype op_axes = ('b', 'c', 0, 1) axes = self.output_axes if tuple(axes) != op_axes: x = x.dimshuffle(axes.index('b'), axes.index('c'), axes.index(0), axes.index(1)) # dot(x, sq(A).T) dummy_v = T.tensor4() sqfilt = T.square(self._filters) z_hs = conv2d( dummy_v, sqfilt, image_shape=self._img_shape, filter_shape=self._filters_shape, subsample=self._subsample, border_mode=self._border_mode, ) rval, xdummy = z_hs.owner.op.grad((dummy_v, sqfilt), (x, )) # Format the output based on the input space axes = self.input_space.axes assert len(axes) == 4 if tuple(axes) != op_axes: rval = rval.dimshuffle(op_axes.index(axes[0]), op_axes.index(axes[1]), op_axes.index(axes[2]), op_axes.index(axes[3])) return rval
def log_diff_normal_cdf(mu, sigma, x, y): """ Compute :math:`\\log(\\Phi(\frac{x - \\mu}{\\sigma}) - \\Phi(\frac{y - \\mu}{\\sigma}))` safely in log space. Parameters ---------- mu: float mean sigma: float std x: float y: float must be strictly less than x. Returns ------- log (\\Phi(x) - \\Phi(y)) """ x = (x - mu) / sigma / tt.sqrt(2.0) y = (y - mu) / sigma / tt.sqrt(2.0) # To stabilize the computation, consider these three regions: # 1) x > y > 0 => Use erf(x) = 1 - e^{-x^2} erfcx(x) and erf(y) =1 - e^{-y^2} erfcx(y) # 2) 0 > x > y => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y) # 3) x > 0 > y => Naive formula log( (erf(x) - erf(y)) / 2 ) works fine. return tt.log(0.5) + tt.switch( tt.gt(y, 0), -tt.square(y) + tt.log(tt.erfcx(y) - tt.exp(tt.square(y) - tt.square(x)) * tt.erfcx(x)), tt.switch( tt.lt(x, 0), # 0 > x > y -tt.square(x) + tt.log(tt.erfcx(-x) - tt.exp(tt.square(x) - tt.square(y)) * tt.erfcx(-y)), tt.log(tt.erf(x) - tt.erf(y)), # x >0 > y ), )
def f(x, u, i, terminal): # Original Gym does not impose a control cost, but does clip it # to [-1, 1]. This non-linear dynamics is hard for iLQG to handle, # so add a quadratic control penalty instead. if terminal: ctrl_cost = T.zeros_like(x[..., 0]) else: ctrl_cost = T.square(u).sum(axis=-1) # x: (batch_size, 6), concatenation of qpos & qvel # Distance cost # The tricky part is finding Cartesian coords of pole tip. base_x = x[..., 0] # qpos[0]: x axis of the slider hinge1_ang = x[..., 1] # qpos[1]: angle of the first hinge hinge2_ang = x[..., 2] # qpos[2]: angle of the second hinge hinge2_cum_ang = hinge1_ang + hinge2_ang # 0 degrees is y=1, x=0; rotates clockwise. hinge1_x, hinge1_y = T.sin(hinge1_ang), T.cos(hinge1_ang) hinge2_x, hinge2_y = T.sin(hinge2_cum_ang), T.cos(hinge2_cum_ang) tip_x = base_x + hinge1_x + hinge2_x tip_y = hinge1_y + hinge2_y dist_cost = 0.01 * T.square(tip_x) + T.square(tip_y - 2) # Velocity cost v1 = x[..., 4] # qvel[1] v2 = x[..., 5] # qvel[2] vel_cost = 1e-3 * T.square(v1) + 5e-3 * T.square(v2) # TODO: termination penalty? (shouldn't change optimal policy?) dist_below = T.max([T.zeros_like(tip_y), 1.1 - tip_y], axis=0) termination_cost = T.square(dist_below) cost = (5 * termination_cost + dist_cost + vel_cost + ctrl_coef * ctrl_cost) return cost
#!/usr/bin/env python # -*- coding: utf-8 -*- import numpy as np import theano.tensor as T import theano from theano import function from theano import shared w = T.dvector("w") w_L2_norm_2 = T.square(w.norm(L=2)) g_w_L2_norm_2 = T.grad(w_L2_norm_2, w) print "To see the graph, difficult to see the result in math form..." theano.pp(g_w_L2_norm_2)
def l2_normalize(x, axis): norm = T.sqrt(T.sum(T.square(x), axis=axis, keepdims=True)) return x / norm
weights.append(m.parameters[layer.weights]) weight_decay = ((weights[0]**2).sum() + (weights[1]**2).sum() + (weights[2]**2).sum()) weight_decay /= m.exprs['inpt'].shape[0] m.exprs['true_loss'] = m.exprs['loss'] c_wd = 0.1 m.exprs['loss'] = m.exprs['loss'] + c_wd * weight_decay mae = T.abs_((m.exprs['output'] * np.std(train_labels, axis=0) + np.mean(train_labels, axis=0)) - m.exprs['target']).mean(axis=0) f_mae = m.function(['inpt', 'target'], mae) rmse = T.sqrt( T.square((m.exprs['output'] * np.std(train_labels, axis=0) + np.mean(train_labels, axis=0)) - m.exprs['target']).mean(axis=0)) f_rmse = m.function(['inpt', 'target'], rmse) start = time.time() # Set up a nice printout. keys = '#', 'seconds', 'loss', 'val loss', 'mae_train', 'rmse_train', 'mae_test', 'rmse_test' max_len = max(len(i) for i in keys) header = '\t'.join(i for i in keys) print header print '-' * len(header) results = open('result.txt', 'a') results.write(header + '\n') results.write('-' * len(header) + '\n') results.close() EXP_DIR = os.getcwd()
#顯示資料 #plt.scatter(x_data,y_data) #plt.show() #定義input,d為float64 x = T.dmatrix('x') y = T.dmatrix('y') #增加layer,in_size=1是因為x只有一個屬性,out_size=10自己定義的 l1 = Layer(x, 1, 10, T.nnet.relu) #output layer 的大小out_size=1是因為y也是只有一個維度 l2 = Layer(l1.outputs, 10, 1, None) #計算cost(平均誤差) cost = T.mean(T.square(l2.outputs - y)) #Gradient 計算(每次weight、bias變化量),後面放參數 gW1, gb1, gW2, gb2 = T.grad(cost, [l1.W, l1.b, l2.W, l2.b]) #Gradient Descent 應用 learning_rate = 0.05 #input x是因為呼叫cost函數會用到x、y來計算出l2.outputs train = theano.function( inputs=[x, y], outputs=cost, #每一個要更新的東西用小括號代替 updates=[(l1.W, l1.W - learning_rate * gW1), (l1.b, l1.b - learning_rate * gb1), (l2.W, l2.W - learning_rate * gW2), (l2.b, l2.b - learning_rate * gb2)])
for p in disc_params ] disc_avg_updates = [(a, a + 0.0001 * (p - a)) for p, a in zip(disc_params, disc_param_avg)] disc_avg_givens = [(p, a) for p, a in zip(disc_params, disc_param_avg) ] # data based initialization train_batch_disc = th.function(inputs=[x_lab, x_unl, lr], outputs=[loss_lab, loss_unl], updates=disc_param_updates) # Theano functions for training the gen net output_unl = ll.get_output(disc_layers[-3], x_unl, deterministic=False) output_gen = ll.get_output(disc_layers[-3], gen_dat, deterministic=False) m1 = T.mean(output_unl, axis=0) m2 = T.mean(output_gen, axis=0) loss_gen = T.mean(T.square(m1 - m2)) # feature matching loss gen_params = ll.get_all_params(gen_layers, trainable=True) gen_param_updates = nn.adam_updates(gen_params, loss_gen, lr=lr, mom1=0.5) train_batch_gen = th.function(inputs=[x_unl, lr], outputs=loss_gen, updates=gen_param_updates) x_temp = T.tensor4() features = ll.get_output(disc_layers[-1], x_temp, deterministic=True) generate_features = th.function(inputs=[x_temp], outputs=features) # //////////// perform training ////////////// for epoch in range(1): begin = time.time() lr = np.cast[th.config.floatX](args.learning_rate * np.minimum(3. - epoch / 400., 1.)) nr_batches_lab = int(txs.shape[0] / args.batch_size)
def compute_y(P, no_dims, max_iter): (n, d) = P.shape # n = 2500 # max_iter = 100 initial_momentum = 0.5 final_momentum = 0.8 eta = 500 min_gain = 0.01 initial_momentum_f = tensor.cast(initial_momentum, FLOATX) final_momentum_f = tensor.cast(final_momentum, FLOATX) min_gain_f = tensor.cast(min_gain, FLOATX) # sample of normal distribution, mean = 0, stardand_variance = 1 numpy.random.seed(2) Y = numpy.random.randn(n, no_dims).astype(FLOATX) iY = numpy.zeros((n, no_dims), dtype=FLOATX) gains = numpy.ones((n, no_dims), dtype=FLOATX) y_arg = theano.shared(Y) iy_arg = theano.shared(iY) gains_arg = theano.shared(gains) p_arg = theano.shared(P.astype(FLOATX)) momentum = theano.shared(numpy.float32(initial_momentum)) # Compute pairwise affinities sum_y = tensor.sum(tensor.square(y_arg), 1) num = 1 / (1 + tensor.add( tensor.add(-2 * tensor.dot(y_arg, y_arg.T), sum_y).T, sum_y)) num = tensor.set_subtensor(num[range(n), range(n)], 0) Q = num / tensor.sum(num) Q = tensor.maximum(Q, 1e-12) PQ = p_arg - Q A = PQ * num dy_arg = (tensor.tile(tensor.sum(A, 0), (no_dims, 1)).T * y_arg) - tensor.dot(A.T, y_arg) dy_arg = tensor.cast(dy_arg, FLOATX) indexsa = tensor.neq((dy_arg > 0), (iy_arg > 0)).nonzero() indexsb = tensor.eq((dy_arg > 0), (iy_arg > 0)).nonzero() resulta = tensor.set_subtensor(gains_arg[indexsa], gains_arg[indexsa] + 0.2) resultb = tensor.set_subtensor(resulta[indexsb], resulta[indexsb] * 0.8) indexs_min = (resultb < min_gain_f).nonzero() new_gains_arg = tensor.set_subtensor(resultb[indexs_min], min_gain_f) # last step in simple version of SNE new_iy_arg = momentum * iy_arg - eta * (new_gains_arg * dy_arg) new_y_arg = y_arg + new_iy_arg new_y_arg = new_y_arg - tensor.tile(tensor.mean(new_y_arg, 0), (n, 1)) # # Compute current value of cost function # if (cur_step + 1) % 10 == 0: # C = tensor.sum(p_arg * tensor.log(p_arg / Q)) # print "Iteration ", (cur_step + 1), ": error is ", C compute_y_fun = theano.function(inputs=[], updates=[(y_arg, new_y_arg), (iy_arg, new_iy_arg), (gains_arg, new_gains_arg)]) for cur_step in range(max_iter): if cur_step == 20: momentum.set_value(numpy.float32(final_momentum)) compute_y_fun() if cur_step == 100: p_arg.set_value((p_arg.get_value() / 4).astype(FLOATX)) return y_arg.get_value()
def osl_w_brier_loss(o, f, class_weights): """f is the forecast and o is the original outcome""" d = T.argmax(T.mul(o, f), axis=-1, keepdims=True) return T.mean(T.dot(T.square(T.sub(f, d)), class_weights), axis=-1)
def gan_unlabelled_classif(trainx, trainy, testx, testy, lab_cnt, inp_size, train_ex_cnt): trainy = trainy.astype(np.int32) testy = testy.astype(np.int32) trainx = trainx.reshape((-1, inp_size)).astype(th.config.floatX) testx = testx.reshape((-1, inp_size)).astype(th.config.floatX) assert train_ex_cnt == trainx.shape[0] # settings parser = argparse.ArgumentParser() parser.add_argument('--seed', type=int, default=1) parser.add_argument('--seed_data', type=int, default=1) parser.add_argument('--unlabeled_weight', type=float, default=1.) parser.add_argument('--batch_size', type=int, default=100) parser.add_argument('--count', type=int, default=10) parser.add_argument('--iter_limit', type=int, default=300) args = parser.parse_args() print(args) # fixed random seeds rng = np.random.RandomState(args.seed) theano_rng = MRG_RandomStreams(rng.randint(2**15)) lasagne.random.set_rng(np.random.RandomState(rng.randint(2**15))) data_rng = np.random.RandomState(args.seed_data) # npshow(trainx.reshape((-1, 27, 32))[0]) trainx_unl = trainx.copy() trainx_unl2 = trainx.copy() nr_batches_train = int(trainx.shape[0] / args.batch_size) nr_batches_test = int(testx.shape[0] / args.batch_size) # select labeled data inds = data_rng.permutation(trainx.shape[0]) trainx = trainx[inds] trainy = trainy[inds] txs = [] tys = [] for _j in range(10): j = _j % lab_cnt txs.append(trainx[trainy == j][:args.count]) tys.append(trainy[trainy == j][:args.count]) txs = np.concatenate(txs, axis=0) tys = np.concatenate(tys, axis=0) # specify generative model noise = theano_rng.uniform(size=(args.batch_size, 100)) gen_layers = [LL.InputLayer(shape=(args.batch_size, 100), input_var=noise)] gen_layers.append( nn.batch_norm(LL.DenseLayer(gen_layers[-1], num_units=500, nonlinearity=T.nnet.softplus), g=None)) gen_layers.append( nn.batch_norm(LL.DenseLayer(gen_layers[-1], num_units=500, nonlinearity=T.nnet.softplus), g=None)) gen_layers.append( nn.l2normalize( LL.DenseLayer(gen_layers[-1], num_units=inp_size, nonlinearity=T.nnet.sigmoid))) gen_dat = LL.get_output(gen_layers[-1], deterministic=False) # specify supervised model layers = [LL.InputLayer(shape=(None, inp_size))] layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.3)) layers.append(nn.DenseLayer(layers[-1], num_units=1000)) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append(nn.DenseLayer(layers[-1], num_units=500)) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append(nn.DenseLayer(layers[-1], num_units=250)) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append(nn.DenseLayer(layers[-1], num_units=250)) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append(nn.DenseLayer(layers[-1], num_units=250)) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append( nn.DenseLayer(layers[-1], num_units=lab_cnt, nonlinearity=None, train_scale=True)) # costs labels = T.ivector() x_lab = T.matrix() x_unl = T.matrix() temp = LL.get_output(gen_layers[-1], init=True) temp = LL.get_output(layers[-1], x_lab, deterministic=False, init=True) init_updates = [ u for l in gen_layers + layers for u in getattr(l, 'init_updates', []) ] output_before_softmax_lab = LL.get_output(layers[-1], x_lab, deterministic=False) output_before_softmax_unl = LL.get_output(layers[-1], x_unl, deterministic=False) output_before_softmax_fake = LL.get_output(layers[-1], gen_dat, deterministic=False) z_exp_lab = T.mean(nn.log_sum_exp(output_before_softmax_lab)) z_exp_unl = T.mean(nn.log_sum_exp(output_before_softmax_unl)) z_exp_fake = T.mean(nn.log_sum_exp(output_before_softmax_fake)) l_lab = output_before_softmax_lab[T.arange(args.batch_size), labels] l_unl = nn.log_sum_exp(output_before_softmax_unl) loss_lab = -T.mean(l_lab) + T.mean(z_exp_lab) loss_unl = -0.5 * T.mean(l_unl) + 0.5 * T.mean( T.nnet.softplus( nn.log_sum_exp(output_before_softmax_unl))) + 0.5 * T.mean( T.nnet.softplus(nn.log_sum_exp(output_before_softmax_fake))) train_err = T.mean( T.neq(T.argmax(output_before_softmax_lab, axis=1), labels)) mom_gen = T.mean(LL.get_output(layers[-3], gen_dat), axis=0) mom_real = T.mean(LL.get_output(layers[-3], x_unl), axis=0) loss_gen = T.mean(T.square(mom_gen - mom_real)) # test error output_before_softmax = LL.get_output(layers[-1], x_lab, deterministic=True) test_err = T.mean(T.neq(T.argmax(output_before_softmax, axis=1), labels)) # Theano functions for training and testing lr = T.scalar() disc_params = LL.get_all_params(layers, trainable=True) disc_param_updates = nn.adam_updates(disc_params, loss_lab + args.unlabeled_weight * loss_unl, lr=lr, mom1=0.5) disc_param_avg = [ th.shared(np.cast[th.config.floatX](0. * p.get_value())) for p in disc_params ] disc_avg_updates = [(a, a + 0.0001 * (p - a)) for p, a in zip(disc_params, disc_param_avg)] disc_avg_givens = [(p, a) for p, a in zip(disc_params, disc_param_avg)] gen_params = LL.get_all_params(gen_layers[-1], trainable=True) gen_param_updates = nn.adam_updates(gen_params, loss_gen, lr=lr, mom1=0.5) init_param = th.function(inputs=[x_lab], outputs=None, updates=init_updates) train_batch_disc = th.function(inputs=[x_lab, labels, x_unl, lr], outputs=[loss_lab, loss_unl, train_err], updates=disc_param_updates + disc_avg_updates) train_batch_gen = th.function(inputs=[x_unl, lr], outputs=[loss_gen], updates=gen_param_updates) test_batch = th.function(inputs=[x_lab, labels], outputs=test_err, givens=disc_avg_givens) init_param(trainx[:500]) # data dependent initialization # //////////// perform training ////////////// lr = 0.003 for epoch in range(args.iter_limit): begin = time.time() # construct randomly permuted minibatches trainx = [] trainy = [] for t in range(trainx_unl.shape[0] / txs.shape[0]): inds = rng.permutation(txs.shape[0]) trainx.append(txs[inds]) trainy.append(tys[inds]) trainx = np.concatenate(trainx, axis=0) trainy = np.concatenate(trainy, axis=0) trainx_unl = trainx_unl[rng.permutation(trainx_unl.shape[0])] trainx_unl2 = trainx_unl2[rng.permutation(trainx_unl2.shape[0])] # train loss_lab = 0. loss_unl = 0. train_err = 0. for t in range(nr_batches_train): ll, lu, te = train_batch_disc( trainx[t * args.batch_size:(t + 1) * args.batch_size], trainy[t * args.batch_size:(t + 1) * args.batch_size], trainx_unl[t * args.batch_size:(t + 1) * args.batch_size], lr) loss_lab += ll loss_unl += lu train_err += te e = train_batch_gen( trainx_unl2[t * args.batch_size:(t + 1) * args.batch_size], lr) loss_lab /= nr_batches_train loss_unl /= nr_batches_train train_err /= nr_batches_train # test test_err = 0. for t in range(nr_batches_test): test_err += test_batch( testx[t * args.batch_size:(t + 1) * args.batch_size], testy[t * args.batch_size:(t + 1) * args.batch_size]) test_err /= nr_batches_test # report print( "Iteration %d, time = %ds, loss_lab = %.4f, loss_unl = %.4f, train err = %.4f, test err = %.4f" % (epoch, time.time() - begin, loss_lab, loss_unl, train_err, test_err)) sys.stdout.flush()