def e_step(self, n_steps=100, eps=1e-5): """ Performs `n_steps` of mean-field inference (used to compute positive phase statistics). :param psamples: list of tensor-like objects, representing the state of each layer of the DBM (during the inference process). psamples[0] points to self.input. :param n_steps: number of iterations of mean-field to perform. """ new_psamples = [T.unbroadcast(T.shape_padleft(psample)) for psample in self.psamples] # now alternate mean-field inference for even/odd layers def mf_iteration(*psamples): new_psamples = [p for p in psamples] for i in xrange(1,self.depth,2): new_psamples[i] = self.hi_given(psamples, i) for i in xrange(2,self.depth,2): new_psamples[i] = self.hi_given(psamples, i) score = 0. for i in xrange(1, self.depth): score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])), score) return new_psamples, theano.scan_module.until(score < eps) new_psamples, updates = scan( mf_iteration, states = new_psamples, n_steps=n_steps) return [x[0] for x in new_psamples]
def pos_sampling(self, n_steps=50): """ Performs `n_steps` of mean-field inference (used to compute positive phase statistics). :param psamples: list of tensor-like objects, representing the state of each layer of the DBM (during the inference process). psamples[0] points to self.input. :param n_steps: number of iterations of mean-field to perform. """ new_psamples = [ T.unbroadcast(T.shape_padleft(psample)) for psample in self.psamples ] # now alternate mean-field inference for even/odd layers def sample_iteration(*psamples): new_psamples = [p for p in psamples] for i in xrange(1, self.depth, 2): new_psamples[i] = self.sample_hi_given(psamples, i) for i in xrange(2, self.depth, 2): new_psamples[i] = self.sample_hi_given(psamples, i) return new_psamples new_psamples, updates = scan(sample_iteration, states=new_psamples, n_steps=n_steps) return [x[0] for x in new_psamples]
def _e_step(psamples, W_list, b_list, n_steps=100, eps=1e-5): """ Performs 'n_steps' of mean-field inference (used to compute positive phase statistics) Parameters ---------- psamples : array-like object of theano shared variables State of each layer of the DBM (during the inference process). psamples[0] points to the input n_steps : integer Number of iterations of mean-field to perform """ depth = len(psamples) new_psamples = [T.unbroadcast(T.shape_padleft(psample)) for psample in psamples] # now alternate mean-field inference for even/odd layers def mf_iteration(*psamples): new_psamples = [p for p in psamples] for i in xrange(1, depth, 2): new_psamples[i] = hi_given(psamples, i, W_list, b_list) for i in xrange(2, depth, 2): new_psamples[i] = hi_given(psamples, i, W_list, b_list) score = 0.0 for i in xrange(1, depth): score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])), score) return new_psamples, theano.scan_module.until(score < eps) new_psamples, updates = scan(mf_iteration, states=new_psamples, n_steps=n_steps) return [x[0] for x in new_psamples]
def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone([model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop(nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates
def e_step(self, n_steps=100, eps=1e-5): """ Performs `n_steps` of mean-field inference (used to compute positive phase statistics). :param psamples: list of tensor-like objects, representing the state of each layer of the DBM (during the inference process). psamples[0] points to self.input. :param n_steps: number of iterations of mean-field to perform. """ new_psamples = [ T.unbroadcast(T.shape_padleft(psample)) for psample in self.psamples ] # now alternate mean-field inference for even/odd layers def mf_iteration(*psamples): new_psamples = [p for p in psamples] for i in xrange(1, self.depth, 2): new_psamples[i] = self.hi_given(psamples, i) for i in xrange(2, self.depth, 2): new_psamples[i] = self.hi_given(psamples, i) score = 0. for i in xrange(1, self.depth): score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])), score) return new_psamples, theano.scan_module.until(score < eps) new_psamples, updates = scan(mf_iteration, states=new_psamples, n_steps=n_steps) return [x[0] for x in new_psamples]
def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost, nw_preactiv_out = safe_clone([model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop(nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates
def compute_Gv(*args): idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates
def pos_phase(self, v, init_state, n_steps=1, eps=1e-3): """ Mixed mean-field + sampling inference in positive phase. :param v: input being conditioned on :param init: dictionary of initial values :param n_steps: number of Gibbs updates to perform afterwards. """ def pos_mf_iteration(g1, h1, v, pos_counter): h2 = self.h_hat(g1, v) s2_1 = self.s1_hat(g1, v) s2_0 = self.s0_hat(g1, v) g2 = self.g_hat(h2, s2_1, s2_0) # stopping criterion dl_dghat = T.max(abs(self.dlbound_dg(g2, h2, s2_1, s2_0, v))) dl_dhhat = T.max(abs(self.dlbound_dh(g2, h2, s2_1, s2_0, v))) stop = T.maximum(dl_dghat, dl_dhhat) return [g2, h2, s2_1, s2_0, v, pos_counter + 1], theano.scan_module.until(stop < eps) states = [T.unbroadcast(T.shape_padleft(init_state['g'])), T.unbroadcast(T.shape_padleft(init_state['h'])), {'steps': 1}, {'steps': 1}, T.unbroadcast(T.shape_padleft(v)), T.unbroadcast(T.shape_padleft(0.))] rvals, updates = scan( pos_mf_iteration, states = states, n_steps=n_steps) return [rval[0] for rval in rvals]
def scalar_armijo_search(phi, phi0, derphi0, c1=constant(1e-4), n_iters=10, profile=0): """ .. todo:: WRITEME """ alpha0 = one phi_a0 = phi(alpha0) alpha1 = -(derphi0) * alpha0 ** 2 / 2.0 /\ (phi_a0 - phi0 - derphi0 * alpha0) phi_a1 = phi(alpha1) csol1 = phi_a0 <= phi0 + c1 * derphi0 csol2 = phi_a1 <= phi0 + c1 * alpha1 * derphi0 def armijo(alpha0, alpha1, phi_a0, phi_a1): factor = alpha0**2 * alpha1**2 * (alpha1 - alpha0) a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \ alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0) a = a / factor b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \ alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0) b = b / factor alpha2 = (-b + TT.sqrt(abs(b**2 - 3 * a * derphi0))) / (3.0 * a) phi_a2 = phi(alpha2) end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0 end_condition = TT.bitwise_or(TT.isnan(alpha2), end_condition) end_condition = TT.bitwise_or(TT.isinf(alpha2), end_condition) alpha2 = TT.switch( TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.), one - alpha2 / alpha1 < 0.96), alpha1 / constant(2.), alpha2) return [alpha1, alpha2, phi_a1, phi_a2], \ theano.scan_module.until(end_condition) states = [] states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)] states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)] # print 'armijo' rvals, _ = scan(armijo, states=states, n_steps=n_iters, name='armijo', mode=theano.Mode(linker='cvm'), profile=profile) sol_scan = rvals[1][0] a_opt = ifelse(csol1, one, ifelse(csol2, alpha1, sol_scan)) score = ifelse(csol1, phi_a0, ifelse(csol2, phi_a1, rvals[2][0])) return a_opt, score
def scalar_armijo_search(phi, phi0, derphi0, c1=constant(1e-4), n_iters=10, profile=0): alpha0 = one phi_a0 = phi(alpha0) alpha1 = -(derphi0) * alpha0 ** 2 / 2.0 /\ (phi_a0 - phi0 - derphi0 * alpha0) phi_a1 = phi(alpha1) csol1 = phi_a0 <= phi0 + c1 * derphi0 csol2 = phi_a1 <= phi0 + c1 * alpha1 * derphi0 def armijo(alpha0, alpha1, phi_a0, phi_a1): factor = alpha0 ** 2 * alpha1 ** 2 * (alpha1 - alpha0) a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \ alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0) a = a / factor b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \ alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0) b = b / factor alpha2 = (-b + TT.sqrt(abs(b ** 2 - 3 * a * derphi0))) / (3.0 * a) phi_a2 = phi(alpha2) end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0 end_condition = TT.bitwise_or( TT.isnan(alpha2), end_condition) end_condition = TT.bitwise_or( TT.isinf(alpha2), end_condition) alpha2 = TT.switch( TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.), one - alpha2 / alpha1 < 0.96), alpha1 / constant(2.), alpha2) return [alpha1, alpha2, phi_a1, phi_a2], \ theano.scan_module.until(end_condition) states = [] states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)] states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)] # print 'armijo' rvals, _ = scan( armijo, states=states, n_steps=n_iters, name='armijo', mode=theano.Mode(linker='cvm'), profile=profile) sol_scan = rvals[1][0] a_opt = ifelse(csol1, one, ifelse(csol2, alpha1, sol_scan)) score = ifelse(csol1, phi_a0, ifelse(csol2, phi_a1, rvals[2][0])) return a_opt, score
def test_005(): sq = theano.tensor.fvector('sq') nst = theano.tensor.iscalar('nst') out, _ = scan.scan(lambda s: s+numpy.float32(1), sequences=sq, states=[None], n_steps=nst) fn = theano.function([sq, nst], out) val_sq = numpy.float32([1, 2, 3, 4, 5]) assert numpy.all(fn(val_sq, 5) == val_sq + 1)
def test_001(): x0 = theano.tensor.fvector('x0') state = theano.tensor.unbroadcast( theano.tensor.shape_padleft(x0), 0) out, _ = scan.scan(lambda x: x+numpy.float32(1), states=state, n_steps=5) fn = theano.function([x0], out[0]) val_x0 = numpy.float32([1, 2, 3]) assert numpy.all(fn(val_x0) == val_x0 + 5)
def compute_Gv(*args): cgv = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name='cgv%d' % idx) for idx, shp in enumerate(model.params_shape) ] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, cgv)) Gvs = [ ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [ TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:] ] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates=updates, givens=dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile=options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {}
def linear_cg_fletcher_reeves(compute_Ax, bs, xinit=None, rtol=1e-6, maxiter=1000, damp=0, floatX=None, profile=0): """ assume all are lists all the time Reference: http://en.wikipedia.org/wiki/Conjugate_gradient_method """ n_params = len(bs) def loop(rz_old, *args): ps = args[:n_params] rs = args[n_params:2 * n_params] xs = args[2 * n_params:] _Aps = compute_Ax(*ps) Aps = [x + damp * y for x, y in zip(_Aps, ps)] alpha = rz_old / sum((x * y).sum() for x, y in zip(Aps, ps)) xs = [x + alpha * p for x, p in zip(xs, ps)] rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)] rz_new = sum((r * r).sum() for r in rs) ps = [r + rz_new / rz_old * p for r, p in zip(rs, ps)] return [rz_new]+ps+rs+xs, \ theano.scan_module.until(abs(rz_new) < rtol) if xinit is None: r0s = bs _x0s = [ tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(x))) for x in bs ] else: init_Ax = compute_Ax(*xinit) r0s = [bs[i] - init_Ax[i] for i in xrange(len(bs))] _x0s = [tensor.unbroadcast(tensor.shape_padleft(xi)) for xi in xinit] _p0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s] _r0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s] rz_old = sum((r * r).sum() for r in r0s) _rz_old = tensor.unbroadcast(tensor.shape_padleft(rz_old), 0) outs, updates = scan(loop, states=[_rz_old] + _p0s + _r0s + _x0s, n_steps=maxiter, mode=theano.Mode(linker='cvm'), name='linear_conjugate_gradient', profile=profile) fxs = outs[1 + 2 * n_params:] return [x[0] for x in fxs]
def test_002(): x0 = theano.tensor.fvector('x0') state = theano.tensor.alloc( theano.tensor.constant(numpy.float32(0)), 6, x0.shape[0]) state = theano.tensor.set_subtensor(state[0], x0) out, _ = scan.scan(lambda x: x+numpy.float32(1), states=state, n_steps=5) fn = theano.function([x0], out) val_x0 = numpy.float32([1, 2, 3]) assert numpy.all(fn(val_x0)[-1] == val_x0 + 5) assert numpy.all(fn(val_x0)[0] == val_x0)
def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates
def linear_cg_precond(compute_Gv, bs, Msz, rtol=1e-16, maxit=100000, floatX=None): """ assume all are lists all the time Reference: http://en.wikipedia.org/wiki/Conjugate_gradient_method """ n_params = len(bs) def loop(rsold, *args): ps = args[:n_params] rs = args[n_params:2 * n_params] xs = args[2 * n_params:] Aps = compute_Gv(*ps) alpha = rsold / sum((x * y).sum() for x, y in zip(Aps, ps)) xs = [x + alpha * p for x, p in zip(xs, ps)] rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)] zs = [r / z for r, z in zip(rs, Msz)] rsnew = sum((r * z).sum() for r, z in zip(rs, zs)) ps = [z + rsnew / rsold * p for z, p in zip(zs, ps)] return [rsnew] + ps + rs + xs, theano.scan_module.until(abs(rsnew) < rtol) r0s = bs _p0s = [ tensor.unbroadcast(tensor.shape_padleft(x / z), 0) for x, z in zip(r0s, Msz) ] _r0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s] _x0s = [ tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(x)), 0) for x in bs ] rsold = sum((r * r / z).sum() for r, z in zip(r0s, Msz)) _rsold = tensor.unbroadcast(tensor.shape_padleft(rsold), 0) outs, updates = scan(loop, states=[_rsold] + _p0s + _r0s + _x0s, n_steps=maxit, mode=theano.Mode(linker='c|py'), name='linear_conjugate_gradient', profile=0) fxs = outs[1 + 2 * n_params:] return [x[0] for x in fxs]
def compute_Gv(*args): cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name ='cgv%d'%idx) for idx, shp in enumerate(model.params_shape)] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost, nw_preactiv_out = safe_clone([model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop(nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, cgv)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates = updates, givens = dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile = options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {}
def linear_cg_fletcher_reeves(compute_Ax, bs, xinit = None, rtol = 1e-6, maxiter = 1000, damp=0, floatX = None, profile=0): """ assume all are lists all the time Reference: http://en.wikipedia.org/wiki/Conjugate_gradient_method """ n_params = len(bs) def loop(rz_old, *args): ps = args[:n_params] rs = args[n_params:2*n_params] xs = args[2*n_params:] _Aps = compute_Ax(*ps) Aps = [x + damp*y for x,y in zip(_Aps, ps)] alpha = rz_old/sum( (x*y).sum() for x,y in zip(Aps, ps)) xs = [x + alpha * p for x,p in zip(xs,ps)] rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)] rz_new = sum( (r*r).sum() for r in rs) ps = [ r + rz_new/rz_old*p for r,p in zip(rs,ps)] return [rz_new]+ps+rs+xs, \ theano.scan_module.until(abs(rz_new) < rtol) if xinit is None: r0s = bs _x0s = [tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(x))) for x in bs] else: init_Ax = compute_Ax(*xinit) r0s = [bs[i] - init_Ax[i] for i in xrange(len(bs))] _x0s = [tensor.unbroadcast(tensor.shape_padleft(xi)) for xi in xinit] _p0s = [tensor.unbroadcast(tensor.shape_padleft(x),0) for x in r0s] _r0s = [tensor.unbroadcast(tensor.shape_padleft(x),0) for x in r0s] rz_old = sum( (r*r).sum() for r in r0s) _rz_old = tensor.unbroadcast(tensor.shape_padleft(rz_old),0) outs, updates = scan(loop, states = [_rz_old] + _p0s + _r0s + _x0s, n_steps = maxiter, mode = theano.Mode(linker='cvm'), name = 'linear_conjugate_gradient', profile=profile) fxs = outs[1+2*n_params:] return [x[0] for x in fxs]
def test_003(): x0 = theano.tensor.fvector('x0') sq = theano.tensor.fvector('sq') state = theano.tensor.alloc( theano.tensor.constant(numpy.float32(0)), 6, x0.shape[0]) state = theano.tensor.set_subtensor(state[0], x0) out, _ = scan.scan(lambda s, x: x+s, sequences=sq, states=state, n_steps=5) fn = theano.function([sq, x0], out) val_x0 = numpy.float32([1, 2, 3]) val_sq = numpy.float32([1, 2, 3, 4, 5]) assert numpy.all(fn(val_sq, val_x0)[-1] == val_x0 + 15) assert numpy.all(fn(val_sq, val_x0)[0] == val_x0)
def _e_step(psamples, W_list, b_list, n_steps=100, eps=1e-5): """ Performs 'n_steps' of mean-field inference (used to compute positive phase statistics) Parameters ---------- psamples : array-like object of theano shared variables State of each layer of the DBM (during the inference process). psamples[0] points to the input n_steps : integer Number of iterations of mean-field to perform """ depth = len(psamples) new_psamples = [T.unbroadcast(T.shape_padleft(psample)) for psample in psamples] # now alternate mean-field inference for even/odd layers def mf_iteration(*psamples): new_psamples = [p for p in psamples] for i in xrange(1, depth, 2): new_psamples[i] = hi_given(psamples, i, W_list, b_list) for i in xrange(2, depth, 2): new_psamples[i] = hi_given(psamples, i, W_list, b_list) score = 0. for i in xrange(1, depth): score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])), score) return new_psamples, theano.scan_module.until(score < eps) new_psamples, updates = scan( mf_iteration, states=new_psamples, n_steps=n_steps ) return [x[0] for x in new_psamples]
def pos_sampling(self, n_steps=50): """ Performs `n_steps` of mean-field inference (used to compute positive phase statistics). :param psamples: list of tensor-like objects, representing the state of each layer of the DBM (during the inference process). psamples[0] points to self.input. :param n_steps: number of iterations of mean-field to perform. """ new_psamples = [T.unbroadcast(T.shape_padleft(psample)) for psample in self.psamples] # now alternate mean-field inference for even/odd layers def sample_iteration(*psamples): new_psamples = [p for p in psamples] for i in xrange(1,self.depth,2): new_psamples[i] = self.sample_hi_given(psamples, i) for i in xrange(2,self.depth,2): new_psamples[i] = self.sample_hi_given(psamples, i) return new_psamples new_psamples, updates = scan( sample_iteration, states = new_psamples, n_steps=n_steps) return [x[0] for x in new_psamples]
def linear_cg_precond(compute_Gv, bs, Msz, rtol = 1e-16, maxit = 100000, floatX = None): """ assume all are lists all the time Reference: http://en.wikipedia.org/wiki/Conjugate_gradient_method """ n_params = len(bs) def loop(rsold, *args): ps = args[:n_params] rs = args[n_params:2*n_params] xs = args[2*n_params:] Aps = compute_Gv(*ps) alpha = rsold/sum( (x*y).sum() for x,y in zip(Aps, ps)) xs = [x + alpha * p for x,p in zip(xs,ps)] rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)] zs = [ r/z for r,z in zip(rs, Msz)] rsnew = sum( (r*z).sum() for r,z in zip(rs,zs)) ps = [ z + rsnew/rsold*p for z,p in zip(zs,ps)] return [rsnew]+ps+rs+xs, theano.scan_module.until(abs(rsnew) < rtol) r0s = bs _p0s = [tensor.unbroadcast(tensor.shape_padleft(x/z),0) for x,z in zip(r0s, Msz)] _r0s = [tensor.unbroadcast(tensor.shape_padleft(x),0) for x in r0s] _x0s = [tensor.unbroadcast(tensor.shape_padleft( tensor.zeros_like(x)),0) for x in bs] rsold = sum( (r*r/z).sum() for r,z in zip(r0s, Msz)) _rsold = tensor.unbroadcast(tensor.shape_padleft(rsold),0) outs, updates = scan(loop, states = [_rsold] + _p0s + _r0s + _x0s, n_steps = maxit, mode = theano.Mode(linker='c|py'), name = 'linear_conjugate_gradient', profile=0) fxs = outs[1+2*n_params:] return [x[0] for x in fxs]
def jobman(state, channel): # load dataset state['null_sym_source'] = 15000 state['null_sym_target'] = 15000 state['n_sym_source'] = state['null_sym_source'] + 1 state['n_sym_target'] = state['null_sym_target'] + 1 state['nouts'] = state['n_sym_target'] state['nins'] = state['n_sym_source'] rng = numpy.random.RandomState(state['seed']) if state['loopIters'] > 0: train_data, valid_data, test_data = get_data(state) else: train_data = None valid_data = None test_data = None ########### Training graph ##################### ## 1. Inputs if state['bs'] == 1: x = TT.lvector('x') x_mask = TT.vector('x_mask') y = TT.lvector('y') y0 = y y_mask = TT.vector('y_mask') else: x = TT.lmatrix('x') x_mask = TT.matrix('x_mask') y = TT.lmatrix('y') y0 = y y_mask = TT.matrix('y_mask') # 2. Layers and Operators bs = state['bs'] embdim = state['dim_mlp'] # Source Sentence emb = MultiLayer( rng, n_in=state['nins'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb') emb_words = [] if state['rec_gating']: gater_words = [] if state['rec_reseting']: reseter_words = [] for si in xrange(state['encoder_stack']): emb_words.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_%d'%si)) if state['rec_gating']: gater_words.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='gater_words_%d'%si)) if state['rec_reseting']: reseter_words.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='reseter_words_%d'%si)) add_rec_step = [] rec_proj = [] if state['rec_gating']: rec_proj_gater = [] if state['rec_reseting']: rec_proj_reseter = [] for si in xrange(state['encoder_stack']): if si > 0: rec_proj.append(MultiLayer( rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d'%si)) if state['rec_gating']: rec_proj_gater.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='rec_proj_gater_%d'%si)) if state['rec_reseting']: rec_proj_reseter.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='rec_proj_reseter_%d'%si)) add_rec_step.append(eval(state['rec_layer'])( rng, n_hids=state['dim'], activation = state['activ'], bias_scale = state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_%d'%si)) def _add_op(words_embeddings, words_mask=None, prev_val=None, si = 0, state_below = None, gater_below = None, reseter_below = None, one_step=False, bs=1, init_state=None, use_noise=True): seqlen = words_embeddings.out.shape[0]//bs rval = words_embeddings gater = None reseter = None if state['rec_gating']: gater = gater_below if state['rec_reseting']: reseter = reseter_below if si > 0: rval += rec_proj[si-1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_gater[si-1](state_below, one_step=one_step, use_noise = use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_reseter[si-1](state_below, one_step=one_step, use_noise = use_noise) if reseter: reseter += projg else: reseter = projg if not one_step: rval= add_rec_step[si]( rval, nsteps=seqlen, batch_size=bs, mask=words_mask, gater_below = gater, reseter_below = reseter, one_step=one_step, init_state=init_state, use_noise = use_noise) else: rval= add_rec_step[si]( rval, mask=words_mask, state_before=prev_val, gater_below = gater, reseter_below = reseter, one_step=one_step, init_state=init_state, use_noise = use_noise) return rval add_op = Operator(_add_op) # Target Sentence emb_t = MultiLayer( rng, n_in=state['nouts'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_t') emb_words_t = [] if state['rec_gating']: gater_words_t = [] if state['rec_reseting']: reseter_words_t = [] for si in xrange(state['decoder_stack']): emb_words_t.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_t_%d'%si)) if state['rec_gating']: gater_words_t.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='gater_words_t_%d'%si)) if state['rec_reseting']: reseter_words_t.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='reseter_words_t_%d'%si)) proj_everything_t = [] if state['rec_gating']: gater_everything_t = [] if state['rec_reseting']: reseter_everything_t = [] for si in xrange(state['decoder_stack']): proj_everything_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='proj_everything_t_%d'%si, learn_bias = False)) if state['rec_gating']: gater_everything_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='gater_everything_t_%d'%si, learn_bias = False)) if state['rec_reseting']: reseter_everything_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='reseter_everything_t_%d'%si, learn_bias = False)) add_rec_step_t = [] rec_proj_t = [] if state['rec_gating']: rec_proj_t_gater = [] if state['rec_reseting']: rec_proj_t_reseter = [] for si in xrange(state['decoder_stack']): if si > 0: rec_proj_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d'%si)) if state['rec_gating']: rec_proj_t_gater.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_gater_%d'%si)) if state['rec_reseting']: rec_proj_t_reseter.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_reseter_%d'%si)) add_rec_step_t.append(eval(state['rec_layer'])( rng, n_hids=state['dim'], activation = state['activ'], bias_scale = state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_t_%d'%si)) if state['encoder_stack'] > 1: encoder_proj = [] for si in xrange(state['encoder_stack']): encoder_proj.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim'] * state['maxout_part']], activation=['lambda x: x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='encoder_proj_%d'%si, learn_bias = (si == 0))) encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']), indim = indim, pieces = pieces, rng=rng) def _add_t_op(words_embeddings, everything = None, words_mask=None, prev_val=None,one_step=False, bs=1, init_state=None, use_noise=True, gater_below = None, reseter_below = None, si = 0, state_below = None): seqlen = words_embeddings.out.shape[0]//bs rval = words_embeddings gater = None if state['rec_gating']: gater = gater_below reseter = None if state['rec_reseting']: reseter = reseter_below if si > 0: if isinstance(state_below, list): state_below = state_below[-1] rval += rec_proj_t[si-1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_t_gater[si-1](state_below, one_step=one_step, use_noise = use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_t_reseter[si-1](state_below, one_step=one_step, use_noise = use_noise) if reseter: reseter += projg else: reseter = projg if everything: rval = rval + proj_everything_t[si](everything) if state['rec_gating']: everyg = gater_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if gater: gater += everyg else: gater = everyg if state['rec_reseting']: everyg = reseter_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if reseter: reseter += everyg else: reseter = everyg if not one_step: rval = add_rec_step_t[si]( rval, nsteps=seqlen, batch_size=bs, mask=words_mask, one_step=one_step, init_state=init_state, gater_below = gater, reseter_below = reseter, use_noise = use_noise) else: rval = add_rec_step_t[si]( rval, mask=words_mask, state_before=prev_val, one_step=one_step, gater_below = gater, reseter_below = reseter, use_noise = use_noise) return rval add_t_op = Operator(_add_t_op) outdim = state['dim_mlp'] if not state['deep_out']: outdim = state['rank_n_approx'] if state['bias_code']: bias_code = [] for si in xrange(state['decoder_stack']): bias_code.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation = [state['activ']], bias_scale = [state['bias']], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='bias_code_%d'%si)) if state['avg_word']: word_code_nin = state['rank_n_approx'] word_code = MultiLayer( rng, n_in=word_code_nin, n_hids=[outdim], activation = 'lambda x:x', bias_scale = [state['bias_mlp']/3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias = False, name='word_code') proj_code = MultiLayer( rng, n_in=state['dim'], n_hids=[outdim], activation = 'lambda x: x', bias_scale = [state['bias_mlp']/3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias = False, name='proj_code') proj_h = [] for si in xrange(state['decoder_stack']): proj_h.append(MultiLayer( rng, n_in=state['dim'], n_hids=[outdim], activation = 'lambda x: x', bias_scale = [state['bias_mlp']/3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='proj_h_%d'%si)) if state['bigram']: proj_word = MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[outdim], activation=['lambda x:x'], bias_scale = [state['bias_mlp']/3], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='emb_words_lm') if state['deep_out']: indim = 0 pieces = 0 act_layer = UnaryOp(activation=eval(state['unary_activ'])) drop_layer = DropOp(rng=rng, dropout=state['dropout']) if state['deep_out']: indim = state['dim_mlp'] / state['maxout_part'] rank_n_approx = state['rank_n_approx'] rank_n_activ = state['rank_n_activ'] else: indim = state['rank_n_approx'] rank_n_approx = 0 rank_n_activ = None output_layer = SoftmaxLayer( rng, indim, state['nouts'], state['weight_scale'], -1, rank_n_approx = rank_n_approx, rank_n_activ = rank_n_activ, weight_noise=state['weight_noise'], init_fn=state['weight_init_fn'], name='out') def _pop_op(everything, accum, everything_max = None, everything_min = None, word = None, aword = None, one_step=False, use_noise=True): rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise) for si in xrange(1,state['decoder_stack']): rval += proj_h[si](accum[si], one_step=one_step, use_noise=use_noise) if state['mult_out']: rval = rval * everything else: rval = rval + everything if aword and state['avg_word']: wcode = aword if one_step: if state['mult_out']: rval = rval * wcode else: rval = rval + wcode else: if not isinstance(wcode, TT.TensorVariable): wcode = wcode.out shape = wcode.shape rshape = rval.shape rval = rval.reshape([rshape[0]/shape[0], shape[0], rshape[1]]) wcode = wcode.dimshuffle('x', 0, 1) if state['mult_out']: rval = rval * wcode else: rval = rval + wcode rval = rval.reshape(rshape) if word and state['bigram']: if one_step: if state['mult_out']: rval *= proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: rval += proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: if isinstance(word, TT.TensorVariable): shape = word.shape ndim = word.ndim else: shape = word.shape ndim = word.out.ndim pword = proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) shape_pword = pword.shape if ndim == 1: pword = Shift()(pword.reshape([shape[0], 1, outdim])) else: pword = Shift()(pword.reshape([shape[0], shape[1], outdim])) if state['mult_out']: rval *= pword.reshape(shape_pword) else: rval += pword.reshape(shape_pword) if state['deep_out']: rval = drop_layer(act_layer(rval), use_noise=use_noise) return rval pop_op = Operator(_pop_op) # 3. Constructing the model gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [add_op(emb_words[0](emb(x)), x_mask, bs=x_mask.shape[1], si=0, gater_below=gater_below, reseter_below=reseter_below)] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1])) for si in xrange(1,state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append(add_op(emb_words[si](emb(x)), x_mask, bs=x_mask.shape[1], si=si, state_below=encoder_acts[-1], gater_below=gater_below, reseter_below=reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1])) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = LastState(ntimes=True,n=y.shape[0])(encoder) else: everything = encoder_act_layer(everything) everything = everything.reshape([1, everything.shape[0], everything.shape[1]]) everything = LastState(ntimes=True,n=y.shape[0])(everything) if state['bias_code']: init_state = [bc(everything[-1]) for bc in bias_code] else: init_state = [None for bc in bias_code] if state['avg_word']: shape = x.shape pword = emb(x).out.reshape([shape[0], shape[1], state['rank_n_approx']]) pword = pword * x_mask.dimshuffle(0, 1, 'x') aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x')) aword = word_code(aword, use_noise=False) else: aword = None gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(y0)) has_said = [add_t_op(emb_words_t[0](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], gater_below = gater_below, reseter_below = reseter_below, init_state=init_state[0], si=0)] for si in xrange(1,state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(y0)) has_said.append(add_t_op(emb_words_t[si](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], state_below = has_said[-1], gater_below = gater_below, reseter_below = reseter_below, init_state=init_state[si], si=si)) if has_said[0].out.ndim < 3: for si in xrange(state['decoder_stack']): shape_hs = has_said[si].shape if y0.ndim == 1: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape([shape[0], 1, state['dim_mlp']])) else: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape([shape[0], shape[1], state['dim_mlp']])) has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si]) has_said[si] = has_said[si].reshape(shape_hs) else: for si in xrange(state['decoder_stack']): has_said[si] = Shift()(has_said[si]) has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si]) model = pop_op(proj_code(everything), has_said, word=y0, aword = aword) nll = output_layer.train(state_below=model, target=y0, mask=y_mask, reg=None) / TT.cast(y.shape[0]*y.shape[1], 'float32') valid_fn = None noise_fn = None x = TT.lvector(name='x') n_steps = TT.iscalar('nsteps') temp = TT.scalar('temp') gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [add_op(emb_words[0](emb(x),use_noise=False), si=0, use_noise=False, gater_below=gater_below, reseter_below=reseter_below)] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False) for si in xrange(1,state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append(add_op(emb_words[si](emb(x),use_noise=False), si=si, state_below=encoder_acts[-1], use_noise=False, gater_below = gater_below, reseter_below = reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1]), use_noise=False) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = last(encoder) else: everything = encoder_act_layer(everything) init_state = [] for si in xrange(state['decoder_stack']): if state['bias_code']: init_state.append(TT.reshape(bias_code[si](everything, use_noise=False), [1, state['dim']])) else: init_state.append(TT.alloc(numpy.float32(0), 1, state['dim'])) if state['avg_word']: aword = emb(x,use_noise=False).out.mean(0) aword = word_code(aword, use_noise=False) else: aword = None def sample_fn(*args): aidx = 0; word_tm1 = args[aidx] aidx += 1; prob_tm1 = args[aidx] has_said_tm1 = [] for si in xrange(state['decoder_stack']): aidx += 1; has_said_tm1.append(args[aidx]) aidx += 1; ctx = args[aidx] if state['avg_word']: aidx += 1; awrd = args[aidx] val = pop_op(proj_code(ctx), has_said_tm1, word=word_tm1, aword=awrd, one_step=True, use_noise=False) sample = output_layer.get_sample(state_below=val, temp=temp) logp = output_layer.get_cost( state_below=val.out.reshape([1, TT.cast(output_layer.n_in, 'int64')]), temp=temp, target=sample.reshape([1,1]), use_noise=False) gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(sample)) has_said_t = [add_t_op(emb_words_t[0](emb_t(sample)), ctx, prev_val=has_said_tm1[0], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=0)] for si in xrange(1, state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(sample)) has_said_t.append(add_t_op(emb_words_t[si](emb_t(sample)), ctx, prev_val=has_said_tm1[si], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=si, state_below=has_said_t[-1])) for si in xrange(state['decoder_stack']): if isinstance(has_said_t[si], list): has_said_t[si] = has_said_t[si][-1] rval = [sample, TT.cast(logp, 'float32')] + has_said_t return rval sampler_params = [everything] if state['avg_word']: sampler_params.append(aword) states = [TT.alloc(numpy.int64(0), n_steps)] states.append(TT.alloc(numpy.float32(0), n_steps)) states += init_state outputs, updates = scan(sample_fn, states = states, params = sampler_params, n_steps= n_steps, name='sampler_scan' ) samples = outputs[0] probs = outputs[1] sample_fn = theano.function( [n_steps, temp, x], [samples, probs.sum()], updates=updates, profile=False, name='sample_fn') model = LM_Model( cost_layer = nll, weight_noise_amount=state['weight_noise_amount'], valid_fn = valid_fn, sample_fn = sample_fn, clean_before_noise_fn = False, noise_fn = noise_fn, indx_word=state['indx_word_target'], indx_word_src=state['indx_word'], character_level = False, rng = rng) if state['loopIters'] > 0: algo = SGD(model, state, train_data) else: algo = None def hook_fn(): if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs old_offset = train_data.offset if state['sample_reset']: train_data.reset() ns = 0 for sidx in xrange(state['sample_n']): while True: batch = train_data.next() if batch: break x = batch['x'] y = batch['y'] #xbow = batch['x_bow'] masks = batch['x_mask'] if x.ndim > 1: for idx in xrange(x.shape[1]): ns += 1 if ns > state['sample_max']: break print 'Input: ', for k in xrange(x[:,idx].shape[0]): print model.word_indxs_src[x[:,idx][k]], if model.word_indxs_src[x[:,idx][k]] == '<eol>': break print '' print 'Target: ', for k in xrange(y[:,idx].shape[0]): print model.word_indxs[y[:,idx][k]], if model.word_indxs[y[:,idx][k]] == '<eol>': break print '' senlen = len(x[:,idx]) if len(numpy.where(masks[:,idx]==0)[0]) > 0: senlen = numpy.where(masks[:,idx]==0)[0][0] if senlen < 1: continue xx = x[:senlen, idx] #xx = xx.reshape([xx.shape[0], 1]) model.get_samples(state['seqlen']+1, 1, xx) else: ns += 1 model.get_samples(state['seqlen']+1, 1, x) if ns > state['sample_max']: break train_data.offset = old_offset return main = MainLoop(train_data, valid_data, None, model, algo, state, channel, reset = state['reset'], hooks = hook_fn) if state['reload']: main.load() if state['loopIters'] > 0: main.main() if state['sampler_test']: # This is a test script: we only sample if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs indx_word=pkl.load(open(state['word_indx'],'rb')) try: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = float(raw_input('Inverse Temperature? ')) seqin = seqin.lower() seqin = seqin.split() seqlen = len(seqin) seq = numpy.zeros(seqlen+1, dtype='int64') for idx,sx in enumerate(seqin): try: seq[idx] = indx_word[sx] except: seq[idx] = indx_word[state['oov']] seq[-1] = state['null_sym_source'] except Exception: print 'Something wrong with your input! Try again!' continue sentences = [] all_probs = [] for sidx in xrange(n_samples): #import ipdb; ipdb.set_trace() [values, probs] = model.sample_fn(seqlen * 3, alpha, seq) sen = [] for k in xrange(values.shape[0]): if model.word_indxs[values[k]] == '<eol>': break sen.append(model.word_indxs[values[k]]) sentences.append(" ".join(sen)) all_probs.append(-probs) sprobs = numpy.argsort(all_probs) for pidx in sprobs: print pidx,"(%f):"%(-all_probs[pidx]),sentences[pidx] print except KeyboardInterrupt: print 'Interrupted' pass
def init_cpu(self, options, channel, data, model): n_params = len(self.model.params) # Step 1. Compile function for computing eucledian gradients self.reset_gradients = theano.function( [], [], updates = zip(self.gs, [TT.zeros_like(g) for g in self.gs]), on_unused_input='warn', mode=cpu_mode, name='reset_gradients', profile=options['profile']) gbdx = TT.iscalar('grad_batch_idx') comp_grad = TT.iscalar('comp_grad') print 'Constructing grad function' loc_inputs = [x.type() for x in model.inputs] srng = RandomStreams(numpy.random.randint(1e5)) cst = time.time() def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[2: 2 + n_params], gs)] _gs = [x for x in gs] _nw_gs = [gpu_from_host(g) for g in nw_gs] nw_gs = ifelse(comp_grad, _nw_gs, _gs, gpu=True) nw_gs = [x.type.filter_variable(y) for x,y in zip(args[2:],nw_gs)] return [args[0] + const(1), args[1] + nw_cost] + nw_gs ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) cost0 = TT.unbroadcast(const([0]),0) n_steps = TT.iscalar('nsteps') rvals, updates = scan(grad_step, states=[idx0, cost0] + ig, n_steps=n_steps, name='grad_loop', mode=gpu_mode, profile=options['profile']) nw_gs = [x[0] / TT.cast(n_steps, 'float32') for x in rvals[2: 2 + n_params]] nw_gs = [og + nwg for og, nwg in zip(self.gs, nw_gs)] fcost = rvals[1][0] / TT.cast(n_steps, 'float32') updates.update(dict(zip(self.gs, nw_gs))) grad_inps = zip(loc_inputs, self.shared_data) print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx, comp_grad, n_steps], fcost, updates=updates, on_unused_input='warn', givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, profile=options['profile']) print 'Time to compile grad', print_time(time.time() - cst) cst = time.time() def jacob_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']:(idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) replace.update(dict(zip(model.params, model.cpu_params))) mode=cpu_mode params = model.cpu_params # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(params, [None]*n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) denom *= nw_out denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= nw_out else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) r = TT.sgn(srng.normal(nw_out.shape)) r = r * factor loc_params = [x for x in params if x in theano.gof.graph.inputs([nw_out])] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [oj + final_results[p] for oj, p in zip(args[1:1+n_params], params)] return [args[0] + const(1)] + nw_js ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) n_steps = options['mbs'] // options['cbs'] mode = cpu_mode rvals, updates = scan(jacob_step, states=[idx0] + ij, n_steps=n_steps, name='jacob_loop', mode=mode, profile=options['profile']) nw_js = [x[0] for x in rvals[1:1+n_params]] updates.update(dict(zip(self.js, nw_js))) grad_inps = [(x, y[gbdx*options['mbs']:(gbdx+1)*options['mbs']]) for x,y in zip(loc_inputs[:1], self.cpu_shared_data[:1])] print 'Compiling grad function' self.compute_jacobi_preconditioner = theano.function( [gbdx], [], updates=updates, on_unused_input='warn', givens=dict(grad_inps), name='jacobi_preconditioner_gradients', mode=mode, profile=options['profile']) print 'Time compile jacobi ', print_time(time.time() - cst) cst = time.time() # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') mode = cpu_mode def compute_Gv(*args): cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name ='cgv%d'%idx) for idx, shp in enumerate(model.params_shape)] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]] grad_inps = zip(loc_inputs, self.shared_data) loc_fn = theano.function([], final_Gvs, updates = updates, givens = dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile = options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {} print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) mreg = TT.scalar('mreg') rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], Ms = self.js, rtol=options['mrtol'], shift = - mreg, maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs, nw_rs))) print 'Compiling riemannian gradient function' self.compute_riemannian_gradients = theano.function( [mreg], [flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0], updates=updates, name='compute_riemannian_gradients', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) print 'Time to compile Riemannian', print_time(time.time() - cst) cst = time.time() # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') nw_ps = [p - lr * r for p, r in zip(model.cpu_params, self.rs)] nw_ds = [ -r for r in self.rs] self.update_cparams = theano.function( [lr], updates = dict(zip(model.cpu_params, nw_ps)), name='update_cparam', allow_input_downcast=True, mode=cpu_mode, on_unused_input='warn', profile=options['profile']) newparams = [y.type.filter_variable(x) for x,y in zip(nw_ps, model.params)] self.update_params = theano.function([lr], updates = dict(zip(model.params, newparams)), name='update_params', on_unused_input='warn', allow_input_downcast=True, mode=cpu_mode, profile=options['profile']) self.scalar_grad = theano.function( [], sum(TT.sum(x*y) for x,y in zip(self.gs, self.ds)), name='scalar_grad', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) nsteps = self.options['ebs'] // self.options['cbs'] self.current_alpha = numpy.inf def ls_cost(alpha, pos): if alpha != self.current_alpha: self.current_alpha = alpha self.update_params(alpha) return self.compute_eucledian_gradients(pos, 0, nsteps) self.ls_cost_fn = ls_cost def ls_grad(alpha, pos): if alpha != self.current_alpha: self.current_alpha = alpha self.update_params(alpha) self.reset_gradients() self.compute_eucledian_gradients(pos, 1, nsteps) return self.scalar_grad() self.ls_grad_fn = ls_grad self.old_score = 50000 n_steps = options['ebs']// options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) replace.update(dict(zip(model.params, model.cpu_params))) nw_cost = \ TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_error, states = states, n_steps = n_steps, name='ls_err_step', mode=cpu_mode, profile = options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([], ferr, givens=dict(zip(loc_inputs, self.cpu_shared_data)), name='compute_err', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) print 'Compile eval time', print_time(time.time() - cst) self.old_cost = 1e6 self.options = options self.perm = self.rng.permutation(4) self.pos = 0
def minres(compute_Av, bs, rtol=npy_floatX(1e-6), maxiter=20, Ms=None, damp=npy_floatX(0.), maxxnorm=npy_floatX(1e15), Acondlim=npy_floatX(1e16), mode = None, xinit = None, profile=0): """ DESCRIPTION: minres attempts to find the minimum-length and minimum-residual-norm solution x to the system of linear equations A*x = b or least squares problem min||Ax-b||. The n-by-n coefficient matrix A must be symmetric (but need not be positive definite or invertible). The right-hand-side column vector b must have length n. INPUTS: :param compute_Av: callable returing the symbolic expression for `Av`. `v` can be a set of parameteres :param bs: list of Theano expressions. We are looking to compute A^-1\dot bs :param rtol: Optional, real, specifies the tolerance of the method. Default is 1e-6 :param maxiter: Optional, positive integer, specifies the maximum number of iterations. Default is 20 :param Ms: List of theano expression of same shape as `bs`. The method uses these to precondition with diag(Ms) :param damp: Optional, scalar, real or complex. Default is 0. Effectively solve the system (A + damp I) * x = b. :param maxxnorm: real positive, maximum bound on NORM(x). Default is 1e14. :param Acondlim: real positive, maximum bound on COND(A). Default is 1e15. :param xinit: None, or list of ndarrays (of same length as bs) containing initial guess for x[i]. OUTPUTS: x n-vector, estimated solution flag integer, convergence flag -1 beta2 = 0. If M = I, b and x are eigenvectors. 0 beta1 = 0. The exact solution is x = 0. 1 A solution to (poss. singular) Ax = b found, given rtol. 2 Pseudoinverse solution for singular LS problem, given rtol. 3 A solution to (poss. singular) Ax = b found, given eps. 4 Pseudoinverse solution for singular LS problem, given eps. 5 x has converged to an eigenvector. 6 xnorm has exceeded maxxnorm. 7 Acond has exceeded Acondlim. 8 The iteration limit was reached. 9 It is a least squares problem but no converged solution yet. iter integer, iteration number at which x was computed: 0 <= iter <= maxiter. relres real positive, the relative residual is defined as NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)), computed recurrently here. If flag is 1 or 3, relres <= TOL. relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) --- computed recurrently here. If flag is 2 or 4, relAres <= TOL. Anorm real positive, estimate of matrix 2-norm of A. Acond real positive, estimate of condition number of A with respect to 2-norm. xnorm non-negative positive, recurrently computed NORM(x) Axnorm non-negative positive, recurrently computed NORM(A * x). EXAMPLE 1: n = 100; on = ones(n,1); A = spdiags([-2*on 4*on -2*on],-1:1,n,n); b = sum(A,2); rtol = 1e-10; maxiter = 50; M = spdiags(4*on,0,n,n); x = minresSOL69(A, b, rtol, maxiter, M); Use this matrix-vector product function function y = afun(x,n) y = 4 * x; y(2:n) = y(2:n) - 2 * x(1:n-1); y(1:n-1) = y(1:n-1) - 2 * x(2:n); as input to minresSOL69 x1 = minresSOL69(@afun, b, rtol, maxiter, M); EXAMPLE 2: A is Laplacian on a 50 by 05 grid, singular and indefinite. n = 50; N = n^2; on=ones(n,1); B = spdiags([on on on], -1:1, n, n); A = sparse([],[],[],N,N,(3*n-2)^2); for i=1:n A((i-1)*n+1:i*n,(i-1)*n+1:i*n) = B; if i*n+1 < n*n, A(i*n+1:(i+1)*n,(i-1)*n+1:i*n)=B; end; if (i-2)*n+1 > 0 A((i-2)*n+1:(i-1)*n,(i-1)*n+1:i*n)=B; end; end b = sum(A,2); rtol = 1e-5; maxxnorm = 1e2; damp = 0; Acondlim = []; show = 1; M = []; x = minresSOL69( A, b, rtol, N, M, damp, maxxnorm, Acondlim, show); EXAMPLE 3: A is diagonal, singular and indefinite. h = 1; a = -10; b = -a; n = 2*b/h + 1; A = spdiags((a:h:b)', 0, n, n); b = ones(n,1); rtol = 1e-6; maxxnorm = 1e2; damp = 0; Acondlim = []; show = 1; M = []; x = minresSOL69( A, b, rtol, N, M, damp, maxxnorm, Acondlim, show); REFERENCES: Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006. http://www.stanford.edu/group/SOL/software.html """ if not isinstance(bs, (tuple, list)): bs = [bs] return_as_list = False else: bs = list(bs) return_as_list = True eps = npy_floatX(1e-23) # Initialise flag = theano.shared(npy_floatX(0.)) #------------------------------------------------------------------ # Set up p and v for the first Lanczos vector v1. # p = beta1 P' v1, where P = C**(-1). # v is really P' v1. #------------------------------------------------------------------ if xinit is None: xinit = [TT.zeros_like(b) for b in bs] r3s = [b for b in bs] r2s = [b for b in bs] r1s = [b for b in bs] beta1 = norm(bs) if Ms is not None: r3s = [b/m for b,m in zip(bs,Ms)] beta1 = norm(r3s, bs) else: init_Ax = compute_Av(*xinit) res = [bs[i] - init_Ax[i] for i in xrange(len(bs))] r3s = copy.copy(res) r2s = copy.copy(res) r1s = copy.copy(res) beta1 = norm(res) if Ms is not None: r3s = [r/m for r,m in zip(r3s, Ms)] beta1 = norm(r3s, res) #------------------------------------------------------------------ ## Initialize other quantities. # Note that Anorm has been initialized by IsOpSym6. # ------------------------------------------------------------------ bnorm = beta1 n_params = len(bs) def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag, *args): #----------------------------------------------------------------- ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,... # The general iteration is similar to the case k = 1 with v0 = 0: # # p1 = Operator * v1 - beta1 * v0, # alpha1 = v1'p1, # q2 = p2 - alpha1 * v1, # beta2^2 = q2'q2, # v2 = (1/beta2) q2. # # Again, p = betak P vk, where P = C**(-1). # .... more description needed. #----------------------------------------------------------------- xs = args[0 * n_params: 1 * n_params] r1s = args[1 * n_params: 2 * n_params] r2s = args[2 * n_params: 3 * n_params] r3s = args[3 * n_params: 4 * n_params] dls = args[4 * n_params: 5 * n_params] ds = args[5 * n_params: 6 * n_params] betal = beta beta = betan vs = [r3/beta for r3 in r3s] r3s = compute_Av(*vs) r3s = [r3 + damp*v for r3,v in zip(r3s, vs)] r3s = [TT.switch(TT.ge(niter, numpy.float64(1.)), r3 - (beta/betal)*r1, r3) for r3, r1 in zip(r3s, r1s)] alpha = sqnorm(r3s, vs) r3s = [r3 - (alpha/beta)*r2 for r3,r2 in zip(r3s,r2s)] r1s = [r2 for r2 in r2s] r2s = [r3 for r3 in r3s] if Ms is not None: r3s = [r3/M for r3, M in zip(r3s, Ms)] betan = norm(r2s, r3s) else: betan = norm(r3s) pnorml = pnorm pnorm = TT.switch(TT.eq(niter, npy_floatX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta))) #----------------------------------------------------------------- ## Apply previous rotation Qk-1 to get # [dlta_k epln_{k+1}] = [cs sn][dbar_k 0 ] # [gbar_k dbar_{k+1} ] [sn -cs][alpha_k beta_{k+1}]. #----------------------------------------------------------------- dbar = dbarn epln = eplnn dlta = cs*dbar + sn*alpha gbar = sn*dbar - cs*alpha eplnn = sn*betan dbarn = - cs*betan; ## Compute the current plane rotation Qk gammal2 = gammal gammal = gamma cs, sn, gamma = symGivens2(gbar, betan) tau = cs*phi phi = sn*phi Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau)) # Update d dl2s = [dl for dl in dls] dls = [d for d in ds] ds = [TT.switch(TT.neq(gamma, npy_floatX(0.)), (v - epln*dl2 - dlta*dl)/gamma, v) for v,dl2,dl in zip(vs,dl2s, dls)] d_norm = TT.switch(TT.neq(gamma,npy_floatX(0.)), norm(ds), TT.constant((npy_floatX(numpy.inf)))) # Update x except if it will become too big xnorml = xnorm dl2s = [x for x in xs] xs = [x + tau*d for x,d in zip(xs,ds)] xnorm = norm(xs) xs = [TT.switch(TT.ge(xnorm, maxxnorm), dl2, x) for dl2,x in zip(dl2s,xs)] flag = TT.switch(TT.ge(xnorm, maxxnorm), npy_floatX(6.), flag) # Estimate various norms rnorml = rnorm # ||r_{k-1}|| Anorml = Anorm Acondl = Acond relrnorml = relrnorm flag_no_6 = TT.neq(flag, npy_floatX(6.)) Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)), Dnorm) xnorm = TT.switch(flag_no_6, norm(xs), xnorm) rnorm = TT.switch(flag_no_6, phi, rnorm) relrnorm = TT.switch(flag_no_6, rnorm / (Anorm*xnorm + bnorm), relrnorm) Tnorm = TT.switch(flag_no_6, TT.switch(TT.eq(niter, npy_floatX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt(TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) + TT.sqr(betan))), Tnorm) Anorm = TT.maximum(Anorm, pnorm) Acond = Anorm * Dnorm rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn)) Anorml = rnorml*rootl relArnorml = rootl / Anorm #--------------------------------------------------------------- # See if any of the stopping criteria are satisfied. # In rare cases, flag is already -1 from above (Abar = const*I). #--------------------------------------------------------------- epsx = Anorm * xnorm * eps epsr = Anorm * xnorm * rtol #Test for singular Hk (hence singular A) # or x is already an LS solution (so again A must be singular). t1 = npy_floatX(1) + relrnorm t2 = npy_floatX(1) + relArnorml flag = TT.switch( TT.bitwise_or(TT.eq(flag, npy_floatX(0.)), TT.eq(flag, npy_floatX(6.))), TT.switch(TT.le(t1, npy_floatX(1.)), npy_floatX(3.), TT.switch(TT.le(t2, npy_floatX(1.)), npy_floatX(4.), TT.switch(TT.le(relrnorm, rtol), npy_floatX(1.), TT.switch(TT.le(Anorm, npy_floatX(1e-20)), npy_floatX(12), TT.switch(TT.le(relArnorml, rtol), npy_floatX(10.), TT.switch(TT.ge(epsx, beta1), npy_floatX(5.), TT.switch(TT.ge(xnorm, maxxnorm), npy_floatX(6.), TT.switch(TT.ge(niter, TT.cast(maxiter,floatX)), npy_floatX(8.), flag)))))))), flag) flag = TT.switch(TT.lt(Axnorm, rtol*Anorm*xnorm), npy_floatX(11.), flag) return [ niter + npy_floatX(1.), beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag] + xs + r1s + r2s + r3s + dls + ds, \ theano.scan_module.scan_utils.until(TT.neq(flag,0)) states = [] # 0 niter states.append(TT.constant(npy_floatX([0]))) # 1 beta states.append(TT.constant(npy_floatX([0]))) # 2 betan states.append(TT.unbroadcast(TT.shape_padleft(beta1),0)) # 3 phi states.append(TT.unbroadcast(TT.shape_padleft(beta1),0)) # 4 Acond states.append(TT.constant(npy_floatX([1]))) # 5 cs states.append(TT.constant(npy_floatX([-1]))) # 6 dbarn states.append(TT.constant(npy_floatX([0]))) # 7 eplnn states.append(TT.constant(npy_floatX([0]))) # 8 rnorm states.append(TT.unbroadcast(TT.shape_padleft(beta1),0)) # 9 sn states.append(TT.constant(npy_floatX([0]))) # 10 Tnorm states.append(TT.constant(npy_floatX([0]))) # 11 rnorml states.append(TT.unbroadcast(TT.shape_padleft(beta1),0)) # 12 xnorm states.append(TT.constant(npy_floatX([0]))) # 13 Dnorm states.append(TT.constant(npy_floatX([0]))) # 14 gamma states.append(TT.constant(npy_floatX([0]))) # 15 pnorm states.append(TT.constant(npy_floatX([0]))) # 16 gammal states.append(TT.constant(npy_floatX([0]))) # 17 Axnorm states.append(TT.constant(npy_floatX([0]))) # 18 relrnorm states.append(TT.constant(npy_floatX([1]))) # 19 relArnorml states.append(TT.constant(npy_floatX([1]))) # 20 Anorm states.append(TT.constant(npy_floatX([0]))) # 21 flag states.append(TT.constant(npy_floatX([0]))) xs = [TT.unbroadcast(TT.shape_padleft(xi),0) for xi in xinit] ds = [TT.unbroadcast(TT.shape_padleft(xi),0) for xi in xinit] dls = [TT.unbroadcast(TT.shape_padleft(xi),0) for xi in xinit] r1s = [TT.unbroadcast(TT.shape_padleft(r1),0) for r1 in r1s] r2s = [TT.unbroadcast(TT.shape_padleft(r2),0) for r2 in r2s] r3s = [TT.unbroadcast(TT.shape_padleft(r3),0) for r3 in r3s] rvals, lupds = scan(loop, states = states + xs + r1s + r2s + r3s + dls + ds, n_steps = maxiter + numpy.int32(1), name='minres', profile=profile, mode=mode) niters = TT.cast(rvals[0][0], 'int32') flag = TT.cast(rvals[21][0], 'int32') relres = rvals[18][0] relAres = rvals[19][0] Anorm = rvals[20][0] Acond = rvals[4][0] xnorm = rvals[12][0] Axnorm = rvals[17][0] sol = [x[0] for x in rvals[22:22+n_params]] return sol, flag, niters, relres, relAres, Anorm, Acond, xnorm, Axnorm
def compute_Gv(*args): cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name ='cgv%d'%idx) for idx, shp in enumerate(model.params_shape)] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]] grad_inps = zip(loc_inputs, self.shared_data) loc_fn = theano.function([], final_Gvs, updates = updates, givens = dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile = options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {}
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo, phi, derphi, phi0, derphi0, c1, c2, n_iters=10, profile = False, mode=theano.Mode(linker='cvm')): """ TODO: re-write me Part of the optimization algorithm in `scalar_search_wolfe2`. a_lo : scalar (step size) a_hi : scalar (step size) phi_lo : scalar (value of f at a_lo) phi_hi : scalar ( value of f at a_hi) derphi_lo : scalar ( value of derivative at a_lo) phi : callable -> generates computational graph derphi: callable -> generates computational graph phi0 : scalar ( value of f at 0) derphi0 : scalar (value of the derivative at 0) c1 : scalar (wolfe parameter) c2 : scalar (wolfe parameter) profile: if you want printouts of profiling information """ # Function reprensenting the computations of one step of the while loop def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime): # interpolate to find a trial step length between a_lo and # a_hi Need to choose interpolation here. Use cubic # interpolation and then if the result is within delta * # dalpha or outside of the interval bounded by a_lo or a_hi # then use quadratic interpolation, if the result is still too # close, then use bisection dalpha = a_hi-a_lo a = TT.switch( dalpha < zero, a_hi, a_lo) b = TT.switch( dalpha < zero, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # cubic interpolation cchk = delta1*dalpha a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec) # quadric interpolation qchk = delta2*dalpha a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('condq',TT.isnan(a_j_quad), a_j_quad > b-qchk, a_j_quad < a + qchk) a_j_quad = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j_quad) # pick between the two .. cond_c = lazy_or('condc',TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk)) # this lazy if actually decides if we need to run the quadric # interpolation a_j = TT.switch(cond_c, a_j_quad, a_j_cubic) #a_j = ifelse(cond_c, a_j_quad, a_j_cubic) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2*derphi0) cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0, phi_aj >= phi_lo) cond2 = derphi_aj*(a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse( cond1, phi_hi, TT.switch( cond2, phi_hi, phi_lo), name = 'phi_rec') a_rec = ifelse( cond1, a_hi, TT.switch( cond2, a_hi, a_lo), name='a_rec') a_hi = ifelse( cond1, a_j, TT.switch( cond2, a_lo, a_hi), name='a_hi') phi_hi = ifelse( cond1, phi_aj, TT.switch( cond2, phi_lo, phi_hi), name='phi_hi') a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo') a_star = a_j val_star = phi_aj valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='valprime') return ( [ phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime], theano.scan_module.scan_utils.until(stop) ) maxiter = n_iters delta1 = TT.constant(numpy.asarray(0.2, dtype=theano.config.floatX)) # cubic interpolant check delta2 = TT.constant(numpy.asarray(0.1, dtype=theano.config.floatX)) # quadratic interpolant check phi_rec = phi0 a_rec = zero # Initial iteration dalpha = a_hi-a_lo a = TT.switch( dalpha < zero, a_hi, a_lo) b = TT.switch( dalpha < zero, a_lo, a_hi) #a = ifelse(dalpha < 0, a_hi, a_lo) #b = ifelse(dalpha < 0, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # quadric interpolation qchk = delta2*dalpha a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('mcond_q',TT.isnan(a_j), TT.bitwise_or( a_j > b-qchk, a_j < a + qchk)) a_j = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0, phi_aj >= phi_lo) cond2 = derphi_aj*(a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse( cond1, phi_hi, TT.switch( cond2, phi_hi, phi_lo), name='mphirec') a_rec = ifelse( cond1, a_hi, TT.switch( cond2, a_hi, a_lo), name='marec') a_hi = ifelse( cond1, a_j, TT.switch( cond2, a_lo, a_hi), name='mahi') phi_hi = ifelse( cond1, phi_aj, TT.switch( cond2, phi_lo, phi_hi), name='mphihi') onlyif = lazy_and( 'only_if', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2*derphi0) a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name = 'derphi_lo_main') phi_rec.name = 'phi_rec' a_rec.name = 'a_rec' a_lo.name = 'a_lo' a_hi.name = 'a_hi' phi_hi.name = 'phi_hi' phi_lo.name = 'phi_lo' derphi_lo.name = 'derphi_lo' vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='vderphi_aj') states = [] states += [TT.unbroadcast(TT.shape_padleft(phi_rec),0)] states += [TT.unbroadcast(TT.shape_padleft(a_rec),0)] states += [TT.unbroadcast(TT.shape_padleft(a_lo),0)] states += [TT.unbroadcast(TT.shape_padleft(a_hi),0)] states += [TT.unbroadcast(TT.shape_padleft(phi_hi),0)] states += [TT.unbroadcast(TT.shape_padleft(phi_lo),0)] states += [TT.unbroadcast(TT.shape_padleft(derphi_lo),0)] states += [TT.unbroadcast(TT.shape_padleft(zero),0)] states += [TT.unbroadcast(TT.shape_padleft(zero),0)] states += [TT.unbroadcast(TT.shape_padleft(zero),0)] print'while_zoom' outs, updates = scan(while_zoom, states = states, n_steps = maxiter, name = 'while_zoom', mode = mode, profile = profile) print 'done_while' a_star = ifelse(onlyif, a_j , outs[7][0], name='astar') val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar') valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime') ## WARNING !! I ignore updates given by scan which I should not do !!! return a_star, val_star, valprime
def krylov_subspace(compute_Av, bs, old_dir, iters=20, param_shapes=None, profile=0, device='gpu'): eps = numpy.float32(1e-20) bs = [b / tensor.sqrt((b**2).sum() + eps) for b in bs] mem_bufs = [ tensor.alloc(zero, iters, *param_sh) for param_sh in param_shapes ] mem_bufs = [ tensor.set_subtensor(mem[0], b) for mem, b in zip(mem_bufs, bs) ] def construct_space(*args): vs, updates = compute_Av(*args) # I need to rescale at every point, otherwise if A is damping, these # vs go quickly to 0 and we loose the direction they represent norm = TT.sqrt(sum((v**2).sum() for v in vs)) + numpy.float32(1e-20) vs = [v / norm for v in vs] return vs, updates if device == 'gpu': mode = gpu_mode else: mode = cpu_mode outs, updates = scan(construct_space, states=mem_bufs, n_steps=iters - 2, name='krylov_space', mode=mode, profile=profile) if not isinstance(outs, (list, tuple)): outs = [outs] outs = [ tensor.set_subtensor(out[iters - 1], o) for out, o in zip(outs, old_dir) ] outs = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in outs] param_lengths = [numpy.prod(shp) for shp in param_shapes] def ortho(idx, *ortho_mats): new_ortho_mats = [] for A, param_length in zip(ortho_mats, param_lengths): weight = tensor.dot( A[idx + 1:].reshape((iters - idx - 1, param_length)), A[idx].reshape((param_length, ))) A_reshuffle = ['x'] + list(range(A[idx].ndim)) W_reshuffle = [0] + ['x'] * A[idx].ndim to_remove = weight.dimshuffle(*W_reshuffle) *\ A[idx].dimshuffle(*A_reshuffle) new_A = tensor.set_subtensor(A[idx + 1:], A[idx + 1:] - to_remove) x_col = new_A[idx + 1] x_col = x_col / tensor.sqrt((x_col**2).sum() + eps) new_A = tensor.set_subtensor(new_A[idx + 1], x_col) new_ortho_mats.append(new_A) return new_ortho_mats rvals, _ = scan(ortho, sequences=tensor.constant(numpy.arange(iters - 1)), states=outs, n_steps=iters - 1, name='ortho', profile=profile, mode=mode) if not isinstance(rvals, (list, tuple)): rvals = [rvals] rvals = [rval[0] * .1 for rval in rvals] return rvals, updates
def scalar_search_wolfe2(phi, derphi, phi0=None, old_phi0=None, derphi0=None, n_iters=20, c1=1e-4, c2=0.9, profile=False): """ Find alpha that satisfies strong Wolfe conditions. alpha > 0 is assumed to be a descent direction. Parameters ---------- phi : callable f(x) Objective scalar function. derphi : callable f'(x) Objective function derivative (can be None) phi0 : float, optional Value of phi at s=0 old_phi0 : float, optional Value of phi at previous point derphi0 : float, optional Value of derphi at s=0 c1 : float Parameter for Armijo condition rule. c2 : float Parameter for curvature condition rule. profile : flag (boolean) True if you want printouts of profiling information Returns ------- alpha_star : float Best alpha phi_star: WRITEME phi at alpha_star phi0: WRITEME phi at 0 derphi_star: WRITEME derphi at alpha_star Notes ----- Uses the line search algorithm to enforce strong Wolfe conditions. See Wright and Nocedal, 'Numerical Optimization', 1999, pg. 59-60. For the zoom phase it uses an algorithm by [...]. """ if phi0 is None: phi0 = phi(zero) else: phi0 = phi0 if derphi0 is None and derphi is not None: derphi0 = derphi(zero) else: derphi0 = derphi0 alpha0 = zero alpha0.name = 'alpha0' if old_phi0 is not None: alpha1 = TT.minimum(one, numpy.asarray(1.01, dtype=theano.config.floatX) * numpy.asarray(2, dtype=theano.config.floatX) * \ (phi0 - old_phi0) / derphi0) else: old_phi0 = nan alpha1 = one alpha1 = TT.switch(alpha1 < zero, one, alpha1) alpha1.name = 'alpha1' # This shouldn't happen. Perhaps the increment has slipped below # machine precision? For now, set the return variables skip the # useless while loop, and raise warnflag=2 due to possible imprecision. phi0 = TT.switch(TT.eq(alpha1, zero), old_phi0, phi0) # I need a lazyif for alpha1 == 0 !!! phi_a1 = ifelse(TT.eq(alpha1, zero), phi0, phi(alpha1), name='phi_a1') phi_a1.name = 'phi_a1' phi_a0 = phi0 phi_a0.name = 'phi_a0' derphi_a0 = derphi0 derphi_a0.name = 'derphi_a0' # Make sure variables are tensors otherwise strange things happen c1 = TT.as_tensor_variable(c1) c2 = TT.as_tensor_variable(c2) maxiter = n_iters def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t, alpha_star, phi_star, derphi_star): derphi_a1 = derphi(alpha1) cond1 = TT.bitwise_or(phi_a1 > phi0 + c1 * alpha1 * derphi0, TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero)) cond2 = abs(derphi_a1) <= -c2 * derphi0 cond3 = derphi_a1 >= zero alpha_star_c1, phi_star_c1, derphi_star_c1 = \ _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, phi, derphi, phi0, derphi0, c1, c2, profile=profile) alpha_star_c3, phi_star_c3, derphi_star_c3 = \ _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi, derphi, phi0, derphi0, c1, c2, profile=profile) nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX) nw_phi = phi(nw_alpha1) alpha_star, phi_star, derphi_star = \ ifelse(cond1, (alpha_star_c1, phi_star_c1, derphi_star_c1), ifelse(cond2, (alpha1, phi_a1, derphi_a1), ifelse(cond3, (alpha_star_c3, phi_star_c3, derphi_star_c3), (nw_alpha1, nw_phi, nan), name='alphastar_c3'), name='alphastar_c2'), name='alphastar_c1') return ([alpha1, nw_alpha1, phi_a1, ifelse(lazy_or('allconds', cond1, cond2, cond3), phi_a1, nw_phi, name='nwphi1'), ifelse(cond1, derphi_a0, derphi_a1, name='derphi'), i_t + one, alpha_star, phi_star, derphi_star], theano.scan_module.scan_utils.until( lazy_or('until_cond_', TT.eq(nw_alpha1, zero), cond1, cond2, cond3))) states = [] states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)] states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)] states += [TT.unbroadcast(TT.shape_padleft(derphi_a0), 0)] # i_t states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] # alpha_star states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] # phi_star states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] # derphi_star states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] # print 'while_search' outs, updates = scan(while_search, states=states, n_steps=maxiter, name='while_search', mode=theano.Mode(linker='cvm_nogc'), profile=profile) # print 'done_while_search' out3 = outs[-3][0] out2 = outs[-2][0] out1 = outs[-1][0] alpha_star, phi_star, derphi_star = \ ifelse(TT.eq(alpha1, zero), (nan, phi0, nan), (out3, out2, out1), name='main_alphastar') return alpha_star, phi_star, phi0, derphi_star
def compute_Gv(*args): cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name ='cgv%d'%idx) for idx, shp in enumerate(model.params_shape)] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates = updates, givens = dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile = options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {}
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the metric `ebs` -> int Number of samples over which to evaluate the training error `mreg` -> float Regularization added to the metric `mrtol` -> float Relative tolerance for inverting the metric `miters` -> int Number of iterations `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data if options['device'] != 'gpu': xdata = theano.shared(data['train_x'][:options['gbs']], name='xdata') ydata = TT._shared(data['train_y'][:options['gbs']], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] else: self.cpu_shared_data = [] xdata = theano.shared(data['train_x'], name='xdata') ydata = TT._shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] if options['device'] != 'gpu': # Store eucledian gradients self.gs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] # Store riemannian gradients self.rs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] else: # Store eucledian gradients self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] # Store riemannian gradients self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients # inputs gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)] return [args[0] + const(1)] + \ nw_gs ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig, n_steps=n_steps, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]] # updates updates.update(dict(zip(self.gs, nw_gs))) # givens if options['device'] == 'gpu': grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']]) for x,y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, on_unused_input='warn', profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') rbpos = rbdx * options['mbs'] if options['device'] == 'gpu': mode=gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone([model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop(nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates else: mode = cpu_mode def compute_Gv(*args): cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name ='cgv%d'%idx) for idx, shp in enumerate(model.params_shape)] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates = updates, givens = dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile = options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {} print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], rtol=options['mrtol'], shift= -options['mreg'], maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']: (rbdx + 1) * options['mbs']]) for x,y in zip(loc_inputs[:1], shared_data[:1])] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients = theano.function( [rbdx], [flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)] def cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] acc0 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0], n_steps=n_steps, name='cost_loop', mode=gpu_mode, profile=options['profile']) final_cost = rvals[1] / const(n_steps) if options['device'] == 'gpu': grad_inps = [(x, y[ebdx * options['ebs']: (ebdx + 1) * options['ebs']]) for x,y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'compling evaluation function' self.eval_fn = theano.function( [ebdx, lr], final_cost, givens=dict(grad_inps), on_unused_input='warn', updates = updates, name='eval_fn', mode=gpu_mode, profile=options['profile']) update_dict = dict(zip(model.params, nw_ps)) if options['device'] != 'gpu': update_dict.update(dict(zip(model.cparams, nw_ps))) self.update_params = theano.function( [lr], [], updates=update_dict, name='update_params', on_unused_input='warn', mode=mode, profile=options['profile']) self.options = options self.old_cost = 1e6 self.device = options['device'] n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone( model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_error, states = states, n_steps = n_steps, name='ls_err_step', mode=cpu_mode, profile = options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])
def linear_cg(compute_Ax, b, M=None, xinit=None, rtol=1e-16, maxiter=100000, damp=0.0, floatX=None): """ Solves the system A x[i] = b[i], for all i. When used as part of a Newton-CG method, b is a list of gradients, where each element of this list represents a gradient for a given parameter type (i.e. weight or bias of a given layer). This method will return a list whose elements approximates A^{-1} b[i], with the precision determined by maxiter or the specified tolerance level. This particular version implements the Polyak-Ribiere flavor of CG. Parameters: :param compute_Ax: python function which symbolically computes the matrix-vector product. :param b: list of T.vector, corresponding to A x[i] = b[i] :param M: list of T.vector (same length as b). Each element is used to precondition its corresponding element of the A-diagonal. If [Mi for Mi in M] contains the diagonal elements of A, this will implement Jacobi preconditioning. :param xinit: list of T.vector (same length as b). x[i] is initial guess for A^{-1} b[i]. :param rtol: float. CG will stop when the norm of the residual error < rtol. :param maxiter: int. Maximum allowable iterations for CG. :param damp: float. Damping factor, equivalent to adding a term along the diagonal of A. :param floatX: 'float32' or 'float64'. Return values: rval[0]: niter, number of iterations run by CG rval[1]: residual error norm. rval[2+i]: approximate value for G^-1 b[i]. Reference: http://en.wikipedia.org/wiki/Conjugate_gradient_method """ n_params = len(b) def loop(niter, rkp_norm, *args): pk = args[:n_params] rk = args[n_params : 2 * n_params] zk = args[2 * n_params : 3 * n_params] xk = args[-n_params:] A_pk_temp = compute_Ax(*pk) A_pk = [A_pk_temp_ + damp * pk_ for A_pk_temp_, pk_ in zip(A_pk_temp, pk)] alphak_num = sum((rk_ * zk_).sum() for rk_, zk_ in zip(rk, zk)) alphak_denum = sum((A_pk_ * pk_).sum() for A_pk_, pk_ in zip(A_pk, pk)) alphak = alphak_num / alphak_denum xkp1 = [xk_ + alphak * pk_ for xk_, pk_ in zip(xk, pk)] rkp1 = [rk_ - alphak * A_pk_ for rk_, A_pk_, in zip(rk, A_pk)] if M: zkp1 = [rkp1_ / m_ for rkp1_, m_ in zip(rkp1, M)] else: zkp1 = rkp1 # compute beta_k using Polak-Ribiere betak_num = sum((zkp1_ * (rkp1_ - rk_)).sum() for rkp1_, rk_, zkp1_ in zip(rkp1, rk, zkp1)) betak_denum = alphak_num betak = betak_num / betak_denum pkp1 = [zkp1_ + betak * pk_ for zkp1_, pk_ in zip(zkp1, pk)] # compute termination critera rkp1_norm = sum((rkp1_ ** 2).sum() for rkp1_ in rkp1) return [niter + 1, rkp1_norm] + pkp1 + rkp1 + zkp1 + xkp1, theano.scan_module.until(abs(rkp1_norm) < rtol) # Initialize residual based on xinit if xinit is None: r0_temp = b x0 = [tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(b_))) for b_ in b] else: init_Ax = compute_Ax(*xinit) r0_temp = [b[i] - init_Ax[i] for i in xrange(len(b))] x0 = [tensor.unbroadcast(tensor.shape_padleft(xinit_)) for xinit_ in xinit] # Leftpad r0, z0 and p0 for scan. r0 = [tensor.unbroadcast(tensor.shape_padleft(r0_temp_)) for r0_temp_ in r0_temp] if M: z0 = [tensor.unbroadcast(tensor.shape_padleft(r0_temp_ / m_)) for r0_temp_, m_ in zip(r0_temp, M)] else: z0 = r0 p0 = z0 states = [] # 0 niter states.append(tensor.constant(npy_floatX([0]))) # 1 residual error norm states.append(tensor.constant(npy_floatX([0]))) outs, updates = scan( loop, states=states + p0 + r0 + z0 + x0, n_steps=maxiter, mode=theano.Mode(linker="c|py"), name="linear_conjugate_gradient", profile=0, ) sol = [x[0] for x in outs[-n_params:]] niter = outs[0][0] rerr = outs[1][0] return [sol, niter, rerr]
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the krylov subspace `ebs` -> int Number of samples over which to evaluate the training error `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lbfgsIters' -> int `krylovDim` -> int channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data xdata = theano.shared(data['train_x'], name='xdata') ydata = theano.shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] rng = numpy.random.RandomState(options['seed']) self.rng = rng self.options = options self.channel = channel self.model = model n_dimensions = options['krylovDim'] self.n_dimensions = n_dimensions if options['device']=='gpu': cfn_subspaces = \ [theano.shared(numpy.zeros( (n_dimensions,) + shp, dtype='float32'), name='cfn{%s|%d}' % (str(param.name), i)) for i, (shp, param) in enumerate(zip(model.params_shape, model.params))] old_deltas = \ [theano.shared(numpy.zeros(shp, dtype='float32'), name='delta{%s|%d}' % (str(param.name), i)) for i, (shp, param) in enumerate(zip(model.params_shape, model.params))] self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] else: cfn_subspaces = \ [TT._shared(numpy.zeros( (n_dimensions,) + shp, dtype='float32'), name='cfn{%s|%d}' % (str(param.name), i)) for i, (shp, param) in enumerate(zip(model.params_shape, model.params))] old_deltas = \ [TT._shared(numpy.zeros(shp, dtype='float32'), name='delta{%s|%d}' % (str(param.name), i)) for i, (shp, param) in enumerate(zip(model.params_shape, model.params))] self.gs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.cfn_subspaces = cfn_subspaces self.old_deltas = old_deltas self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients print 'Constructing grad function' loc_inputs = [x.type(name='locx') for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)] return [args[0] + const(1)] + \ nw_gs ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig, n_steps=n_steps, name='grad_loop', mode=gpu_mode, profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]] updates.update(dict(zip(self.gs, nw_gs))) gdx = TT.iscalar('gdx') grad_inps = zip(loc_inputs, [x[gdx*options['gbs']:(gdx+1)*options['gbs']] for x in shared_data]) print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients if options['device'] == 'gpu': mode=gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost, nw_preactiv_out = safe_clone([model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop(nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates else: mode = cpu_mode def compute_Gv(*args): cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name ='cgv%d'%idx) for idx, shp in enumerate(model.params_shape)] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost, nw_preactiv_out = safe_clone([model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop(nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, cgv)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates = updates, givens = dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile = options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {} rvals, updates = krylov_subspace( compute_Gv, self.gs, old_deltas, n_dimensions, model.params_shape, profile=options['profile'], device=options['device']) gdx = TT.iscalar('gdx') grad_inps = zip(loc_inputs, [x[gdx*options['mbs']:(gdx+1)*options['mbs']] for x in shared_data]) updates.update(dict(zip(cfn_subspaces, rvals))) self.update_krylov_subspace = theano.function( [gdx], [], updates=updates, givens=dict(grad_inps), profile=options['profile'], on_unused_input='warn', name='update_krylov_subspace', mode=mode) alphas = tensor.vector('alphas') deltas = [] nw_params = [] if options['device'] == 'gpu': params = model.params else: params = model.cpu_params for param, subspace in zip(params, cfn_subspaces): alpha_reshuffle = [0] + ['x'] * param.ndim delta = (alphas.dimshuffle(*alpha_reshuffle) * \ subspace).sum(axis=0) nw_param = param + delta nw_params.append(nw_param) deltas.append(delta) print 'constructing evaluation function' ebdx = TT.iscalar('ebdx') updates_dict = dict(zip(model.params + old_deltas, nw_params + deltas)) if options['device'] != 'gpu': updates_dict.update(dict(zip(model.cpu_params, nw_params))) self.update_params = theano.function([alphas], updates = updates_dict, name='update_params', allow_input_downcast=True, mode=mode, profile=options['profile']) n_steps = options['ebs'] // options['cbs'] def ls_cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_params)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_cost_step, states = states, n_steps = n_steps, name='ls_cost_step', mode=gpu_mode, profile = options['profile']) fcost = rvals[1][0] / const(n_steps) def ls_grad_step(_idx, gws): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_params)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_gs = TT.grad(nw_cost, alphas) return _idx + numpy.float32(1), gws + nw_gs states = [TT.constant(numpy.float32([0])), TT.constant(numpy.zeros((1, n_dimensions),dtype='float32'))] rvals, _ = scan(ls_grad_step, states = states, n_steps = n_steps, name = 'ls_grad_step', mode = gpu_mode, profile=options['profile']) fgrad = rvals[1][0] / const(n_steps) grad_inps = zip(loc_inputs, [x[ebdx*options['ebs']:(ebdx+1)*options['ebs']] for x in shared_data]) self.lbfgs_fn = theano.function([alphas, ebdx], #theano.printing.Print('fcost')(fcost), fcost, givens=grad_inps, allow_input_downcast=True, on_unused_input='warn', name='lbfgs_fn', profile=options['profile'], mode=gpu_mode) self.lbfgs_grad = theano.function([alphas, ebdx], fgrad, givens=grad_inps, on_unused_input='warn', allow_input_downcast=True, name='lbfgs_grad', profile=options['profile'], mode=gpu_mode) n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone( model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_error, states = states, n_steps = n_steps, name='ls_err_step', mode=cpu_mode, profile = options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([], ferr, givens=dict(zip(loc_inputs, shared_data)), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])
def init_gpu(self, options, channel, data, model): # Step 1. Compile function for computing eucledian gradients eps = numpy.float32(1e-24) gbdx = TT.iscalar('grad_batch_idx') n_params = len(self.model.params) print 'Constructing grad function' loc_inputs = [x.type() for x in model.inputs] srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(model.params, [None] * n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) #denom *= nw_out #denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= (nw_out + eps) else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) if out_operator == 'sigmoid': tnwout = TT.nnet.sigmoid(nw_out) factor = TT.sqrt(tnwout * (numpy.float32(1) - tnwout)) * factor r = TT.sgn(srng.normal(nw_out.shape)) r = r * factor loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [ oj + final_results[p] for oj, p in zip(args[1 + n_params:1 + 2 * n_params], model.params) ] return [args[0] + const(1)] + nw_gs + nw_js ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] ij = [ TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig + ij, n_steps=n_steps, name='grad_loop', mode=gpu_mode, profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]] updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js))) grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']]) for x, y in zip(loc_inputs, self.shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), allow_input_downcast=True, name='compute_eucledian_gradients', mode=gpu_mode, profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const( options['cbs']) # * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) self.damping = theano.shared(numpy.float32(options['mreg'])) rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs], Ms=self.js, rtol=options['mrtol'], shift=self.damping, maxit=options['miters'], profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) reset = TT.scalar(dtype='int8', name='reset') norm_kkm1 = sum([(r * g).sum() for r, g in zip(self.rs, self.gs)]) norm_kk = sum([(r * g).sum() for r, g in zip(nw_rs, self.gs)]) norm_dk = sum([(d * g).sum() for d, g in zip(self.ds, self.gs)]) norm_y = norm_kk - 2 * norm_kkm1 + self.norm_km1km1 beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \ 2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2)) beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)), beta_k) beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k), TT.isinf(beta_k)), TT.constant(numpy.float32(0.)), beta_k) nwds = [-r + beta_k * d for r, d in zip(nw_rs, self.ds)] self.nwds = nwds nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \ numpy.float32(1e-25) updates.update(dict(zip(self.rs, nw_rs))) updates.update(dict(zip(self.ds, nwds))) updates[self.norm_km1km1] = norm_kk updates[self.norm_dkm1] = norm_dk updates[self.norm_d] = nw_normd print 'Compiling riemannian gradient function' cst = time.time() grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs, self.shared_data)] self.compute_riemannian_gradients = theano.function( [reset, rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0, beta_k ], updates=updates, allow_input_downcast=True, givens=dict(grad_inps), name='compute_riemannian_gradients', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) print 'Time to compile Riemannian', print_time(time.time() - cst) cst = time.time() # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') newparams = [p + lr * d for p, d in zip(model.params, self.ds)] nw_ds = [-r for r in self.rs] nw_normd = TT.sqrt(sum([(r * r).sum() for r in self.rs])) self.update_params = theano.function([lr], updates=dict( zip(model.params, newparams)), name='update_params', on_unused_input='warn', allow_input_downcast=True, mode=gpu_mode, profile=options['profile']) self.reset_directions = theano.function( [], updates=dict(zip(self.ds + [self.norm_d], nw_ds + [nw_normd])), name='reset_dirs', on_unused_input='warn', mode=cpu_mode, allow_input_downcast=True, profile=options['profile']) n_steps = options['ebs'] // options['cbs'] def ls_cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict( zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_cost_step, states=states, n_steps=n_steps, name='ls_cost_step', profile=options['profile']) fcost = rvals[1][0] / const(n_steps) def ls_grad_step(_idx, gws): idx = TT.cast(_idx, 'int32') nw_inps = [ x[idx * options['cbs']:(idx + 1) * options['cbs']] for x in loc_inputs ] replace = dict( zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_gs = TT.grad(nw_cost, lr) return _idx + numpy.float32(1), gws + nw_gs states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_grad_step, states=states, n_steps=n_steps, name='ls_grad_step', profile=options['profile']) fgrad = rvals[1][0] / const(n_steps) ebdx = TT.iscalar('ebdx') grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]) for x, y in zip(loc_inputs, self.shared_data)] self.ls_cost_fn = theano.function([lr, ebdx], fcost, givens=grad_inps, allow_input_downcast=True, name='ls_cost_fn', mode=gpu_mode, profile=options['profile']) self.approx_change = theano.function( [lr], -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.ds)]), allow_input_downcast=True, name='approx_change', mode=gpu_mode, profile=options['profile']) self.ls_grad_fn = theano.function([lr, ebdx], fgrad, allow_input_downcast=True, givens=grad_inps, name='ls_grad_fn', mode=gpu_mode, profile=options['profile']) self.old_score = 50000 n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=gpu_mode, profile=options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=cpu_mode, allow_input_downcast=True, on_unused_input='warn', profile=options['profile']) print 'Compile eval time', print_time(time.time() - cst) self.old_cost = 1e6 self.options = options self.perm = self.rng.permutation(4) self.pos = 0
def krylov_subspace(compute_Av, bs, old_dir, iters=20, param_shapes=None, profile=0, device='gpu'): eps = numpy.float32(1e-20) bs = [b / tensor.sqrt((b ** 2).sum()+eps) for b in bs] mem_bufs = [tensor.alloc(zero, iters, *param_sh) for param_sh in param_shapes] mem_bufs = [tensor.set_subtensor(mem[0], b) for mem, b in zip(mem_bufs, bs)] def construct_space(*args): vs, updates = compute_Av(*args) # I need to rescale at every point, otherwise if A is damping, these # vs go quickly to 0 and we loose the direction they represent norm = TT.sqrt(sum((v**2).sum() for v in vs)) + numpy.float32(1e-20) vs = [v / norm for v in vs] return vs, updates if device == 'gpu': mode = gpu_mode else: mode = cpu_mode outs, updates = scan(construct_space, states=mem_bufs, n_steps=iters - 2, name='krylov_space', mode=mode, profile=profile) if not isinstance(outs, (list, tuple)): outs = [outs] outs = [tensor.set_subtensor(out[iters - 1], o) for out, o in zip(outs, old_dir)] outs = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in outs] param_lengths = [numpy.prod(shp) for shp in param_shapes] def ortho(idx, *ortho_mats): new_ortho_mats = [] for A, param_length in zip(ortho_mats, param_lengths): weight = tensor.dot(A[idx + 1:].reshape( (iters - idx - 1, param_length)), A[idx].reshape((param_length,))) A_reshuffle = ['x'] + list(range(A[idx].ndim)) W_reshuffle = [0] + ['x'] * A[idx].ndim to_remove = weight.dimshuffle(*W_reshuffle) *\ A[idx].dimshuffle(*A_reshuffle) new_A = tensor.set_subtensor(A[idx + 1:], A[idx + 1:] - to_remove) x_col = new_A[idx + 1] x_col = x_col / tensor.sqrt((x_col ** 2).sum()+eps) new_A = tensor.set_subtensor(new_A[idx + 1], x_col) new_ortho_mats.append(new_A) return new_ortho_mats rvals, _ = scan(ortho, sequences=tensor.constant(numpy.arange(iters - 1)), states=outs, n_steps=iters - 1, name='ortho', profile=profile, mode=mode) if not isinstance(rvals, (list, tuple)): rvals = [rvals] rvals = [rval[0]*.1 for rval in rvals] return rvals, updates
def jobman(state, channel): # load dataset state['null_sym_source'] = 15000 state['null_sym_target'] = 15000 state['n_sym_source'] = state['null_sym_source'] + 1 state['n_sym_target'] = state['null_sym_target'] + 1 state['nouts'] = state['n_sym_target'] state['nins'] = state['n_sym_source'] rng = numpy.random.RandomState(state['seed']) if state['loopIters'] > 0: train_data, valid_data, test_data = get_data(state) else: train_data = None valid_data = None test_data = None ########### Training graph ##################### ## 1. Inputs if state['bs'] == 1: x = TT.lvector('x') x_mask = TT.vector('x_mask') y = TT.lvector('y') y0 = y y_mask = TT.vector('y_mask') else: x = TT.lmatrix('x') x_mask = TT.matrix('x_mask') y = TT.lmatrix('y') y0 = y y_mask = TT.matrix('y_mask') # 2. Layers and Operators bs = state['bs'] embdim = state['dim_mlp'] # Source Sentence emb = MultiLayer(rng, n_in=state['nins'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb') emb_words = [] if state['rec_gating']: gater_words = [] if state['rec_reseting']: reseter_words = [] for si in xrange(state['encoder_stack']): emb_words.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_%d' % si)) if state['rec_gating']: gater_words.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='gater_words_%d' % si)) if state['rec_reseting']: reseter_words.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='reseter_words_%d' % si)) add_rec_step = [] rec_proj = [] if state['rec_gating']: rec_proj_gater = [] if state['rec_reseting']: rec_proj_reseter = [] for si in xrange(state['encoder_stack']): if si > 0: rec_proj.append( MultiLayer(rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d' % si)) if state['rec_gating']: rec_proj_gater.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_gater_%d' % si)) if state['rec_reseting']: rec_proj_reseter.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_reseter_%d' % si)) add_rec_step.append( eval(state['rec_layer'])(rng, n_hids=state['dim'], activation=state['activ'], bias_scale=state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_%d' % si)) def _add_op(words_embeddings, words_mask=None, prev_val=None, si=0, state_below=None, gater_below=None, reseter_below=None, one_step=False, bs=1, init_state=None, use_noise=True): seqlen = words_embeddings.out.shape[0] // bs rval = words_embeddings gater = None reseter = None if state['rec_gating']: gater = gater_below if state['rec_reseting']: reseter = reseter_below if si > 0: rval += rec_proj[si - 1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_gater[si - 1](state_below, one_step=one_step, use_noise=use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_reseter[si - 1](state_below, one_step=one_step, use_noise=use_noise) if reseter: reseter += projg else: reseter = projg if not one_step: rval = add_rec_step[si](rval, nsteps=seqlen, batch_size=bs, mask=words_mask, gater_below=gater, reseter_below=reseter, one_step=one_step, init_state=init_state, use_noise=use_noise) else: rval = add_rec_step[si](rval, mask=words_mask, state_before=prev_val, gater_below=gater, reseter_below=reseter, one_step=one_step, init_state=init_state, use_noise=use_noise) return rval add_op = Operator(_add_op) # Target Sentence emb_t = MultiLayer(rng, n_in=state['nouts'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_t') emb_words_t = [] if state['rec_gating']: gater_words_t = [] if state['rec_reseting']: reseter_words_t = [] for si in xrange(state['decoder_stack']): emb_words_t.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_t_%d' % si)) if state['rec_gating']: gater_words_t.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='gater_words_t_%d' % si)) if state['rec_reseting']: reseter_words_t.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='reseter_words_t_%d' % si)) proj_everything_t = [] if state['rec_gating']: gater_everything_t = [] if state['rec_reseting']: reseter_everything_t = [] for si in xrange(state['decoder_stack']): proj_everything_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='proj_everything_t_%d' % si, learn_bias=False)) if state['rec_gating']: gater_everything_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='gater_everything_t_%d' % si, learn_bias=False)) if state['rec_reseting']: reseter_everything_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='reseter_everything_t_%d' % si, learn_bias=False)) add_rec_step_t = [] rec_proj_t = [] if state['rec_gating']: rec_proj_t_gater = [] if state['rec_reseting']: rec_proj_t_reseter = [] for si in xrange(state['decoder_stack']): if si > 0: rec_proj_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d' % si)) if state['rec_gating']: rec_proj_t_gater.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_gater_%d' % si)) if state['rec_reseting']: rec_proj_t_reseter.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_reseter_%d' % si)) add_rec_step_t.append( eval(state['rec_layer'])(rng, n_hids=state['dim'], activation=state['activ'], bias_scale=state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_t_%d' % si)) if state['encoder_stack'] > 1: encoder_proj = [] for si in xrange(state['encoder_stack']): encoder_proj.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim'] * state['maxout_part']], activation=['lambda x: x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='encoder_proj_%d' % si, learn_bias=(si == 0))) encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']), indim=indim, pieces=pieces, rng=rng) def _add_t_op(words_embeddings, everything=None, words_mask=None, prev_val=None, one_step=False, bs=1, init_state=None, use_noise=True, gater_below=None, reseter_below=None, si=0, state_below=None): seqlen = words_embeddings.out.shape[0] // bs rval = words_embeddings gater = None if state['rec_gating']: gater = gater_below reseter = None if state['rec_reseting']: reseter = reseter_below if si > 0: if isinstance(state_below, list): state_below = state_below[-1] rval += rec_proj_t[si - 1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_t_gater[si - 1](state_below, one_step=one_step, use_noise=use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_t_reseter[si - 1](state_below, one_step=one_step, use_noise=use_noise) if reseter: reseter += projg else: reseter = projg if everything: rval = rval + proj_everything_t[si](everything) if state['rec_gating']: everyg = gater_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if gater: gater += everyg else: gater = everyg if state['rec_reseting']: everyg = reseter_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if reseter: reseter += everyg else: reseter = everyg if not one_step: rval = add_rec_step_t[si](rval, nsteps=seqlen, batch_size=bs, mask=words_mask, one_step=one_step, init_state=init_state, gater_below=gater, reseter_below=reseter, use_noise=use_noise) else: rval = add_rec_step_t[si](rval, mask=words_mask, state_before=prev_val, one_step=one_step, gater_below=gater, reseter_below=reseter, use_noise=use_noise) return rval add_t_op = Operator(_add_t_op) outdim = state['dim_mlp'] if not state['deep_out']: outdim = state['rank_n_approx'] if state['bias_code']: bias_code = [] for si in xrange(state['decoder_stack']): bias_code.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=[state['activ']], bias_scale=[state['bias']], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='bias_code_%d' % si)) if state['avg_word']: word_code_nin = state['rank_n_approx'] word_code = MultiLayer(rng, n_in=word_code_nin, n_hids=[outdim], activation='lambda x:x', bias_scale=[state['bias_mlp'] / 3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias=False, name='word_code') proj_code = MultiLayer(rng, n_in=state['dim'], n_hids=[outdim], activation='lambda x: x', bias_scale=[state['bias_mlp'] / 3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias=False, name='proj_code') proj_h = [] for si in xrange(state['decoder_stack']): proj_h.append( MultiLayer(rng, n_in=state['dim'], n_hids=[outdim], activation='lambda x: x', bias_scale=[state['bias_mlp'] / 3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='proj_h_%d' % si)) if state['bigram']: proj_word = MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[outdim], activation=['lambda x:x'], bias_scale=[state['bias_mlp'] / 3], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='emb_words_lm') if state['deep_out']: indim = 0 pieces = 0 act_layer = UnaryOp(activation=eval(state['unary_activ'])) drop_layer = DropOp(rng=rng, dropout=state['dropout']) if state['deep_out']: indim = state['dim_mlp'] / state['maxout_part'] rank_n_approx = state['rank_n_approx'] rank_n_activ = state['rank_n_activ'] else: indim = state['rank_n_approx'] rank_n_approx = 0 rank_n_activ = None output_layer = SoftmaxLayer(rng, indim, state['nouts'], state['weight_scale'], -1, rank_n_approx=rank_n_approx, rank_n_activ=rank_n_activ, weight_noise=state['weight_noise'], init_fn=state['weight_init_fn'], name='out') def _pop_op(everything, accum, everything_max=None, everything_min=None, word=None, aword=None, one_step=False, use_noise=True): rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise) for si in xrange(1, state['decoder_stack']): rval += proj_h[si](accum[si], one_step=one_step, use_noise=use_noise) if state['mult_out']: rval = rval * everything else: rval = rval + everything if aword and state['avg_word']: wcode = aword if one_step: if state['mult_out']: rval = rval * wcode else: rval = rval + wcode else: if not isinstance(wcode, TT.TensorVariable): wcode = wcode.out shape = wcode.shape rshape = rval.shape rval = rval.reshape( [rshape[0] / shape[0], shape[0], rshape[1]]) wcode = wcode.dimshuffle('x', 0, 1) if state['mult_out']: rval = rval * wcode else: rval = rval + wcode rval = rval.reshape(rshape) if word and state['bigram']: if one_step: if state['mult_out']: rval *= proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: rval += proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: if isinstance(word, TT.TensorVariable): shape = word.shape ndim = word.ndim else: shape = word.shape ndim = word.out.ndim pword = proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) shape_pword = pword.shape if ndim == 1: pword = Shift()(pword.reshape([shape[0], 1, outdim])) else: pword = Shift()(pword.reshape([shape[0], shape[1], outdim])) if state['mult_out']: rval *= pword.reshape(shape_pword) else: rval += pword.reshape(shape_pword) if state['deep_out']: rval = drop_layer(act_layer(rval), use_noise=use_noise) return rval pop_op = Operator(_pop_op) # 3. Constructing the model gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [ add_op(emb_words[0](emb(x)), x_mask, bs=x_mask.shape[1], si=0, gater_below=gater_below, reseter_below=reseter_below) ] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1])) for si in xrange(1, state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append( add_op(emb_words[si](emb(x)), x_mask, bs=x_mask.shape[1], si=si, state_below=encoder_acts[-1], gater_below=gater_below, reseter_below=reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1])) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = LastState(ntimes=True, n=y.shape[0])(encoder) else: everything = encoder_act_layer(everything) everything = everything.reshape( [1, everything.shape[0], everything.shape[1]]) everything = LastState(ntimes=True, n=y.shape[0])(everything) if state['bias_code']: init_state = [bc(everything[-1]) for bc in bias_code] else: init_state = [None for bc in bias_code] if state['avg_word']: shape = x.shape pword = emb(x).out.reshape( [shape[0], shape[1], state['rank_n_approx']]) pword = pword * x_mask.dimshuffle(0, 1, 'x') aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x')) aword = word_code(aword, use_noise=False) else: aword = None gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(y0)) has_said = [ add_t_op(emb_words_t[0](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], gater_below=gater_below, reseter_below=reseter_below, init_state=init_state[0], si=0) ] for si in xrange(1, state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(y0)) has_said.append( add_t_op(emb_words_t[si](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], state_below=has_said[-1], gater_below=gater_below, reseter_below=reseter_below, init_state=init_state[si], si=si)) if has_said[0].out.ndim < 3: for si in xrange(state['decoder_stack']): shape_hs = has_said[si].shape if y0.ndim == 1: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape( [shape[0], 1, state['dim_mlp']])) else: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape( [shape[0], shape[1], state['dim_mlp']])) has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si]) has_said[si] = has_said[si].reshape(shape_hs) else: for si in xrange(state['decoder_stack']): has_said[si] = Shift()(has_said[si]) has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si]) model = pop_op(proj_code(everything), has_said, word=y0, aword=aword) nll = output_layer.train( state_below=model, target=y0, mask=y_mask, reg=None) / TT.cast( y.shape[0] * y.shape[1], 'float32') valid_fn = None noise_fn = None x = TT.lvector(name='x') n_steps = TT.iscalar('nsteps') temp = TT.scalar('temp') gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [ add_op(emb_words[0](emb(x), use_noise=False), si=0, use_noise=False, gater_below=gater_below, reseter_below=reseter_below) ] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False) for si in xrange(1, state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append( add_op(emb_words[si](emb(x), use_noise=False), si=si, state_below=encoder_acts[-1], use_noise=False, gater_below=gater_below, reseter_below=reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1]), use_noise=False) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = last(encoder) else: everything = encoder_act_layer(everything) init_state = [] for si in xrange(state['decoder_stack']): if state['bias_code']: init_state.append( TT.reshape(bias_code[si](everything, use_noise=False), [1, state['dim']])) else: init_state.append(TT.alloc(numpy.float32(0), 1, state['dim'])) if state['avg_word']: aword = emb(x, use_noise=False).out.mean(0) aword = word_code(aword, use_noise=False) else: aword = None def sample_fn(*args): aidx = 0 word_tm1 = args[aidx] aidx += 1 prob_tm1 = args[aidx] has_said_tm1 = [] for si in xrange(state['decoder_stack']): aidx += 1 has_said_tm1.append(args[aidx]) aidx += 1 ctx = args[aidx] if state['avg_word']: aidx += 1 awrd = args[aidx] val = pop_op(proj_code(ctx), has_said_tm1, word=word_tm1, aword=awrd, one_step=True, use_noise=False) sample = output_layer.get_sample(state_below=val, temp=temp) logp = output_layer.get_cost(state_below=val.out.reshape( [1, TT.cast(output_layer.n_in, 'int64')]), temp=temp, target=sample.reshape([1, 1]), use_noise=False) gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(sample)) has_said_t = [ add_t_op(emb_words_t[0](emb_t(sample)), ctx, prev_val=has_said_tm1[0], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=0) ] for si in xrange(1, state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(sample)) has_said_t.append( add_t_op(emb_words_t[si](emb_t(sample)), ctx, prev_val=has_said_tm1[si], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=si, state_below=has_said_t[-1])) for si in xrange(state['decoder_stack']): if isinstance(has_said_t[si], list): has_said_t[si] = has_said_t[si][-1] rval = [sample, TT.cast(logp, 'float32')] + has_said_t return rval sampler_params = [everything] if state['avg_word']: sampler_params.append(aword) states = [TT.alloc(numpy.int64(0), n_steps)] states.append(TT.alloc(numpy.float32(0), n_steps)) states += init_state outputs, updates = scan(sample_fn, states=states, params=sampler_params, n_steps=n_steps, name='sampler_scan') samples = outputs[0] probs = outputs[1] sample_fn = theano.function([n_steps, temp, x], [samples, probs.sum()], updates=updates, profile=False, name='sample_fn') model = LM_Model(cost_layer=nll, weight_noise_amount=state['weight_noise_amount'], valid_fn=valid_fn, sample_fn=sample_fn, clean_before_noise_fn=False, noise_fn=noise_fn, indx_word=state['indx_word_target'], indx_word_src=state['indx_word'], character_level=False, rng=rng) if state['loopIters'] > 0: algo = SGD(model, state, train_data) else: algo = None def hook_fn(): if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs old_offset = train_data.offset if state['sample_reset']: train_data.reset() ns = 0 for sidx in xrange(state['sample_n']): while True: batch = train_data.next() if batch: break x = batch['x'] y = batch['y'] #xbow = batch['x_bow'] masks = batch['x_mask'] if x.ndim > 1: for idx in xrange(x.shape[1]): ns += 1 if ns > state['sample_max']: break print 'Input: ', for k in xrange(x[:, idx].shape[0]): print model.word_indxs_src[x[:, idx][k]], if model.word_indxs_src[x[:, idx][k]] == '<eol>': break print '' print 'Target: ', for k in xrange(y[:, idx].shape[0]): print model.word_indxs[y[:, idx][k]], if model.word_indxs[y[:, idx][k]] == '<eol>': break print '' senlen = len(x[:, idx]) if len(numpy.where(masks[:, idx] == 0)[0]) > 0: senlen = numpy.where(masks[:, idx] == 0)[0][0] if senlen < 1: continue xx = x[:senlen, idx] #xx = xx.reshape([xx.shape[0], 1]) model.get_samples(state['seqlen'] + 1, 1, xx) else: ns += 1 model.get_samples(state['seqlen'] + 1, 1, x) if ns > state['sample_max']: break train_data.offset = old_offset return main = MainLoop(train_data, valid_data, None, model, algo, state, channel, reset=state['reset'], hooks=hook_fn) if state['reload']: main.load() if state['loopIters'] > 0: main.main() if state['sampler_test']: # This is a test script: we only sample if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs indx_word = pkl.load(open(state['word_indx'], 'rb')) try: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = float(raw_input('Inverse Temperature? ')) seqin = seqin.lower() seqin = seqin.split() seqlen = len(seqin) seq = numpy.zeros(seqlen + 1, dtype='int64') for idx, sx in enumerate(seqin): try: seq[idx] = indx_word[sx] except: seq[idx] = indx_word[state['oov']] seq[-1] = state['null_sym_source'] except Exception: print 'Something wrong with your input! Try again!' continue sentences = [] all_probs = [] for sidx in xrange(n_samples): #import ipdb; ipdb.set_trace() [values, probs] = model.sample_fn(seqlen * 3, alpha, seq) sen = [] for k in xrange(values.shape[0]): if model.word_indxs[values[k]] == '<eol>': break sen.append(model.word_indxs[values[k]]) sentences.append(" ".join(sen)) all_probs.append(-probs) sprobs = numpy.argsort(all_probs) for pidx in sprobs: print pidx, "(%f):" % (-all_probs[pidx]), sentences[pidx] print except KeyboardInterrupt: print 'Interrupted' pass
def linear_cg(compute_Ax, b, M=None, xinit=None, rtol=1e-16, maxiter=100000, damp=0., floatX=None): """ Solves the system A x[i] = b[i], for all i. When used as part of a Newton-CG method, b is a list of gradients, where each element of this list represents a gradient for a given parameter type (i.e. weight or bias of a given layer). This method will return a list whose elements approximates A^{-1} b[i], with the precision determined by maxiter or the specified tolerance level. This particular version implements the Polyak-Ribiere flavor of CG. Parameters: :param compute_Ax: python function which symbolically computes the matrix-vector product. :param b: list of T.vector, corresponding to A x[i] = b[i] :param M: list of T.vector (same length as b). Each element is used to precondition its corresponding element of the A-diagonal. If [Mi for Mi in M] contains the diagonal elements of A, this will implement Jacobi preconditioning. :param xinit: list of T.vector (same length as b). x[i] is initial guess for A^{-1} b[i]. :param rtol: float. CG will stop when the norm of the residual error < rtol. :param maxiter: int. Maximum allowable iterations for CG. :param damp: float. Damping factor, equivalent to adding a term along the diagonal of A. :param floatX: 'float32' or 'float64'. Return values: rval[0]: niter, number of iterations run by CG rval[1]: residual error norm. rval[2+i]: approximate value for G^-1 b[i]. Reference: http://en.wikipedia.org/wiki/Conjugate_gradient_method """ n_params = len(b) def loop(niter, rkp_norm, *args): pk = args[:n_params] rk = args[n_params:2 * n_params] zk = args[2 * n_params:3 * n_params] xk = args[-n_params:] A_pk_temp = compute_Ax(*pk) A_pk = [ A_pk_temp_ + damp * pk_ for A_pk_temp_, pk_ in zip(A_pk_temp, pk) ] alphak_num = sum((rk_ * zk_).sum() for rk_, zk_ in zip(rk, zk)) alphak_denum = sum((A_pk_ * pk_).sum() for A_pk_, pk_ in zip(A_pk, pk)) alphak = alphak_num / alphak_denum xkp1 = [xk_ + alphak * pk_ for xk_, pk_ in zip(xk, pk)] rkp1 = [rk_ - alphak * A_pk_ for rk_, A_pk_, in zip(rk, A_pk)] if M: zkp1 = [rkp1_ / m_ for rkp1_, m_ in zip(rkp1, M)] else: zkp1 = rkp1 # compute beta_k using Polak-Ribiere betak_num = sum((zkp1_ * (rkp1_ - rk_)).sum() for rkp1_, rk_, zkp1_ in zip(rkp1, rk, zkp1)) betak_denum = alphak_num betak = betak_num / betak_denum pkp1 = [zkp1_ + betak * pk_ for zkp1_, pk_ in zip(zkp1, pk)] # compute termination critera rkp1_norm = sum((rkp1_**2).sum() for rkp1_ in rkp1) return [niter + 1, rkp1_norm] + pkp1 + rkp1 + zkp1 + xkp1,\ theano.scan_module.until(abs(rkp1_norm) < rtol) # Initialize residual based on xinit if xinit is None: r0_temp = b x0 = [ tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(b_))) for b_ in b ] else: init_Ax = compute_Ax(*xinit) r0_temp = [b[i] - init_Ax[i] for i in xrange(len(b))] x0 = [ tensor.unbroadcast(tensor.shape_padleft(xinit_)) for xinit_ in xinit ] # Leftpad r0, z0 and p0 for scan. r0 = [ tensor.unbroadcast(tensor.shape_padleft(r0_temp_)) for r0_temp_ in r0_temp ] if M: z0 = [ tensor.unbroadcast(tensor.shape_padleft(r0_temp_ / m_)) for r0_temp_, m_ in zip(r0_temp, M) ] else: z0 = r0 p0 = z0 states = [] # 0 niter states.append(tensor.constant(npy_floatX([0]))) # 1 residual error norm states.append(tensor.constant(npy_floatX([0]))) outs, updates = scan(loop, states=states + p0 + r0 + z0 + x0, n_steps=maxiter, mode=theano.Mode(linker='c|py'), name='linear_conjugate_gradient', profile=0) sol = [x[0] for x in outs[-n_params:]] niter = outs[0][0] rerr = outs[1][0] return [sol, niter, rerr]
def __init__( self, nhids=50, nouts=8, nins=2, activ=TT.nnet.sigmoid, seed=234, bs=16, # batchsize seqlen=3 # sequence length - fixed during training ): # 0. Keep track of arguments self.bs = bs self.nhids = nhids self.nouts = nouts self.nins = nins self.activ = activ self.seed = seed self.bs = bs self.seqlen = seqlen floatX = theano.config.floatX self.rng = numpy.random.RandomState(seed) # 1. Generating Theano variables # DenseSequence space # We store data as 3D tensor with (time, batch-size, nfeatures) self.x = TT.tensor3('x') # IndexSequence space # We store data as 1D tensor where each the dimension goes over the # batch size (i.e. target of each sequence in the batch) self.t = TT.ivector('t') # target index for each element of batchsize self.inputs = [self.x, self.t] # Naming convention for letters after the `_`: # u - input # h - hidden # y - output # f - forward # b - backwards self.W_uhf = numpy.asarray(self.rng.normal(size=(self.nins, self.nhids), loc=0, scale=.01), dtype=floatX) self.W_uhb = numpy.asarray(self.rng.normal(size=(self.nins, self.nhids), loc=0, scale=.01), dtype=floatX) self.W_hhf = numpy.asarray(self.rng.normal(size=(self.nhids, self.nhids), loc=0, scale=1), dtype=floatX) self.W_hhb = numpy.asarray(self.rng.normal(size=(self.nhids, self.nhids), loc=0, scale=1), dtype=floatX) self.W_hyf = numpy.asarray(self.rng.normal(size=(self.nhids, self.nouts), loc=0, scale=.1), dtype=floatX) self.W_hyb = numpy.asarray(self.rng.normal(size=(self.nhids, self.nouts), loc=0, scale=.1), dtype=floatX) # sparsifying hidden weights (Ilya&Martens formula == ESN style # init) for dx in xrange(self.nhids): psng = self.rng.permutation(nhids) self.W_hhf[dx][psng[15:]] = 0. psng = self.rng.permutation(nhids) self.W_hhb[dx][psng[15:]] = 0. # Any spectral radius larger than .9 smaller than 1.1 should be fine sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhf))) self.W_hhf = numpy.float32(.97 * self.W_hhf / sr) sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhb))) self.W_hhb = numpy.float32(.97 * self.W_hhb / sr) self.b_hhf = numpy.zeros((nhids, ), dtype=floatX) self.b_hhb = numpy.zeros((nhids, ), dtype=floatX) self.b_hy = numpy.zeros((nouts, ), dtype=floatX) self.W_uhf = theano.shared(self.W_uhf, name='W_uhf') self.W_uhb = theano.shared(self.W_uhb, name='W_uhb') self.W_hhf = theano.shared(self.W_hhf, name='W_hhf') self.W_hhb = theano.shared(self.W_hhb, name='W_hhb') self.W_hyf = theano.shared(self.W_hyf, name='W_hyf') self.W_hyb = theano.shared(self.W_hyb, name='W_hyb') self.b_hhf = theano.shared(self.b_hhf, name='b_hhf') self.b_hhb = theano.shared(self.b_hhb, name='b_hhb') self.b_hy = theano.shared(self.b_hy, name='b_hy') self.params = [ self.W_uhf, self.W_uhb, self.W_hhf, self.W_hhb, self.W_hyf, self.W_hyb, self.b_hhf, self.b_hhb, self.b_hy ] self.best_params = [(x.name, x.get_value()) for x in self.params] self.params_shape = [ x.get_value(borrow=True).shape for x in self.params ] # 2. Constructing Theano graph # Note: new interface of scan asks the user to provide a memory # buffer that contains the initial state but which is also used # internally by scan to store the intermediate values of its # computations - hence the initial state is a 3D tensor h0_f = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1, self.bs, self.nhids) h0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1, self.bs, self.nhids) # Do we use to much memory!? p_hf = TT.dot(self.x.reshape( (self.seqlen * self.bs, self.nins)), self.W_uhf) + self.b_hhf p_hb = TT.dot(self.x[::-1].reshape( (self.seqlen * self.bs, self.nins)), self.W_uhb) + self.b_hhb def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1): hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t) hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t) return hf_t, hb_t # provide sequence length !? is better on GPU [h_f, h_b], _ = scan(recurrent_fn, sequences=[ p_hf.reshape((self.seqlen, self.bs, self.nhids)), p_hb.reshape((self.seqlen, self.bs, self.nhids)) ], states=[h0_f, h0_b], n_steps=self.seqlen, name='bi-RNN', profile=0) h_b = h_b[::-1] # Optionally do the max over hidden layer !? # I'm afraid the semantics for RNN are somewhat different than MLP y = TT.nnet.softmax( TT.dot(h_f.reshape((self.seqlen * self.bs + self.bs, self.nhids )), self.W_hyf) + # Check doc flatten TT.dot(h_b.reshape((self.seqlen * self.bs + self.bs, self.nhids)), self.W_hyb) + self.b_hy) my = y.reshape((self.seqlen + 1, self.bs, self.nouts)).max(axis=0) nll = -TT.log(my[TT.constant(numpy.arange(self.bs)), self.t]) self.train_cost = nll.mean() self.error = TT.mean(TT.neq(my.argmax(axis=1), self.t) * 100.) ## |----------------------------- # - Computing metric times a vector efficiently for p(y|x) # Assume softmax .. we might want sigmoids though self.Gyvs = lambda *args:\ TT.Lop(y, self.params, TT.Rop(y, self.params, args) /\ (y*numpy.array(self.bs, dtype=floatX))) # Computing metric times a vector effciently for p(h|x) if activ == TT.nnet.sigmoid: fn = lambda x: (1 - x) * x * numpy.array(self.bs, dtype=floatX) elif activ == TT.tanh: # Please check formula !!!! It is probably wrong fn = lambda x: (.5 - x / 2) * (x / 2 + .5) * numpy.array( self.bs, dtype=floatX) else: # Assume linear or piece-wise linear activation fn = lambda x: numpy, array(self.bs, dtype=floatX) self.Ghfvs = lambda *args:\ TT.Lop(h_f, self.params, TT.Rop(h_f, self.params, args) / fn(h_f)) self.Ghbvs = lambda *args:\ TT.Lop(h_b, self.params, TT.Rop(h_b, self.params, args) / fn(h_b)) ## ------------------ | vx = TT.matrix('vx') vt = TT.iscalar('vt') vh0_f = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1, self.nhids) vh0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1, self.nhids) # Do we use to much memory!? vp_hf = TT.dot(vx, self.W_uhf) + self.b_hhf vp_hb = TT.dot(vx[::-1], self.W_uhb) + self.b_hhb def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1): hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t) hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t) return hf_t, hb_t # provide sequence length !? is better on GPU [vh_f, vh_b], _ = scan(recurrent_fn, sequences=[vp_hf, vp_hb], states=[vh0_f, vh0_b], name='valid bi-RNN', n_steps=vp_hf.shape[0], profile=0) vh_b = vh_b[::-1] # Optionally do the max over hidden layer !? # I'm afraid the semantics for RNN are somewhat different than MLP vy = TT.nnet.softmax( TT.dot(vh_f, self.W_hyf) + TT.dot(vh_b, self.W_hyb) + self.b_hy) my = TT.neq(vy.max(axis=0).argmax(), vt) self.validate = theano.function([vx, vt], my, name='validation', profile=0)
def init_gpu(self, options, channel, data, model): # Step 1. Compile function for computing eucledian gradients eps = numpy.float32(1e-24) gbdx = TT.iscalar('grad_batch_idx') n_params = len(self.model.params) print 'Constructing grad function' loc_inputs = [x.type() for x in model.inputs] srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)] # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(model.params, [None]*n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) #denom *= nw_out #denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= (nw_out+eps) else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) if out_operator == 'sigmoid': tnwout = TT.nnet.sigmoid(nw_out) factor = TT.sqrt(tnwout * (numpy.float32(1) - tnwout))*factor r = TT.sgn(srng.normal(nw_out.shape)) r = r * factor loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [oj + final_results[p] for oj, p in zip(args[1+n_params:1+2*n_params], model.params)] return [args[0] + const(1)] + nw_gs + nw_js ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0) for shp in model.params_shape] ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig + ij, n_steps=n_steps, name='grad_loop', mode=gpu_mode, profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]] nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]] updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js))) grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']]) for x,y in zip(loc_inputs, self.shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), allow_input_downcast=True, name='compute_eucledian_gradients', mode=gpu_mode, profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const(options['cbs'])# * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) self.damping = theano.shared(numpy.float32(options['mreg'])) rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], Ms = self.js, rtol=options['mrtol'], shift=self.damping, maxit=options['miters'], profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) reset = TT.scalar(dtype='int8', name='reset') norm_kkm1 = sum([(r*g).sum() for r,g in zip(self.rs, self.gs)]) norm_kk = sum([(r*g).sum() for r,g in zip(nw_rs, self.gs)]) norm_dk = sum([(d*g).sum() for d,g in zip(self.ds, self.gs)]) norm_y = norm_kk - 2*norm_kkm1 + self.norm_km1km1 beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \ 2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2)) beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)), beta_k) beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k), TT.isinf(beta_k)), TT.constant(numpy.float32(0.)), beta_k) nwds = [-r + beta_k*d for r,d in zip(nw_rs, self.ds)] self.nwds = nwds nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \ numpy.float32(1e-25) updates.update(dict(zip(self.rs, nw_rs))) updates.update(dict(zip(self.ds, nwds))) updates[self.norm_km1km1] = norm_kk updates[self.norm_dkm1] = norm_dk updates[self.norm_d] = nw_normd print 'Compiling riemannian gradient function' cst = time.time() grad_inps = [(x, y[rbdx*options['mbs']:(rbdx+1)*options['mbs']]) for x,y in zip(loc_inputs, self.shared_data)] self.compute_riemannian_gradients = theano.function( [reset, rbdx], [flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0, beta_k], updates=updates, allow_input_downcast = True, givens=dict(grad_inps), name='compute_riemannian_gradients', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) print 'Time to compile Riemannian', print_time(time.time() - cst) cst = time.time() # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') newparams = [p + lr * d for p, d in zip(model.params, self.ds)] nw_ds = [ -r for r in self.rs] nw_normd = TT.sqrt(sum([(r*r).sum() for r in self.rs])) self.update_params = theano.function([lr], updates = dict(zip(model.params, newparams)), name='update_params', on_unused_input='warn', allow_input_downcast=True, mode=gpu_mode, profile=options['profile']) self.reset_directions = theano.function([], updates=dict(zip(self.ds + [self.norm_d], nw_ds + [nw_normd])), name='reset_dirs', on_unused_input='warn', mode=cpu_mode, allow_input_downcast=True, profile=options['profile']) n_steps = options['ebs'] // options['cbs'] def ls_cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_cost_step, states = states, n_steps = n_steps, name='ls_cost_step', profile = options['profile']) fcost = rvals[1][0] / const(n_steps) def ls_grad_step(_idx, gws): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_gs = TT.grad(nw_cost, lr) return _idx + numpy.float32(1), gws + nw_gs states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_grad_step, states = states, n_steps = n_steps, name = 'ls_grad_step', profile=options['profile']) fgrad = rvals[1][0] / const(n_steps) ebdx = TT.iscalar('ebdx') grad_inps = [(x, y[ebdx * options['ebs']: (ebdx + 1) * options['ebs']]) for x,y in zip(loc_inputs, self.shared_data)] self.ls_cost_fn = theano.function( [lr, ebdx], fcost, givens = grad_inps, allow_input_downcast=True, name='ls_cost_fn', mode=gpu_mode, profile=options['profile']) self.approx_change = theano.function( [lr], -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.ds)]), allow_input_downcast=True, name='approx_change', mode=gpu_mode, profile=options['profile']) self.ls_grad_fn = theano.function( [lr, ebdx], fgrad, allow_input_downcast=True, givens = grad_inps, name='ls_grad_fn', mode=gpu_mode, profile=options['profile']) self.old_score = 50000 n_steps = options['ebs']// options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone( model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_error, states = states, n_steps = n_steps, name='ls_err_step', mode=gpu_mode, profile = options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=cpu_mode, allow_input_downcast=True, on_unused_input='warn', profile=options['profile']) print 'Compile eval time', print_time(time.time() - cst) self.old_cost = 1e6 self.options = options self.perm = self.rng.permutation(4) self.pos = 0
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `ebs` -> int Number of samples over which to evaluate the training error `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ self.model = model # push dataset into shared var n_params = len(model.params) xdata = theano.shared(data['train_x'].astype('float32'), name='xdata') # ! This works for 1 of k classification ydata = TT.cast( theano.shared(data['train_y'].astype('float32'), name='ydata'), 'int32') shared_data = [xdata, ydata] self.xdata = xdata self.ydata = ydata # all sorts of indices self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] # vars for gradients # Store Euclidean gradients self.gs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients (H^-1*g) self.rs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] return [args[0] + const(1)] + \ nw_gs ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig, n_steps=n_steps, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] updates.update(dict(zip(self.gs, nw_gs))) grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']]) for x, y in zip(loc_inputs, shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), on_unused_input='warn', name='compute_eucledian_gradients', mode=theano.Mode(linker='cvm'), profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * g for p, g in zip(model.params, self.gs)] def cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] acc0 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0], n_steps=n_steps, name='cost_loop', profile=options['profile']) final_cost = rvals[1] / const(n_steps) update_vals = dict(zip(model.params, nw_ps)) #updates.update(dict(zip(model.params, nw_ps))) grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]) for x, y in zip(loc_inputs, shared_data)] print 'compling evaluation function' self.eval_fn = theano.function([ebdx, lr], final_cost, givens=dict(grad_inps), updates=updates, on_unused_input='warn', name='eval_fn', mode=theano.Mode(linker='cvm'), profile=options['profile']) self.update_params = theano.function( [lr], [], updates=update_vals, on_unused_input='warn', #givens=dict(grad_inps), name='update_params', mode=theano.Mode(linker='cvm'), profile=options['profile']) self.options = options self.old_cost = 1e6 n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc, acc_train_cost): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') train_cost = TT.cast(safe_clone(model.train_cost, replace=replace), 'float32') return [ _idx + const(1), acc + nw_cost, acc_train_cost + train_cost ] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=theano.Mode(linker='cvm'), profile=options['profile']) ferr = rvals[1][0] / const(n_steps) ftrain_cost = rvals[2][0] / const(n_steps) self.compute_error = theano.function([ebdx], [ferr, ftrain_cost], givens=dict(grad_inps), name='compute_err', on_unused_input='warn', mode=theano.Mode(linker='cvm'), profile=options['profile'])
def __init__(self, nhids =50, nouts = 8, nins = 2, activ = TT.nnet.sigmoid, seed = 234, bs = 16, # batchsize seqlen = 3 # sequence length - fixed during training ): # 0. Keep track of arguments self.bs = bs self.nhids = nhids self.nouts = nouts self.nins = nins self.activ = activ self.seed = seed self.bs = bs self.seqlen = seqlen floatX = theano.config.floatX self.rng = numpy.random.RandomState(seed) # 1. Generating Theano variables # DenseSequence space # We store data as 3D tensor with (time, batch-size, nfeatures) self.x = TT.tensor3('x') # IndexSequence space # We store data as 1D tensor where each the dimension goes over the # batch size (i.e. target of each sequence in the batch) self.t = TT.ivector('t') # target index for each element of batchsize self.inputs = [self.x, self.t] # Naming convention for letters after the `_`: # u - input # h - hidden # y - output # f - forward # b - backwards self.W_uhf = numpy.asarray( self.rng.normal(size=(self.nins, self.nhids), loc=0, scale=.01), dtype=floatX) self.W_uhb = numpy.asarray( self.rng.normal(size=(self.nins, self.nhids), loc=0, scale=.01), dtype=floatX) self.W_hhf = numpy.asarray( self.rng.normal(size=(self.nhids, self.nhids), loc=0, scale=1), dtype=floatX) self.W_hhb = numpy.asarray( self.rng.normal(size=(self.nhids, self.nhids), loc=0, scale=1), dtype=floatX) self.W_hyf = numpy.asarray( self.rng.normal(size=(self.nhids, self.nouts), loc=0, scale=.1), dtype=floatX) self.W_hyb = numpy.asarray( self.rng.normal(size=(self.nhids, self.nouts), loc=0, scale=.1), dtype=floatX) # sparsifying hidden weights (Ilya&Martens formula == ESN style # init) for dx in xrange(self.nhids): psng = self.rng.permutation(nhids) self.W_hhf[dx][psng[15:]] = 0. psng = self.rng.permutation(nhids) self.W_hhb[dx][psng[15:]] = 0. # Any spectral radius larger than .9 smaller than 1.1 should be fine sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhf))) self.W_hhf = numpy.float32(.97*self.W_hhf/sr) sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhb))) self.W_hhb = numpy.float32(.97*self.W_hhb/sr) self.b_hhf = numpy.zeros((nhids,), dtype=floatX) self.b_hhb = numpy.zeros((nhids,), dtype=floatX) self.b_hy = numpy.zeros((nouts,), dtype=floatX) self.W_uhf = theano.shared(self.W_uhf, name='W_uhf') self.W_uhb = theano.shared(self.W_uhb, name='W_uhb') self.W_hhf = theano.shared(self.W_hhf, name='W_hhf') self.W_hhb = theano.shared(self.W_hhb, name='W_hhb') self.W_hyf = theano.shared(self.W_hyf, name='W_hyf') self.W_hyb = theano.shared(self.W_hyb, name='W_hyb') self.b_hhf = theano.shared(self.b_hhf, name='b_hhf') self.b_hhb = theano.shared(self.b_hhb, name='b_hhb') self.b_hy = theano.shared(self.b_hy, name='b_hy') self.params = [self.W_uhf, self.W_uhb, self.W_hhf, self.W_hhb, self.W_hyf, self.W_hyb, self.b_hhf, self.b_hhb, self.b_hy] self.best_params = [(x.name, x.get_value()) for x in self.params] self.params_shape = [x.get_value(borrow=True).shape for x in self.params] # 2. Constructing Theano graph # Note: new interface of scan asks the user to provide a memory # buffer that contains the initial state but which is also used # internally by scan to store the intermediate values of its # computations - hence the initial state is a 3D tensor h0_f = TT.alloc(numpy.array(0,dtype=floatX), self.seqlen+1, self.bs, self.nhids) h0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen+1, self.bs, self.nhids) # Do we use to much memory!? p_hf = TT.dot(self.x.reshape((self.seqlen*self.bs, self.nins)), self.W_uhf) + self.b_hhf p_hb = TT.dot(self.x[::-1].reshape((self.seqlen*self.bs, self.nins)), self.W_uhb) + self.b_hhb def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1): hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t) hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t) return hf_t, hb_t # provide sequence length !? is better on GPU [h_f, h_b], _ = scan( recurrent_fn, sequences = [ p_hf.reshape((self.seqlen, self.bs, self.nhids)), p_hb.reshape((self.seqlen, self.bs, self.nhids))], states = [h0_f, h0_b], n_steps = self.seqlen, name = 'bi-RNN', profile = 0) h_b = h_b[::-1] # Optionally do the max over hidden layer !? # I'm afraid the semantics for RNN are somewhat different than MLP y = TT.nnet.softmax( TT.dot(h_f.reshape((self.seqlen * self.bs+self.bs, self.nhids)), self.W_hyf) + # Check doc flatten TT.dot(h_b.reshape((self.seqlen * self.bs+self.bs, self.nhids)), self.W_hyb) + self.b_hy) my = y.reshape((self.seqlen+1, self.bs, self.nouts)).max(axis=0) nll = -TT.log( my[TT.constant(numpy.arange(self.bs)), self.t]) self.train_cost = nll.mean() self.error = TT.mean(TT.neq(my.argmax(axis=1), self.t) * 100.) ## |----------------------------- # - Computing metric times a vector efficiently for p(y|x) # Assume softmax .. we might want sigmoids though self.Gyvs = lambda *args:\ TT.Lop(y, self.params, TT.Rop(y, self.params, args) /\ (y*numpy.array(self.bs, dtype=floatX))) # Computing metric times a vector effciently for p(h|x) if activ == TT.nnet.sigmoid: fn = lambda x : (1-x)*x*numpy.array(self.bs, dtype=floatX) elif activ == TT.tanh: # Please check formula !!!! It is probably wrong fn = lambda x:(.5-x/2)*(x/2+.5)*numpy.array(self.bs, dtype=floatX) else: # Assume linear or piece-wise linear activation fn = lambda x: numpy,array(self.bs, dtype=floatX) self.Ghfvs = lambda *args:\ TT.Lop(h_f, self.params, TT.Rop(h_f, self.params, args) / fn(h_f)) self.Ghbvs = lambda *args:\ TT.Lop(h_b, self.params, TT.Rop(h_b, self.params, args) / fn(h_b)) ## ------------------ | vx = TT.matrix('vx') vt = TT.iscalar('vt') vh0_f = TT.alloc(numpy.array(0,dtype=floatX), self.seqlen+1, self.nhids) vh0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen+1, self.nhids) # Do we use to much memory!? vp_hf = TT.dot(vx, self.W_uhf) + self.b_hhf vp_hb = TT.dot(vx[::-1], self.W_uhb) + self.b_hhb def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1): hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t) hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t) return hf_t, hb_t # provide sequence length !? is better on GPU [vh_f, vh_b], _ = scan( recurrent_fn, sequences = [vp_hf, vp_hb], states = [vh0_f, vh0_b], name = 'valid bi-RNN', n_steps = vp_hf.shape[0], profile = 0) vh_b = vh_b[::-1] # Optionally do the max over hidden layer !? # I'm afraid the semantics for RNN are somewhat different than MLP vy = TT.nnet.softmax( TT.dot(vh_f, self.W_hyf) + TT.dot(vh_b, self.W_hyb) + self.b_hy) my = TT.neq(vy.max(axis=0).argmax(), vt) self.validate = theano.function([vx, vt], my, name='validation', profile=0)
def minres(compute_Av, bs, rtol=numpy.float32(1e-6), maxit=20, Ms=None, shift=numpy.float32(0.), maxxnorm=numpy.float32(1e15), Acondlim=numpy.float32(1e16), mode=None, profile=0): """ DESCRIPTION: minres attempts to find the minimum-length and minimum-residual-norm solution x to the system of linear equations A*x = b or least squares problem min||Ax-b||. The n-by-n coefficient matrix A must be symmetric (but need not be positive definite or invertible). The right-hand-side column vector b must have length n. INPUTS: :param compute_Av: callable returing the symbolic expression for `Av`. `v` can be a set of parameteres :param bs: list of Theano expressions. We are looking to compute A^-1\dot bs :param rtol: Optional, real, specifies the tolerance of the method. Default is 1e-6 :param maxit: Optional, positive integer, specifies the maximum number of iterations. Default is 20 :param Ms: List of theano expression of same shape as `bs`. The method uses these to precondition with diag(Ms) :param shift: Optional, scalar, real or complex. Default is 0. Effectively solve the system (A - shift I) * x = b. maxxnorm real positive, maximum bound on NORM(x). Default is 1e14. Acondlim real positive, maximum bound on COND(A). Default is 1e15. show boolean, 0 to suppress outputs, 1 to show iterations. Default is 0. p1, p2,... Optional, inputs to A and M if they are functions OUTPUTS: x n-vector, estimated solution flag integer, convergence flag -1 beta2 = 0. If M = I, b and x are eigenvectors. 0 beta1 = 0. The exact solution is x = 0. 1 A solution to (poss. singular) Ax = b found, given rtol. 2 Pseudoinverse solution for singular LS problem, given rtol. 3 A solution to (poss. singular) Ax = b found, given eps. 4 Pseudoinverse solution for singular LS problem, given eps. 5 x has converged to an eigenvector. 6 xnorm has exceeded maxxnorm. 7 Acond has exceeded Acondlim. 8 The iteration limit was reached. 9 It is a least squares problem but no converged solution yet. iter integer, iteration number at which x was computed: 0 <= iter <= maxit. relres real positive, the relative residual is defined as NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)), computed recurrently here. If flag is 1 or 3, relres <= TOL. relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) --- computed recurrently here. If flag is 2 or 4, relAres <= TOL. Anorm real positive, estimate of matrix 2-norm of A. Acond real positive, estimate of condition number of A with respect to 2-norm. xnorm non-negative positive, recurrently computed NORM(x) Axnorm non-negative positive, recurrently computed NORM(A * x). EXAMPLE 1: n = 100; on = ones(n,1); A = spdiags([-2*on 4*on -2*on],-1:1,n,n); b = sum(A,2); rtol = 1e-10; maxit = 50; M = spdiags(4*on,0,n,n); x = minresSOL69(A, b, rtol, maxit, M); Use this matrix-vector product function function y = afun(x,n) y = 4 * x; y(2:n) = y(2:n) - 2 * x(1:n-1); y(1:n-1) = y(1:n-1) - 2 * x(2:n); as input to minresSOL69 x1 = minresSOL69(@afun, b, rtol, maxit, M); EXAMPLE 2: A is Laplacian on a 50 by 05 grid, singular and indefinite. n = 50; N = n^2; on=ones(n,1); B = spdiags([on on on], -1:1, n, n); A = sparse([],[],[],N,N,(3*n-2)^2); for i=1:n A((i-1)*n+1:i*n,(i-1)*n+1:i*n) = B; if i*n+1 < n*n, A(i*n+1:(i+1)*n,(i-1)*n+1:i*n)=B; end; if (i-2)*n+1 > 0 A((i-2)*n+1:(i-1)*n,(i-1)*n+1:i*n)=B; end; end b = sum(A,2); rtol = 1e-5; maxxnorm = 1e2; shift = 0; Acondlim = []; show = 1; M = []; x = minresSOL69( A, b, rtol, N, M, shift, maxxnorm, Acondlim, show); EXAMPLE 3: A is diagonal, singular and indefinite. h = 1; a = -10; b = -a; n = 2*b/h + 1; A = spdiags((a:h:b)', 0, n, n); b = ones(n,1); rtol = 1e-6; maxxnorm = 1e2; shift = 0; Acondlim = []; show = 1; M = []; x = minresSOL69( A, b, rtol, N, M, shift, maxxnorm, Acondlim, show); REFERENCES: Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006. http://www.stanford.edu/group/SOL/software.html """ if not isinstance(bs, (tuple, list)): bs = [bs] return_as_list = False else: bs = list(bs) return_as_list = True eps = numpy.float32(1e-23) # Initialise flag = theano.shared(numpy.float32(0.)) beta1 = norm(bs) #------------------------------------------------------------------ # Set up p and v for the first Lanczos vector v1. # p = beta1 P' v1, where P = C**(-1). # v is really P' v1. #------------------------------------------------------------------ r3s = [b for b in bs] r2s = [b for b in bs] r1s = [b for b in bs] if Ms is not None: r3s = [b / m for b, m in zip(bs, Ms)] beta1 = norm(r3s, bs) #------------------------------------------------------------------ ## Initialize other quantities. # Note that Anorm has been initialized by IsOpSym6. # ------------------------------------------------------------------ bnorm = beta1 n_params = len(bs) def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag, *args): #----------------------------------------------------------------- ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,... # The general iteration is similar to the case k = 1 with v0 = 0: # # p1 = Operator * v1 - beta1 * v0, # alpha1 = v1'p1, # q2 = p2 - alpha1 * v1, # beta2^2 = q2'q2, # v2 = (1/beta2) q2. # # Again, p = betak P vk, where P = C**(-1). # .... more description needed. #----------------------------------------------------------------- xs = args[0 * n_params:1 * n_params] r1s = args[1 * n_params:2 * n_params] r2s = args[2 * n_params:3 * n_params] r3s = args[3 * n_params:4 * n_params] dls = args[4 * n_params:5 * n_params] ds = args[5 * n_params:6 * n_params] betal = beta beta = betan vs = [r3 / beta for r3 in r3s] r3s, upds = compute_Av(*vs) r3s = [r3 + shift * v for r3, v in zip(r3s, vs)] r3s = [ TT.switch(TT.ge(niter, numpy.float64(1.)), r3 - (beta / betal) * r1, r3) for r3, r1 in zip(r3s, r1s) ] alpha = sqnorm(r3s, vs) r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)] r1s = [r2 for r2 in r2s] r2s = [r3 for r3 in r3s] if Ms is not None: r3s = [r3 / M for r3, M in zip(r3s, Ms)] betan = norm(r2s, r3s) else: betan = norm(r3s) pnorml = pnorm pnorm = TT.switch( TT.eq(niter, numpy.float32(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta))) #----------------------------------------------------------------- ## Apply previous rotation Qk-1 to get # [dlta_k epln_{k+1}] = [cs sn][dbar_k 0 ] # [gbar_k dbar_{k+1} ] [sn -cs][alpha_k beta_{k+1}]. #----------------------------------------------------------------- dbar = dbarn epln = eplnn dlta = cs * dbar + sn * alpha gbar = sn * dbar - cs * alpha eplnn = sn * betan dbarn = -cs * betan ## Compute the current plane rotation Qk gammal2 = gammal gammal = gamma cs, sn, gamma = symGivens2(gbar, betan) tau = cs * phi phi = sn * phi Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau)) # Update d dl2s = [dl for dl in dls] dls = [d for d in ds] ds = [ TT.switch(TT.neq(gamma, numpy.float32(0.)), (v - epln * dl2 - dlta * dl) / gamma, v) for v, dl2, dl in zip(vs, dl2s, dls) ] d_norm = TT.switch(TT.neq(gamma, numpy.float32(0.)), norm(ds), TT.constant((numpy.float32(numpy.inf)))) # Update x except if it will become too big xnorml = xnorm dl2s = [x for x in xs] xs = [x + tau * d for x, d in zip(xs, ds)] xnorm = norm(xs) xs = [ TT.switch(TT.ge(xnorm, maxxnorm), dl2, x) for dl2, x in zip(dl2s, xs) ] flag = TT.switch(TT.ge(xnorm, maxxnorm), numpy.float32(6.), flag) # Estimate various norms rnorml = rnorm # ||r_{k-1}|| Anorml = Anorm Acondl = Acond relrnorml = relrnorm flag_no_6 = TT.neq(flag, numpy.float32(6.)) Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)), Dnorm) xnorm = TT.switch(flag_no_6, norm(xs), xnorm) rnorm = TT.switch(flag_no_6, phi, rnorm) relrnorm = TT.switch(flag_no_6, rnorm / (Anorm * xnorm + bnorm), relrnorm) Tnorm = TT.switch( flag_no_6, TT.switch( TT.eq(niter, numpy.float32(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt( TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) + TT.sqr(betan))), Tnorm) Anorm = TT.maximum(Anorm, pnorm) Acond = Anorm * Dnorm rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn)) Anorml = rnorml * rootl relArnorml = rootl / Anorm #--------------------------------------------------------------- # See if any of the stopping criteria are satisfied. # In rare cases, flag is already -1 from above (Abar = const*I). #--------------------------------------------------------------- epsx = Anorm * xnorm * eps epsr = Anorm * xnorm * rtol #Test for singular Hk (hence singular A) # or x is already an LS solution (so again A must be singular). t1 = numpy.float32(1) + relrnorm t2 = numpy.float32(1) + relArnorml flag = TT.switch( TT.bitwise_or(TT.eq(flag, numpy.float32(0.)), TT.eq(flag, numpy.float32(6.))), TT.switch( TT.le(t1, numpy.float32(1.)), numpy.float32(3.), TT.switch( TT.le(t2, numpy.float32(1.)), numpy.float32(4.), TT.switch( TT.le(relrnorm, rtol), numpy.float32(1.), TT.switch( TT.le(Anorm, numpy.float32(1e-20)), numpy.float32(12), TT.switch( TT.le(relArnorml, rtol), numpy.float32(10.), TT.switch( TT.ge(epsx, beta1), numpy.float32(5.), TT.switch( TT.ge(xnorm, maxxnorm), numpy.float32(6.), TT.switch( TT.ge(niter, TT.cast(maxit, 'float32')), numpy.float32(8.), flag)))))))), flag) flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm), numpy.float32(11.), flag) return [ niter + numpy.float32(1.), beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag] + xs + r1s + r2s + r3s + dls + ds, upds, \ theano.scan_module.scan_utils.until(TT.neq(flag,0)) states = [] # 0 niter states.append(TT.constant(numpy.float32([0]))) # 1 beta states.append(TT.constant(numpy.float32([0]))) # 2 betan states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 3 phi states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 4 Acond states.append(TT.constant(numpy.float32([1]))) # 5 cs states.append(TT.constant(numpy.float32([-1]))) # 6 dbarn states.append(TT.constant(numpy.float32([0]))) # 7 eplnn states.append(TT.constant(numpy.float32([0]))) # 8 rnorm states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 9 sn states.append(TT.constant(numpy.float32([0]))) # 10 Tnorm states.append(TT.constant(numpy.float32([0]))) # 11 rnorml states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 12 xnorm states.append(TT.constant(numpy.float32([0]))) # 13 Dnorm states.append(TT.constant(numpy.float32([0]))) # 14 gamma states.append(TT.constant(numpy.float32([0]))) # 15 pnorm states.append(TT.constant(numpy.float32([0]))) # 16 gammal states.append(TT.constant(numpy.float32([0]))) # 17 Axnorm states.append(TT.constant(numpy.float32([0]))) # 18 relrnorm states.append(TT.constant(numpy.float32([1]))) # 19 relArnorml states.append(TT.constant(numpy.float32([1]))) # 20 Anorm states.append(TT.constant(numpy.float32([0]))) # 21 flag states.append(TT.constant(numpy.float32([0]))) xs = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs] ds = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs] dls = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs] r1s = [TT.unbroadcast(TT.shape_padleft(r1), 0) for r1 in r1s] r2s = [TT.unbroadcast(TT.shape_padleft(r2), 0) for r2 in r2s] r3s = [TT.unbroadcast(TT.shape_padleft(r3), 0) for r3 in r3s] rvals, lupds = scan(loop, states=states + xs + r1s + r2s + r3s + dls + ds, n_steps=maxit + numpy.int32(1), name='minres', profile=profile, mode=mode) niters = TT.cast(rvals[0][0], 'int32') flag = TT.cast(rvals[21][0], 'int32') relres = rvals[18][0] relAres = rvals[19][0] Anorm = rvals[20][0] Acond = rvals[4][0] xnorm = rvals[12][0] Axnorm = rvals[17][0] sol = [x[0] for x in rvals[22:22 + n_params]] return sol, flag, niters, relres, relAres, Anorm, Acond, xnorm, Axnorm, lupds
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo, phi, derphi, phi0, derphi0, c1, c2, n_iters=10, profile=False): """ WRITEME Part of the optimization algorithm in `scalar_search_wolfe2`. Parameters ---------- a_lo : float Step size a_hi : float Step size phi_lo : float Value of f at a_lo phi_hi : float Value of f at a_hi derphi_lo : float Value of derivative at a_lo phi : callable Generates computational graph derphi : callable Generates computational graph phi0 : float Value of f at 0 derphi0 : float Value of the derivative at 0 c1 : float Wolfe parameter c2 : float Wolfe parameter profile : bool True if you want printouts of profiling information """ # Function reprensenting the computations of one step of the while loop def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime): # interpolate to find a trial step length between a_lo and # a_hi Need to choose interpolation here. Use cubic # interpolation and then if the result is within delta * # dalpha or outside of the interval bounded by a_lo or a_hi # then use quadratic interpolation, if the result is still too # close, then use bisection dalpha = a_hi - a_lo a = TT.switch(dalpha < zero, a_hi, a_lo) b = TT.switch(dalpha < zero, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # cubic interpolation cchk = delta1 * dalpha a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec) # quadric interpolation qchk = delta2 * dalpha a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('condq', TT.isnan(a_j_quad), a_j_quad > b - qchk, a_j_quad < a + qchk) a_j_quad = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX) * \ dalpha, a_j_quad) # pick between the two .. cond_c = lazy_or('condc', TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk)) # this lazy if actually decides if we need to run the quadric # interpolation a_j = TT.switch(cond_c, a_j_quad, a_j_cubic) #a_j = ifelse(cond_c, a_j_quad, a_j_cubic) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2 * derphi0) cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0, phi_aj >= phi_lo) cond2 = derphi_aj * (a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse(cond1, phi_hi, TT.switch(cond2, phi_hi, phi_lo), name='phi_rec') a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='a_rec') a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='a_hi') phi_hi = ifelse(cond1, phi_aj, TT.switch(cond2, phi_lo, phi_hi), name='phi_hi') a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo') a_star = a_j val_star = phi_aj valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='valprime') return ([phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime], theano.scan_module.scan_utils.until(stop)) maxiter = n_iters # cubic interpolant check delta1 = TT.constant(numpy.asarray(0.2, dtype=theano.config.floatX)) # quadratic interpolant check delta2 = TT.constant(numpy.asarray(0.1, dtype=theano.config.floatX)) phi_rec = phi0 a_rec = zero # Initial iteration dalpha = a_hi - a_lo a = TT.switch(dalpha < zero, a_hi, a_lo) b = TT.switch(dalpha < zero, a_lo, a_hi) #a = ifelse(dalpha < 0, a_hi, a_lo) #b = ifelse(dalpha < 0, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # quadric interpolation qchk = delta2 * dalpha a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('mcond_q', TT.isnan(a_j), TT.bitwise_or(a_j > b - qchk, a_j < a + qchk)) a_j = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX) * \ dalpha, a_j) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0, phi_aj >= phi_lo) cond2 = derphi_aj * (a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse(cond1, phi_hi, TT.switch(cond2, phi_hi, phi_lo), name='mphirec') a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='marec') a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='mahi') phi_hi = ifelse(cond1, phi_aj, TT.switch(cond2, phi_lo, phi_hi), name='mphihi') onlyif = lazy_and('only_if', TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2 * derphi0) a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo_main') phi_rec.name = 'phi_rec' a_rec.name = 'a_rec' a_lo.name = 'a_lo' a_hi.name = 'a_hi' phi_hi.name = 'phi_hi' phi_lo.name = 'phi_lo' derphi_lo.name = 'derphi_lo' vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='vderphi_aj') states = [] states += [TT.unbroadcast(TT.shape_padleft(phi_rec), 0)] states += [TT.unbroadcast(TT.shape_padleft(a_rec), 0)] states += [TT.unbroadcast(TT.shape_padleft(a_lo), 0)] states += [TT.unbroadcast(TT.shape_padleft(a_hi), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_hi), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_lo), 0)] states += [TT.unbroadcast(TT.shape_padleft(derphi_lo), 0)] states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] # print'while_zoom' outs, updates = scan(while_zoom, states=states, n_steps=maxiter, name='while_zoom', mode=theano.Mode(linker='cvm_nogc'), profile=profile) # print 'done_while' a_star = ifelse(onlyif, a_j, outs[7][0], name='astar') val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar') valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime') ## WARNING !! I ignore updates given by scan which I should not do !!! return a_star, val_star, valprime
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `ebs` -> int Number of samples over which to evaluate the training error `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ self.model = model # push dataset into shared var n_params = len(model.params) xdata = theano.shared(data['train_x'].astype('float32'), name='xdata') # ! This works for 1 of k classification ydata = TT.cast( theano.shared(data['train_y'].astype('float32'), name='ydata'), 'int32') shared_data = [xdata, ydata] self.xdata = xdata self.ydata = ydata # all sorts of indices self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] # vars for gradients # Store Euclidean gradients self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] # Store riemannian gradients (H^-1*g) self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)] return [args[0] + const(1)] + \ nw_gs ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig, n_steps=n_steps, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]] updates.update(dict(zip(self.gs, nw_gs))) grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']]) for x,y in zip(loc_inputs, shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), on_unused_input='warn', name='compute_eucledian_gradients', mode=theano.Mode(linker='cvm'), profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * g for p, g in zip(model.params, self.gs)] def cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] acc0 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0], n_steps=n_steps, name='cost_loop', profile=options['profile']) final_cost = rvals[1] / const(n_steps) update_vals = dict(zip(model.params, nw_ps)) #updates.update(dict(zip(model.params, nw_ps))) grad_inps = [(x, y[ebdx * options['ebs']: (ebdx + 1) * options['ebs']]) for x,y in zip(loc_inputs, shared_data)] print 'compling evaluation function' self.eval_fn = theano.function( [ebdx, lr], final_cost, givens=dict(grad_inps), updates= updates, on_unused_input='warn', name='eval_fn', mode=theano.Mode(linker='cvm'), profile=options['profile']) self.update_params = theano.function( [lr], [], updates=update_vals, on_unused_input='warn', #givens=dict(grad_inps), name='update_params', mode=theano.Mode(linker='cvm'), profile=options['profile']) self.options = options self.old_cost = 1e6 n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc, acc_train_cost): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace),'float32') train_cost = TT.cast(safe_clone(model.train_cost, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost, acc_train_cost + train_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_error, states = states, n_steps = n_steps, name='ls_err_step', mode=theano.Mode(linker='cvm'), profile = options['profile']) ferr = rvals[1][0] / const(n_steps) ftrain_cost = rvals[2][0] / const(n_steps) self.compute_error = theano.function([ebdx], [ferr, ftrain_cost], givens=dict(grad_inps), name='compute_err', on_unused_input='warn', mode=theano.Mode(linker='cvm'), profile=options['profile'])
def scalar_search_wolfe2(phi, derphi, phi0=None, old_phi0=None, derphi0=None, n_iters = 20, c1=1e-4, c2=0.9, mode=theano.Mode(linker='cvm'), profile = False): """Find alpha that satisfies strong Wolfe conditions. alpha > 0 is assumed to be a descent direction. Parameters ---------- phi : callable f(x) Objective scalar function. derphi : callable f'(x) Objective function derivative (can be None) phi0 : float, optional Value of phi at s=0 old_phi0 : float, optional Value of phi at previous point derphi0 : float, optional Value of derphi at s=0 c1 : float Parameter for Armijo condition rule. c2 : float Parameter for curvature condition rule. profile : flag (boolean) True if you want printouts of profiling information Returns ------- alpha_star : float Best alpha phi_star phi at alpha_star phi0 phi at 0 derphi_star derphi at alpha_star Notes ----- Uses the line search algorithm to enforce strong Wolfe conditions. See Wright and Nocedal, 'Numerical Optimization', 1999, pg. 59-60. For the zoom phase it uses an algorithm by [...]. """ if phi0 is None: phi0 = phi(zero) else: phi0 = phi0 if derphi0 is None and derphi is not None: derphi0 = derphi(zero) else: derphi0 = derphi0 alpha0 = zero alpha0.name ='alpha0' if old_phi0 is not None: alpha1 = TT.minimum(one, numpy.asarray(1.01, dtype=theano.config.floatX)* \ numpy.asarray(2, dtype=theano.config.floatX)*(phi0 - old_phi0)/derphi0) else: old_phi0 = nan alpha1 = one alpha1 = TT.switch(alpha1 < zero, one, alpha1) alpha1.name = 'alpha1' # This shouldn't happen. Perhaps the increment has slipped below # machine precision? For now, set the return variables skip the # useless while loop, and raise warnflag=2 due to possible imprecision. phi0 = TT.switch(TT.eq(alpha1, zero), old_phi0, phi0) # I need a lazyif for alpha1 == 0 !!! phi_a1 = ifelse(TT.eq(alpha1,zero), phi0, phi(alpha1), name='phi_a1') phi_a1.name = 'phi_a1' phi_a0 = phi0 phi_a0.name = 'phi_a0' derphi_a0 = derphi0 derphi_a0.name = 'derphi_a0' # Make sure variables are tensors otherwise strange things happen c1 = TT.as_tensor_variable(c1) c2 = TT.as_tensor_variable(c2) maxiter = n_iters def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t, alpha_star, phi_star, derphi_star): derphi_a1 = derphi(alpha1) cond1 = TT.bitwise_or(phi_a1 > phi0 + c1*alpha1*derphi0, TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero)) cond2 = abs(derphi_a1) <= -c2*derphi0 cond3 = derphi_a1 >= zero alpha_star_c1, phi_star_c1, derphi_star_c1 = \ _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, phi, derphi, phi0, derphi0, c1,c2, profile = profile, mode=mode) alpha_star_c3, phi_star_c3, derphi_star_c3 = \ _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi, derphi, phi0, derphi0, c1,c2, profile = profile, mode=mode) nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX) nw_phi = phi(nw_alpha1) alpha_star, phi_star, derphi_star = \ ifelse(cond1, (alpha_star_c1, phi_star_c1, derphi_star_c1), ifelse(cond2, (alpha1, phi_a1, derphi_a1), ifelse(cond3, (alpha_star_c3, phi_star_c3, derphi_star_c3), (nw_alpha1, nw_phi, nan), name = 'alphastar_c3'), name = 'alphastar_c2'), name ='alphastar_c1') return ( [alpha1, nw_alpha1, phi_a1, ifelse(lazy_or('allconds',cond1, cond2, cond3), phi_a1, nw_phi, name='nwphi1'), ifelse(cond1, derphi_a0, derphi_a1, name='derphi'), i_t + one, alpha_star, phi_star, derphi_star], theano.scan_module.scan_utils.until( lazy_or('until_cond_',TT.eq(nw_alpha1,zero), cond1, cond2, cond3))) states = [] states += [TT.unbroadcast(TT.shape_padleft(alpha0),0)] states += [TT.unbroadcast(TT.shape_padleft(alpha1),0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a0),0)] states += [TT.unbroadcast(TT.shape_padleft(phi_a1),0)] states += [TT.unbroadcast(TT.shape_padleft(derphi_a0), 0)] # i_t states += [TT.unbroadcast(TT.shape_padleft(zero),0)] # alpha_star states += [TT.unbroadcast(TT.shape_padleft(zero),0)] # phi_star states += [TT.unbroadcast(TT.shape_padleft(zero),0)] # derphi_star states += [TT.unbroadcast(TT.shape_padleft(zero),0)] print 'while_search' outs, updates = scan(while_search, states = states, n_steps = maxiter, name = 'while_search', mode = mode, profile = profile) print 'done_while_search' out3 = outs[-3][0] out2 = outs[-2][0] out1 = outs[-1][0] alpha_star, phi_star, derphi_star = \ ifelse(TT.eq(alpha1, zero), ( nan,phi0, nan), ( out3, out2, out1), name = 'main_alphastar') return alpha_star, phi_star, phi0, derphi_star
def minres(compute_Av, bs, rtol=constantX(1e-6), maxit=20, Ms=None, shift=constantX(0.), maxxnorm=constantX(1e15), Acondlim=constantX(1e16), profile=0): """ minres attempts to find the minimum-length and minimum-residual-norm solution x to the system of linear equations A*x = b or least squares problem min||Ax-b||. The n-by-n coefficient matrix A must be symmetric (but need not be positive definite or invertible). The right-hand-side column vector b must have length n. Parameters: compute_Av: callable returing the symbolic expression for `Av` (the product of matrix A with some vector v). `v` should be a list of tensors, whre the vector v means the vector obtain by concatenating and flattening all tensors in v bs: list of Theano expressions. We are looking to compute `A^-1\dot bs`. rtol: Optional, real, specifies the tolerance of the method. Default is 1e-6 maxit: Optional, positive integer, specifies the maximum number of iterations. Default is 20 Ms: List of theano expression of same shape as `bs`. The method uses these to precondition with diag(Ms) shift: Optional, scalar, real or complex. Default is 0. Effectively solve the system (A - shift I) * x = b. maxxnorm real positive, maximum bound on NORM(x). Default is 1e14. Acondlim real positive, maximum bound on COND(A). Default is 1e15. show boolean, 0 to suppress outputs, 1 to show iterations. Default is 0. OUTPUTS: x list of Theano tensor representing the solution flag theano int scalar - convergence flag 0 beta1 = 0. The exact solution is x = 0. 1 A solution to (poss. singular) Ax = b found, given rtol. 2 Pseudoinverse solution for singular LS problem, given rtol. 3 A solution to (poss. singular) Ax = b found, given eps. 4 Pseudoinverse solution for singular LS problem, given eps. 5 x has converged to an eigenvector. 6 xnorm has exceeded maxxnorm. 7 Acond has exceeded Acondlim. 8 The iteration limit was reached. 9/10 It is a least squares problem but no converged solution yet. iter integer, iteration number at which x was computed: 0 <= iter <= maxit. relres real positive, the relative residual is defined as NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)), computed recurrently here. If flag is 1 or 3, relres <= TOL. relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) --- computed recurrently here. If flag is 2 or 4, relAres <= TOL. Anorm real positive, estimate of matrix 2-norm of A. Acond real positive, estimate of condition number of A with respect to 2-norm. xnorm non-negative positive, recurrently computed NORM(x) Axnorm non-negative positive, recurrently computed NORM(A * x). REFERENCES: Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006. http://www.stanford.edu/group/SOL/software.html """ if not isinstance(bs, (tuple, list)): bs = [bs] return_as_list = False else: bs = list(bs) return_as_list = True eps = constantX(1e-23) # Initialise flag = theano.shared(constantX(0.)) beta1 = sqrt_inner_product(bs) #------------------------------------------------------------------ # Set up p and v for the first Lanczos vector v1. # p = beta1 P' v1, where P = C**(-1). # v is really P' v1. #------------------------------------------------------------------ r3s = [b for b in bs] r2s = [b for b in bs] r1s = [b for b in bs] if Ms is not None: r3s = [b / m for b, m in zip(bs, Ms)] beta1 = sqrt_inner_product(r3s, bs) #------------------------------------------------------------------ ## Initialize other quantities. # Note that Anorm has been initialized by IsOpSym6. # ------------------------------------------------------------------ bnorm = beta1 n_params = len(bs) def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag, *args): #----------------------------------------------------------------- ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,... # The general iteration is similar to the case k = 1 with v0 = 0: # # p1 = Operator * v1 - beta1 * v0, # alpha1 = v1'p1, # q2 = p2 - alpha1 * v1, # beta2^2 = q2'q2, # v2 = (1/beta2) q2. # # Again, p = betak P vk, where P = C**(-1). # .... more description needed. #----------------------------------------------------------------- xs = args[0 * n_params: 1 * n_params] r1s = args[1 * n_params: 2 * n_params] r2s = args[2 * n_params: 3 * n_params] r3s = args[3 * n_params: 4 * n_params] dls = args[4 * n_params: 5 * n_params] ds = args[5 * n_params: 6 * n_params] betal = beta beta = betan vs = [r3 / beta for r3 in r3s] r3s, upds = compute_Av(*vs) r3s = [r3 - shift * v for r3, v in zip(r3s, vs)] r3s = [TT.switch(TT.ge(niter, constantX(1.)), r3 - (beta / betal) * r1, r3) for r3, r1 in zip(r3s, r1s)] alpha = inner_product(r3s, vs) r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)] r1s = [r2 for r2 in r2s] r2s = [r3 for r3 in r3s] if Ms is not None: r3s = [r3 / M for r3, M in zip(r3s, Ms)] betan = sqrt_inner_product(r2s, r3s) else: betan = sqrt_inner_product(r3s) pnorml = pnorm pnorm = TT.switch(TT.eq(niter, constantX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta))) #----------------------------------------------------------------- ## Apply previous rotation Qk-1 to get # [dlta_k epln_{k+1}] = [cs sn][dbar_k 0 ] # [gbar_k dbar_{k+1} ] [sn -cs][alpha_k beta_{k+1}]. #----------------------------------------------------------------- dbar = dbarn epln = eplnn dlta = cs * dbar + sn * alpha gbar = sn * dbar - cs * alpha eplnn = sn * betan dbarn = -cs * betan ## Compute the current plane rotation Qk gammal2 = gammal gammal = gamma cs, sn, gamma = symGivens2(gbar, betan) tau = cs * phi phi = sn * phi Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau)) # Update d dl2s = [dl for dl in dls] dls = [d for d in ds] ds = [TT.switch(TT.neq(gamma, constantX(0.)), (v - epln * dl2 - dlta * dl) / gamma, v) for v, dl2, dl in zip(vs, dl2s, dls)] d_norm = TT.switch(TT.neq(gamma, constantX(0.)), sqrt_inner_product(ds), constantX(numpy.inf)) # Update x except if it will become too big xnorml = xnorm dl2s = [x for x in xs] xs = [x + tau * d for x, d in zip(xs, ds)] xnorm = sqrt_inner_product(xs) xs = [TT.switch(TT.ge(xnorm, maxxnorm), dl2, x) for dl2, x in zip(dl2s, xs)] flag = TT.switch(TT.ge(xnorm, maxxnorm), constantX(6.), flag) # Estimate various norms rnorml = rnorm # ||r_{k-1}|| Anorml = Anorm Acondl = Acond relrnorml = relrnorm flag_no_6 = TT.neq(flag, constantX(6.)) Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)), Dnorm) xnorm = TT.switch(flag_no_6, sqrt_inner_product(xs), xnorm) rnorm = TT.switch(flag_no_6, phi, rnorm) relrnorm = TT.switch(flag_no_6, rnorm / (Anorm * xnorm + bnorm), relrnorm) Tnorm = TT.switch(flag_no_6, TT.switch(TT.eq(niter, constantX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt(TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) + TT.sqr(betan))), Tnorm) Anorm = TT.maximum(Anorm, pnorm) Acond = Anorm * Dnorm rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn)) Anorml = rnorml * rootl relArnorml = rootl / Anorm #--------------------------------------------------------------- # See if any of the stopping criteria are satisfied. # In rare cases, flag is already -1 from above (Abar = const*I). #--------------------------------------------------------------- epsx = Anorm * xnorm * eps epsr = Anorm * xnorm * rtol #Test for singular Hk (hence singular A) # or x is already an LS solution (so again A must be singular). t1 = constantX(1) + relrnorm t2 = constantX(1) + relArnorml flag = TT.switch( TT.bitwise_or(TT.eq(flag, constantX(0)), TT.eq(flag, constantX(6))), multiple_switch(TT.le(t1, constantX(1)), constantX(3), TT.le(t2, constantX(1)), constantX(4), TT.le(relrnorm, rtol), constantX(1), TT.le(Anorm, constantX(1e-20)), constantX(12), TT.le(relArnorml, rtol), constantX(10), TT.ge(epsx, beta1), constantX(5), TT.ge(xnorm, maxxnorm), constantX(6), TT.ge(niter, TT.cast(maxit, theano.config.floatX)), constantX(8), flag), flag) flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm), constantX(11.), flag) return [niter + constantX(1.), beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag] + xs + r1s + r2s + r3s + dls + ds, upds, \ theano.scan_module.scan_utils.until(TT.neq(flag, 0)) states = [] # 0 niter states.append(constantX([0])) # 1 beta states.append(constantX([0])) # 2 betan states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 3 phi states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 4 Acond states.append(constantX([1])) # 5 cs states.append(constantX([-1])) # 6 dbarn states.append(constantX([0])) # 7 eplnn states.append(constantX([0])) # 8 rnorm states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 9 sn states.append(constantX([0])) # 10 Tnorm states.append(constantX([0])) # 11 rnorml states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 12 xnorm states.append(constantX([0])) # 13 Dnorm states.append(constantX([0])) # 14 gamma states.append(constantX([0])) # 15 pnorm states.append(constantX([0])) # 16 gammal states.append(constantX([0])) # 17 Axnorm states.append(constantX([0])) # 18 relrnorm states.append(constantX([1])) # 19 relArnorml states.append(constantX([1])) # 20 Anorm states.append(constantX([0])) # 21 flag states.append(constantX([0])) xs = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs] ds = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs] dls = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs] r1s = [TT.unbroadcast(TT.shape_padleft(r1), 0) for r1 in r1s] r2s = [TT.unbroadcast(TT.shape_padleft(r2), 0) for r2 in r2s] r3s = [TT.unbroadcast(TT.shape_padleft(r3), 0) for r3 in r3s] rvals, loc_updates = scan( loop, states=states + xs + r1s + r2s + r3s + dls + ds, n_steps=maxit + numpy.int32(1), name='minres', profile=profile, mode=theano.Mode(linker='cvm')) assert isinstance(loc_updates, dict) and 'Ordered' in str(type(loc_updates)) niters = TT.cast(rvals[0][0], 'int32') flag = TT.cast(rvals[21][0], 'int32') relres = rvals[18][0] relAres = rvals[19][0] Anorm = rvals[20][0] Acond = rvals[4][0] xnorm = rvals[12][0] Axnorm = rvals[17][0] sol = [x[0] for x in rvals[22: 22 + n_params]] return (sol, flag, niters, relres, relAres, Anorm, Acond, xnorm, Axnorm, loc_updates)
def jobman(state, channel): # load dataset rng = numpy.random.RandomState(state['seed']) # declare the dimensionalies of the input and output if state['chunks'] == 'words': state['n_in'] = 10000 state['n_out'] = 10000 else: state['n_in'] = 50 state['n_out'] = 50 train_data, valid_data, test_data = get_text_data(state) ## BEGIN Tutorial ### Define Theano Input Variables x = TT.lvector('x') y = TT.lvector('y') h0 = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32')) ### Neural Implementation of the Operators: \oplus #### Word Embedding emb_words = MultiLayer( rng, n_in=state['n_in'], n_hids=eval(state['inp_nhids']), activation=eval(state['inp_activ']), init_fn='sample_weights_classic', weight_noise=state['weight_noise'], rank_n_approx = state['rank_n_approx'], scale=state['inp_scale'], sparsity=state['inp_sparse'], learn_bias = True, bias_scale=eval(state['inp_bias']), name='emb_words') #### Deep Transition Recurrent Layer rec = eval(state['rec_layer'])( rng, eval(state['nhids']), activation = eval(state['rec_activ']), #activation = 'TT.nnet.sigmoid', bias_scale = eval(state['rec_bias']), scale=eval(state['rec_scale']), sparsity=eval(state['rec_sparse']), init_fn=eval(state['rec_init']), weight_noise=state['weight_noise'], name='rec') #### Stiching them together ##### (1) Get the embedding of a word x_emb = emb_words(x, no_noise_bias=state['no_noise_bias']) ##### (2) Embedding + Hidden State via DT Recurrent Layer reset = TT.scalar('reset') rec_layer = rec(x_emb, n_steps=x.shape[0], init_state=h0*reset, no_noise_bias=state['no_noise_bias'], truncate_gradient=state['truncate_gradient'], batch_size=1) ## BEGIN Exercise: DOT-RNN ### Neural Implementation of the Operators: \lhd #### Exercise (1) #### TODO: Define a layer from the hidden state to the intermediate layer #### Exercise (1) #### TODO: Define a layer from the input to the intermediate Layer #### Hidden State: Combine emb_state and emb_words_out #### Exercise (1) #### TODO: Define an activation layer #### Exercise (2) #### TODO: Define a dropout layer #### Softmax Layer output_layer = SoftmaxLayer( rng, eval(state['dout_nhid']), state['n_out'], scale=state['out_scale'], bias_scale=state['out_bias_scale'], init_fn="sample_weights_classic", weight_noise=state['weight_noise'], sparsity=state['out_sparse'], sum_over_time=True, name='out') ### Few Optional Things #### Direct shortcut from x to y if state['shortcut_inpout']: shortcut = MultiLayer( rng, n_in=state['n_in'], n_hids=eval(state['inpout_nhids']), activations=eval(state['inpout_activ']), init_fn='sample_weights_classic', weight_noise = state['weight_noise'], scale=eval(state['inpout_scale']), sparsity=eval(state['inpout_sparse']), learn_bias=eval(state['inpout_learn_bias']), bias_scale=eval(state['inpout_bias']), name='shortcut') #### Learning rate scheduling (1/(1+n/beta)) state['clr'] = state['lr'] def update_lr(obj, cost): stp = obj.step if isinstance(obj.state['lr_start'], int) and stp > obj.state['lr_start']: time = float(stp - obj.state['lr_start']) new_lr = obj.state['clr']/(1+time/obj.state['lr_beta']) obj.lr = new_lr if state['lr_adapt']: rec.add_schedule(update_lr) ### Neural Implementations of the Language Model #### Training if state['shortcut_inpout']: additional_inputs = [rec_layer, shortcut(x)] else: additional_inputs = [rec_layer] ##### Exercise (1): Compute the output intermediate layer ##### TODO: Compute the output intermediate layer ##### Exercise (2): Apply Dropout ##### TODO: Apply the dropout layer train_model = output_layer(outhid, no_noise_bias=state['no_noise_bias'], additional_inputs=additional_inputs).train(target=y, scale=numpy.float32(1./state['seqlen'])) nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1] if state['carry_h0']: train_model.updates += [(h0, nw_h0)] #### Validation h0val = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32')) rec_layer = rec(emb_words(x, use_noise=False), n_steps = x.shape[0], batch_size=1, init_state=h0val*reset, use_noise=False) nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1] ##### Exercise (1): ##### TODO: Compute the output intermediate layer ##### Exercise (2): Apply Dropout ##### TODO: Apply the dropout layer without noise if state['shortcut_inpout']: additional_inputs=[rec_layer, shortcut(x, use_noise=False)] else: additional_inputs=[rec_layer] valid_model = output_layer(outhid, additional_inputs=additional_inputs, use_noise=False).validate(target=y, sum_over_time=True) valid_updates = [] if state['carry_h0']: valid_updates = [(h0val, nw_h0)] valid_fn = theano.function([x,y, reset], valid_model.out, name='valid_fn', updates=valid_updates) #### Sampling ##### single-step sampling def sample_fn(word_tm1, h_tm1): x_emb = emb_words(word_tm1, use_noise = False, one_step=True) h0 = rec(x_emb, state_before=h_tm1, one_step=True, use_noise=False)[-1] outhid = outhid_dropout(outhid_activ(emb_state(h0, use_noise=False, one_step=True) + emb_words_out(word_tm1, use_noise=False, one_step=True), one_step=True), use_noise=False, one_step=True) word = output_layer.get_sample(state_below=outhid, additional_inputs=[h0], temp=1.) return word, h0 ##### scan for iterating the single-step sampling multiple times [samples, summaries], updates = scan(sample_fn, states = [ TT.alloc(numpy.int64(0), state['sample_steps']), TT.alloc(numpy.float32(0), 1, eval(state['nhids'])[-1])], n_steps= state['sample_steps'], name='sampler_scan') ##### build a Theano function for sampling sample_fn = theano.function([], [samples], updates=updates, profile=False, name='sample_fn') ##### Load a dictionary dictionary = numpy.load(state['dictionary']) if state['chunks'] == 'chars': dictionary = dictionary['unique_chars'] else: dictionary = dictionary['unique_words'] def hook_fn(): sample = sample_fn()[0] print 'Sample:', if state['chunks'] == 'chars': print "".join(dictionary[sample]) else: for si in sample: print dictionary[si], print ### Build and Train a Model #### Define a model model = LM_Model( cost_layer = train_model, weight_noise_amount=state['weight_noise_amount'], valid_fn = valid_fn, clean_before_noise_fn = False, noise_fn = None, rng = rng) if state['reload']: model.load(state['prefix']+'model.npz') #### Define a trainer ##### Training algorithm (SGD) if state['moment'] < 0: algo = SGD(model, state, train_data) else: algo = SGD_m(model, state, train_data) ##### Main loop of the trainer main = MainLoop(train_data, valid_data, test_data, model, algo, state, channel, train_cost = False, hooks = hook_fn, validate_postprocess = eval(state['validate_postprocess'])) ## Run! main.main()
def minres(compute_Av, bs, rtol=constantX(1e-6), maxit=20, Ms=None, shift=constantX(0.), maxxnorm=constantX(1e15), Acondlim=constantX(1e16), profile=0): """ minres attempts to find the minimum-length and minimum-residual-norm solution x to the system of linear equations A*x = b or least squares problem min||Ax-b||. The n-by-n coefficient matrix A must be symmetric (but need not be positive definite or invertible). The right-hand-side column vector b must have length n. Parameters: compute_Av: callable returing the symbolic expression for `Av` (the product of matrix A with some vector v). `v` should be a list of tensors, whre the vector v means the vector obtain by concatenating and flattening all tensors in v bs: list of Theano expressions. We are looking to compute `A^-1\dot bs`. rtol: Optional, real, specifies the tolerance of the method. Default is 1e-6 maxit: Optional, positive integer, specifies the maximum number of iterations. Default is 20 Ms: List of theano expression of same shape as `bs`. The method uses these to precondition with diag(Ms) shift: Optional, scalar, real or complex. Default is 0. Effectively solve the system (A - shift I) * x = b. maxxnorm real positive, maximum bound on NORM(x). Default is 1e14. Acondlim real positive, maximum bound on COND(A). Default is 1e15. show boolean, 0 to suppress outputs, 1 to show iterations. Default is 0. OUTPUTS: x list of Theano tensor representing the solution flag theano int scalar - convergence flag 0 beta1 = 0. The exact solution is x = 0. 1 A solution to (poss. singular) Ax = b found, given rtol. 2 Pseudoinverse solution for singular LS problem, given rtol. 3 A solution to (poss. singular) Ax = b found, given eps. 4 Pseudoinverse solution for singular LS problem, given eps. 5 x has converged to an eigenvector. 6 xnorm has exceeded maxxnorm. 7 Acond has exceeded Acondlim. 8 The iteration limit was reached. 9/10 It is a least squares problem but no converged solution yet. iter integer, iteration number at which x was computed: 0 <= iter <= maxit. relres real positive, the relative residual is defined as NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)), computed recurrently here. If flag is 1 or 3, relres <= TOL. relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) --- computed recurrently here. If flag is 2 or 4, relAres <= TOL. Anorm real positive, estimate of matrix 2-norm of A. Acond real positive, estimate of condition number of A with respect to 2-norm. xnorm non-negative positive, recurrently computed NORM(x) Axnorm non-negative positive, recurrently computed NORM(A * x). REFERENCES: Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006. http://www.stanford.edu/group/SOL/software.html """ if not isinstance(bs, (tuple, list)): bs = [bs] return_as_list = False else: bs = list(bs) return_as_list = True eps = constantX(1e-23) # Initialise beta1 = sqrt_inner_product(bs) #------------------------------------------------------------------ # Set up p and v for the first Lanczos vector v1. # p = beta1 P' v1, where P = C**(-1). # v is really P' v1. #------------------------------------------------------------------ r3s = [b for b in bs] r2s = [b for b in bs] r1s = [b for b in bs] if Ms is not None: r3s = [b / m for b, m in zip(bs, Ms)] beta1 = sqrt_inner_product(r3s, bs) #------------------------------------------------------------------ ## Initialize other quantities. # Note that Anorm has been initialized by IsOpSym6. # ------------------------------------------------------------------ bnorm = beta1 n_params = len(bs) def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag, *args): #----------------------------------------------------------------- ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,... # The general iteration is similar to the case k = 1 with v0 = 0: # # p1 = Operator * v1 - beta1 * v0, # alpha1 = v1'p1, # q2 = p2 - alpha1 * v1, # beta2^2 = q2'q2, # v2 = (1/beta2) q2. # # Again, p = betak P vk, where P = C**(-1). # .... more description needed. #----------------------------------------------------------------- xs = args[0 * n_params:1 * n_params] r1s = args[1 * n_params:2 * n_params] r2s = args[2 * n_params:3 * n_params] r3s = args[3 * n_params:4 * n_params] dls = args[4 * n_params:5 * n_params] ds = args[5 * n_params:6 * n_params] betal = beta beta = betan vs = [r3 / beta for r3 in r3s] r3s, upds = compute_Av(*vs) r3s = [r3 - shift * v for r3, v in zip(r3s, vs)] r3s = [ TT.switch(TT.ge(niter, constantX(1.)), r3 - (beta / betal) * r1, r3) for r3, r1 in zip(r3s, r1s) ] alpha = inner_product(r3s, vs) r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)] r1s = [r2 for r2 in r2s] r2s = [r3 for r3 in r3s] if Ms is not None: r3s = [r3 / M for r3, M in zip(r3s, Ms)] betan = sqrt_inner_product(r2s, r3s) else: betan = sqrt_inner_product(r3s) pnorml = pnorm pnorm = TT.switch( TT.eq(niter, constantX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta))) #----------------------------------------------------------------- ## Apply previous rotation Qk-1 to get # [dlta_k epln_{k+1}] = [cs sn][dbar_k 0 ] # [gbar_k dbar_{k+1} ] [sn -cs][alpha_k beta_{k+1}]. #----------------------------------------------------------------- dbar = dbarn epln = eplnn dlta = cs * dbar + sn * alpha gbar = sn * dbar - cs * alpha eplnn = sn * betan dbarn = -cs * betan ## Compute the current plane rotation Qk gammal2 = gammal gammal = gamma cs, sn, gamma = symGivens2(gbar, betan) tau = cs * phi phi = sn * phi Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau)) # Update d dl2s = [dl for dl in dls] dls = [d for d in ds] ds = [ TT.switch(TT.neq(gamma, constantX(0.)), (v - epln * dl2 - dlta * dl) / gamma, v) for v, dl2, dl in zip(vs, dl2s, dls) ] d_norm = TT.switch(TT.neq(gamma, constantX(0.)), sqrt_inner_product(ds), constantX(numpy.inf)) # Update x except if it will become too big xnorml = xnorm dl2s = [x for x in xs] xs = [x + tau * d for x, d in zip(xs, ds)] xnorm = sqrt_inner_product(xs) xs = [ TT.switch(TT.ge(xnorm, maxxnorm), dl2, x) for dl2, x in zip(dl2s, xs) ] flag = TT.switch(TT.ge(xnorm, maxxnorm), constantX(6.), flag) # Estimate various norms rnorml = rnorm # ||r_{k-1}|| Anorml = Anorm Acondl = Acond relrnorml = relrnorm flag_no_6 = TT.neq(flag, constantX(6.)) Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)), Dnorm) xnorm = TT.switch(flag_no_6, sqrt_inner_product(xs), xnorm) rnorm = TT.switch(flag_no_6, phi, rnorm) relrnorm = TT.switch(flag_no_6, rnorm / (Anorm * xnorm + bnorm), relrnorm) Tnorm = TT.switch( flag_no_6, TT.switch( TT.eq(niter, constantX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt( TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) + TT.sqr(betan))), Tnorm) Anorm = TT.maximum(Anorm, pnorm) Acond = Anorm * Dnorm rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn)) Anorml = rnorml * rootl relArnorml = rootl / Anorm #--------------------------------------------------------------- # See if any of the stopping criteria are satisfied. # In rare cases, flag is already -1 from above (Abar = const*I). #--------------------------------------------------------------- epsx = Anorm * xnorm * eps epsr = Anorm * xnorm * rtol #Test for singular Hk (hence singular A) # or x is already an LS solution (so again A must be singular). t1 = constantX(1) + relrnorm t2 = constantX(1) + relArnorml flag = TT.switch( TT.bitwise_or(TT.eq(flag, constantX(0)), TT.eq(flag, constantX(6))), multiple_switch(TT.le(t1, constantX(1)), constantX(3), TT.le(t2, constantX(1)), constantX(4), TT.le(relrnorm, rtol), constantX(1), TT.le(Anorm, constantX(1e-20)), constantX(12), TT.le(relArnorml, rtol), constantX(10), TT.ge(epsx, beta1), constantX(5), TT.ge(xnorm, maxxnorm), constantX(6), TT.ge(niter, TT.cast(maxit, theano.config.floatX)), constantX(8), flag), flag) flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm), constantX(11.), flag) return [niter + constantX(1.), beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag] + xs + r1s + r2s + r3s + dls + ds, upds, \ theano.scan_module.scan_utils.until(TT.neq(flag, 0)) states = [] # 0 niter states.append(constantX([0])) # 1 beta states.append(constantX([0])) # 2 betan states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 3 phi states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 4 Acond states.append(constantX([1])) # 5 cs states.append(constantX([-1])) # 6 dbarn states.append(constantX([0])) # 7 eplnn states.append(constantX([0])) # 8 rnorm states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 9 sn states.append(constantX([0])) # 10 Tnorm states.append(constantX([0])) # 11 rnorml states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0)) # 12 xnorm states.append(constantX([0])) # 13 Dnorm states.append(constantX([0])) # 14 gamma states.append(constantX([0])) # 15 pnorm states.append(constantX([0])) # 16 gammal states.append(constantX([0])) # 17 Axnorm states.append(constantX([0])) # 18 relrnorm states.append(constantX([1])) # 19 relArnorml states.append(constantX([1])) # 20 Anorm states.append(constantX([0])) # 21 flag states.append(constantX([0])) xs = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs] ds = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs] dls = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs] r1s = [TT.unbroadcast(TT.shape_padleft(r1), 0) for r1 in r1s] r2s = [TT.unbroadcast(TT.shape_padleft(r2), 0) for r2 in r2s] r3s = [TT.unbroadcast(TT.shape_padleft(r3), 0) for r3 in r3s] rvals, loc_updates = scan(loop, states=states + xs + r1s + r2s + r3s + dls + ds, n_steps=maxit + numpy.int32(1), name='minres', profile=profile, mode=theano.Mode(linker='cvm')) assert isinstance(loc_updates, dict) and 'Ordered' in str( type(loc_updates)) niters = TT.cast(rvals[0][0], 'int32') flag = TT.cast(rvals[21][0], 'int32') relres = rvals[18][0] relAres = rvals[19][0] Anorm = rvals[20][0] Acond = rvals[4][0] xnorm = rvals[12][0] Axnorm = rvals[17][0] sol = [x[0] for x in rvals[22:22 + n_params]] return (sol, flag, niters, relres, relAres, Anorm, Acond, xnorm, Axnorm, loc_updates)
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the metric `ebs` -> int Number of samples over which to evaluate the training error `mreg` -> float Regularization added to the metric `mrtol` -> float Relative tolerance for inverting the metric `miters` -> int Number of iterations `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data eps = numpy.float32(1e-24) xdata = theano.shared(data['train_x'], name='xdata') ydata = theano.shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] # Store eucledian gradients self.gs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients self.rs1 = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.rs2 = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store jacobi diagonal self.js = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(model.params, [None] * n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) #denom *= nw_out #denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= (nw_out + eps) else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) if out_operator == 'sigmoid': tnwout = TT.nnet.sigmoid(nw_out) factor = TT.sqrt(tnwout * (numpy.float32(1) - tnwout)) * factor r = TT.sgn(srng.normal(nw_out.shape, nstreams=128)) r = r * factor loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [ oj + final_results[p] for oj, p in zip(args[1 + n_params:1 + 2 * n_params], model.params) ] return [args[0] + const(1)] + nw_gs + nw_js ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] ij = [ TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig + ij, n_steps=n_steps, mode=gpu_mode, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]] updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js))) grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']]) for x, y in zip(loc_inputs, shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, on_unused_input='warn', profile=options['profile']) #theano.printing.pydotprint(self.compute_eucledian_gradients, # 'eucledian_grad', scan_graphs=True) self.damping = theano.shared(numpy.float32(options['mreg'])) # Step 2.1 Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') rbpos = rbdx * options['mbs'] mode = gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.gf_outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.gf_outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const( options['cbs']) # * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] #_final_Gvs = [x + self.damping * y # for x,y in zip(final_Gvs, args)] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], #Ms = self.js, rtol=options['mrtol'], shift=self.damping, maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs1, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs, shared_data)] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients1 = theano.function( [rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0 ], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 2.2 Compile function for Computing Riemannian gradients rbpos = rbdx * options['mbs'] mode = gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.gc_outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.gc_outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const( options['cbs']) # * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] #_final_Gvs = [x + self.damping * y # for x,y in zip(final_Gvs, args)] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], #Ms = self.js, rtol=options['mrtol'], shift=self.damping, maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs2, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs, shared_data)] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients2 = theano.function( [rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0 ], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' if options['rsch'] == 1: self.rs = self.rs1 else: self.rs = self.rs2 lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)] def cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] acc0 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0], n_steps=n_steps, name='cost_loop', mode=gpu_mode, profile=options['profile']) final_cost = rvals[1].sum() / const(n_steps) grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]) for x, y in zip(loc_inputs, shared_data)] denom = -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)]) self.approx_change = theano.function([lr], denom, name='approx_change', mode=gpu_mode, allow_input_downcast=True, profile=options['profile']) print 'compling evaluation function' self.eval_fn = theano.function([ebdx, lr], final_cost, givens=dict(grad_inps), on_unused_input='warn', updates=updates, name='eval_fn', allow_input_downcast=True, mode=gpu_mode, profile=options['profile']) def ls_grad_step(_idx, gws): idx = TT.cast(_idx, 'int32') nw_inps = [ x[idx * options['cbs']:(idx + 1) * options['cbs']] for x in loc_inputs ] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_gs = TT.grad(nw_cost, lr) return _idx + numpy.float32(1), gws + nw_gs states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_grad_step, states=states, n_steps=n_steps, name='ls_grad_step', profile=options['profile']) fgrad = rvals[1][0] / const(n_steps) self.grad_lr_fn = theano.function([ebdx, lr], fgrad, givens=grad_inps, name='ls_grad_fn', on_unused_input='warn', mode=gpu_mode, allow_input_downcast=True, profile=options['profile']) update_dict = dict(zip(model.params, nw_ps)) self.update_params = theano.function([lr], [], updates=update_dict, name='update_params', on_unused_input='warn', allow_input_downcast=True, mode=mode, profile=options['profile']) self.options = options self.old_cost = numpy.inf n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=gpu_mode, profile=options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the krylov subspace `ebs` -> int Number of samples over which to evaluate the training error `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lbfgsIters' -> int `krylovDim` -> int channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data xdata = theano.shared(data['train_x'], name='xdata') ydata = theano.shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] rng = numpy.random.RandomState(options['seed']) self.rng = rng self.options = options self.channel = channel self.model = model n_dimensions = options['krylovDim'] self.n_dimensions = n_dimensions if options['device'] == 'gpu': cfn_subspaces = \ [theano.shared(numpy.zeros( (n_dimensions,) + shp, dtype='float32'), name='cfn{%s|%d}' % (str(param.name), i)) for i, (shp, param) in enumerate(zip(model.params_shape, model.params))] old_deltas = \ [theano.shared(numpy.zeros(shp, dtype='float32'), name='delta{%s|%d}' % (str(param.name), i)) for i, (shp, param) in enumerate(zip(model.params_shape, model.params))] self.gs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] else: cfn_subspaces = \ [TT._shared(numpy.zeros( (n_dimensions,) + shp, dtype='float32'), name='cfn{%s|%d}' % (str(param.name), i)) for i, (shp, param) in enumerate(zip(model.params_shape, model.params))] old_deltas = \ [TT._shared(numpy.zeros(shp, dtype='float32'), name='delta{%s|%d}' % (str(param.name), i)) for i, (shp, param) in enumerate(zip(model.params_shape, model.params))] self.gs = [ TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.cfn_subspaces = cfn_subspaces self.old_deltas = old_deltas self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients print 'Constructing grad function' loc_inputs = [x.type(name='locx') for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] return [args[0] + const(1)] + \ nw_gs ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig, n_steps=n_steps, name='grad_loop', mode=gpu_mode, profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] updates.update(dict(zip(self.gs, nw_gs))) gdx = TT.iscalar('gdx') grad_inps = zip(loc_inputs, [ x[gdx * options['gbs']:(gdx + 1) * options['gbs']] for x in shared_data ]) print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients if options['device'] == 'gpu': mode = gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates else: mode = cpu_mode def compute_Gv(*args): cgv = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name='cgv%d' % idx) for idx, shp in enumerate(model.params_shape) ] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, cgv)) Gvs = [ ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [ TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:] ] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates=updates, givens=dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile=options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {} rvals, updates = krylov_subspace(compute_Gv, self.gs, old_deltas, n_dimensions, model.params_shape, profile=options['profile'], device=options['device']) gdx = TT.iscalar('gdx') grad_inps = zip(loc_inputs, [ x[gdx * options['mbs']:(gdx + 1) * options['mbs']] for x in shared_data ]) updates.update(dict(zip(cfn_subspaces, rvals))) self.update_krylov_subspace = theano.function( [gdx], [], updates=updates, givens=dict(grad_inps), profile=options['profile'], on_unused_input='warn', name='update_krylov_subspace', mode=mode) alphas = tensor.vector('alphas') deltas = [] nw_params = [] if options['device'] == 'gpu': params = model.params else: params = model.cpu_params for param, subspace in zip(params, cfn_subspaces): alpha_reshuffle = [0] + ['x'] * param.ndim delta = (alphas.dimshuffle(*alpha_reshuffle) * \ subspace).sum(axis=0) nw_param = param + delta nw_params.append(nw_param) deltas.append(delta) print 'constructing evaluation function' ebdx = TT.iscalar('ebdx') updates_dict = dict(zip(model.params + old_deltas, nw_params + deltas)) if options['device'] != 'gpu': updates_dict.update(dict(zip(model.cpu_params, nw_params))) self.update_params = theano.function([alphas], updates=updates_dict, name='update_params', allow_input_downcast=True, mode=mode, profile=options['profile']) n_steps = options['ebs'] // options['cbs'] def ls_cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict( zip(model.inputs + model.params, nw_inps + nw_params)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_cost_step, states=states, n_steps=n_steps, name='ls_cost_step', mode=gpu_mode, profile=options['profile']) fcost = rvals[1][0] / const(n_steps) def ls_grad_step(_idx, gws): idx = TT.cast(_idx, 'int32') nw_inps = [ x[idx * options['cbs']:(idx + 1) * options['cbs']] for x in loc_inputs ] replace = dict( zip(model.inputs + model.params, nw_inps + nw_params)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_gs = TT.grad(nw_cost, alphas) return _idx + numpy.float32(1), gws + nw_gs states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.zeros((1, n_dimensions), dtype='float32')) ] rvals, _ = scan(ls_grad_step, states=states, n_steps=n_steps, name='ls_grad_step', mode=gpu_mode, profile=options['profile']) fgrad = rvals[1][0] / const(n_steps) grad_inps = zip(loc_inputs, [ x[ebdx * options['ebs']:(ebdx + 1) * options['ebs']] for x in shared_data ]) self.lbfgs_fn = theano.function( [alphas, ebdx], #theano.printing.Print('fcost')(fcost), fcost, givens=grad_inps, allow_input_downcast=True, on_unused_input='warn', name='lbfgs_fn', profile=options['profile'], mode=gpu_mode) self.lbfgs_grad = theano.function([alphas, ebdx], fgrad, givens=grad_inps, on_unused_input='warn', allow_input_downcast=True, name='lbfgs_grad', profile=options['profile'], mode=gpu_mode) n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=cpu_mode, profile=options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([], ferr, givens=dict( zip(loc_inputs, shared_data)), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the metric `ebs` -> int Number of samples over which to evaluate the training error `mreg` -> float Regularization added to the metric `mrtol` -> float Relative tolerance for inverting the metric `miters` -> int Number of iterations `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data if options['device'] != 'gpu': xdata = theano.shared(data['train_x'][:options['gbs']], name='xdata') ydata = TT._shared(data['train_y'][:options['gbs']], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] else: self.cpu_shared_data = [] xdata = theano.shared(data['train_x'], name='xdata') ydata = TT._shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] if options['device'] != 'gpu': # Store eucledian gradients self.gs = [ TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients self.rs = [ TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] else: # Store eucledian gradients self.gs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients self.rs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients # inputs gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] return [args[0] + const(1)] + \ nw_gs ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig, n_steps=n_steps, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] # updates updates.update(dict(zip(self.gs, nw_gs))) # givens if options['device'] == 'gpu': grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']]) for x, y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, on_unused_input='warn', profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') rbpos = rbdx * options['mbs'] if options['device'] == 'gpu': mode = gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const( options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates else: mode = cpu_mode def compute_Gv(*args): cgv = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name='cgv%d' % idx) for idx, shp in enumerate(model.params_shape) ] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const( options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [ TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:] ] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates=updates, givens=dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile=options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {} print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs], rtol=options['mrtol'], shift=-options['mreg'], maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs[:1], shared_data[:1])] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients = theano.function( [rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0 ], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)] def cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] acc0 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0], n_steps=n_steps, name='cost_loop', mode=gpu_mode, profile=options['profile']) final_cost = rvals[1] / const(n_steps) if options['device'] == 'gpu': grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]) for x, y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'compling evaluation function' self.eval_fn = theano.function([ebdx, lr], final_cost, givens=dict(grad_inps), on_unused_input='warn', updates=updates, name='eval_fn', mode=gpu_mode, profile=options['profile']) update_dict = dict(zip(model.params, nw_ps)) if options['device'] != 'gpu': update_dict.update(dict(zip(model.cparams, nw_ps))) self.update_params = theano.function([lr], [], updates=update_dict, name='update_params', on_unused_input='warn', mode=mode, profile=options['profile']) self.options = options self.old_cost = 1e6 self.device = options['device'] n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=cpu_mode, profile=options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the metric `ebs` -> int Number of samples over which to evaluate the training error `mreg` -> float Regularization added to the metric `mrtol` -> float Relative tolerance for inverting the metric `miters` -> int Number of iterations `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data eps = numpy.float32(1e-24) xdata = theano.shared(data['train_x'], name='xdata') ydata = theano.shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] # Store eucledian gradients self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] # Store riemannian gradients self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] # Store jacobi diagonal self.js = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)] # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(model.params, [None]*n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) #denom *= nw_out #denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= (nw_out + eps) else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) if out_operator == 'sigmoid': tnwout = TT.nnet.sigmoid(nw_out) factor = TT.sqrt(tnwout * (numpy.float32(1) - tnwout))*factor r = TT.sgn(srng.normal(nw_out.shape)) r = r * factor loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [oj + final_results[p] for oj, p in zip(args[1+n_params:1+2*n_params], model.params)] return [args[0] + const(1)] + nw_gs + nw_js ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0) for shp in model.params_shape] ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig + ij, n_steps=n_steps, mode=gpu_mode, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]] nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]] updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js))) grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']]) for x,y in zip(loc_inputs, shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, on_unused_input='warn', profile=options['profile']) #theano.printing.pydotprint(self.compute_eucledian_gradients, # 'eucledian_grad', scan_graphs=True) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') rbpos = rbdx * options['mbs'] self.damping = theano.shared(numpy.float32(options['mreg'])) mode=gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const(options['cbs'])# * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], Ms = self.js, rtol=options['mrtol'], shift= self.damping, maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']: (rbdx + 1) * options['mbs']]) for x,y in zip(loc_inputs, shared_data)] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients = theano.function( [rbdx], [flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)] def cost_step(_idx, acc0,acc1): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_cost2 = safe_clone(model.train_cost, replace = dict(zip(model.inputs, nw_inps))) return [_idx + const(1), acc0 + nw_cost, acc1 + nw_cost2] acc0 = const([0]) acc1 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0, acc1], n_steps=n_steps, name='cost_loop', mode=gpu_mode, profile=options['profile']) final_cost = rvals[1].sum() / const(n_steps) cost0 = rvals[2].sum() / const(n_steps) grad_inps = [(x, y[ebdx * options['ebs']: (ebdx + 1) * options['ebs']]) for x,y in zip(loc_inputs, shared_data)] denom = -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.rs)]) rho = (final_cost - cost0) / denom print 'compling evaluation function' self.eval_fn = theano.function( [ebdx, lr], [final_cost, rho], givens=dict(grad_inps), on_unused_input='warn', updates = updates, name='eval_fn', mode=gpu_mode, profile=options['profile']) update_dict = dict(zip(model.params, nw_ps)) self.update_params = theano.function( [lr], [], updates=update_dict, name='update_params', on_unused_input='warn', mode=mode, profile=options['profile']) self.options = options self.old_cost = numpy.inf n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone( model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_error, states = states, n_steps = n_steps, name='ls_err_step', mode=gpu_mode, profile = options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])