def test_2(): h = 1 a = -10 b = -a n = 2 * b // h + 1 A = numpy.zeros((n, n), dtype=theano.config.floatX) A = numpy.zeros((n, n), dtype=theano.config.floatX) v = a for k in xrange(n): A[k, k] = v v += h b = numpy.ones((n, ), dtype=theano.config.floatX) rtol = numpy.asarray(1e-6, theano.config.floatX) maxxnorm = 1e8 maxit = 50 tA = theano.shared(A.astype(theano.config.floatX)) tb = theano.shared(b.astype(theano.config.floatX)) compute_Av = lambda x: ([TT.dot(tA, x)], {}) xs, flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm, updates = \ minres.minres(compute_Av, [tb], rtol=rtol, maxit=maxit, maxxnorm=maxxnorm, profile=0) func = theano.function( [], xs + [flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm], name='func', profile=0, updates=updates, mode=theano.Mode(linker='cvm')) rvals = func() print 'flag', rvals[1] print minres.messages[int(rvals[1])] print 'iters', rvals[2] print 'relres', rvals[3] print 'relAres', rvals[4] print 'Anorm', rvals[5] print 'Acond', rvals[6] print 'xnorm', rvals[7] print 'Axnorm', rvals[8] print rvals[0]
def test_1(): n = 100 on = numpy.ones((n, 1), dtype=theano.config.floatX) A = numpy.zeros((n, n), dtype=theano.config.floatX) for k in xrange(n): A[k, k] = 4. if k > 0: A[k - 1, k] = -2. A[k, k - 1] = -2. b = A.sum(axis=1) rtol = numpy.asarray(1e-10, dtype=theano.config.floatX) maxit = 50 M = numpy.ones((n, ), dtype=theano.config.floatX) * 4. tA = theano.shared(A.astype(theano.config.floatX)) tb = theano.shared(b.astype(theano.config.floatX)) tM = theano.shared(M.astype(theano.config.floatX)) compute_Av = lambda x: ([TT.dot(tA, x)], {}) xs, flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm, updates = \ minres.minres(compute_Av, [tb], rtol=rtol, maxit=maxit, Ms=[tM], profile=0) func = theano.function( [], xs + [flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm], name='func', profile=0, updates=updates, mode=theano.Mode(linker='cvm')) rvals = func() print 'flag', rvals[1] print minres.messages[int(rvals[1])] print 'iters', rvals[2] print 'relres', rvals[3] print 'relAres', rvals[4] print 'Anorm', rvals[5] print 'Acond', rvals[6] print 'xnorm', rvals[7] print 'Axnorm', rvals[8] print 'error', numpy.sqrt(numpy.sum((numpy.dot(rvals[0], A) - b)**2)) print
def test_1(): n = 100 on = numpy.ones((n, 1), dtype=theano.config.floatX) A = numpy.zeros((n, n), dtype=theano.config.floatX) for k in xrange(n): A[k, k] = 4. if k > 0: A[k - 1, k] = -2. A[k, k - 1] = -2. b = A.sum(axis=1) rtol = numpy.asarray(1e-10, dtype=theano.config.floatX) maxit = 50 M = numpy.ones((n,), dtype=theano.config.floatX) * 4. tA = theano.shared(A.astype(theano.config.floatX)) tb = theano.shared(b.astype(theano.config.floatX)) tM = theano.shared(M.astype(theano.config.floatX)) compute_Av = lambda x: ([TT.dot(tA, x)], {}) xs, flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm, updates = \ minres.minres(compute_Av, [tb], rtol=rtol, maxit=maxit, Ms=[tM], profile=0) func = theano.function([], xs + [flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm], name='func', profile=0, updates = updates, mode=theano.Mode(linker='cvm')) rvals = func() print 'flag', rvals[1] print minres.messages[int(rvals[1])] print 'iters', rvals[2] print 'relres', rvals[3] print 'relAres', rvals[4] print 'Anorm', rvals[5] print 'Acond', rvals[6] print 'xnorm', rvals[7] print 'Axnorm', rvals[8] print 'error', numpy.sqrt(numpy.sum((numpy.dot(rvals[0], A) - b) ** 2)) print
def test_2(): h = 1 a = -10 b = -a n = 2 * b // h + 1 A = numpy.zeros((n, n), dtype=theano.config.floatX) A = numpy.zeros((n, n), dtype=theano.config.floatX) v = a for k in xrange(n): A[k, k] = v v += h b = numpy.ones((n,), dtype=theano.config.floatX) rtol = numpy.asarray(1e-6, theano.config.floatX) maxxnorm = 1e8 maxit = 50 tA = theano.shared(A.astype(theano.config.floatX)) tb = theano.shared(b.astype(theano.config.floatX)) compute_Av = lambda x: ([TT.dot(tA, x)], {}) xs, flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm, updates = \ minres.minres(compute_Av, [tb], rtol=rtol, maxit=maxit, maxxnorm=maxxnorm, profile=0) func = theano.function([], xs + [flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm], name='func', profile=0, updates = updates, mode=theano.Mode(linker='cvm')) rvals = func() print 'flag', rvals[1] print minres.messages[int(rvals[1])] print 'iters', rvals[2] print 'relres', rvals[3] print 'relAres', rvals[4] print 'Anorm', rvals[5] print 'Acond', rvals[6] print 'xnorm', rvals[7] print 'Axnorm', rvals[8] print rvals[0]
def __init__(self, model, state, data): """ Parameters: :param model: Class describing the model used. It should provide the computational graph to evaluate the model :param state: Dictionary containing the current state of your job. This includes configuration of the job, specifically the seed, the startign damping factor, batch size, etc. See main.py for details :param data: Class describing the dataset used by the model """ ##################################### # Step 0. Constructs shared variables ##################################### n_params = len(model.params) cbs = state['cbs'] bs = state['bs'] ebs = state['ebs'] mbs = state['mbs'] profile = state['profile'] self.model = model self.rng = numpy.random.RandomState(state['seed']) self.damping = theano.shared(numpy.float32(state['damp'])) self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.loop_inps = [theano.shared( numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.loop_outs = [theano.shared( numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.step = 0 self.cbs = cbs self.bs = bs self.ebs = ebs self.mbs = mbs self.state = state self.profile = profile self.data = data self.step_timer = time.time() ############################################################ # Step 1. Compile function for computing eucledian gradients ############################################################ print 'Constructing grad function' bdx = TT.iscalar('batch_idx') loc_data = [x[bdx * cbs: (bdx + 1) * cbs] for x in self.data.variables] cost = safe_clone(model.train_cost, model.inputs, loc_data) gs = TT.grad(cost, model.params) ratio = numpy.float32(float(bs) / cbs) update = [(g, g + lg / ratio) for g, lg in zip(self.gs, gs)] print 'Compiling grad function' st = time.time() self.loc_grad_fn = theano.function( [bdx ], [], updates=update, name='loc_fn_grad', profile=profile) print 'took', time.time() - st ############################################################# # Step 2. Compile function for Computing Riemannian gradients ############################################################# loc_x = self.data._natgrad[bdx*cbs: (bdx+1)*cbs] loc_y = self.data._natgrady[bdx*cbs:(bdx+1)*cbs] loc_Gvs = safe_clone(model.Gvs(*self.loop_inps), [model.X, model.Y], [loc_x, loc_y]) updates = [(l, l + lg) for l, lg in zip(self.loop_outs, loc_Gvs)] st = time.time() loc_Gv_fn = theano.function( [bdx], [], updates=updates, name='loc_fn_rop', profile=profile) print 'took', time.time() - st def compute_Gv(*args): rval = forloop(loc_Gv_fn, mbs // cbs, self.loop_inps, self.loop_outs)(*args) return rval, {} print 'Constructing riemannian gradient function' st = time.time() norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) if not state['minresQLP']: self.msgs = minres_messages rvals = minres(compute_Gv, [x / norm_grads for x in self.gs], rtol=state['mrtol'], damp=self.damping, maxit=state['miters'], profile=state['profile']) else: self.msgs = minresQLP_messages[1:] rvals = minresQLP(compute_Gv, [x / norm_grads for x in self.gs], model.params_shape, rtol=state['mrtol'], damp=self.damping, maxit=state['miters'], TranCond=state['trancond'], profile=state['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = TT.cast(rvals[1], 'int32') niters = rvals[2] rel_residual = rvals[3] Anorm = rvals[4] Acond = rvals[5] norm_rs_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in nw_rs)) norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates = zip(self.rs, nw_rs) print 'took', time.time() - st print 'Compiling riemannian gradient function' st = time.time() self.compute_natural_gradients = theano.function( [], [flag, niters, rel_residual, Anorm, Acond, norm_grads, norm_rs_grads, norm_ord0], updates=updates, allow_input_downcast = True, name='compute_riemannian_gradients', on_unused_input='warn', profile=profile) print 'took', time.time() - st ########################################################### # Step 3. Compile function for evaluating cost and updating # parameters ########################################################### print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(state['lr']) loc_data = [x[bdx * cbs: (bdx + 1) * cbs] for x in self.data.variables] old_cost = safe_clone(model.train_cost, model.inputs, loc_data) self.loc_old_cost = theano.function( [bdx], old_cost, name='loc_old_cost', profile=profile) new_params = [p - lr * r for p, r in zip(model.params, self.rs)] new_cost = safe_clone(model.train_cost, model.inputs + model.params, loc_data + new_params) new_err = safe_clone(model.error, model.inputs + model.params, loc_data + new_params) self.loc_new_cost = theano.function( [bdx, lr], [new_cost, new_err], name='loc_new_cost', profile=profile) self.lr = numpy.float32(state['lr']) updates = dict(zip(model.params, new_params)) model.dbm_class.censor_updates(updates) self.update_params = theano.function( [lr], [], updates=updates, name='update_params') old_cost = TT.scalar('old_cost') new_cost = TT.scalar('new_cost') p_norm = TT.scalar('p_norm') prod = sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)]) #pnorm = TT.sqrt(sum(TT.sum(g*g) for g in self.gs)) * \ # TT.sqrt(sum(TT.sum(r*r) for r in self.rs)) dist = -lr * prod angle = prod / p_norm rho = (new_cost - old_cost) / dist self.compute_rho = theano.function( [old_cost, new_cost, lr, p_norm], [rho, dist, angle], name='compute_rho', profile=profile) self.old_cost = 1e20 self.__new_cost = 0 self.__error = 0 self.return_names = ['cost', 'old_cost', 'error', 'time_grads', 'time_metric', 'time_eval', 'minres_flag', 'minres_iters', 'minres_relres', 'minres_Anorm', 'minres_Acond', 'norm_ord0', 'norm_grad', 'norm_nat', 'lr', 'grad_angle', #'r_g', #'icost', 'damping', 'rho' ]
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the metric `ebs` -> int Number of samples over which to evaluate the training error `mreg` -> float Regularization added to the metric `mrtol` -> float Relative tolerance for inverting the metric `miters` -> int Number of iterations `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data if options['device'] != 'gpu': xdata = theano.shared(data['train_x'][:options['gbs']], name='xdata') ydata = TT._shared(data['train_y'][:options['gbs']], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] else: self.cpu_shared_data = [] xdata = theano.shared(data['train_x'], name='xdata') ydata = TT._shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] if options['device'] != 'gpu': # Store eucledian gradients self.gs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] # Store riemannian gradients self.rs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] else: # Store eucledian gradients self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] # Store riemannian gradients self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients # inputs gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)] return [args[0] + const(1)] + \ nw_gs ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig, n_steps=n_steps, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]] # updates updates.update(dict(zip(self.gs, nw_gs))) # givens if options['device'] == 'gpu': grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']]) for x,y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, on_unused_input='warn', profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') rbpos = rbdx * options['mbs'] if options['device'] == 'gpu': mode=gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone([model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop(nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates else: mode = cpu_mode def compute_Gv(*args): cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name ='cgv%d'%idx) for idx, shp in enumerate(model.params_shape)] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates = updates, givens = dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile = options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {} print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], rtol=options['mrtol'], shift= -options['mreg'], maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']: (rbdx + 1) * options['mbs']]) for x,y in zip(loc_inputs[:1], shared_data[:1])] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients = theano.function( [rbdx], [flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)] def cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] acc0 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0], n_steps=n_steps, name='cost_loop', mode=gpu_mode, profile=options['profile']) final_cost = rvals[1] / const(n_steps) if options['device'] == 'gpu': grad_inps = [(x, y[ebdx * options['ebs']: (ebdx + 1) * options['ebs']]) for x,y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'compling evaluation function' self.eval_fn = theano.function( [ebdx, lr], final_cost, givens=dict(grad_inps), on_unused_input='warn', updates = updates, name='eval_fn', mode=gpu_mode, profile=options['profile']) update_dict = dict(zip(model.params, nw_ps)) if options['device'] != 'gpu': update_dict.update(dict(zip(model.cparams, nw_ps))) self.update_params = theano.function( [lr], [], updates=update_dict, name='update_params', on_unused_input='warn', mode=mode, profile=options['profile']) self.options = options self.old_cost = 1e6 self.device = options['device'] n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone( model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_error, states = states, n_steps = n_steps, name='ls_err_step', mode=cpu_mode, profile = options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])
def init_gpu(self, options, channel, data, model): # Step 1. Compile function for computing eucledian gradients eps = numpy.float32(1e-24) gbdx = TT.iscalar('grad_batch_idx') n_params = len(self.model.params) print 'Constructing grad function' loc_inputs = [x.type() for x in model.inputs] srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(model.params, [None] * n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) #denom *= nw_out #denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= (nw_out + eps) else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) if out_operator == 'sigmoid': tnwout = TT.nnet.sigmoid(nw_out) factor = TT.sqrt(tnwout * (numpy.float32(1) - tnwout)) * factor r = TT.sgn(srng.normal(nw_out.shape)) r = r * factor loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [ oj + final_results[p] for oj, p in zip(args[1 + n_params:1 + 2 * n_params], model.params) ] return [args[0] + const(1)] + nw_gs + nw_js ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] ij = [ TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig + ij, n_steps=n_steps, name='grad_loop', mode=gpu_mode, profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]] updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js))) grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']]) for x, y in zip(loc_inputs, self.shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), allow_input_downcast=True, name='compute_eucledian_gradients', mode=gpu_mode, profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const( options['cbs']) # * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) self.damping = theano.shared(numpy.float32(options['mreg'])) rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs], Ms=self.js, rtol=options['mrtol'], shift=self.damping, maxit=options['miters'], profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) reset = TT.scalar(dtype='int8', name='reset') norm_kkm1 = sum([(r * g).sum() for r, g in zip(self.rs, self.gs)]) norm_kk = sum([(r * g).sum() for r, g in zip(nw_rs, self.gs)]) norm_dk = sum([(d * g).sum() for d, g in zip(self.ds, self.gs)]) norm_y = norm_kk - 2 * norm_kkm1 + self.norm_km1km1 beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \ 2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2)) beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)), beta_k) beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k), TT.isinf(beta_k)), TT.constant(numpy.float32(0.)), beta_k) nwds = [-r + beta_k * d for r, d in zip(nw_rs, self.ds)] self.nwds = nwds nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \ numpy.float32(1e-25) updates.update(dict(zip(self.rs, nw_rs))) updates.update(dict(zip(self.ds, nwds))) updates[self.norm_km1km1] = norm_kk updates[self.norm_dkm1] = norm_dk updates[self.norm_d] = nw_normd print 'Compiling riemannian gradient function' cst = time.time() grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs, self.shared_data)] self.compute_riemannian_gradients = theano.function( [reset, rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0, beta_k ], updates=updates, allow_input_downcast=True, givens=dict(grad_inps), name='compute_riemannian_gradients', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) print 'Time to compile Riemannian', print_time(time.time() - cst) cst = time.time() # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') newparams = [p + lr * d for p, d in zip(model.params, self.ds)] nw_ds = [-r for r in self.rs] nw_normd = TT.sqrt(sum([(r * r).sum() for r in self.rs])) self.update_params = theano.function([lr], updates=dict( zip(model.params, newparams)), name='update_params', on_unused_input='warn', allow_input_downcast=True, mode=gpu_mode, profile=options['profile']) self.reset_directions = theano.function( [], updates=dict(zip(self.ds + [self.norm_d], nw_ds + [nw_normd])), name='reset_dirs', on_unused_input='warn', mode=cpu_mode, allow_input_downcast=True, profile=options['profile']) n_steps = options['ebs'] // options['cbs'] def ls_cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict( zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_cost_step, states=states, n_steps=n_steps, name='ls_cost_step', profile=options['profile']) fcost = rvals[1][0] / const(n_steps) def ls_grad_step(_idx, gws): idx = TT.cast(_idx, 'int32') nw_inps = [ x[idx * options['cbs']:(idx + 1) * options['cbs']] for x in loc_inputs ] replace = dict( zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_gs = TT.grad(nw_cost, lr) return _idx + numpy.float32(1), gws + nw_gs states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_grad_step, states=states, n_steps=n_steps, name='ls_grad_step', profile=options['profile']) fgrad = rvals[1][0] / const(n_steps) ebdx = TT.iscalar('ebdx') grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]) for x, y in zip(loc_inputs, self.shared_data)] self.ls_cost_fn = theano.function([lr, ebdx], fcost, givens=grad_inps, allow_input_downcast=True, name='ls_cost_fn', mode=gpu_mode, profile=options['profile']) self.approx_change = theano.function( [lr], -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.ds)]), allow_input_downcast=True, name='approx_change', mode=gpu_mode, profile=options['profile']) self.ls_grad_fn = theano.function([lr, ebdx], fgrad, allow_input_downcast=True, givens=grad_inps, name='ls_grad_fn', mode=gpu_mode, profile=options['profile']) self.old_score = 50000 n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=gpu_mode, profile=options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=cpu_mode, allow_input_downcast=True, on_unused_input='warn', profile=options['profile']) print 'Compile eval time', print_time(time.time() - cst) self.old_cost = 1e6 self.options = options self.perm = self.rng.permutation(4) self.pos = 0
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the metric `ebs` -> int Number of samples over which to evaluate the training error `mreg` -> float Regularization added to the metric `mrtol` -> float Relative tolerance for inverting the metric `miters` -> int Number of iterations `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data eps = numpy.float32(1e-24) xdata = theano.shared(data['train_x'], name='xdata') ydata = theano.shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] # Store eucledian gradients self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] # Store riemannian gradients self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] # Store jacobi diagonal self.js = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)] # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(model.params, [None]*n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) #denom *= nw_out #denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= (nw_out + eps) else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) if out_operator == 'sigmoid': tnwout = TT.nnet.sigmoid(nw_out) factor = TT.sqrt(tnwout * (numpy.float32(1) - tnwout))*factor r = TT.sgn(srng.normal(nw_out.shape)) r = r * factor loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [oj + final_results[p] for oj, p in zip(args[1+n_params:1+2*n_params], model.params)] return [args[0] + const(1)] + nw_gs + nw_js ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0) for shp in model.params_shape] ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig + ij, n_steps=n_steps, mode=gpu_mode, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]] nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]] updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js))) grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']]) for x,y in zip(loc_inputs, shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, on_unused_input='warn', profile=options['profile']) #theano.printing.pydotprint(self.compute_eucledian_gradients, # 'eucledian_grad', scan_graphs=True) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') rbpos = rbdx * options['mbs'] self.damping = theano.shared(numpy.float32(options['mreg'])) mode=gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const(options['cbs'])# * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], Ms = self.js, rtol=options['mrtol'], shift= self.damping, maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']: (rbdx + 1) * options['mbs']]) for x,y in zip(loc_inputs, shared_data)] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients = theano.function( [rbdx], [flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)] def cost_step(_idx, acc0,acc1): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_cost2 = safe_clone(model.train_cost, replace = dict(zip(model.inputs, nw_inps))) return [_idx + const(1), acc0 + nw_cost, acc1 + nw_cost2] acc0 = const([0]) acc1 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0, acc1], n_steps=n_steps, name='cost_loop', mode=gpu_mode, profile=options['profile']) final_cost = rvals[1].sum() / const(n_steps) cost0 = rvals[2].sum() / const(n_steps) grad_inps = [(x, y[ebdx * options['ebs']: (ebdx + 1) * options['ebs']]) for x,y in zip(loc_inputs, shared_data)] denom = -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.rs)]) rho = (final_cost - cost0) / denom print 'compling evaluation function' self.eval_fn = theano.function( [ebdx, lr], [final_cost, rho], givens=dict(grad_inps), on_unused_input='warn', updates = updates, name='eval_fn', mode=gpu_mode, profile=options['profile']) update_dict = dict(zip(model.params, nw_ps)) self.update_params = theano.function( [lr], [], updates=update_dict, name='update_params', on_unused_input='warn', mode=mode, profile=options['profile']) self.options = options self.old_cost = numpy.inf n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone( model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_error, states = states, n_steps = n_steps, name='ls_err_step', mode=gpu_mode, profile = options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])
def init_gpu(self, options, channel, data, model): # Step 1. Compile function for computing eucledian gradients eps = numpy.float32(1e-24) gbdx = TT.iscalar('grad_batch_idx') n_params = len(self.model.params) print 'Constructing grad function' loc_inputs = [x.type() for x in model.inputs] srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)] # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(model.params, [None]*n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) #denom *= nw_out #denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= (nw_out+eps) else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) if out_operator == 'sigmoid': tnwout = TT.nnet.sigmoid(nw_out) factor = TT.sqrt(tnwout * (numpy.float32(1) - tnwout))*factor r = TT.sgn(srng.normal(nw_out.shape)) r = r * factor loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [oj + final_results[p] for oj, p in zip(args[1+n_params:1+2*n_params], model.params)] return [args[0] + const(1)] + nw_gs + nw_js ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0) for shp in model.params_shape] ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig + ij, n_steps=n_steps, name='grad_loop', mode=gpu_mode, profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]] nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]] updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js))) grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']]) for x,y in zip(loc_inputs, self.shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), allow_input_downcast=True, name='compute_eucledian_gradients', mode=gpu_mode, profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const(options['cbs'])# * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) self.damping = theano.shared(numpy.float32(options['mreg'])) rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], Ms = self.js, rtol=options['mrtol'], shift=self.damping, maxit=options['miters'], profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) reset = TT.scalar(dtype='int8', name='reset') norm_kkm1 = sum([(r*g).sum() for r,g in zip(self.rs, self.gs)]) norm_kk = sum([(r*g).sum() for r,g in zip(nw_rs, self.gs)]) norm_dk = sum([(d*g).sum() for d,g in zip(self.ds, self.gs)]) norm_y = norm_kk - 2*norm_kkm1 + self.norm_km1km1 beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \ 2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2)) beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)), beta_k) beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k), TT.isinf(beta_k)), TT.constant(numpy.float32(0.)), beta_k) nwds = [-r + beta_k*d for r,d in zip(nw_rs, self.ds)] self.nwds = nwds nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \ numpy.float32(1e-25) updates.update(dict(zip(self.rs, nw_rs))) updates.update(dict(zip(self.ds, nwds))) updates[self.norm_km1km1] = norm_kk updates[self.norm_dkm1] = norm_dk updates[self.norm_d] = nw_normd print 'Compiling riemannian gradient function' cst = time.time() grad_inps = [(x, y[rbdx*options['mbs']:(rbdx+1)*options['mbs']]) for x,y in zip(loc_inputs, self.shared_data)] self.compute_riemannian_gradients = theano.function( [reset, rbdx], [flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0, beta_k], updates=updates, allow_input_downcast = True, givens=dict(grad_inps), name='compute_riemannian_gradients', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) print 'Time to compile Riemannian', print_time(time.time() - cst) cst = time.time() # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') newparams = [p + lr * d for p, d in zip(model.params, self.ds)] nw_ds = [ -r for r in self.rs] nw_normd = TT.sqrt(sum([(r*r).sum() for r in self.rs])) self.update_params = theano.function([lr], updates = dict(zip(model.params, newparams)), name='update_params', on_unused_input='warn', allow_input_downcast=True, mode=gpu_mode, profile=options['profile']) self.reset_directions = theano.function([], updates=dict(zip(self.ds + [self.norm_d], nw_ds + [nw_normd])), name='reset_dirs', on_unused_input='warn', mode=cpu_mode, allow_input_downcast=True, profile=options['profile']) n_steps = options['ebs'] // options['cbs'] def ls_cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_cost_step, states = states, n_steps = n_steps, name='ls_cost_step', profile = options['profile']) fcost = rvals[1][0] / const(n_steps) def ls_grad_step(_idx, gws): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_gs = TT.grad(nw_cost, lr) return _idx + numpy.float32(1), gws + nw_gs states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_grad_step, states = states, n_steps = n_steps, name = 'ls_grad_step', profile=options['profile']) fgrad = rvals[1][0] / const(n_steps) ebdx = TT.iscalar('ebdx') grad_inps = [(x, y[ebdx * options['ebs']: (ebdx + 1) * options['ebs']]) for x,y in zip(loc_inputs, self.shared_data)] self.ls_cost_fn = theano.function( [lr, ebdx], fcost, givens = grad_inps, allow_input_downcast=True, name='ls_cost_fn', mode=gpu_mode, profile=options['profile']) self.approx_change = theano.function( [lr], -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.ds)]), allow_input_downcast=True, name='approx_change', mode=gpu_mode, profile=options['profile']) self.ls_grad_fn = theano.function( [lr, ebdx], fgrad, allow_input_downcast=True, givens = grad_inps, name='ls_grad_fn', mode=gpu_mode, profile=options['profile']) self.old_score = 50000 n_steps = options['ebs']// options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone( model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_error, states = states, n_steps = n_steps, name='ls_err_step', mode=gpu_mode, profile = options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=cpu_mode, allow_input_downcast=True, on_unused_input='warn', profile=options['profile']) print 'Compile eval time', print_time(time.time() - cst) self.old_cost = 1e6 self.options = options self.perm = self.rng.permutation(4) self.pos = 0
def init_cpu(self, options, channel, data, model): n_params = len(self.model.params) # Step 1. Compile function for computing eucledian gradients self.reset_gradients = theano.function( [], [], updates = zip(self.gs, [TT.zeros_like(g) for g in self.gs]), on_unused_input='warn', mode=cpu_mode, name='reset_gradients', profile=options['profile']) gbdx = TT.iscalar('grad_batch_idx') comp_grad = TT.iscalar('comp_grad') print 'Constructing grad function' loc_inputs = [x.type() for x in model.inputs] srng = RandomStreams(numpy.random.randint(1e5)) cst = time.time() def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[2: 2 + n_params], gs)] _gs = [x for x in gs] _nw_gs = [gpu_from_host(g) for g in nw_gs] nw_gs = ifelse(comp_grad, _nw_gs, _gs, gpu=True) nw_gs = [x.type.filter_variable(y) for x,y in zip(args[2:],nw_gs)] return [args[0] + const(1), args[1] + nw_cost] + nw_gs ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) cost0 = TT.unbroadcast(const([0]),0) n_steps = TT.iscalar('nsteps') rvals, updates = scan(grad_step, states=[idx0, cost0] + ig, n_steps=n_steps, name='grad_loop', mode=gpu_mode, profile=options['profile']) nw_gs = [x[0] / TT.cast(n_steps, 'float32') for x in rvals[2: 2 + n_params]] nw_gs = [og + nwg for og, nwg in zip(self.gs, nw_gs)] fcost = rvals[1][0] / TT.cast(n_steps, 'float32') updates.update(dict(zip(self.gs, nw_gs))) grad_inps = zip(loc_inputs, self.shared_data) print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx, comp_grad, n_steps], fcost, updates=updates, on_unused_input='warn', givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, profile=options['profile']) print 'Time to compile grad', print_time(time.time() - cst) cst = time.time() def jacob_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']:(idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) replace.update(dict(zip(model.params, model.cpu_params))) mode=cpu_mode params = model.cpu_params # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(params, [None]*n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) denom *= nw_out denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= nw_out else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) r = TT.sgn(srng.normal(nw_out.shape)) r = r * factor loc_params = [x for x in params if x in theano.gof.graph.inputs([nw_out])] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [oj + final_results[p] for oj, p in zip(args[1:1+n_params], params)] return [args[0] + const(1)] + nw_js ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) n_steps = options['mbs'] // options['cbs'] mode = cpu_mode rvals, updates = scan(jacob_step, states=[idx0] + ij, n_steps=n_steps, name='jacob_loop', mode=mode, profile=options['profile']) nw_js = [x[0] for x in rvals[1:1+n_params]] updates.update(dict(zip(self.js, nw_js))) grad_inps = [(x, y[gbdx*options['mbs']:(gbdx+1)*options['mbs']]) for x,y in zip(loc_inputs[:1], self.cpu_shared_data[:1])] print 'Compiling grad function' self.compute_jacobi_preconditioner = theano.function( [gbdx], [], updates=updates, on_unused_input='warn', givens=dict(grad_inps), name='jacobi_preconditioner_gradients', mode=mode, profile=options['profile']) print 'Time compile jacobi ', print_time(time.time() - cst) cst = time.time() # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') mode = cpu_mode def compute_Gv(*args): cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name ='cgv%d'%idx) for idx, shp in enumerate(model.params_shape)] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]] grad_inps = zip(loc_inputs, self.shared_data) loc_fn = theano.function([], final_Gvs, updates = updates, givens = dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile = options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {} print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) mreg = TT.scalar('mreg') rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], Ms = self.js, rtol=options['mrtol'], shift = - mreg, maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs, nw_rs))) print 'Compiling riemannian gradient function' self.compute_riemannian_gradients = theano.function( [mreg], [flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0], updates=updates, name='compute_riemannian_gradients', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) print 'Time to compile Riemannian', print_time(time.time() - cst) cst = time.time() # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') nw_ps = [p - lr * r for p, r in zip(model.cpu_params, self.rs)] nw_ds = [ -r for r in self.rs] self.update_cparams = theano.function( [lr], updates = dict(zip(model.cpu_params, nw_ps)), name='update_cparam', allow_input_downcast=True, mode=cpu_mode, on_unused_input='warn', profile=options['profile']) newparams = [y.type.filter_variable(x) for x,y in zip(nw_ps, model.params)] self.update_params = theano.function([lr], updates = dict(zip(model.params, newparams)), name='update_params', on_unused_input='warn', allow_input_downcast=True, mode=cpu_mode, profile=options['profile']) self.scalar_grad = theano.function( [], sum(TT.sum(x*y) for x,y in zip(self.gs, self.ds)), name='scalar_grad', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) nsteps = self.options['ebs'] // self.options['cbs'] self.current_alpha = numpy.inf def ls_cost(alpha, pos): if alpha != self.current_alpha: self.current_alpha = alpha self.update_params(alpha) return self.compute_eucledian_gradients(pos, 0, nsteps) self.ls_cost_fn = ls_cost def ls_grad(alpha, pos): if alpha != self.current_alpha: self.current_alpha = alpha self.update_params(alpha) self.reset_gradients() self.compute_eucledian_gradients(pos, 1, nsteps) return self.scalar_grad() self.ls_grad_fn = ls_grad self.old_score = 50000 n_steps = options['ebs']// options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) replace.update(dict(zip(model.params, model.cpu_params))) nw_cost = \ TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_error, states = states, n_steps = n_steps, name='ls_err_step', mode=cpu_mode, profile = options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([], ferr, givens=dict(zip(loc_inputs, self.cpu_shared_data)), name='compute_err', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) print 'Compile eval time', print_time(time.time() - cst) self.old_cost = 1e6 self.options = options self.perm = self.rng.permutation(4) self.pos = 0
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the metric `ebs` -> int Number of samples over which to evaluate the training error `mreg` -> float Regularization added to the metric `mrtol` -> float Relative tolerance for inverting the metric `miters` -> int Number of iterations `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data eps = numpy.float32(1e-24) xdata = theano.shared(data['train_x'], name='xdata') ydata = theano.shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] # Store eucledian gradients self.gs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients self.rs1 = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.rs2 = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store jacobi diagonal self.js = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(model.params, [None] * n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) #denom *= nw_out #denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= (nw_out + eps) else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) if out_operator == 'sigmoid': tnwout = TT.nnet.sigmoid(nw_out) factor = TT.sqrt(tnwout * (numpy.float32(1) - tnwout)) * factor r = TT.sgn(srng.normal(nw_out.shape, nstreams=128)) r = r * factor loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [ oj + final_results[p] for oj, p in zip(args[1 + n_params:1 + 2 * n_params], model.params) ] return [args[0] + const(1)] + nw_gs + nw_js ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] ij = [ TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig + ij, n_steps=n_steps, mode=gpu_mode, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]] updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js))) grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']]) for x, y in zip(loc_inputs, shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, on_unused_input='warn', profile=options['profile']) #theano.printing.pydotprint(self.compute_eucledian_gradients, # 'eucledian_grad', scan_graphs=True) self.damping = theano.shared(numpy.float32(options['mreg'])) # Step 2.1 Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') rbpos = rbdx * options['mbs'] mode = gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.gf_outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.gf_outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const( options['cbs']) # * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] #_final_Gvs = [x + self.damping * y # for x,y in zip(final_Gvs, args)] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], #Ms = self.js, rtol=options['mrtol'], shift=self.damping, maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs1, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs, shared_data)] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients1 = theano.function( [rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0 ], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 2.2 Compile function for Computing Riemannian gradients rbpos = rbdx * options['mbs'] mode = gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.gc_outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.gc_outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const( options['cbs']) # * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] #_final_Gvs = [x + self.damping * y # for x,y in zip(final_Gvs, args)] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], #Ms = self.js, rtol=options['mrtol'], shift=self.damping, maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs2, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs, shared_data)] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients2 = theano.function( [rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0 ], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' if options['rsch'] == 1: self.rs = self.rs1 else: self.rs = self.rs2 lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)] def cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] acc0 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0], n_steps=n_steps, name='cost_loop', mode=gpu_mode, profile=options['profile']) final_cost = rvals[1].sum() / const(n_steps) grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]) for x, y in zip(loc_inputs, shared_data)] denom = -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)]) self.approx_change = theano.function([lr], denom, name='approx_change', mode=gpu_mode, allow_input_downcast=True, profile=options['profile']) print 'compling evaluation function' self.eval_fn = theano.function([ebdx, lr], final_cost, givens=dict(grad_inps), on_unused_input='warn', updates=updates, name='eval_fn', allow_input_downcast=True, mode=gpu_mode, profile=options['profile']) def ls_grad_step(_idx, gws): idx = TT.cast(_idx, 'int32') nw_inps = [ x[idx * options['cbs']:(idx + 1) * options['cbs']] for x in loc_inputs ] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_gs = TT.grad(nw_cost, lr) return _idx + numpy.float32(1), gws + nw_gs states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_grad_step, states=states, n_steps=n_steps, name='ls_grad_step', profile=options['profile']) fgrad = rvals[1][0] / const(n_steps) self.grad_lr_fn = theano.function([ebdx, lr], fgrad, givens=grad_inps, name='ls_grad_fn', on_unused_input='warn', mode=gpu_mode, allow_input_downcast=True, profile=options['profile']) update_dict = dict(zip(model.params, nw_ps)) self.update_params = theano.function([lr], [], updates=update_dict, name='update_params', on_unused_input='warn', allow_input_downcast=True, mode=mode, profile=options['profile']) self.options = options self.old_cost = numpy.inf n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=gpu_mode, profile=options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the metric `ebs` -> int Number of samples over which to evaluate the training error `mreg` -> float Regularization added to the metric `mrtol` -> float Relative tolerance for inverting the metric `miters` -> int Number of iterations `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data if options['device'] != 'gpu': xdata = theano.shared(data['train_x'][:options['gbs']], name='xdata') ydata = TT._shared(data['train_y'][:options['gbs']], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] else: self.cpu_shared_data = [] xdata = theano.shared(data['train_x'], name='xdata') ydata = TT._shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] if options['device'] != 'gpu': # Store eucledian gradients self.gs = [ TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients self.rs = [ TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] else: # Store eucledian gradients self.gs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients self.rs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients # inputs gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] return [args[0] + const(1)] + \ nw_gs ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig, n_steps=n_steps, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] # updates updates.update(dict(zip(self.gs, nw_gs))) # givens if options['device'] == 'gpu': grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']]) for x, y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, on_unused_input='warn', profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') rbpos = rbdx * options['mbs'] if options['device'] == 'gpu': mode = gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const( options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates else: mode = cpu_mode def compute_Gv(*args): cgv = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name='cgv%d' % idx) for idx, shp in enumerate(model.params_shape) ] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const( options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [ TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:] ] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates=updates, givens=dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile=options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {} print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs], rtol=options['mrtol'], shift=-options['mreg'], maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs[:1], shared_data[:1])] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients = theano.function( [rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0 ], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)] def cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] acc0 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0], n_steps=n_steps, name='cost_loop', mode=gpu_mode, profile=options['profile']) final_cost = rvals[1] / const(n_steps) if options['device'] == 'gpu': grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]) for x, y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'compling evaluation function' self.eval_fn = theano.function([ebdx, lr], final_cost, givens=dict(grad_inps), on_unused_input='warn', updates=updates, name='eval_fn', mode=gpu_mode, profile=options['profile']) update_dict = dict(zip(model.params, nw_ps)) if options['device'] != 'gpu': update_dict.update(dict(zip(model.cparams, nw_ps))) self.update_params = theano.function([lr], [], updates=update_dict, name='update_params', on_unused_input='warn', mode=mode, profile=options['profile']) self.options = options self.old_cost = 1e6 self.device = options['device'] n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=cpu_mode, profile=options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])