def fd2(mlp, fdm, params, globalLR1, globalLR2, momentParam1, momentParam2): cost2 = mlp.classError2 gradC2 = T.grad(cost2, mlp.paramsT1) tempUps = [] history = {'grad': dict(), 'up': dict()} if params.avC2grad in ['adam', 'momentum']: if params.avC2grad == 'adam': opt3 = adam() else: opt3 = None tempUps = [] if opt3 is None else opt3.initial_updates() newC2 = [] for param, grad in zip(mlp.paramsT1, gradC2): tempUp, _, newGrad = update_fun(param, T.reshape(grad, param.shape), None, 'T1', history, opt3, params, globalLR1, globalLR2, momentParam1, momentParam2) newC2 += newGrad tempUps += tempUp[:-1] gradC2 = newC2 updateT1 = [] updateT2 = [] # save grad W of C2 as shared (3), update W - (1) + (3) for param, grad, uC1, uC2 in zip(mlp.paramsT1, gradC2, fdm.updateC1T1, fdm.updateC2T1): updateT1 += [(uC2, -step * globalLR1 * grad)] updateT1 += [(param, param - uC1 - step * globalLR1 * grad)] return updateT1 + updateT2 + tempUps
def fd3(mlp, fdm, params, globalLR1, globalLR2, momentParam1, momentParam2): cost1 = mlp.classError1 + mlp.penalty gradT1reg = T.grad(cost1, mlp.paramsT2) updateT1 = [] updateT2 = [] onlyT2param = [] # take opt from Adam? if params.opt2 in ['adam']: opt2 = adam() else: opt2 = None # update W - (1) + (3) for param, uC1, uC2 in zip(mlp.paramsT1, fdm.updateC1T1, fdm.updateC2T1): updateT1 += [(param, param + uC1 - uC2)] # compute grad T2 of C1, update T2 - [(4) - (2) ] / lr1 for param, grad, gT2 in zip(mlp.paramsT2, gradT1reg, fdm.gradC1T2): if params.T2onlySGN: grad_proxi = T.sgn((grad - gT2) / step * globalLR1) else: grad_proxi = (grad - gT2) / step * globalLR1 tempUp, tempPair, _ = update_fun(param, T.reshape(grad_proxi, param.shape), None, 'T2', {}, opt2, params, globalLR1, globalLR2, momentParam1, momentParam2) updateT2 += tempUp onlyT2param += tempPair debugs = [check for (_, check) in onlyT2param] return updateT1 + updateT2, debugs
def fd3(mlp, fdm, params, globalLR1, globalLR2, momentParam1, momentParam2): cost1 = mlp.classError1 + mlp.penalty gradT1reg = T.grad(cost1, mlp.paramsT2) updateT1 = []; updateT2 = []; onlyT2param = [] # take opt from Adam? if params.opt2 in ['adam']: opt2 = adam() else: opt2 = None # update W - (1) + (3) for param, uC1, uC2 in zip(mlp.paramsT1, fdm.updateC1T1, fdm.updateC2T1): updateT1 += [(param, param + uC1 - uC2)] # compute grad T2 of C1, update T2 - [(4) - (2) ] / lr1 for param, grad, gT2 in zip(mlp.paramsT2, gradT1reg, fdm.gradC1T2): if params.T2onlySGN: grad_proxi = T.sgn((grad - gT2)/step*globalLR1) else: grad_proxi = (grad - gT2)/step*globalLR1 tempUp, tempPair, _ = update_fun(param, T.reshape(grad_proxi, param.shape), None, 'T2', {}, opt2, params, globalLR1, globalLR2, momentParam1, momentParam2) updateT2 += tempUp onlyT2param += tempPair debugs = [check for (_, check) in onlyT2param] return updateT1 + updateT2, debugs
def fd2(mlp, fdm, params, globalLR1, globalLR2, momentParam1, momentParam2): cost2 = mlp.classError2 gradC2 = T.grad(cost2, mlp.paramsT1) tempUps = [] history = {'grad': dict(), 'up': dict()} if params.avC2grad in ['adam', 'momentum']: if params.avC2grad == 'adam': opt3 = adam() else: opt3 = None tempUps = [] if opt3 is None else opt3.initial_updates() newC2 = [] for param, grad in zip(mlp.paramsT1, gradC2): tempUp, _, newGrad = update_fun(param, T.reshape(grad, param.shape), None, 'T1', history, opt3, params, globalLR1, globalLR2, momentParam1, momentParam2) newC2 += newGrad tempUps += tempUp[:-1] gradC2 = newC2 updateT1 = []; updateT2 = [] # save grad W of C2 as shared (3), update W - (1) + (3) for param, grad, uC1, uC2 in zip(mlp.paramsT1, gradC2, fdm.updateC1T1, fdm.updateC2T1): updateT1 += [(uC2, - step*globalLR1*grad)] updateT1 += [(param, param - uC1 - step*globalLR1*grad)] return updateT1 + updateT2 + tempUps
def fd1(mlp, fdm, params, globalLR1, globalLR2, momentParam1, momentParam2): # gradient of T1 ----------------------------------- GRADS cost1 = mlp.classError1 + mlp.penalty gradT1 = T.grad(cost1, mlp.paramsT1) gradT1reg = T.grad(cost1, mlp.paramsT2) # take opt from Adam? if params.opt1 in ['adam']: opt1 = adam() else: opt1 = None if params.opt2 in ['adam']: opt2 = adam() else: opt2 = None updateT1 = [] if opt1 is None else opt1.initial_updates() updateT2 = [] if opt2 is None else opt2.initial_updates() onlyT1param = [] history = {'grad': dict(), 'up': dict()} assert len(mlp.paramsT1) == len(gradT1) assert len(mlp.paramsT1) == len(fdm.updateC1T1) assert len(mlp.paramsT2) == len(gradT1reg) assert len(mlp.paramsT2) == len(fdm.gradC1T2) for param, grad, uC1 in zip(mlp.paramsT1, gradT1, fdm.updateC1T1): tempUp, tempPair, _ = update_fun(param, grad, mlp.penaltyMaxParams.get(param, None), 'T1', history, opt1, params, globalLR1, globalLR2, momentParam1, momentParam2) updateT1 += tempUp onlyT1param += tempPair newparam = tempUp[-1][-1] just_up = newparam - param updateT1 += [(uC1, just_up)] # save grad T2 of C1 as shared (2) in gradT1reg for param, grad, gT2 in zip(mlp.paramsT2, gradT1reg, fdm.gradC1T2): updateT2 += [(gT2, grad)] debugs = [check for (_, check) in onlyT1param] return updateT1 + updateT2, debugs #, T2_grads
def fd1(mlp, fdm, params, globalLR1, globalLR2, momentParam1, momentParam2): # gradient of T1 ----------------------------------- GRADS cost1 = mlp.classError1 + mlp.penalty gradT1 = T.grad(cost1, mlp.paramsT1) gradT1reg = T.grad(cost1, mlp.paramsT2) # take opt from Adam? if params.opt1 in ['adam']: opt1 = adam() else: opt1 = None if params.opt2 in ['adam']: opt2 = adam() else: opt2 = None updateT1 = [] if opt1 is None else opt1.initial_updates() updateT2 = [] if opt2 is None else opt2.initial_updates() onlyT1param = [] history = {'grad': dict(), 'up': dict()} assert len(mlp.paramsT1) == len(gradT1) assert len(mlp.paramsT1) == len(fdm.updateC1T1) assert len(mlp.paramsT2) == len(gradT1reg) assert len(mlp.paramsT2) == len(fdm.gradC1T2) for param, grad, uC1 in zip(mlp.paramsT1, gradT1, fdm.updateC1T1): tempUp, tempPair, _ = update_fun(param, grad, mlp.penaltyMaxParams.get(param, None), 'T1', history, opt1, params, globalLR1, globalLR2, momentParam1, momentParam2) updateT1 += tempUp onlyT1param += tempPair newparam = tempUp[-1][-1] just_up = newparam - param updateT1 += [(uC1, just_up)] # save grad T2 of C1 as shared (2) in gradT1reg for param, grad, gT2 in zip(mlp.paramsT2, gradT1reg, fdm.gradC1T2): updateT2 += [(gT2, grad)] debugs = [check for (_, check) in onlyT1param] return updateT1 + updateT2, debugs#, T2_grads
def updates(mlp, params, globalLR1, globalLR2, momentParam1, momentParam2): ''' Computing updates of T1 and T2 parameters. Inputs: mlp :: model params :: specification of the model and training globalLR1, globalLR2 :: global learning rates for T1 and T2 momentParam1, momentParam2 :: momentum parameters for T1 and T2 phase :: external parameter in case of ifelse (currently not in use) Outputs: updateT1 :: update of T1 parameters and related shared variables updateT2 :: update of T2 parameters and related shared variables upnormdiff, debugs :: variable tracked for debugging ''' # gradients cost1 = mlp.trainCost + mlp.penalty cost2 = mlp.trainCost # dC1/dT1 gradC1T1 = T.grad(cost1, mlp.paramsT1) gradC2T1temp = T.grad(cost2, mlp.paramsT1) # initialzations opt1 = adam() if params.opt1 in ['adam'] else None opt2 = adam() if params.opt2 in ['adam'] else None updateT1 = [] if opt1 is None else opt1.initial_updates() updateT2 = [] if opt2 is None else opt2.initial_updates() updateC2grad = []; gradC2T1 = []; gradC2T2 = []; tempUps = [] trackT1grads = []; trackT2grads = [] history = {'grad': dict(), 'up': dict()} historyC2 = {'grad': dict(), 'up': dict()} learnParams = [globalLR1, globalLR2, momentParam1, momentParam2] ''' Updating T1 params ''' for param, grad in zip(mlp.paramsT1, gradC1T1): grad = scale_norm(remove_nans(grad), threshold=3.) ups, track, _ = update_fun(param, grad, 'T1', history, opt1, learnParams, params) updateT1 += ups trackT1grads += [track] ''' Updating T2 params ''' if params.useT2: ''' Save grads C2T1 for the T2 update: ''' for param, grad in zip(mlp.paramsT1, gradC2T1temp): grad = scale_norm(remove_nans(grad), threshold=3.) grad = clip_grad(grad, threshold=10.) saveGrad = theano.shared(np.asarray(param.get_value() * 0., dtype='float32'), broadcastable=param.broadcastable, name='gradC2T1_%s' % param.name) updateC2grad += [(saveGrad, grad)] gradC2T1 += [saveGrad] ''' If gradient dC2/dT1 is also estimated with adam ''' if params.avC2grad in ['adam', 'momentum']: #gradC2T1 = T.grad(cost2, mlp.paramsT1) if params.avC2grad == 'adam': opt3 = adam() else: opt3 = None tempUps = [] if opt3 is None else opt3.initial_updates() newC2 = [] grad = scale_norm(remove_nans(grad), threshold=3.) grad = clip_grad(grad, threshold=10.) for param, grad in zip(mlp.paramsT1, gradC2T1): tempUp, _, newGrad = update_fun(param, T.reshape(grad, param.shape), 'T1', historyC2, opt3, learnParams, params) tempUps += tempUp[:-1] newC2 += newGrad gradC2T1 = newC2 paramsT2, gradC2T2 = hypergrad(mlp.paramsT1, mlp.paramsT2, gradC2T1, mlp.trainCost, mlp.trainCost, mlp.penalty) for param, grad in zip(mlp.paramsT2, gradC2T2): paramName, _ = param.name.split('_') if params.decayT2 > 0. and paramName not in ['L2', 'L1']: grad += params.decayT2*param grad = scale_norm(remove_nans(grad), threshold=3.) grad = clip_grad(grad, threshold=10.) tempUp, track, _ = update_fun(param, T.reshape(grad, param.shape),'T2', {}, opt2, learnParams, params) updateT2 += tempUp trackT2grads += [track] # monitored variables for output if (not params.useT2) and params.trackGrads: debugs = trackT1grads elif params.trackGrads: debugs = trackT1grads + trackT2grads else: debugs = [] print "Parameters ", print ", ".join([p.name for p in mlp.paramsT2]), print "are trained on T2" return updateT1, updateT2+tempUps, updateC2grad, debugs