def synth_grad(tparams, prefix, activation, labels_one_hot): ''' Synthetic gradient estimation using a linear model ''' return T.dot(activation, tparams[_concat(prefix, 'W')]) + T.dot( labels_one_hot, tparams[_concat(prefix, 'C')]) + tparams[_concat( prefix, 'b')]
def param_init_fflayer(params, prefix, nin, nout): ''' Initializes weights for a feedforward layer ''' params[_concat(prefix, 'W')] = init_weights(nin, nout, type_init='ortho') params[_concat(prefix, 'b')] = np.zeros((nout, )).astype('float32') return params
def param_init_sgmod(params, prefix, units, zero_init=True): ''' Initialization for synthetic gradient subnetwork ''' global args # conditioned on the whole image, on the activation produced by encoder input and the backpropagated gradients for latent samples. inp_list = [14*28, 14*28, units, units, units] inp_size = 0 for i in range(5): inp_size += inp_list[i] if not zero_init: if args.sg_type == 'lin': params[_concat(prefix, 'W')] = init_weights(inp_size, units, type_init='ortho') params[_concat(prefix, 'b')] = np.zeros((units,)).astype('float32') else: if args.sg_type == 'lin' or args.sg_type == 'lin_deep': params[_concat(prefix, 'W')] = np.zeros((inp_size, units)).astype('float32') params[_concat(prefix, 'b')] = np.zeros((units,)).astype('float32') if args.sg_type == 'deep' or args.sg_type == 'lin_deep': params = param_init_fflayer(params, _concat(prefix, 'I'), inp_size, 1024, batchnorm=True) params = param_init_fflayer(params, _concat(prefix, 'H'), 1024, 1024, batchnorm=True) if args.bn_type == 0: params = param_init_fflayer(params, _concat(prefix, 'o'), 1024, units, zero_init=True, batchnorm=True) else: params = param_init_fflayer(params, _concat(prefix, 'o'), 1024, units, zero_init=True, batchnorm=False) return params
def param_init_fflayer(params, prefix, nin, nout, zero_init=False, batchnorm=False, skip_running_vars=False): ''' Initializes weights for a feedforward layer ''' global args if zero_init: params[_concat(prefix, 'W')] = np.zeros((nin, nout)).astype('float32') else: params[_concat(prefix, 'W')] = init_weights(nin, nout, type_init='ortho') params[_concat(prefix, 'b')] = np.zeros((nout, )).astype('float32') if batchnorm: if args.bn_type == 0: dim = nin else: dim = nout params[_concat(prefix, 'g')] = np.ones((dim, ), dtype=np.float32) params[_concat(prefix, 'be')] = np.zeros((dim, )).astype('float32') # it is not necessary for deep synthetic subnetworks to track running averages as they are not used in test time if not skip_running_vars: params[_concat(prefix, 'rm')] = np.zeros( (1, dim)).astype('float32') params[_concat(prefix, 'rv')] = np.ones((1, dim), dtype=np.float32) return params
def _compute_unrolled_model(self, input, target, eta, network_optimizer): # loss on train data loss = self.model._loss(input, target) # w theta = _concat(self.model.parameters()).data try: moment = _concat(network_optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()).mul_( self.network_momentum) except: moment = torch.zeros_like(theta) # w_grad + weight_decay * w dtheta = _concat(torch.autograd.grad( loss, self.model.parameters())).data + self.network_weight_decay * theta # eta: learning rate unrolled_model = self._construct_model_from_theta( theta.sub(eta, moment + dtheta)) return unrolled_model
def param_init_sgmod(params, prefix, units, zero_init=True): ''' Initializes a linear regression based model for estimating gradients, conditioned on the class labels ''' if not zero_init: params[_concat(prefix, 'W')] = init_weights(units, units, type_init='ortho') params[_concat(prefix, 'C')] = init_weights(10, units, type_init='ortho') else: params[_concat(prefix, 'W')] = np.zeros( (units, units)).astype('float32') params[_concat(prefix, 'C')] = np.zeros((10, units)).astype('float32') params[_concat(prefix, 'b')] = np.zeros((units, )).astype('float32') return params
def fflayer(tparams, state_below, prefix, nonlin='tanh'): ''' A feedforward layer ''' if nonlin == None: return T.dot(state_below, tparams[_concat( prefix, 'W')]) + tparams[_concat(prefix, 'b')] elif nonlin == 'tanh': return T.tanh( T.dot(state_below, tparams[_concat(prefix, 'W')]) + tparams[_concat(prefix, 'b')]) elif nonlin == 'sigmoid': return T.nnet.nnet.sigmoid( T.dot(state_below, tparams[_concat(prefix, 'W')]) + tparams[_concat(prefix, 'b')]) elif nonlin == 'softplus': return T.nnet.nnet.softplus( T.dot(state_below, tparams[_concat(prefix, 'W')]) + tparams[_concat(prefix, 'b')]) elif nonlin == 'relu': return T.nnet.nnet.relu( T.dot(state_below, tparams[_concat(prefix, 'W')]) + tparams[_concat(prefix, 'b')])
def _hessian_vector_product(self, vector, input, target, r=1e-2): R = r / _concat(vector).norm() for p, v in zip(self.model.parameters(), vector): p.data.add_(R, v) loss = self.model._loss(input, target) grads_p = torch.autograd.grad(loss, self.model.arch_parameters()) for p, v in zip(self.model.parameters(), vector): p.data.sub_(2 * R, v) loss = self.model._loss(input, target) grads_n = torch.autograd.grad(loss, self.model.arch_parameters()) for p, v in zip(self.model.parameters(), vector): p.data.add_(R, v) return [(x - y).div_(2 * R) for x, y in zip(grads_p, grads_n)]
def synth_grad(tparams, prefix, inp, mode='Train'): ''' Synthetic gradients ''' global args if args.sg_type == 'lin': return T.dot(inp, tparams[_concat(prefix, 'W')]) + tparams[_concat(prefix, 'b')] elif args.sg_type == 'deep' or args.sg_type == 'lin_deep': outi = fflayer(tparams, inp, _concat(prefix, 'I'), nonlin='relu', batchnorm='train', dropout=None) outh = fflayer(tparams, outi, _concat(prefix,'H'), nonlin='relu', batchnorm='train', dropout=None) # depending on the bn type being used, bn is used/not used in the layer if args.bn_type == 0: bn_last = 'train' else: bn_last = None if args.sg_type == 'deep': return fflayer(tparams, outh + outi, _concat(prefix, 'o'), batchnorm=bn_last, nonlin=None) elif args.sg_type == 'lin_deep': return T.dot(inp, tparams[_concat(prefix, 'W')]) + tparams[_concat(prefix, 'b')] + fflayer(tparams, outh + outi, _concat(prefix, 'o'), batchnorm=bn_last, nonlin=None)
def param_init_fflayer(params, prefix, nin, nout, zero_init=False, batchnorm=False): ''' Initializes weights for a feedforward layer ''' global args if zero_init: params[_concat(prefix, 'W')] = np.zeros((nin, nout)).astype('float32') else: params[_concat(prefix, 'W')] = init_weights(nin, nout, type_init='ortho') params[_concat(prefix, 'b')] = np.zeros((nout,)).astype('float32') if batchnorm: if args.bn_type == 0: dim = nin else: dim = nout params[_concat(prefix, 'g')] = np.ones((dim,), dtype=np.float32) params[_concat(prefix, 'be')] = np.zeros((dim,)).astype('float32') params[_concat(prefix, 'rm')] = np.zeros((1, dim)).astype('float32') params[_concat(prefix, 'rv')] = np.ones((1, dim), dtype=np.float32) return params
def fflayer(tparams, state_below, prefix, nonlin='tanh', batchnorm=None, dropout=None): ''' A feedforward layer Note: None means dropout/batch normalization is not used. Use 'train' or 'test' options. ''' global srng, args # apply batchnormalization on the input if args.bn_type == 0: inp = state_below else: inp = T.dot(state_below, tparams[_concat(prefix, 'W')]) + tparams[_concat(prefix, 'b')] if batchnorm == 'train': axes = (0,) mean = inp.mean(axes, keepdims=True) var = inp.var(axes, keepdims=True) invstd = T.inv(T.sqrt(var + 1e-4)) inp = (inp - mean) * tparams[_concat(prefix, 'g')] * invstd + tparams[_concat(prefix, 'be')] running_average_factor = 0.1 m = T.cast(T.prod(inp.shape) / T.prod(mean.shape), 'float32') tparams[_concat(prefix, 'rm')] = tparams[_concat(prefix, 'rm')] * (1 - running_average_factor) + mean * running_average_factor tparams[_concat(prefix, 'rv')] = tparams[_concat(prefix, 'rv')] * (1 - running_average_factor) + (m / (m - 1)) * var * running_average_factor elif batchnorm == 'test': inp = (inp - tparams[_concat(prefix, 'rm')].flatten()) * tparams[_concat(prefix, 'g')] / T.sqrt(tparams[_concat(prefix, 'rv')].flatten() + 1e-4) + tparams[_concat(prefix, 'be')] if args.bn_type == 0: preact = T.dot(inp, tparams[_concat(prefix, 'W')]) + tparams[_concat(prefix, 'b')] else: preact = inp # dropout is carried out with fixed probability if dropout == 'train': dropmask = srng.binomial(n=1, p=1. - args.dropout_prob, size=preact.shape, dtype=theano.config.floatX) preact *= dropmask elif dropout == 'test': preact *= 1. - args.dropout_prob if nonlin == None: return preact elif nonlin == 'tanh': return T.tanh(preact) elif nonlin == 'sigmoid': return T.nnet.nnet.sigmoid(preact) elif nonlin == 'softplus': return T.nnet.nnet.softplus(preact) elif nonlin == 'relu': return T.nnet.nnet.relu(preact)
# split images trp = [split_img(img) for img in trc] tep = [split_img(img) for img in tec] print "Initializing parameters" # parameter initializations ff_e = 'ff_enc' ff_d = 'ff_dec' sg = 'sg' latent_dim = 50 params = OrderedDict() # encoder params = param_init_fflayer(params, _concat(ff_e, 'i'), 14*28, 200, batchnorm=True) params = param_init_fflayer(params, _concat(ff_e, 'h'), 200, 100, batchnorm=True) # latent if args.bn_type == 0: params = param_init_fflayer(params, _concat(ff_e, 'bern'), 100, latent_dim, batchnorm=True) else: params = param_init_fflayer(params, _concat(ff_e, 'bern'), 100, latent_dim, batchnorm=False) # synthetic gradient module for the last encoder layer params = param_init_sgmod(params, _concat(sg, 'r'), latent_dim) # loss prediction neural network, conditioned on input and output (in this case the whole image). Acts as the baseline params = param_init_fflayer(params, 'loss_pred', 28*28, 1) # decoder parameters
# split images trp = [split_img(img) for img in trc] tep = [split_img(img) for img in tec] print "Initializing parameters" # parameter initializations ff_e = 'ff_enc' ff_d = 'ff_dec' latent_dim = 1000 params = OrderedDict() # encoder params = param_init_fflayer(params, _concat(ff_e, 'i'), 14 * 28, 200, batchnorm=True) params = param_init_fflayer(params, _concat(ff_e, 'h'), 200, 100, batchnorm=True) # latent distribution parameters if args.latent_type == 'cont': if args.bn_type == 0: params = param_init_fflayer(params, _concat(ff_e, 'mu'), 100,
tei = np.asarray( [img.flatten() for lbl, img in read(dataset='testing', path='MNIST/')], dtype=np.float32) tel = np.asarray([lbl for lbl, img in read(dataset='testing', path='MNIST/')], dtype=np.int64) print "Initializing parameters" ff = 'ff' sg = 'sg' # no address for weights if len(sys.argv) < 3: params = OrderedDict() params = param_init_fflayer(params, _concat(ff, '1'), 28 * 28, 300) params = param_init_fflayer(params, _concat(ff, '2'), 300, 150) if train_rou == 'synthetic_gradients': params = param_init_sgmod(params, _concat(sg, '1'), 300) params = param_init_sgmod(params, _concat(sg, '2'), 150) params = param_init_fflayer(params, _concat(ff, 'o'), 150, 10) else: params = np.load(sys.argv[2]) tparams = OrderedDict() for key, val in params.iteritems(): tparams[key] = theano.shared(val, name=key)