def test_disconnected_paths(self): # Test that taking gradient going through a disconnected # path rasises an exception T = theano.tensor a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = T.matrix('x') # This MUST raise a DisconnectedInputError error. # This also rasies an additional warning from gradients.py. self.assertRaises(gradient.DisconnectedInputError, gradient.grad, gradient.disconnected_grad(x).sum(), x) # This MUST NOT raise a DisconnectedInputError error. y = gradient.grad((x + gradient.disconnected_grad(x)).sum(), x) a = T.matrix('a') b = T.matrix('b') y = a + gradient.disconnected_grad(b) # This MUST raise a DisconnectedInputError error. # This also rasies an additional warning from gradients.py. self.assertRaises(gradient.DisconnectedInputError, gradient.grad, y.sum(), b) # This MUST NOT raise a DisconnectedInputError error. gradient.grad(y.sum(), a)
def functions(self, sequence_length): key = (sequence_length) if key not in self.cache: logging.info("Need to construct graph for sequence_length=%d..." % (sequence_length)) # creating network input variable nodes correct_inputs = t.ftensor3("correct input") noise_inputs = t.ftensor3("noise input") learning_rate = t.fscalar("learning rate") # creating op nodes for firing the network correct_score, correct_prehidden = self.score(correct_inputs) noise_score, noise_prehidden = self.score(noise_inputs) # creating op nodes for the pairwise ranking cost function loss = t.clip(1 - correct_score + noise_score, 0, 1e999) total_loss = t.sum(loss) # the necessary cost function gradients parameters_gradient = grad(total_loss, list(self.parameters)) correct_inputs_gradient = grad(total_loss, correct_inputs) noise_inputs_gradient = grad(total_loss, noise_inputs) # setting network inputs predict_inputs = [correct_inputs] train_inputs = [correct_inputs, noise_inputs, learning_rate] verbose_predict_inputs = predict_inputs # setting network outputs predict_outputs = [correct_score] train_outputs = [correct_inputs_gradient, noise_inputs_gradient, loss, correct_score, noise_score] verbose_predict_outputs = [correct_score, correct_prehidden] nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs)) logging.info("About to compile prediction function over %d ops [nodes]..." % nnodes) predict = theano.function(predict_inputs, predict_outputs, mode=COMPILE_MODE) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) nnodes = len(theano.gof.graph.ops(verbose_predict_inputs, verbose_predict_outputs)) logging.info("About to compile verbose prediction function over %d ops [nodes]..." % nnodes) verbose_predict = theano.function(verbose_predict_inputs, verbose_predict_outputs, mode=COMPILE_MODE) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs)) logging.info("About to compile training function over %d ops [nodes]..." % nnodes) train = theano.function(train_inputs, train_outputs, mode=COMPILE_MODE, updates=[(p, p - learning_rate * gp) for p, gp in zip(list(self.parameters), parameters_gradient)]) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) self.cache[key] = (predict, train, verbose_predict) return self.cache[key]
def grad(self, inputs, output_grads): # OpFromGraph doesn't implement a connection_pattern, so for # now we regard all inputs and outputs as connected. This will # compute the right numerical value for the gradients but # could fail to raise the disconnected inputs error in some # cases. if hasattr(self, "grad_ops"): grad_ops = self.grad_ops else: gs = G.grad(cost=None, known_grads=dict(zip(self.new_outputs, output_grads)), wrt=self.new_inputs, disconnected_inputs='ignore') grad_ops = [] for g in gs: if g is None: grad_ops.append(lambda *args: None) else: # It is normal if some inputs are not needed in order # to compute the gradient, so we ignore them. grad_ops.append( OpFromGraph(self.new_inputs + output_grads, [g], on_unused_input='ignore')) self.grad_ops = grad_ops return [go(*(inputs + output_grads)) for go in grad_ops]
def grad(self, inputs, output_grads): # OpFromGraph doesn't implement a connection_pattern, so for # now we regard all inputs and outputs as connected. This will # compute the right numerical value for the gradients but # could fail to raise the disconnected inputs error in some # cases. if hasattr(self, "grad_ops"): grad_ops = self.grad_ops else: gs = G.grad(cost=None, known_grads=dict(zip(self.new_outputs, output_grads)), wrt=self.new_inputs, disconnected_inputs='ignore') grad_ops = [] for g in gs: if g is None: grad_ops.append(lambda *args: None) else: # It is normal if some inputs are not needed in order # to compute the gradient, so we ignore them. grad_ops.append(OpFromGraph(self.new_inputs + output_grads, [g], on_unused_input='ignore')) self.grad_ops = grad_ops return [go(*(inputs + output_grads)) for go in grad_ops]
def compute_nll_upper_bound(self, seq_length, validation=False): ############# # Inference (enc_mu, enc_sig, prior_mu, prior_sig, dec_bin), updates = \ self.inference(self.orch, self.piano, seq_length) ############# # Cost dec_bin_non_zero = T.switch(dec_bin > 0, dec_bin, 1e-30) # Avoid log zero recon = T.sum(T.nnet.binary_crossentropy(dec_bin_non_zero, self.orch), axis=1) # binary_crossentropy = nll for binary input. Sum along input dimension, mean along time (i.e. batch) # for real-valued units, use GaussianNLL kl = KLGaussianGaussian(enc_mu, enc_sig, prior_mu, prior_sig) # Mean over batches recon_term = T.mean(recon) kl_term = T.mean(kl) # Note that instead of maximazing the neg log-lik upper bound, # We here minimize the log-lik upper bound cost = recon_term + kl_term if not validation: ############# # Gradient gparams = G.grad(cost, self.params_dico.values()) ############# # Updates updates_train = self.optimizer.get_updates(self.params_dico.values(), gparams, updates) ############# # Cost return cost, updates_train else: return cost, recon_term, kl_term, dec_bin, updates
def test_disconnected_nan(self): # test that connection_pattern can prevent getting NaN # Op1 has two outputs, f and g # x is connected to f but not to g class Op1(theano.gof.Op): def make_node(self, x): return theano.Apply(self, inputs=[x], outputs=[x.type(), theano.tensor.scalar()]) def connection_pattern(self, node): return [[True, False]] def grad(self, inputs, output_grads): return [inputs[0].zeros_like()] # Op2 has two inputs, f and g # Its gradient with respect to g is not defined class Op2(theano.gof.Op): def make_node(self, f, g): return theano.Apply(self, inputs=[f, g], outputs=[theano.tensor.scalar()]) def grad(self, inputs, output_grads): return [inputs[0].zeros_like(), NullType()()] x = theano.tensor.vector() f, g = Op1()(x) cost = Op2()(f, g) # cost is differentiable wrt x # but we can't tell that without using Op1's connection pattern # looking at the theano graph alone, g is an ancestor of cost # and has x as an ancestor, so we must compute its gradient g = gradient.grad(cost, x)
def inner_function(*args): idx = args[0] expr = args[1] rvals = [] for inp in args[2:]: rval = grad(expr[idx], inp, consider_constant=consider_constant, disconnected_inputs=disconnected_inputs) rvals.append(rval) return rvals
def jacobian(expression, wrt, consider_constant=None, disconnected_inputs='raise'): ''' similar implementation as in theano.gradient, but ignore not empty updates (because when you use it in lasagna there is should be some update and it is ok) ''' from theano.tensor import arange # Check inputs have the right format assert isinstance(expression, Variable), \ "tensor.jacobian expects a Variable as `expression`" assert expression.ndim < 2, \ ("tensor.jacobian expects a 1 dimensional variable as " "`expression`. If not use flatten to make it a vector") using_list = isinstance(wrt, list) using_tuple = isinstance(wrt, tuple) if isinstance(wrt, (list, tuple)): wrt = list(wrt) else: wrt = [wrt] if expression.ndim == 0: # expression is just a scalar, use grad return format_as(using_list, using_tuple, grad(expression, wrt, consider_constant=consider_constant, disconnected_inputs=disconnected_inputs)) def inner_function(*args): idx = args[0] expr = args[1] rvals = [] for inp in args[2:]: rval = grad(expr[idx], inp, consider_constant=consider_constant, disconnected_inputs=disconnected_inputs) rvals.append(rval) return rvals # Computing the gradients does not affect the random seeds on any random # generator used n expression (because during computing gradients we are # just backtracking over old values. (rp Jan 2012 - if anyone has a # counter example please show me) jacobs, updates = theano.scan(inner_function, sequences=arange(expression.shape[0]), non_sequences=[expression] + wrt) #the only difference from theano implementation -- no assertion for updates # assert not updates, \ # ("Scan has returned a list of updates. This should not " # "happen! Report this to theano-users (also include the " # "script that generated the error)") return format_as(using_list, using_tuple, jacobs)
def __init__(self, intpic_parameters=None, case_costs=None, pics=None, case_labels=None, batch_size=None, pic_size=None, label_count=None, **kwargs): super(IntpicGradientDescent, self).__init__(**kwargs) center_val = 0.5 self.input_pics = pics self.case_costs = case_costs self.batch_size = batch_size self.label_count = label_count self.intpic_parameters = intpic_parameters self.jacobians = self._compute_jacobians() self.gradpics = OrderedDict([ (param, _create_intpic_histogram_for(param, pic_size, label_count)) for param in self.intpic_parameters ]) self.intpics = OrderedDict([ (param, _create_intpic_histogram_for(param, pic_size, label_count)) for param in self.intpic_parameters ]) # attributes pics: (cases, picy, picx) to (cases, labels, picy, picx) # attributed_pics = tensor.batched_tensordot( # tensor.extra_ops.to_one_hot(case_labels.flatten(), label_count), # pics[:, 0, :, :], axes=0) zeroed_pics = pics - 0.5 attributed_pics = tensor.batched_tensordot(tensor.extra_ops.to_one_hot( case_labels.flatten(), label_count), zeroed_pics[:, 0, :, :], axes=0) self.gradpic_updates = OrderedDict([ _create_gradpic_updates(self.gradpics[param], self.jacobians[param], attributed_pics) for param in self.intpic_parameters ]) self.add_updates(self.gradpic_updates) intensity_pics = (zeroed_pics * gradient.grad(case_costs.mean(), pics)) attributed_i_pics = tensor.batched_tensordot( tensor.extra_ops.to_one_hot(case_labels.flatten(), label_count), intensity_pics[:, 0, :, :], axes=0) self.intpic_updates = OrderedDict([ _create_intensity_updates(self.intpics[param], self.jacobians[param], attributed_i_pics) for param in self.intpic_parameters ]) self.add_updates(self.intpic_updates)
def generate_adv_example(embedded, loss, perturb_scale): # embedded: [n_examples, input_length, feature_dim] grad = gradient.grad(loss, embedded) grad = gradient.disconnected_grad(grad) shifted = embedded + T.max(T.abs_(embedded)) + 1.0 grad_dim = (shifted / shifted).sum(axis=(1, 2)).mean( axis=0) # grad dim for each example sqrt_grad_dim = T.sqrt(grad_dim) # sqrt(input_length * emb_dim) perturb = perturb_scale * sqrt_grad_dim * _scale_unit_l2(grad) return embedded + perturb
def __init__(self, intpic_parameters=None, case_costs=None, pics=None, case_labels=None, batch_size=None, pic_size=None, label_count=None, **kwargs): super(IntpicGradientDescent, self).__init__(**kwargs) center_val = 0.5 self.input_pics = pics self.case_costs = case_costs self.batch_size = batch_size self.label_count = label_count self.intpic_parameters = intpic_parameters self.jacobians = self._compute_jacobians() self.gradpics = OrderedDict( [(param, _create_intpic_histogram_for(param, pic_size, label_count)) for param in self.intpic_parameters]) self.intpics = OrderedDict( [(param, _create_intpic_histogram_for(param, pic_size, label_count)) for param in self.intpic_parameters]) # attributes pics: (cases, picy, picx) to (cases, labels, picy, picx) # attributed_pics = tensor.batched_tensordot( # tensor.extra_ops.to_one_hot(case_labels.flatten(), label_count), # pics[:, 0, :, :], axes=0) zeroed_pics = pics - 0.5 attributed_pics = tensor.batched_tensordot( tensor.extra_ops.to_one_hot( case_labels.flatten(), label_count), zeroed_pics[:, 0, :, :], axes=0) self.gradpic_updates = OrderedDict( [_create_gradpic_updates( self.gradpics[param], self.jacobians[param], attributed_pics) for param in self.intpic_parameters]) self.add_updates(self.gradpic_updates) intensity_pics = (zeroed_pics * gradient.grad(case_costs.mean(), pics)) attributed_i_pics = tensor.batched_tensordot( tensor.extra_ops.to_one_hot( case_labels.flatten(), label_count), intensity_pics[:, 0, :, :], axes=0) self.intpic_updates = OrderedDict( [_create_intensity_updates( self.intpics[param], self.jacobians[param], attributed_i_pics) for param in self.intpic_parameters]) self.add_updates(self.intpic_updates)
def test_disconnected_nan(self): # test that connection_pattern can prevent getting NaN # Op1 has two outputs, f and g # x is connected to f but not to g class Op1(theano.gof.Op): __props__ = () def make_node(self, x): return theano.Apply(self, inputs=[x], outputs=[x.type(), theano.tensor.scalar()]) def connection_pattern(self, node): return [[True, False]] def grad(self, inputs, output_grads): return [inputs[0].zeros_like()] # Op2 has two inputs, f and g # Its gradient with respect to g is not defined class Op2(theano.gof.Op): __props__ = () def make_node(self, f, g): return theano.Apply(self, inputs=[f, g], outputs=[theano.tensor.scalar()]) def grad(self, inputs, output_grads): return [inputs[0].zeros_like(), NullType()()] x = theano.tensor.vector() f, g = Op1()(x) cost = Op2()(f, g) # cost is differentiable wrt x # but we can't tell that without using Op1's connection pattern # looking at the theano graph alone, g is an ancestor of cost # and has x as an ancestor, so we must compute its gradient g = gradient.grad(cost, x)
def test_grad(self): T = theano.tensor a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = T.matrix("x") expressions_gradients = [ (x * gradient.disconnected_grad(x), x), (x * gradient.disconnected_grad(T.exp(x)), T.exp(x)), (x ** 2 * gradient.disconnected_grad(x), 2 * x ** 2), ] for expr, expr_grad in expressions_gradients: g = gradient.grad(expr.sum(), x) # gradient according to theano f = theano.function([x], g, on_unused_input="ignore") # desired gradient f2 = theano.function([x], expr_grad, on_unused_input="ignore") assert np.allclose(f(a), f2(a))
def test_grad(self): T = theano.tensor a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = T.matrix("x") expressions_gradients = [ (x * gradient.disconnected_grad(x), x), (x * gradient.disconnected_grad(T.exp(x)), T.exp(x)), (x**2 * gradient.disconnected_grad(x), 2 * x**2), ] for expr, expr_grad in expressions_gradients: g = gradient.grad(expr.sum(), x) # gradient according to theano f = theano.function([x], g, on_unused_input="ignore") # desired gradient f2 = theano.function([x], expr_grad, on_unused_input="ignore") assert np.allclose(f(a), f2(a))
def test_grad_disconnected(self): # tests corner cases of gradient for shape and alloc x = theano.tensor.vector(name="x") total = x.sum() total.name = "total" num_elements = x.shape[0] num_elements.name = "num_elements" silly_vector = theano.tensor.alloc(total / num_elements, num_elements) silly_vector.name = "silly_vector" cost = silly_vector.sum() cost.name = "cost" # note that cost simplifies to be the same as "total" g = gradient.grad(cost, x, add_names=False) # we still need to pass in x because it determines the shape of # the output f = theano.function([x], g) rng = np.random.RandomState([2012, 9, 5]) x = np.cast[x.dtype](rng.randn(3)) g = f(x) assert np.allclose(g, np.ones(x.shape, dtype=x.dtype))
def test_grad_disconnected(): #tests corner cases of gradient for shape and alloc x = theano.tensor.vector(name='x') total = x.sum() total.name = 'total' num_elements = x.shape[0] num_elements.name = 'num_elements' silly_vector = theano.tensor.alloc(total / num_elements, num_elements) silly_vector.name = 'silly_vector' cost = silly_vector.sum() cost.name = 'cost' #note that cost simplifies to be the same as "total" g = gradient.grad(cost, x, add_names=False) #we still need to pass in x because it determines the shape of the output f = theano.function([x], g) rng = np.random.RandomState([2012, 9, 5]) x = np.cast[x.dtype](rng.randn(3)) g = f(x) assert np.allclose(g, np.ones(x.shape, dtype=x.dtype))
def __init__(self, inputs, outputs, grad_depth=1, **kwargs): if not isinstance(outputs, list): raise TypeError('outputs must be list', outputs) for i in inputs + outputs: if not isinstance(i, gof.Variable): raise TypeError( 'inputs and outputs must be Variable instances', i) if 'updates' in kwargs: raise TypeError('updates are not allowed in kwargs') # TODO: the graph may have implicit inputs like # SharedVariable instances. # what impact to they have on the validity of this Op? self.fn = orig_function(inputs, outputs, **kwargs) self.inputs = inputs self.outputs = outputs self.input_types = [input.type for input in inputs] self.output_types = [output.type for output in outputs] if grad_depth > 0: output_grads = [t() for t in self.output_types] # OpFromGraph doesn't implement a connection_pattern, so for now we regard # all inputs and outputs as connected. This will compute the right numerical # value for the gradients but could fail to raise the disconnected inputs error # in some cases. gs = G.grad(cost=None, known_grads=dict(zip(self.outputs, output_grads)), wrt=self.inputs, disconnected_inputs='ignore') self.grad_ops = [] for g in gs: if g is None: self.grad_ops.append(lambda *args: None) else: # It is normal if some inputs are not needed in order # to compute the gradient, so we ignore them. self.grad_ops.append(OpFromGraph(inputs + output_grads, [g], grad_depth=grad_depth - 1, on_unused_input='ignore'))
def __init__(self, synpic_parameters=None, case_costs=None, pics=None, case_labels=None, batch_size=None, pic_size=None, label_count=None, **kwargs): kwargs.setdefault("before_training", True) center_val = 0.5 self.input_pics = pics self.case_costs = case_costs self.batch_size = batch_size self.label_count = label_count self.synpic_parameters = synpic_parameters self.jacobians = self._compute_jacobians() self.synpics = OrderedDict([ (param, _create_synpic_histogram_for(param, pic_size, label_count)) for param in self.synpic_parameters ]) # attributes pics: (cases, picy, picx) to (cases, labels, picy, picx) # attributed_pics = tensor.batched_tensordot( # tensor.extra_ops.to_one_hot(case_labels.flatten(), label_count), # pics[:, 0, :, :], axes=0) zeroed_pics = pics - 0.5 focused_pics = zeroed_pics * abs(gradient.grad(case_costs.mean(), pics)) attributed_pics = tensor.batched_tensordot(tensor.extra_ops.to_one_hot( case_labels.flatten(), label_count), focused_pics[:, 0, :, :], axes=0) self.synpic_updates = OrderedDict([ _create_synpic_updates(self.synpics[param], self.jacobians[param], attributed_pics) for param in self.synpic_parameters ]) super(SynpicExtension, self).__init__(**kwargs)
def make_grad_func(X): Z = theano.tensor.dot(X, W) + b H = theano.tensor.nnet.sigmoid(Z) cost = H.sum() g = gradient.grad(cost, X) return theano.function([X, W, b], g, on_unused_input="ignore")
def main(save_to): batch_size = 500 image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction convnet = create_lenet_5() mnist_test = MNIST(("test", ), sources=['features', 'targets']) basis_init = create_fair_basis(mnist_test, 10, 2) # b = shared_floatx(basis) # random_init = numpy.rand.random(100, 1000) # r = shared_floatx(random_init) # rn = r / r.norm(axis=1) # x = tensor.dot(rn, tensor.shape_padright(b)) x = shared_floatx(basis_init) # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter(roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) learning_rate = shared_floatx(0.01, 'learning_rate') unit = shared_floatx(0, 'unit', dtype='int64') negate = False suffix = '_negsynth.jpg' if negate else '_synth.jpg' for output in outs: layer = get_brick(output) # For now, skip masks -for some reason they are always NaN iterations = 10000 layername = layer.parents[0].name + '-' + layer.name # if layername != 'noisylinear_2-linear': # continue dims = layer.get_dims(['output'])[0] if negate: measure = -output else: measure = output measure = measure[(slice(0, basis_init.shape[0]), ) + (slice(None), ) * (measure.ndim - 1)] if isinstance(dims, numbers.Integral): dims = (dims, ) costvec = -tensor.log( tensor.nnet.softmax(measure)[:, unit].flatten()) else: flatout = measure.flatten(ndim=3) maxout = flatout.max(axis=2) costvec = -tensor.log( tensor.nnet.softmax(maxout)[:, unit].flatten()) # Add a regularization to favor gray images. # cost = costvec.sum() + (x - 0.5).norm(2) * ( # 10.0 / basis_init.shape[0]) cost = costvec.sum() grad = gradient.grad(cost, x) stepx = x - learning_rate * grad normx = stepx / tensor.shape_padright( stepx.flatten(ndim=2).max(axis=1), n_ones=3) newx = tensor.clip(normx, 0, 1) newx = newx[(slice(0, basis_init.shape[0]), ) + (slice(None), ) * (newx.ndim - 1)] fn = theano.function([], [cost], updates=[(x, newx)]) filmstrip = Filmstrip(basis_init.shape[-2:], (dims[0], basis_init.shape[0]), background='red') for u in range(dims[0]): unit.set_value(u) x.set_value(basis_init) print('layer', layername, 'unit', u) for index in range(iterations): c = fn()[0] if index % 1000 == 0: print('cost', c) result = x.get_value() for i2 in range(basis_init.shape[0]): filmstrip.set_image((u, i2), result[i2, :, :, :]) filmstrip.save(layername + suffix) result = x.get_value() for index in range(basis_init.shape[0]): filmstrip.set_image((u, index), result[index, :, :, :]) filmstrip.save(layername + suffix)
def evaluate_lenet5(datasets_=datasets, learning_rate=[17./(3**i) for i in range(6)], n_epochs=42, nkerns=[12, 12, 0, 0], batch_size=1, patience=200000, filter_shape=[3, 3, 0], poolsize=[2, 2, 0] ): rng = numpy.random.RandomState(23455) train_set_x, train_set_y = datasets_[0] test_set_x, test_set_y = datasets_[1] n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_test_batches //= batch_size index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') image_shape = (batch_size, dim_vals[0], dim_vals[1], dim_vals[2]) layer0_input = x.reshape( image_shape ) # Construct the first convolutional pooling layer: # filtering reduces the image size to (264-5+1 , 264-5+1) = (260, 260) # maxpooling reduces this further to (260/2, 260/2) = (130, 130) # 4D output tensor is thus of shape (batch_size, nkerns[0], 130, 130) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=image_shape, filter_shape=(nkerns[0], dim_vals[0], filter_shape[0], filter_shape[0]), poolsize=(poolsize[0], poolsize[0]) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (130-3+1, 130-3+1) = (128, 128) # maxpooling reduces this further to (128/2, 128/2) = (64, 64) # 4D output tensor is thus of shape (batch_size, nkerns[1], 64, 64) layer1_input_shape = (dim_val+1-filter_shape[0]) / poolsize[0] layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], layer1_input_shape, layer1_input_shape), filter_shape=(nkerns[1], nkerns[0], filter_shape[1], filter_shape[1]), poolsize=(poolsize[1], poolsize[1]) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[3] * 31 * 31) layer2_input = layer1.output.flatten(2) layer2_input_shape = (layer1_input_shape+1-filter_shape[1]) / poolsize[1] layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * layer2_input_shape * layer2_input_shape, n_out=500, activation=T.tanh ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[3] * 31 * 31) # layer3_input = layer2.output # # layer3 = HiddenLayer( # rng, # input=layer3_input, # n_in=1000, # n_out=500, # activation=T.tanh # ) layer4 = LogisticRegression(input=layer2.output, n_in=500, n_out=13) cost = layer4.negative_log_likelihood(y) test_model = theano.function( [index], layer4.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) test_model_on_train = theano.function( [index], layer4.errors(y), givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) params = layer4.params + layer2.params + layer1.params + layer0.params grads = grad(cost, params) updates_0 = [ (param_i, param_i - learning_rate[0] * grad_i) for param_i, grad_i in zip(params, grads) ] updates_1 = [ (param_i, param_i - learning_rate[1] * grad_i) for param_i, grad_i in zip(params, grads) ] updates_2 = [ (param_i, param_i - learning_rate[2] * grad_i) for param_i, grad_i in zip(params, grads) ] updates_3 = [ (param_i, param_i - learning_rate[3] * grad_i) for param_i, grad_i in zip(params, grads) ] updates_4 = [ (param_i, param_i - learning_rate[4] * grad_i) for param_i, grad_i in zip(params, grads) ] updates_5 = [ (param_i, param_i - learning_rate[5] * grad_i) for param_i, grad_i in zip(params, grads) ] # updates_6 = [ # (param_i, param_i - learning_rate[6] * grad_i) # for param_i, grad_i in zip(params, grads) # ] train_model_0 = theano.function( [index], cost, updates=updates_0, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) train_model_1 = theano.function( [index], cost, updates=updates_1, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) train_model_2 = theano.function( [index], cost, updates=updates_2, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) train_model_3 = theano.function( [index], cost, updates=updates_3, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) train_model_4 = theano.function( [index], cost, updates=updates_4, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) train_model_5 = theano.function( [index], cost, updates=updates_5, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # train_model_6 = theano.function( # [index], # cost, # updates=updates_6, # givens={ # x: train_set_x[index * batch_size: (index + 1) * batch_size], # y: train_set_y[index * batch_size: (index + 1) * batch_size] # } # ) ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters start_time = timeit.default_timer() epoch = 0 done_looping = False cblr = [n_epochs*i/len(learning_rate) for i in range(len(learning_rate)+1)] for i in range( len(learning_rate) ): while (epoch in range(cblr[i],cblr[i+1])) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 1000 == 0: print('training @ iter = ', iter) if i == 0: cost_ij = train_model_0( minibatch_index ) elif i == 1: cost_ij = train_model_1( minibatch_index ) elif i == 2: cost_ij = train_model_2( minibatch_index ) elif i == 3: cost_ij = train_model_3( minibatch_index ) elif i == 4: cost_ij = train_model_4( minibatch_index ) elif i == 5: cost_ij = train_model_5( minibatch_index ) # elif i == 6: # cost_ij = train_model_6( minibatch_index ) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) # test it on the training set test_losses = [ test_model_on_train(i) for i in range(n_train_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error on training set ' '%f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) error_on_train.append( test_score * 100. ) # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error ' '%f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) error_on_test.append( test_score * 100. )
def make_grad_func(X): Z = theano.tensor.dot(X, W) + b H = theano.tensor.nnet.sigmoid(Z) cost = H.sum() g = gradient.grad(cost, X) return theano.function([X, W, b], g, on_unused_input='ignore')
def main(save_to): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) random_init = (numpy.random.rand(100, 1, 28, 28) * 128).astype('float32') layers = [l for l in convnet.layers if isinstance(l, Convolutional)] mnist_test = MNIST(("test",), sources=['features', 'targets']) basis_init = create_fair_basis(mnist_test, 10, 50) basis_set = make_shifted_basis(basis_init, convnet, layers) for layer, basis in zip(layers, basis_set): # basis is 5d: # (probed_units, base_cases, 1-c, 28-y, 28-x) b = shared_floatx(basis) # coefficients is 2d: # (probed_units, base_cases) coefficients = shared_floatx( numpy.ones(basis.shape[0:2], dtype=theano.config.floatX)) # prod is 5d: (probed_units, base_cases, 1-c, 28-y, 28-x) prod = tensor.shape_padright(coefficients, 3) * b # x is 4d: (probed_units, 1-c, 28-y, 28-x) ux = prod.sum(axis=1) x = tensor.clip(ux / tensor.shape_padright(ux.flatten(ndim=2).max(axis=1), 3), 0, 1) # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter( roles=[OUTPUT], bricks=[layer])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) learning_rate = shared_floatx(0.03, 'learning_rate') # We will try to do all units at once. # unit = shared_floatx(0, 'unit', dtype='int64') # But we are only doing one layer at once. output = outs[0] dims = layer.get_dims(['output'])[0] if isinstance(dims, numbers.Integral): # FC case: output is 2d: (probed_units, units) dims = (dims, ) unitrange = tensor.arange(dims[0]) costvec = -tensor.log( tensor.nnet.softmax(output)[unitrange, unitrage]. flatten()) else: # Conv case: output is 4d: (probed_units, units, y, x) unitrange = tensor.arange(dims[0]) print('dims is', dims) costvec = -tensor.log(tensor.nnet.softmax(output[ unitrange, unitrange, dims[1] // 2, dims[2] // 2]). flatten()) cost = costvec.sum() # grad is dims (probed_units, basis_size) grad = gradient.grad(cost, coefficients) stepc = coefficients # - learning_rate * grad newc = stepc / tensor.shape_padright(stepc.mean(axis=1)) fn = theano.function([], [cost, x], updates=[(coefficients, newc)]) filmstrip = Filmstrip( random_init.shape[-2:], (dims[0], 1), background='red') layer = get_brick(output) learning_rate.set_value(0.1) for index in range(20000): c, result = fn() if index % 1000 == 0: learning_rate.set_value(numpy.cast[theano.config.floatX]( learning_rate.get_value() * 0.8)) print('cost', c) for u in range(dims[0]): filmstrip.set_image((u, 0), result[u,:,:,:]) filmstrip.save(layer.name + '_stroke.jpg') for u in range(dims[0]): filmstrip.set_image((u, 0), result[u,:,:,:]) filmstrip.save(layer.name + '_stroke.jpg')
def functions(self, sequence_length): key = (sequence_length) if key not in self.cache: logging.info("Need to construct graph for sequence_length=%d..." % (sequence_length)) # creating network input variable nodes correct_inputs = t.ftensor3("correct input") noise_inputs = t.ftensor3("noise input") learning_rate = t.fscalar("learning rate") # creating op nodes for firing the network correct_score, correct_prehidden = self.score(correct_inputs) noise_score, noise_prehidden = self.score(noise_inputs) # creating op nodes for the pairwise ranking cost function loss = t.clip(1 - correct_score + noise_score, 0, 1e999) total_loss = t.sum(loss) # the necessary cost function gradients parameters_gradient = grad(total_loss, list(self.parameters)) correct_inputs_gradient = grad(total_loss, correct_inputs) noise_inputs_gradient = grad(total_loss, noise_inputs) # setting network inputs predict_inputs = [correct_inputs] train_inputs = [correct_inputs, noise_inputs, learning_rate] verbose_predict_inputs = predict_inputs # setting network outputs predict_outputs = [correct_score] train_outputs = [ correct_inputs_gradient, noise_inputs_gradient, loss, correct_score, noise_score ] verbose_predict_outputs = [correct_score, correct_prehidden] nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs)) logging.info( "About to compile prediction function over %d ops [nodes]..." % nnodes) predict = theano.function(predict_inputs, predict_outputs, mode=COMPILE_MODE) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) nnodes = len( theano.gof.graph.ops(verbose_predict_inputs, verbose_predict_outputs)) logging.info( "About to compile verbose prediction function over %d ops [nodes]..." % nnodes) verbose_predict = theano.function(verbose_predict_inputs, verbose_predict_outputs, mode=COMPILE_MODE) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs)) logging.info( "About to compile training function over %d ops [nodes]..." % nnodes) train = theano.function( train_inputs, train_outputs, mode=COMPILE_MODE, updates=[(p, p - learning_rate * gp) for p, gp in zip( list(self.parameters), parameters_gradient)]) logging.info("...done constructing graph for sequence_length=%d" % (sequence_length)) self.cache[key] = (predict, train, verbose_predict) return self.cache[key]
def assign_step_methods(model, step=None, methods=STEP_METHODS, step_kwargs=None): """Assign model variables to appropriate step methods. Passing a specified model will auto-assign its constituent stochastic variables to step methods based on the characteristics of the variables. This function is intended to be called automatically from `sample()`, but may be called manually. Each step method passed should have a `competence()` method that returns an ordinal competence value corresponding to the variable passed to it. This value quantifies the appropriateness of the step method for sampling the variable. Parameters ---------- model : Model object A fully-specified model object step : step function or vector of step functions One or more step functions that have been assigned to some subset of the model's parameters. Defaults to None (no assigned variables). methods : vector of step method classes The set of step methods from which the function may choose. Defaults to the main step methods provided by PyMC3. step_kwargs : dict Parameters for the samplers. Keys are the lower case names of the step method, values a dict of arguments. Returns ------- methods : list List of step methods associated with the model's variables. """ steps = [] assigned_vars = set() if step is not None: try: steps += list(step) except TypeError: steps.append(step) for step in steps: try: assigned_vars = assigned_vars.union(set(step.vars)) except AttributeError: for method in step.methods: assigned_vars = assigned_vars.union(set(method.vars)) # Use competence classmethods to select step methods for remaining # variables selected_steps = defaultdict(list) for var in model.free_RVs: if var not in assigned_vars: # determine if a gradient can be computed has_gradient = var.dtype not in discrete_types if has_gradient: try: tg.grad(model.logpt, var) except (AttributeError, NotImplementedError, tg.NullTypeGradError): has_gradient = False # select the best method selected = max(methods, key=lambda method, var=var, has_gradient= has_gradient: method._competence(var, has_gradient)) pm._log.info('Assigned {0} to {1}'.format(selected.__name__, var)) selected_steps[selected].append(var) return instantiate_steppers(model, steps, selected_steps, step_kwargs)
def main(save_to): batch_size = 500 image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction convnet = create_lenet_5() mnist_test = MNIST(("test",), sources=['features', 'targets']) basis_init = create_fair_basis(mnist_test, 10, 2) # b = shared_floatx(basis) # random_init = numpy.rand.random(100, 1000) # r = shared_floatx(random_init) # rn = r / r.norm(axis=1) # x = tensor.dot(rn, tensor.shape_padright(b)) x = shared_floatx(basis_init) # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) learning_rate = shared_floatx(0.01, 'learning_rate') unit = shared_floatx(0, 'unit', dtype='int64') negate = False suffix = '_negsynth.jpg' if negate else '_synth.jpg' for output in outs: layer = get_brick(output) # For now, skip masks -for some reason they are always NaN iterations = 10000 layername = layer.parents[0].name + '-' + layer.name # if layername != 'noisylinear_2-linear': # continue dims = layer.get_dims(['output'])[0] if negate: measure = -output else: measure = output measure = measure[(slice(0, basis_init.shape[0]), ) + (slice(None),) * (measure.ndim - 1)] if isinstance(dims, numbers.Integral): dims = (dims, ) costvec = -tensor.log(tensor.nnet.softmax( measure)[:,unit].flatten()) else: flatout = measure.flatten(ndim=3) maxout = flatout.max(axis=2) costvec = -tensor.log(tensor.nnet.softmax( maxout)[:,unit].flatten()) # Add a regularization to favor gray images. # cost = costvec.sum() + (x - 0.5).norm(2) * ( # 10.0 / basis_init.shape[0]) cost = costvec.sum() grad = gradient.grad(cost, x) stepx = x - learning_rate * grad normx = stepx / tensor.shape_padright( stepx.flatten(ndim=2).max(axis=1), n_ones=3) newx = tensor.clip(normx, 0, 1) newx = newx[(slice(0, basis_init.shape[0]), ) + (slice(None),) * (newx.ndim - 1)] fn = theano.function([], [cost], updates=[(x, newx)]) filmstrip = Filmstrip( basis_init.shape[-2:], (dims[0], basis_init.shape[0]), background='red') for u in range(dims[0]): unit.set_value(u) x.set_value(basis_init) print('layer', layername, 'unit', u) for index in range(iterations): c = fn()[0] if index % 1000 == 0: print('cost', c) result = x.get_value() for i2 in range(basis_init.shape[0]): filmstrip.set_image((u, i2), result[i2,:,:,:]) filmstrip.save(layername + suffix) result = x.get_value() for index in range(basis_init.shape[0]): filmstrip.set_image((u, index), result[index,:,:,:]) filmstrip.save(layername + suffix)
def main(save_to): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) random_init = (numpy.random.rand(100, 1, 28, 28) * 128).astype('float32') layers = [l for l in convnet.layers if isinstance(l, Convolutional)] mnist_test = MNIST(("test", ), sources=['features', 'targets']) basis_init = create_fair_basis(mnist_test, 10, 50) basis_set = make_shifted_basis(basis_init, convnet, layers) for layer, basis in zip(layers, basis_set): # basis is 5d: # (probed_units, base_cases, 1-c, 28-y, 28-x) b = shared_floatx(basis) # coefficients is 2d: # (probed_units, base_cases) coefficients = shared_floatx( numpy.ones(basis.shape[0:2], dtype=theano.config.floatX)) # prod is 5d: (probed_units, base_cases, 1-c, 28-y, 28-x) prod = tensor.shape_padright(coefficients, 3) * b # x is 4d: (probed_units, 1-c, 28-y, 28-x) ux = prod.sum(axis=1) x = tensor.clip( ux / tensor.shape_padright(ux.flatten(ndim=2).max(axis=1), 3), 0, 1) # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter(roles=[OUTPUT], bricks=[layer])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) learning_rate = shared_floatx(0.03, 'learning_rate') # We will try to do all units at once. # unit = shared_floatx(0, 'unit', dtype='int64') # But we are only doing one layer at once. output = outs[0] dims = layer.get_dims(['output'])[0] if isinstance(dims, numbers.Integral): # FC case: output is 2d: (probed_units, units) dims = (dims, ) unitrange = tensor.arange(dims[0]) costvec = -tensor.log( tensor.nnet.softmax(output)[unitrange, unitrage].flatten()) else: # Conv case: output is 4d: (probed_units, units, y, x) unitrange = tensor.arange(dims[0]) print('dims is', dims) costvec = -tensor.log( tensor.nnet.softmax(output[unitrange, unitrange, dims[1] // 2, dims[2] // 2]).flatten()) cost = costvec.sum() # grad is dims (probed_units, basis_size) grad = gradient.grad(cost, coefficients) stepc = coefficients # - learning_rate * grad newc = stepc / tensor.shape_padright(stepc.mean(axis=1)) fn = theano.function([], [cost, x], updates=[(coefficients, newc)]) filmstrip = Filmstrip(random_init.shape[-2:], (dims[0], 1), background='red') layer = get_brick(output) learning_rate.set_value(0.1) for index in range(20000): c, result = fn() if index % 1000 == 0: learning_rate.set_value(numpy.cast[theano.config.floatX]( learning_rate.get_value() * 0.8)) print('cost', c) for u in range(dims[0]): filmstrip.set_image((u, 0), result[u, :, :, :]) filmstrip.save(layer.name + '_stroke.jpg') for u in range(dims[0]): filmstrip.set_image((u, 0), result[u, :, :, :]) filmstrip.save(layer.name + '_stroke.jpg')