class Classifier(Initializable): def __init__(self, conv_seq, fully_connected, **kwargs): super(Classifier, self).__init__(**kwargs) self.conv_seq = conv_seq self.flatten = Flattener() self.fully_connected = fully_connected self.children = [self.conv_seq, self.flatten, self.fully_connected] def get_dim(self, name): if name=="input": return self.conv_seq.get_dim(name) elif name == 'output': return self.fully_connected.get_dim(name) else: super(Classifier, self).get_dim(name) @application(inputs=['input_'], outputs=['output_']) def apply(self, input_): output_conv = self.conv_seq.apply(input_) input_fully = self.flatten.apply(output_conv) output = self.fully_connected.apply(input_fully) output.name = "output_" return output
def run_experiment(): np.random.seed(42) X = tensor.tensor4('features') nbr_channels = 3 image_shape = (5, 5) conv_layers = [ ConvolutionalLayer( filter_size=(2,2), num_filters=10, activation=Rectifier().apply, border_mode='valid', pooling_size=(1,1), weights_init=Uniform(width=0.1), #biases_init=Uniform(width=0.01), biases_init=Constant(0.0), name='conv0')] conv_sequence = ConvolutionalSequence( conv_layers, num_channels=nbr_channels, image_size=image_shape) #conv_sequence.push_allocation_config() conv_sequence.initialize() flattener = Flattener() conv_output = conv_sequence.apply(X) y_hat = flattener.apply(conv_output) # Whatever. Not important since we're not going to actually train anything. cost = tensor.sqr(y_hat).sum() #L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[FILTER, BIAS])(ComputationGraph([y_hat]).variables)] L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[BIAS])(ComputationGraph([y_hat]).variables)] # works on the sum of the gradients in a mini-batch sum_square_norm_gradients_method_02 = sum([tensor.sqr(g).sum() for g in L_grads_method_02]) D_by_layer = get_conv_layers_transformation_roles(ComputationGraph(conv_output)) individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_conv_transformations(D_by_layer, cost) # why does this thing depend on N again ? # I don't think I've used a cost that divides by N. N = 2 Xtrain = np.random.randn(N, nbr_channels, image_shape[0], image_shape[1]).astype(np.float32) #Xtrain[1:,:,:,:] = 0.0 Xtrain[:,:,:,:] = 1.0 convolution_filter_variable = VariableFilter(roles=[FILTER])(ComputationGraph([y_hat]).variables)[0] convolution_filter_variable_value = convolution_filter_variable.get_value() convolution_filter_variable_value[:,:,:,:] = 1.0 #convolution_filter_variable_value[0,0,:,:] = 1.0 convolution_filter_variable.set_value(convolution_filter_variable_value) f = theano.function([X], [cost, individual_sum_square_norm_gradients_method_00, sum_square_norm_gradients_method_02]) [c, v0, gs2] = f(Xtrain) #print "[c, v0, gs2]" L_c, L_v0, L_gs2 = ([], [], []) for n in range(N): [nc, nv0, ngs2] = f(Xtrain[n,:, :, :].reshape((1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3]))) L_c.append(nc) L_v0.append(nv0) L_gs2.append(ngs2) print "Cost for whole mini-batch in single shot : %f." % c print "Cost for whole mini-batch accumulated : %f." % sum(L_c) print "" print "Square-norm of all gradients for each data point in single shot :" print v0.reshape((1,-1)) print "Square-norm of all gradients for each data point iteratively :" print np.array(L_gs2).reshape((1,-1)) print "" print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2))) print "" print "Ratios : " print np.array(L_gs2).reshape((1,-1)) / v0.reshape((1,-1))
def run_experiment(): np.random.seed(42) X = tensor.tensor4('features') nbr_channels = 3 image_shape = (5, 5) conv_layers = [ ConvolutionalLayer( filter_size=(2, 2), num_filters=10, activation=Rectifier().apply, border_mode='valid', pooling_size=(1, 1), weights_init=Uniform(width=0.1), #biases_init=Uniform(width=0.01), biases_init=Constant(0.0), name='conv0') ] conv_sequence = ConvolutionalSequence(conv_layers, num_channels=nbr_channels, image_size=image_shape) #conv_sequence.push_allocation_config() conv_sequence.initialize() flattener = Flattener() conv_output = conv_sequence.apply(X) y_hat = flattener.apply(conv_output) # Whatever. Not important since we're not going to actually train anything. cost = tensor.sqr(y_hat).sum() #L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[FILTER, BIAS])(ComputationGraph([y_hat]).variables)] L_grads_method_02 = [ tensor.grad(cost, v) for v in VariableFilter( roles=[BIAS])(ComputationGraph([y_hat]).variables) ] # works on the sum of the gradients in a mini-batch sum_square_norm_gradients_method_02 = sum( [tensor.sqr(g).sum() for g in L_grads_method_02]) D_by_layer = get_conv_layers_transformation_roles( ComputationGraph(conv_output)) individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_conv_transformations( D_by_layer, cost) # why does this thing depend on N again ? # I don't think I've used a cost that divides by N. N = 2 Xtrain = np.random.randn(N, nbr_channels, image_shape[0], image_shape[1]).astype(np.float32) #Xtrain[1:,:,:,:] = 0.0 Xtrain[:, :, :, :] = 1.0 convolution_filter_variable = VariableFilter(roles=[FILTER])( ComputationGraph([y_hat]).variables)[0] convolution_filter_variable_value = convolution_filter_variable.get_value() convolution_filter_variable_value[:, :, :, :] = 1.0 #convolution_filter_variable_value[0,0,:,:] = 1.0 convolution_filter_variable.set_value(convolution_filter_variable_value) f = theano.function([X], [ cost, individual_sum_square_norm_gradients_method_00, sum_square_norm_gradients_method_02 ]) [c, v0, gs2] = f(Xtrain) #print "[c, v0, gs2]" L_c, L_v0, L_gs2 = ([], [], []) for n in range(N): [nc, nv0, ngs2] = f(Xtrain[n, :, :, :].reshape( (1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3]))) L_c.append(nc) L_v0.append(nv0) L_gs2.append(ngs2) print "Cost for whole mini-batch in single shot : %f." % c print "Cost for whole mini-batch accumulated : %f." % sum(L_c) print "" print "Square-norm of all gradients for each data point in single shot :" print v0.reshape((1, -1)) print "Square-norm of all gradients for each data point iteratively :" print np.array(L_gs2).reshape((1, -1)) print "" print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2))) print "" print "Ratios : " print np.array(L_gs2).reshape((1, -1)) / v0.reshape((1, -1))
def run_experiment(): np.random.seed(42) #X = tensor.matrix('features') X = tensor.tensor4('features') y = tensor.matrix('targets') nbr_channels = 3 image_shape = (30, 30) conv_layers = [ ConvolutionalLayer( filter_size=(4,4), num_filters=10, activation=Rectifier().apply, border_mode='full', pooling_size=(1,1), weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name='conv0'), ConvolutionalLayer( filter_size=(3,3), num_filters=14, activation=Rectifier().apply, border_mode='full', pooling_size=(1,1), weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name='conv1')] conv_sequence = ConvolutionalSequence( conv_layers, num_channels=nbr_channels, image_size=image_shape) #conv_sequence.push_allocation_config() conv_sequence.initialize() conv_output_dim = np.prod(conv_sequence.get_dim('output')) #conv_output_dim = 25*25 flattener = Flattener() mlp = MLP( activations=[Rectifier(), Rectifier(), Softmax()], dims=[conv_output_dim, 50, 50, 10], weights_init=IsotropicGaussian(std=0.1), biases_init=IsotropicGaussian(std=0.01)) mlp.initialize() conv_output = conv_sequence.apply(X) y_hat = mlp.apply(flattener.apply(conv_output)) cost = CategoricalCrossEntropy().apply(y, y_hat) #cost = CategoricalCrossEntropy().apply(y_hat, y) #cost = BinaryCrossEntropy().apply(y.flatten(), y_hat.flatten()) cg = ComputationGraph([y_hat]) """ print "--- INPUT ---" for v in VariableFilter(bricks=mlp.linear_transformations, roles=[INPUT])(cg.variables): print v.tag.annotations[0].name print "--- OUTPUT ---" #print(VariableFilter(bricks=mlp.linear_transformations, roles=[OUTPUT])(cg.variables)) for v in VariableFilter(bricks=mlp.linear_transformations, roles=[OUTPUT])(cg.variables): print v.tag.annotations[0].name print "--- WEIGHT ---" #print(VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg.variables)) for v in VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg.variables): print v.tag.annotations[0].name print "--- BIAS ---" #print(VariableFilter(bricks=mlp.linear_transformations, roles=[BIAS])(cg.variables)) for v in VariableFilter(bricks=mlp.linear_transformations, roles=[BIAS])(cg.variables): print v.tag.annotations[0].name """ # check out .tag on the variables to see which layer they belong to print "----------------------------" D_by_layer = get_linear_transformation_roles(mlp, cg) # returns a vector with one entry for each in the mini-batch individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_linear_transformations(D_by_layer, cost) #import pprint #pp = pprint.PrettyPrinter(indent=4) #pp.pprint(get_conv_layers_transformation_roles(ComputationGraph(conv_output)).items()) D_by_layer = get_conv_layers_transformation_roles(ComputationGraph(conv_output)) individual_sum_square_norm_gradients_method_00 += get_sum_square_norm_gradients_conv_transformations(D_by_layer, cost) print "There are %d entries in cg.parameters." % len(cg.parameters) L_grads_method_01 = [tensor.grad(cost, p) for p in cg.parameters] L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[WEIGHT, BIAS])(cg.variables)] # works on the sum of the gradients in a mini-batch sum_square_norm_gradients_method_01 = sum([tensor.sqr(g).sum() for g in L_grads_method_01]) sum_square_norm_gradients_method_02 = sum([tensor.sqr(g).sum() for g in L_grads_method_02]) N = 8 Xtrain = np.random.randn(N, nbr_channels, image_shape[0], image_shape[1]).astype(np.float32) # Option 1. ytrain = np.zeros((N, 10), dtype=np.float32) for n in range(N): label = np.random.randint(low=0, high=10) ytrain[n, label] = 1.0 # Option 2, just to debug situations with NaN. #ytrain = np.random.rand(N, 10).astype(np.float32) #for n in range(N): # ytrain[n,:] = ytrain[n,:] / ytrain[n,:].sum() f = theano.function([X,y], [cost, individual_sum_square_norm_gradients_method_00, sum_square_norm_gradients_method_01, sum_square_norm_gradients_method_02]) [c, v0, gs1, gs2] = f(Xtrain, ytrain) #print "[c, v0, gs1, gs2]" L_c, L_v0, L_gs1, L_gs2 = ([], [], [], []) for n in range(N): [nc, nv0, ngs1, ngs2] = f(Xtrain[n,:].reshape((1,Xtrain.shape[1],Xtrain.shape[2], Xtrain.shape[3])), ytrain[n,:].reshape((1,10))) L_c.append(nc) L_v0.append(nv0) L_gs1.append(ngs1) L_gs2.append(ngs2) print "Cost for whole mini-batch in single shot : %f." % c print "Cost for whole mini-batch accumulated : %f." % sum(L_c) print "" print "Square-norm of all gradients for each data point in single shot :" print v0.reshape((1,-1)) print "Square-norm of all gradients for each data point iteratively :" print np.array(L_gs1).reshape((1,-1)) print "Square-norm of all gradients for each data point iteratively :" print np.array(L_gs2).reshape((1,-1)) print "" print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs1))) print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2))) print "" print "Ratios : " print np.array(L_gs1).reshape((1,-1)) / v0.reshape((1,-1))
def run_experiment(): np.random.seed(42) #X = tensor.matrix('features') X = tensor.tensor4('features') y = tensor.matrix('targets') nbr_channels = 3 image_shape = (30, 30) conv_layers = [ ConvolutionalLayer(filter_size=(4, 4), num_filters=10, activation=Rectifier().apply, border_mode='full', pooling_size=(1, 1), weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name='conv0'), ConvolutionalLayer(filter_size=(3, 3), num_filters=14, activation=Rectifier().apply, border_mode='full', pooling_size=(1, 1), weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name='conv1') ] conv_sequence = ConvolutionalSequence(conv_layers, num_channels=nbr_channels, image_size=image_shape) #conv_sequence.push_allocation_config() conv_sequence.initialize() conv_output_dim = np.prod(conv_sequence.get_dim('output')) #conv_output_dim = 25*25 flattener = Flattener() mlp = MLP(activations=[Rectifier(), Rectifier(), Softmax()], dims=[conv_output_dim, 50, 50, 10], weights_init=IsotropicGaussian(std=0.1), biases_init=IsotropicGaussian(std=0.01)) mlp.initialize() conv_output = conv_sequence.apply(X) y_hat = mlp.apply(flattener.apply(conv_output)) cost = CategoricalCrossEntropy().apply(y, y_hat) #cost = CategoricalCrossEntropy().apply(y_hat, y) #cost = BinaryCrossEntropy().apply(y.flatten(), y_hat.flatten()) cg = ComputationGraph([y_hat]) """ print "--- INPUT ---" for v in VariableFilter(bricks=mlp.linear_transformations, roles=[INPUT])(cg.variables): print v.tag.annotations[0].name print "--- OUTPUT ---" #print(VariableFilter(bricks=mlp.linear_transformations, roles=[OUTPUT])(cg.variables)) for v in VariableFilter(bricks=mlp.linear_transformations, roles=[OUTPUT])(cg.variables): print v.tag.annotations[0].name print "--- WEIGHT ---" #print(VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg.variables)) for v in VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg.variables): print v.tag.annotations[0].name print "--- BIAS ---" #print(VariableFilter(bricks=mlp.linear_transformations, roles=[BIAS])(cg.variables)) for v in VariableFilter(bricks=mlp.linear_transformations, roles=[BIAS])(cg.variables): print v.tag.annotations[0].name """ # check out .tag on the variables to see which layer they belong to print "----------------------------" D_by_layer = get_linear_transformation_roles(mlp, cg) # returns a vector with one entry for each in the mini-batch individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_linear_transformations( D_by_layer, cost) #import pprint #pp = pprint.PrettyPrinter(indent=4) #pp.pprint(get_conv_layers_transformation_roles(ComputationGraph(conv_output)).items()) D_by_layer = get_conv_layers_transformation_roles( ComputationGraph(conv_output)) individual_sum_square_norm_gradients_method_00 += get_sum_square_norm_gradients_conv_transformations( D_by_layer, cost) print "There are %d entries in cg.parameters." % len(cg.parameters) L_grads_method_01 = [tensor.grad(cost, p) for p in cg.parameters] L_grads_method_02 = [ tensor.grad(cost, v) for v in VariableFilter(roles=[WEIGHT, BIAS])(cg.variables) ] # works on the sum of the gradients in a mini-batch sum_square_norm_gradients_method_01 = sum( [tensor.sqr(g).sum() for g in L_grads_method_01]) sum_square_norm_gradients_method_02 = sum( [tensor.sqr(g).sum() for g in L_grads_method_02]) N = 8 Xtrain = np.random.randn(N, nbr_channels, image_shape[0], image_shape[1]).astype(np.float32) # Option 1. ytrain = np.zeros((N, 10), dtype=np.float32) for n in range(N): label = np.random.randint(low=0, high=10) ytrain[n, label] = 1.0 # Option 2, just to debug situations with NaN. #ytrain = np.random.rand(N, 10).astype(np.float32) #for n in range(N): # ytrain[n,:] = ytrain[n,:] / ytrain[n,:].sum() f = theano.function([X, y], [ cost, individual_sum_square_norm_gradients_method_00, sum_square_norm_gradients_method_01, sum_square_norm_gradients_method_02 ]) [c, v0, gs1, gs2] = f(Xtrain, ytrain) #print "[c, v0, gs1, gs2]" L_c, L_v0, L_gs1, L_gs2 = ([], [], [], []) for n in range(N): [nc, nv0, ngs1, ngs2] = f( Xtrain[n, :].reshape( (1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3])), ytrain[n, :].reshape((1, 10))) L_c.append(nc) L_v0.append(nv0) L_gs1.append(ngs1) L_gs2.append(ngs2) print "Cost for whole mini-batch in single shot : %f." % c print "Cost for whole mini-batch accumulated : %f." % sum(L_c) print "" print "Square-norm of all gradients for each data point in single shot :" print v0.reshape((1, -1)) print "Square-norm of all gradients for each data point iteratively :" print np.array(L_gs1).reshape((1, -1)) print "Square-norm of all gradients for each data point iteratively :" print np.array(L_gs2).reshape((1, -1)) print "" print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs1))) print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2))) print "" print "Ratios : " print np.array(L_gs1).reshape((1, -1)) / v0.reshape((1, -1))