class Classifier(Initializable):


    def __init__(self,
                    conv_seq, fully_connected,
                    **kwargs):
        super(Classifier, self).__init__(**kwargs)

        self.conv_seq = conv_seq
        self.flatten = Flattener()
        self.fully_connected = fully_connected
        
        self.children = [self.conv_seq, self.flatten, self.fully_connected]

    def get_dim(self, name):
        if name=="input":
            return self.conv_seq.get_dim(name)
        elif name == 'output':
            return self.fully_connected.get_dim(name)
        else:
            super(Classifier, self).get_dim(name)

    @application(inputs=['input_'], outputs=['output_'])
    def apply(self, input_):
        output_conv = self.conv_seq.apply(input_)
        input_fully = self.flatten.apply(output_conv)
        output = self.fully_connected.apply(input_fully)
        output.name = "output_"
        return output
def run_experiment():

    np.random.seed(42)

    X = tensor.tensor4('features')
    nbr_channels = 3
    image_shape = (5, 5)

    conv_layers = [ ConvolutionalLayer( filter_size=(2,2),
                                        num_filters=10,
                                        activation=Rectifier().apply,
                                        border_mode='valid',
                                        pooling_size=(1,1),
                                        weights_init=Uniform(width=0.1),
                                        #biases_init=Uniform(width=0.01),
                                        biases_init=Constant(0.0),
                                        name='conv0')]
    conv_sequence = ConvolutionalSequence(  conv_layers,
                                            num_channels=nbr_channels,
                                            image_size=image_shape)
    #conv_sequence.push_allocation_config()
    conv_sequence.initialize()
    
    flattener = Flattener()
    conv_output = conv_sequence.apply(X)
    y_hat = flattener.apply(conv_output)
    # Whatever. Not important since we're not going to actually train anything.
    cost = tensor.sqr(y_hat).sum()


    #L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[FILTER, BIAS])(ComputationGraph([y_hat]).variables)]
    L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[BIAS])(ComputationGraph([y_hat]).variables)]
    # works on the sum of the gradients in a mini-batch
    sum_square_norm_gradients_method_02 = sum([tensor.sqr(g).sum() for g in L_grads_method_02])


    D_by_layer = get_conv_layers_transformation_roles(ComputationGraph(conv_output))
    individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_conv_transformations(D_by_layer, cost)


    # why does this thing depend on N again ?
    # I don't think I've used a cost that divides by N.

    N = 2
    Xtrain = np.random.randn(N, nbr_channels, image_shape[0], image_shape[1]).astype(np.float32)
    #Xtrain[1:,:,:,:] = 0.0
    Xtrain[:,:,:,:] = 1.0

    convolution_filter_variable = VariableFilter(roles=[FILTER])(ComputationGraph([y_hat]).variables)[0]
    convolution_filter_variable_value = convolution_filter_variable.get_value()
    convolution_filter_variable_value[:,:,:,:] = 1.0
    #convolution_filter_variable_value[0,0,:,:] = 1.0
    convolution_filter_variable.set_value(convolution_filter_variable_value)

    f = theano.function([X],
                        [cost,
                            individual_sum_square_norm_gradients_method_00,
                            sum_square_norm_gradients_method_02])


    [c, v0, gs2] = f(Xtrain)

    #print "[c, v0, gs2]"
    L_c, L_v0, L_gs2 = ([], [], [])
    for n in range(N):
        [nc, nv0, ngs2] = f(Xtrain[n,:, :, :].reshape((1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3])))
        L_c.append(nc)
        L_v0.append(nv0)
        L_gs2.append(ngs2)

    print "Cost for whole mini-batch in single shot : %f." % c
    print "Cost for whole mini-batch accumulated    : %f." % sum(L_c)
    print ""
    print "Square-norm of all gradients for each data point in single shot :"
    print v0.reshape((1,-1))
    print "Square-norm of all gradients for each data point iteratively :"
    print np.array(L_gs2).reshape((1,-1))
    print ""
    print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2)))
    print ""
    print "Ratios : "
    print np.array(L_gs2).reshape((1,-1)) / v0.reshape((1,-1))
def run_experiment():

    np.random.seed(42)

    X = tensor.tensor4('features')
    nbr_channels = 3
    image_shape = (5, 5)

    conv_layers = [
        ConvolutionalLayer(
            filter_size=(2, 2),
            num_filters=10,
            activation=Rectifier().apply,
            border_mode='valid',
            pooling_size=(1, 1),
            weights_init=Uniform(width=0.1),
            #biases_init=Uniform(width=0.01),
            biases_init=Constant(0.0),
            name='conv0')
    ]
    conv_sequence = ConvolutionalSequence(conv_layers,
                                          num_channels=nbr_channels,
                                          image_size=image_shape)
    #conv_sequence.push_allocation_config()
    conv_sequence.initialize()

    flattener = Flattener()
    conv_output = conv_sequence.apply(X)
    y_hat = flattener.apply(conv_output)
    # Whatever. Not important since we're not going to actually train anything.
    cost = tensor.sqr(y_hat).sum()

    #L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[FILTER, BIAS])(ComputationGraph([y_hat]).variables)]
    L_grads_method_02 = [
        tensor.grad(cost, v) for v in VariableFilter(
            roles=[BIAS])(ComputationGraph([y_hat]).variables)
    ]
    # works on the sum of the gradients in a mini-batch
    sum_square_norm_gradients_method_02 = sum(
        [tensor.sqr(g).sum() for g in L_grads_method_02])

    D_by_layer = get_conv_layers_transformation_roles(
        ComputationGraph(conv_output))
    individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_conv_transformations(
        D_by_layer, cost)

    # why does this thing depend on N again ?
    # I don't think I've used a cost that divides by N.

    N = 2
    Xtrain = np.random.randn(N, nbr_channels, image_shape[0],
                             image_shape[1]).astype(np.float32)
    #Xtrain[1:,:,:,:] = 0.0
    Xtrain[:, :, :, :] = 1.0

    convolution_filter_variable = VariableFilter(roles=[FILTER])(
        ComputationGraph([y_hat]).variables)[0]
    convolution_filter_variable_value = convolution_filter_variable.get_value()
    convolution_filter_variable_value[:, :, :, :] = 1.0
    #convolution_filter_variable_value[0,0,:,:] = 1.0
    convolution_filter_variable.set_value(convolution_filter_variable_value)

    f = theano.function([X], [
        cost, individual_sum_square_norm_gradients_method_00,
        sum_square_norm_gradients_method_02
    ])

    [c, v0, gs2] = f(Xtrain)

    #print "[c, v0, gs2]"
    L_c, L_v0, L_gs2 = ([], [], [])
    for n in range(N):
        [nc, nv0, ngs2] = f(Xtrain[n, :, :, :].reshape(
            (1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3])))
        L_c.append(nc)
        L_v0.append(nv0)
        L_gs2.append(ngs2)

    print "Cost for whole mini-batch in single shot : %f." % c
    print "Cost for whole mini-batch accumulated    : %f." % sum(L_c)
    print ""
    print "Square-norm of all gradients for each data point in single shot :"
    print v0.reshape((1, -1))
    print "Square-norm of all gradients for each data point iteratively :"
    print np.array(L_gs2).reshape((1, -1))
    print ""
    print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2)))
    print ""
    print "Ratios : "
    print np.array(L_gs2).reshape((1, -1)) / v0.reshape((1, -1))
def run_experiment():

    np.random.seed(42)

    #X = tensor.matrix('features')
    X = tensor.tensor4('features')
    y = tensor.matrix('targets')
    nbr_channels = 3
    image_shape = (30, 30)

    conv_layers = [ ConvolutionalLayer( filter_size=(4,4),
                                        num_filters=10,
                                        activation=Rectifier().apply,
                                        border_mode='full',
                                        pooling_size=(1,1),
                                        weights_init=Uniform(width=0.1),
                                        biases_init=Constant(0.0),
                                        name='conv0'),
                    ConvolutionalLayer( filter_size=(3,3),
                                        num_filters=14,
                                        activation=Rectifier().apply,
                                        border_mode='full',
                                        pooling_size=(1,1),
                                        weights_init=Uniform(width=0.1),
                                        biases_init=Constant(0.0),
                                        name='conv1')]
    conv_sequence = ConvolutionalSequence(  conv_layers,
                                            num_channels=nbr_channels,
                                            image_size=image_shape)
    #conv_sequence.push_allocation_config()
    conv_sequence.initialize()
    conv_output_dim = np.prod(conv_sequence.get_dim('output'))
    #conv_output_dim = 25*25

    flattener = Flattener()

    mlp = MLP(  activations=[Rectifier(), Rectifier(), Softmax()],
                dims=[conv_output_dim, 50, 50, 10],
                weights_init=IsotropicGaussian(std=0.1), biases_init=IsotropicGaussian(std=0.01))
    mlp.initialize()

    conv_output = conv_sequence.apply(X)
    y_hat = mlp.apply(flattener.apply(conv_output))

    cost = CategoricalCrossEntropy().apply(y, y_hat)
    #cost = CategoricalCrossEntropy().apply(y_hat, y)
    #cost = BinaryCrossEntropy().apply(y.flatten(), y_hat.flatten())

    cg = ComputationGraph([y_hat])
    
    """
    print "--- INPUT ---"
    for v in VariableFilter(bricks=mlp.linear_transformations, roles=[INPUT])(cg.variables):
        print v.tag.annotations[0].name

    print "--- OUTPUT ---"
    #print(VariableFilter(bricks=mlp.linear_transformations, roles=[OUTPUT])(cg.variables))
    for v in VariableFilter(bricks=mlp.linear_transformations, roles=[OUTPUT])(cg.variables):
        print v.tag.annotations[0].name

    print "--- WEIGHT ---"
    #print(VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg.variables))
    for v in VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg.variables):
        print v.tag.annotations[0].name
    print "--- BIAS ---"
    #print(VariableFilter(bricks=mlp.linear_transformations, roles=[BIAS])(cg.variables))
    for v in VariableFilter(bricks=mlp.linear_transformations, roles=[BIAS])(cg.variables):
        print v.tag.annotations[0].name
    """

    # check out .tag on the variables to see which layer they belong to

    print "----------------------------"


    D_by_layer = get_linear_transformation_roles(mlp, cg)

    # returns a vector with one entry for each in the mini-batch
    individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_linear_transformations(D_by_layer, cost)

    #import pprint
    #pp = pprint.PrettyPrinter(indent=4)
    #pp.pprint(get_conv_layers_transformation_roles(ComputationGraph(conv_output)).items())

    D_by_layer = get_conv_layers_transformation_roles(ComputationGraph(conv_output))
    individual_sum_square_norm_gradients_method_00 += get_sum_square_norm_gradients_conv_transformations(D_by_layer, cost)



    print "There are %d entries in cg.parameters." % len(cg.parameters)
    L_grads_method_01 = [tensor.grad(cost, p) for p in cg.parameters]
    L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[WEIGHT, BIAS])(cg.variables)]

    # works on the sum of the gradients in a mini-batch
    sum_square_norm_gradients_method_01 = sum([tensor.sqr(g).sum() for g in L_grads_method_01])
    sum_square_norm_gradients_method_02 = sum([tensor.sqr(g).sum() for g in L_grads_method_02])

    N = 8
    Xtrain = np.random.randn(N, nbr_channels, image_shape[0], image_shape[1]).astype(np.float32)

    # Option 1.
    ytrain = np.zeros((N, 10), dtype=np.float32)
    for n in range(N):
        label = np.random.randint(low=0, high=10)
        ytrain[n, label] = 1.0

    # Option 2, just to debug situations with NaN.
    #ytrain = np.random.rand(N, 10).astype(np.float32)
    #for n in range(N):
    #    ytrain[n,:] = ytrain[n,:] / ytrain[n,:].sum()


    f = theano.function([X,y],
                        [cost,
                            individual_sum_square_norm_gradients_method_00,
                            sum_square_norm_gradients_method_01,
                            sum_square_norm_gradients_method_02])

    [c, v0, gs1, gs2] = f(Xtrain, ytrain)

    #print "[c, v0, gs1, gs2]"
    L_c, L_v0, L_gs1, L_gs2 = ([], [], [], [])
    for n in range(N):
        [nc, nv0, ngs1, ngs2] = f(Xtrain[n,:].reshape((1,Xtrain.shape[1],Xtrain.shape[2], Xtrain.shape[3])), ytrain[n,:].reshape((1,10)))
        L_c.append(nc)
        L_v0.append(nv0)
        L_gs1.append(ngs1)
        L_gs2.append(ngs2)

    print "Cost for whole mini-batch in single shot : %f." % c
    print "Cost for whole mini-batch accumulated    : %f." % sum(L_c)
    print ""
    print "Square-norm of all gradients for each data point in single shot :"
    print v0.reshape((1,-1))
    print "Square-norm of all gradients for each data point iteratively :"
    print np.array(L_gs1).reshape((1,-1))
    print "Square-norm of all gradients for each data point iteratively :"
    print np.array(L_gs2).reshape((1,-1))
    print ""
    print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs1)))
    print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2)))
    print ""
    print "Ratios : "
    print np.array(L_gs1).reshape((1,-1)) / v0.reshape((1,-1))
Example #5
0
def run_experiment():

    np.random.seed(42)

    #X = tensor.matrix('features')
    X = tensor.tensor4('features')
    y = tensor.matrix('targets')
    nbr_channels = 3
    image_shape = (30, 30)

    conv_layers = [
        ConvolutionalLayer(filter_size=(4, 4),
                           num_filters=10,
                           activation=Rectifier().apply,
                           border_mode='full',
                           pooling_size=(1, 1),
                           weights_init=Uniform(width=0.1),
                           biases_init=Constant(0.0),
                           name='conv0'),
        ConvolutionalLayer(filter_size=(3, 3),
                           num_filters=14,
                           activation=Rectifier().apply,
                           border_mode='full',
                           pooling_size=(1, 1),
                           weights_init=Uniform(width=0.1),
                           biases_init=Constant(0.0),
                           name='conv1')
    ]
    conv_sequence = ConvolutionalSequence(conv_layers,
                                          num_channels=nbr_channels,
                                          image_size=image_shape)
    #conv_sequence.push_allocation_config()
    conv_sequence.initialize()
    conv_output_dim = np.prod(conv_sequence.get_dim('output'))
    #conv_output_dim = 25*25

    flattener = Flattener()

    mlp = MLP(activations=[Rectifier(), Rectifier(),
                           Softmax()],
              dims=[conv_output_dim, 50, 50, 10],
              weights_init=IsotropicGaussian(std=0.1),
              biases_init=IsotropicGaussian(std=0.01))
    mlp.initialize()

    conv_output = conv_sequence.apply(X)
    y_hat = mlp.apply(flattener.apply(conv_output))

    cost = CategoricalCrossEntropy().apply(y, y_hat)
    #cost = CategoricalCrossEntropy().apply(y_hat, y)
    #cost = BinaryCrossEntropy().apply(y.flatten(), y_hat.flatten())

    cg = ComputationGraph([y_hat])
    """
    print "--- INPUT ---"
    for v in VariableFilter(bricks=mlp.linear_transformations, roles=[INPUT])(cg.variables):
        print v.tag.annotations[0].name

    print "--- OUTPUT ---"
    #print(VariableFilter(bricks=mlp.linear_transformations, roles=[OUTPUT])(cg.variables))
    for v in VariableFilter(bricks=mlp.linear_transformations, roles=[OUTPUT])(cg.variables):
        print v.tag.annotations[0].name

    print "--- WEIGHT ---"
    #print(VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg.variables))
    for v in VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg.variables):
        print v.tag.annotations[0].name
    print "--- BIAS ---"
    #print(VariableFilter(bricks=mlp.linear_transformations, roles=[BIAS])(cg.variables))
    for v in VariableFilter(bricks=mlp.linear_transformations, roles=[BIAS])(cg.variables):
        print v.tag.annotations[0].name
    """

    # check out .tag on the variables to see which layer they belong to

    print "----------------------------"

    D_by_layer = get_linear_transformation_roles(mlp, cg)

    # returns a vector with one entry for each in the mini-batch
    individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_linear_transformations(
        D_by_layer, cost)

    #import pprint
    #pp = pprint.PrettyPrinter(indent=4)
    #pp.pprint(get_conv_layers_transformation_roles(ComputationGraph(conv_output)).items())

    D_by_layer = get_conv_layers_transformation_roles(
        ComputationGraph(conv_output))
    individual_sum_square_norm_gradients_method_00 += get_sum_square_norm_gradients_conv_transformations(
        D_by_layer, cost)

    print "There are %d entries in cg.parameters." % len(cg.parameters)
    L_grads_method_01 = [tensor.grad(cost, p) for p in cg.parameters]
    L_grads_method_02 = [
        tensor.grad(cost, v)
        for v in VariableFilter(roles=[WEIGHT, BIAS])(cg.variables)
    ]

    # works on the sum of the gradients in a mini-batch
    sum_square_norm_gradients_method_01 = sum(
        [tensor.sqr(g).sum() for g in L_grads_method_01])
    sum_square_norm_gradients_method_02 = sum(
        [tensor.sqr(g).sum() for g in L_grads_method_02])

    N = 8
    Xtrain = np.random.randn(N, nbr_channels, image_shape[0],
                             image_shape[1]).astype(np.float32)

    # Option 1.
    ytrain = np.zeros((N, 10), dtype=np.float32)
    for n in range(N):
        label = np.random.randint(low=0, high=10)
        ytrain[n, label] = 1.0

    # Option 2, just to debug situations with NaN.
    #ytrain = np.random.rand(N, 10).astype(np.float32)
    #for n in range(N):
    #    ytrain[n,:] = ytrain[n,:] / ytrain[n,:].sum()

    f = theano.function([X, y], [
        cost, individual_sum_square_norm_gradients_method_00,
        sum_square_norm_gradients_method_01,
        sum_square_norm_gradients_method_02
    ])

    [c, v0, gs1, gs2] = f(Xtrain, ytrain)

    #print "[c, v0, gs1, gs2]"
    L_c, L_v0, L_gs1, L_gs2 = ([], [], [], [])
    for n in range(N):
        [nc, nv0, ngs1, ngs2] = f(
            Xtrain[n, :].reshape(
                (1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3])),
            ytrain[n, :].reshape((1, 10)))
        L_c.append(nc)
        L_v0.append(nv0)
        L_gs1.append(ngs1)
        L_gs2.append(ngs2)

    print "Cost for whole mini-batch in single shot : %f." % c
    print "Cost for whole mini-batch accumulated    : %f." % sum(L_c)
    print ""
    print "Square-norm of all gradients for each data point in single shot :"
    print v0.reshape((1, -1))
    print "Square-norm of all gradients for each data point iteratively :"
    print np.array(L_gs1).reshape((1, -1))
    print "Square-norm of all gradients for each data point iteratively :"
    print np.array(L_gs2).reshape((1, -1))
    print ""
    print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs1)))
    print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2)))
    print ""
    print "Ratios : "
    print np.array(L_gs1).reshape((1, -1)) / v0.reshape((1, -1))