Example #1
0
def build_conv_classifier(input_node,
                          filter_shapes,
                          filter_counts,
                          filter_init_uniform_ranges,
                          pool_shapes,
                          pool_strides,
                          affine_output_sizes,
                          affine_init_stddevs,
                          dropout_include_rates,
                          conv_pads,
                          rng,
                          theano_rng):
    '''
    Builds a classification convnet on top of input_node.

    Returns
    -------
    rval: tuple
      (conv_nodes, affine_nodes, output_node), where:
         conv_nodes is a list of the Conv2d nodes.
         affine_nodes is a list of the AffineNodes.
         output_node is the final node, a Softmax.
    '''

    assert_is_instance(input_node, Lcn)

    conv_shape_args = (filter_shapes,
                       pool_shapes,
                       pool_strides)

    for conv_shapes in conv_shape_args:
        for conv_shape in conv_shapes:
            assert_all_integer(conv_shape)
            assert_all_greater(conv_shape, 0)

    conv_args = conv_shape_args + (filter_counts, filter_init_uniform_ranges)
    assert_all_equal([len(c) for c in conv_args])

    assert_equal(len(affine_output_sizes), len(affine_init_stddevs))

    assert_equal(len(dropout_include_rates),
                 len(filter_shapes) + len(affine_output_sizes))

    assert_equal(affine_output_sizes[-1], 10)  # for MNIST

    #assert_equal(input_node.output_format.axes, ('b', '0', '1'))

    #
    # Done sanity-checking args.
    #

    input_shape = input_node.output_format.shape

    # Converts from MNIST's ('b', '0', '1') to ('b', 'c', '0', '1')
    last_node = input_node

    conv_dropout_include_rates = \
        dropout_include_rates[:len(filter_shapes)]

    # Adds a dropout-conv-bias-relu-maxpool stack for each element in
    # filter_XXXX

    conv_layers = []

    def uniform_init(rng, params, init_range):
        '''
        Fills params with values uniformly sampled from
        [-init_range, init_range]
        '''

        assert_floating(init_range)
        assert_greater_equal(init_range, 0)

        values = params.get_value()
        values[...] = rng.uniform(low=-init_range,
                                  high=init_range,
                                  size=values.shape)
        params.set_value(values)

    for (filter_shape,
         filter_count,
         filter_init_range,
         pool_shape,
         pool_stride,
         conv_dropout_include_rate,
         conv_pads)                 in safe_izip(filter_shapes,
                                                 filter_counts,
                                                 filter_init_uniform_ranges,
                                                 pool_shapes,
                                                 pool_strides,
                                                 conv_dropout_include_rates,
                                                 conv_pads):
        if conv_dropout_include_rate != 1.0:
            last_node = Dropout(last_node,
                                conv_dropout_include_rate,
                                theano_rng)

        last_node = Conv2dLayer(last_node,
                                filter_shape,
                                filter_count,
                                conv_pads=conv_pads,
                                pool_window_shape=pool_shape,
                                pool_strides=pool_stride,
                                pool_pads='pylearn2')
        conv_layers.append(last_node)

        uniform_init(rng, last_node.conv2d_node.filters, filter_init_range)

    affine_dropout_include_rates = dropout_include_rates[len(filter_shapes):]

    affine_layers = []

    def normal_distribution_init(rng, params, stddev):
        '''
        Fills params with values uniformly sampled from
        [-init_range, init_range]
        '''

        assert_floating(stddev)
        assert_greater_equal(stddev, 0)

        values = params.get_value()
        values[...] = rng.standard_normal(values.shape) * stddev
        params.set_value(values)

    #
    # Adds a dropout-affine-relu stack for each element in affine_XXXX,
    # except for the last one, where it omits the dropout.
    #

    # Add a fully connected layer here:
    output_format = DenseFormat(axes=('b', 'f'),
                                    shape=(-1, 500),
                                    dtype=None)

    if affine_dropout_include_rates < 1.0:
        last_node = Dropout(last_node,
                            affine_dropout_include_rates,
                            theano_rng)

    last_node = AffineLayer(last_node, output_format, input_to_bf_map={('0', '1', 'c'): 'f'})
    affine_layers.append(last_node)
    normal_distribution_init(rng,
                         last_node.affine_node.linear_node.params,
                         0.05)
 

    for (affine_size,
         affine_init_stddev,
         affine_dropout_include_rate) in \
        safe_izip(affine_output_sizes,
                  affine_init_stddevs,
                  affine_dropout_include_rates):

        '''
        if affine_dropout_include_rate < 1.0:
            last_node = Dropout(last_node,
                                affine_dropout_include_rate,
                                theano_rng)
        '''
        # No need to supply an axis map for the first affine transform.
        # By default, it collapses all non-'b' axes into a feature vector,
        # which is what we want.

        # remap from bc01 to b01c before flattening to bf, as pylearn2 does,
        # just so that they do identical things.
        last_node = SoftmaxLayer(last_node,
                                 DenseFormat(axes=('b', 'f'),
                                             shape=(-1, affine_size),
                                             dtype=None))
                                 #input_to_bf_map={('0', '1', 'c'): 'f'})
        normal_distribution_init(rng,
                                 last_node.affine_node.linear_node.params,
                                 affine_init_stddev)
        # stddev_init(rng, last_node.bias_node.params, affine_init_stddev)
        affine_layers.append(last_node)

    return conv_layers, affine_layers, last_node
Example #2
0
def build_fc_classifier(input_node, sizes, sparse_init_counts, dropout_include_probabilities, rng, theano_rng):
    """
    Builds a stack of fully-connected layers followed by a Softmax.

    Each hidden layer will be preceded by a ReLU.

    Initialization:

    Weights are initialized in the same way as in Pylearn2's MLP tutorial:
    pylearn2/scripts/tutorials/multilayer_perceptron/mlp_tutorial_part_3.yaml

    This means the following:

    Of the N affine layers, the weights of the first N-1 are to all 0.0, except
    for k randomly-chosen elements, which are set to some random number drawn
    from the normal distribution with stddev=1.0.

    The biases are all initialized to 0.0.
    The last layer's weights and biases are both set to 0.0.

    Parameters
    ----------
    input_node: Node
      The node to build the stack on.

    sizes: Sequence
      A sequence of ints, indicating the output sizes of each layer.
      The last int is the number of classes.

    sparse_init_counts:
      A sequence of N-1 ints, where N = len(sizes).
      Used to initialize the weights of the first N-1 layers.
      If the n'th element is x, this means that the n'th layer
      will have x nonzeros, with the rest initialized to zeros.

    dropout_include_probabilities: Sequence
      A Sequence of N-1 floats, where N := len(sizes)
      The dropout include probabilities for the outputs of each of the layers,
      except for the final one.
      If any of these probabilities is 1.0, the corresponding Dropout node
      will be omitted.

    rng: numpy.random.RandomState
      The RandomState to draw initial weights from.

    theano_rng: theano.tensor.shared_randomstreams.RandomStreams
      The RandomStreams to draw dropout masks from.

    Returns
    -------
    rval: tuple
      (affine_nodes, output_node), where affine_nodes is a list of the
      AffineNodes, in order, and output_node is the final node, a Softmax.
    """
    assert_is_instance(input_node, Node)

    # pylint: disable=no-member
    assert_equal(input_node.output_format.dtype, numpy.dtype(theano.config.floatX))

    assert_greater(len(sizes), 0)
    assert_all_greater(sizes, 0)

    assert_equal(len(sparse_init_counts), len(sizes) - 1)
    assert_all_integer(sparse_init_counts)
    assert_all_greater(sparse_init_counts, 0)
    assert_all_less_equal(sparse_init_counts, sizes[:-1])

    assert_equal(len(dropout_include_probabilities), len(sizes))

    affine_nodes = []

    last_node = input_node

    for layer_index, layer_output_size in enumerate(sizes):
        # Add dropout, if asked for
        include_probability = dropout_include_probabilities[layer_index]
        if include_probability != 1.0:
            last_node = Dropout(last_node, include_probability, theano_rng)

        output_format = DenseFormat(axes=("b", "f"), shape=(-1, layer_output_size), dtype=None)

        if layer_index < (len(sizes) - 1):
            last_node = AffineLayer(last_node, output_format)
        else:
            last_node = SoftmaxLayer(last_node, output_format)

        affine_nodes.append(last_node.affine_node)

    # Not used in this demo, but keeping it in in case we want to start using
    # it again.
    def init_sparse_bias(shared_variable, num_nonzeros, rng):
        """
        Mimics the sparse initialization in
        pylearn2.models.mlp.Linear.set_input_space()
        """

        params = shared_variable.get_value()
        assert_equal(params.shape[0], 1)

        assert_greater_equal(num_nonzeros, 0)
        assert_less_equal(num_nonzeros, params.shape[1])

        params[...] = 0.0

        indices = rng.choice(params.size, size=num_nonzeros, replace=False)

        # normal dist with stddev=1.0
        params[0, indices] = rng.randn(num_nonzeros)

        # Found that for biases, this didn't help (it increased the
        # final misclassification rate by .001)
        # if num_nonzeros > 0:
        #     params /= float(num_nonzeros)

        shared_variable.set_value(params)

    def init_sparse_linear(shared_variable, num_nonzeros, rng):
        params = shared_variable.get_value()
        params[...] = 0.0

        assert_greater_equal(num_nonzeros, 0)
        assert_less_equal(num_nonzeros, params.shape[0])

        for c in xrange(params.shape[1]):
            indices = rng.choice(params.shape[0], size=num_nonzeros, replace=False)

            # normal dist with stddev=1.0, divided by 255.0
            #
            # We need to divide by 255 for convergence. This is because
            # we're using unnormalized (i.e. 0 to 255) pixel values, unlike the
            # 0.0-to-1.0 pixels in
            # pylearn2.scripts.tutorials.multilayer_perceptron/
            #
            # We could just do as the above tutorial does and normalize the
            # pixels to [0.0, 1.0], and not rescale the weights. However,
            # experiments show that this converges to a higher error, and also
            # makes mnist_visualizer.py's results look very "staticky", without
            # any recognizable digit hallucinations.
            params[indices, c] = rng.randn(num_nonzeros) / 255.0

        shared_variable.set_value(params)

    # Initialize the affine layer weights (not the biases, and not the softmax
    # weights)
    for sparse_init_count, affine_node in safe_izip(sparse_init_counts, affine_nodes[:-1]):
        # pylearn2 doesn't sparse_init the biases. I also found that
        # doing so slightly increases the final misclassification rate.
        init_sparse_linear(affine_node.linear_node.params, sparse_init_count, rng)

    return affine_nodes, last_node
def build_conv_classifier(input_node,
                          filter_shapes,
                          filter_counts,
                          filter_init_uniform_ranges,
                          pool_shapes,
                          pool_strides,
                          affine_output_sizes,
                          affine_init_stddevs,
                          dropout_include_rates,
                          conv_pads,
                          rng,
                          theano_rng):
    '''
    Builds a classification convnet on top of input_node.

    Returns
    -------
    rval: tuple
      (conv_nodes, affine_nodes, output_node), where:
         conv_nodes is a list of the Conv2d nodes.
         affine_nodes is a list of the AffineNodes.
         output_node is the final node, a Softmax.
    '''

    assert_is_instance(input_node, Lcn)

    conv_shape_args = (filter_shapes,
                       pool_shapes,
                       pool_strides)

    for conv_shapes in conv_shape_args:
        for conv_shape in conv_shapes:
            assert_all_integer(conv_shape)
            assert_all_greater(conv_shape, 0)

    conv_args = conv_shape_args + (filter_counts, filter_init_uniform_ranges)
    assert_all_equal([len(c) for c in conv_args])

    assert_equal(len(affine_output_sizes), len(affine_init_stddevs))

    assert_equal(len(dropout_include_rates),
                 len(filter_shapes) + len(affine_output_sizes))

    assert_equal(affine_output_sizes[-1], 10)  # for MNIST

    #assert_equal(input_node.output_format.axes, ('b', '0', '1'))

    #
    # Done sanity-checking args.
    #

    input_shape = input_node.output_format.shape

    # Converts from MNIST's ('b', '0', '1') to ('b', 'c', '0', '1')
    last_node = input_node

    conv_dropout_include_rates = \
        dropout_include_rates[:len(filter_shapes)]

    # Adds a dropout-conv-bias-relu-maxpool stack for each element in
    # filter_XXXX

    conv_layers = []

    def uniform_init(rng, params, init_range):
        '''
        Fills params with values uniformly sampled from
        [-init_range, init_range]
        '''

        assert_floating(init_range)
        assert_greater_equal(init_range, 0)

        values = params.get_value()
        values[...] = rng.uniform(low=-init_range,
                                  high=init_range,
                                  size=values.shape)
        params.set_value(values)

    for (filter_shape,
         filter_count,
         filter_init_range,
         pool_shape,
         pool_stride,
         conv_dropout_include_rate,
         conv_pad)                 in safe_izip(filter_shapes,
                                                 filter_counts,
                                                 filter_init_uniform_ranges,
                                                 pool_shapes,
                                                 pool_strides,
                                                 conv_dropout_include_rates,
                                                 conv_pads):
        if conv_dropout_include_rate != 1.0:
            last_node = Dropout(last_node,
                                conv_dropout_include_rate,
                                theano_rng)

        last_node = Conv2dLayer(last_node,
                                filter_shape,
                                filter_count,
                                conv_pads=conv_pad,
                                pool_window_shape=pool_shape,
                                pool_strides=pool_stride,
                                pool_pads='pylearn2')
        conv_layers.append(last_node)

        uniform_init(rng, last_node.conv2d_node.filters, filter_init_range)

    affine_dropout_include_rates = dropout_include_rates[len(filter_shapes):]

    affine_layers = []

    def normal_distribution_init(rng, params, stddev):
        '''
        Fills params with values uniformly sampled from
        [-init_range, init_range]
        '''

        assert_floating(stddev)
        assert_greater_equal(stddev, 0)

        values = params.get_value()
        values[...] = rng.standard_normal(values.shape) * stddev
        params.set_value(values)

    #
    # Adds a dropout-affine-relu stack for each element in affine_XXXX,
    # except for the last one, where it omits the dropout.
    #

    # Add a fully connected layer here:
    output_format = DenseFormat(axes=('b', 'f'),
                                    shape=(-1, 500),
                                    dtype=None)

    if affine_dropout_include_rates < 1.0:
        last_node = Dropout(last_node,
                            affine_dropout_include_rates,
                            theano_rng)

    last_node = AffineLayer(last_node, output_format, input_to_bf_map={('0', '1', 'c'): 'f'})
    affine_layers.append(last_node)
    normal_distribution_init(rng,
                         last_node.affine_node.linear_node.params,
                         0.05)


    for (affine_size,
         affine_init_stddev,
         affine_dropout_include_rate) in \
        safe_izip(affine_output_sizes,
                  affine_init_stddevs,
                  affine_dropout_include_rates):

        '''
        if affine_dropout_include_rate < 1.0:
            last_node = Dropout(last_node,
                                affine_dropout_include_rate,
                                theano_rng)
        '''
        # No need to supply an axis map for the first affine transform.
        # By default, it collapses all non-'b' axes into a feature vector,
        # which is what we want.

        # remap from bc01 to b01c before flattening to bf, as pylearn2 does,
        # just so that they do identical things.
        last_node = SoftmaxLayer(last_node,
                                 DenseFormat(axes=('b', 'f'),
                                             shape=(-1, affine_size),
                                             dtype=None))
                                 #input_to_bf_map={('0', '1', 'c'): 'f'})
        normal_distribution_init(rng,
                                 last_node.affine_node.linear_node.params,
                                 affine_init_stddev)
        # stddev_init(rng, last_node.bias_node.params, affine_init_stddev)
        affine_layers.append(last_node)

    #################################################################################################
    ### BUILD THE SECOND NETWORK WITH FLAT PARAMETERS (given the dimensions of the first) ###########
    #################################################################################################

    rng = numpy.random.RandomState(281934)
    std_deviation = .05

    # Fetch all parameters and shapes
    parameters = []
    for conv_layer in conv_layers:
        filters = conv_layer.conv2d_node.filters
        parameters.append(filters)

        bias = conv_layer.bias_node.params
        parameters.append(bias)

    for affine_layer in affine_layers:
        weights = affine_layer.affine_node.linear_node.params
        parameters.append(weights)

        biases = affine_layer.affine_node.bias_node.params
        parameters.append(biases)

    '''
    print(len(parameters))
    for parameter in parameters:
        print(parameter.get_value().shape)

    shapes = []
    params_flat_values = numpy.asarray([], dtype=theano.config.floatX)
    counter = 0
    for parameter in parameters:
        shape = parameter.get_value().shape
        if counter%2 == 0 and len(shape)==4:
            vector_param = numpy.asarray(numpy.ndarray.flatten(parameter.get_value()), dtype=theano.config.floatX)
            vector_param[...] = rng.standard_normal(vector_param.shape) * std_deviation
            col_length = shape[2]
            index_from = 0

            ###
            #for _ in range(shape[0]*shape[1]*shape[3]):
            #    index_to = index_from + col_length
            #    vector_param[index_from:index_to] = vector_param[index_from:index_to]/numpy.linalg.norm(vector_param[index_from:index_to])
            #    index_from = index_to
            ####

        elif counter%2==0:
            vector_param = numpy.asarray(numpy.ndarray.flatten(parameter.get_value()), dtype=theano.config.floatX)
            vector_param[...] = rng.standard_normal(vector_param.shape) * std_deviation
        else:
            vector_param = numpy.asarray(numpy.ndarray.flatten(parameter.get_value()), dtype=theano.config.floatX)

        params_flat_values = numpy.append(params_flat_values, vector_param)
        shapes.append(shape)
    '''
    params_flat_values = numpy.asarray([], dtype=theano.config.floatX)
    shapes = []

    for parameter in parameters:
        parameter_value = parameter.get_value()
        shapes.append(parameter_value.shape)
        vector_param = numpy.asarray(numpy.ndarray.flatten(parameter_value))
        params_flat_values = numpy.append(params_flat_values, vector_param)
        print(parameter.get_value().shape)

    print(params_flat_values)
    print(params_flat_values.shape)

    params_flat = theano.shared(params_flat_values)
    params_old_flat = theano.shared(params_flat_values)

    '''
    Builds a classification convnet on top of input_node.

    Returns
    -------
    rval: tuple
      (conv_nodes, affine_nodes, output_node), where:
         conv_nodes is a list of the Conv2d nodes.
         affine_nodes is a list of the AffineNodes.
         output_node is the final node, a Softmax.
    '''

    assert_is_instance(input_node, Lcn)

    conv_shape_args = (filter_shapes,
                       pool_shapes,
                       pool_strides)

    for conv_shapes in conv_shape_args:
        for conv_shape in conv_shapes:
            assert_all_integer(conv_shape)
            assert_all_greater(conv_shape, 0)

    conv_args = conv_shape_args + (filter_counts, filter_init_uniform_ranges)
    assert_all_equal([len(c) for c in conv_args])

    assert_equal(len(affine_output_sizes), len(affine_init_stddevs))

    assert_equal(len(dropout_include_rates),
                 len(filter_shapes) + len(affine_output_sizes))

    assert_equal(affine_output_sizes[-1], 10)  # for MNIST

    #assert_equal(input_node.output_format.axes, ('b', '0', '1'))

    #
    # Done sanity-checking args.
    #

    input_shape = input_node.output_format.shape

    # Converts from MNIST's ('b', '0', '1') to ('b', 'c', '0', '1')
    last_node = input_node

    conv_dropout_include_rates = \
        dropout_include_rates[:len(filter_shapes)]

    # Adds a dropout-conv-bias-relu-maxpool stack for each element in
    # filter_XXXX

    conv_layers = []
    counter = 0
    index_from = 0

    for (filter_shape,
         filter_count,
         filter_init_range,
         pool_shape,
         pool_stride,
         conv_dropout_include_rate,
         conv_pad)                 in safe_izip(filter_shapes,
                                                 filter_counts,
                                                 filter_init_uniform_ranges,
                                                 pool_shapes,
                                                 pool_strides,
                                                 conv_dropout_include_rates,
                                                 conv_pads):
        if conv_dropout_include_rate != 1.0:
            last_node = Dropout(last_node,
                                conv_dropout_include_rate,
                                theano_rng)

        print(shapes)

        shape1 = shapes[counter]
        shape2 = shapes[counter+1]
        size1= numpy.prod(numpy.asarray(shape1))
        size2= numpy.prod(numpy.asarray(shape2))
        index_to = index_from + size1
        #filters_ = theano.tensor.transpose(params_flat[index_from:index_to].reshape(shape1), axes=[0,1,3,2])
        filters_ = params_flat[index_from:index_to].reshape(shape1)
        index_from = index_to
        index_to = index_from + size2
        bias_ = params_flat[index_from:index_to].reshape(shape2)
        index_from = index_to

        last_node = Conv2dLayer(last_node,
                                filter_shape,
                                filter_count,
                                conv_pads=conv_pad,
                                pool_window_shape=pool_shape,
                                pool_strides=pool_stride,
                                pool_pads='pylearn2',
                                filters=filters_,
                                bias=bias_)
        conv_layers.append(last_node)

        counter = counter + 2


    affine_dropout_include_rates = dropout_include_rates[len(filter_shapes):]

    affine_layers = []

    #
    # Adds a dropout-affine-relu stack for each element in affine_XXXX,
    # except for the last one, where it omits the dropout.
    #

    # Add a fully connected layer here:

    shape1 = shapes[counter]
    #shape1 = (shape1[1], shape1[0])
    shape2 = shapes[counter+1]
    size1= numpy.prod(numpy.asarray(shape1))
    size2= numpy.prod(numpy.asarray(shape2))
    index_to = index_from + size1
    weights_ = params_flat[index_from:index_to].reshape(shape1)
    index_from = index_to
    index_to = index_from + size2
    bias_ = params_flat[index_from:index_to].reshape(shape2)
    index_from = index_to

    output_format = DenseFormat(axes=('b', 'f'),
                                    shape=(-1, 500),
                                    dtype=None)

    if affine_dropout_include_rates < 1.0:
        last_node = Dropout(last_node,
                            affine_dropout_include_rates,
                            theano_rng)

    last_node = AffineLayer(last_node, output_format, weights=weights_, bias=bias_, input_to_bf_map={('0', '1', 'c'): 'f'})
    affine_layers.append(last_node)

    counter += 2

    for (affine_size,
         affine_init_stddev,
         affine_dropout_include_rate) in \
        safe_izip(affine_output_sizes,
                  affine_init_stddevs,
                  affine_dropout_include_rates):

        if affine_dropout_include_rate < 1.0:
            last_node = Dropout(last_node,
                                affine_dropout_include_rate,
                                theano_rng)

        # No need to supply an axis map for the first affine transform.
        # By default, it collapses all non-'b' axes into a feature vector,
        # which is what we want.

        shape1 = shapes[counter]
        #shape1 = (shape1[1], shape1[0])
        shape2 = shapes[counter+1]
        size1= numpy.prod(numpy.asarray(shape1))
        size2= numpy.prod(numpy.asarray(shape2))
        index_to = index_from + size1
        weights_ = params_flat[index_from:index_to].reshape(shape1)
        index_from = index_to
        index_to = index_from + size2
        bias_ = params_flat[index_from:index_to].reshape(shape2)
        index_from = index_to
        # remap from bc01 to b01c before flattening to bf, as pylearn2 does,
        # just so that they do identical things.
        last_node = SoftmaxLayer(last_node,
                                 DenseFormat(axes=('b', 'f'),
                                             shape=(-1, affine_size),
                                             dtype=None),
                                 weights=weights_,
                                 bias=bias_)
                                 #input_to_bf_map={('0', '1', 'c'): 'f'})

        # stddev_init(rng, last_node.bias_node.params, affine_init_stddev)
        affine_layers.append(last_node)

        counter += 2


    return conv_layers, affine_layers, last_node, params_flat, params_old_flat, shapes
Example #4
0
def build_fc_classifier(input_node,
                        sizes,
                        sparse_init_counts,
                        dropout_include_probabilities,
                        rng,
                        theano_rng):
    '''
    Builds a stack of fully-connected layers followed by a Softmax.

    Each hidden layer will be preceded by a ReLU.

    Initialization:

    Weights are initialized in the same way as in Pylearn2's MLP tutorial:
    pylearn2/scripts/tutorials/multilayer_perceptron/mlp_tutorial_part_3.yaml

    This means the following:

    Of the N affine layers, the weights of the first N-1 are to all 0.0, except
    for k randomly-chosen elements, which are set to some random number drawn
    from the normal distribution with stddev=1.0.

    The biases are all initialized to 0.0.
    The last layer's weights and biases are both set to 0.0.

    Parameters
    ----------
    input_node: Node
      The node to build the stack on.

    sizes: Sequence
      A sequence of ints, indicating the output sizes of each layer.
      The last int is the number of classes.

    sparse_init_counts:
      A sequence of N-1 ints, where N = len(sizes).
      Used to initialize the weights of the first N-1 layers.
      If the n'th element is x, this means that the n'th layer
      will have x nonzeros, with the rest initialized to zeros.

    dropout_include_probabilities: Sequence
      A Sequence of N-1 floats, where N := len(sizes)
      The dropout include probabilities for the outputs of each of the layers,
      except for the final one.
      If any of these probabilities is 1.0, the corresponding Dropout node
      will be omitted.

    rng: numpy.random.RandomState
      The RandomState to draw initial weights from.

    theano_rng: theano.tensor.shared_randomstreams.RandomStreams
      The RandomStreams to draw dropout masks from.

    Returns
    -------
    rval: tuple
      (affine_nodes, output_node), where affine_nodes is a list of the
      AffineNodes, in order, and output_node is the final node, a Softmax.
    '''
    assert_is_instance(input_node, Node)

    # pylint: disable=no-member
    assert_equal(input_node.output_format.dtype,
                 numpy.dtype(theano.config.floatX))

    assert_greater(len(sizes), 0)
    assert_all_greater(sizes, 0)

    assert_equal(len(sparse_init_counts), len(sizes) - 1)
    assert_all_integer(sparse_init_counts)
    assert_all_greater(sparse_init_counts, 0)
    assert_all_less_equal(sparse_init_counts, sizes[:-1])

    assert_equal(len(dropout_include_probabilities), len(sizes))

    '''
    affine_nodes = []

    last_node = input_node

    for layer_index, layer_output_size in enumerate(sizes):
        # Add dropout, if asked for
        include_probability = dropout_include_probabilities[layer_index]
        if include_probability != 1.0:
            last_node = Dropout(last_node, include_probability, theano_rng)

        output_format = DenseFormat(axes=('b', 'f'),
                                    shape=(-1, layer_output_size),
                                    dtype=None)

        if layer_index < (len(sizes) - 1):
            last_node = AffineLayer(last_node, output_format)
        else:
            last_node = SoftmaxLayer(last_node, output_format)

        affine_nodes.append(last_node.affine_node)


    # Not used in this demo, but keeping it in in case we want to start using
    # it again.
    def init_sparse_bias(shared_variable, num_nonzeros, rng):

        #Mimics the sparse initialization in
        #pylearn2.models.mlp.Linear.set_input_space()


        params = shared_variable.get_value()
        assert_equal(params.shape[0], 1)

        assert_greater_equal(num_nonzeros, 0)
        assert_less_equal(num_nonzeros, params.shape[1])

        params[...] = 0.0

        indices = rng.choice(params.size,
                             size=num_nonzeros,
                             replace=False)

        # normal dist with stddev=1.0
        params[0, indices] = rng.randn(num_nonzeros)

        # Found that for biases, this didn't help (it increased the
        # final misclassification rate by .001)
        # if num_nonzeros > 0:
        #     params /= float(num_nonzeros)

        shared_variable.set_value(params)

    def init_sparse_linear(shared_variable, num_nonzeros, rng):
        params = shared_variable.get_value()
        params[...] = 0.0

        assert_greater_equal(num_nonzeros, 0)
        assert_less_equal(num_nonzeros, params.shape[0])

        for c in xrange(params.shape[1]):
            indices = rng.choice(params.shape[0],
                                 size=num_nonzeros,
                                 replace=False)

            # normal dist with stddev=1.0, divided by 255.0
            #
            # We need to divide by 255 for convergence. This is because
            # we're using unnormalized (i.e. 0 to 255) pixel values, unlike the
            # 0.0-to-1.0 pixels in
            # pylearn2.scripts.tutorials.multilayer_perceptron/
            #
            # We could just do as the above tutorial does and normalize the
            # pixels to [0.0, 1.0], and not rescale the weights. However,
            # experiments show that this converges to a higher error, and also
            # makes mnist_visualizer.py's results look very "staticky", without
            # any recognizable digit hallucinations.
            params[indices, c] = rng.randn(num_nonzeros) / 255.0

        shared_variable.set_value(params)

    # Initialize the affine layer weights (not the biases, and not the softmax
    # weights)
    for sparse_init_count, affine_node in safe_izip(sparse_init_counts,
                                                    affine_nodes[:-1]):
        # pylearn2 doesn't sparse_init the biases. I also found that
        # doing so slightly increases the final misclassification rate.
        init_sparse_linear(affine_node.linear_node.params,
                           sparse_init_count,
                           rng)

    #################################################################################################
    ### BUILD THE SECOND NETWORK WITH FLAT PARAMETERS (given the dimensions of the first) ###########
    #################################################################################################

    parameters = []
    shapes = []
    for affine_node in affine_nodes:
        weights = affine_node.linear_node.params
        bias = affine_node.bias_node.params
        parameters.append(weights)
        parameters.append(bias)
        shapes.append(weights.get_value().shape)
        shapes.append(bias.get_value().shape)

    params_flat_values = numpy.asarray([], dtype=theano.config.floatX)
    for parameter in parameters:
        vector_param = numpy.asarray(numpy.ndarray.flatten(parameter.get_value()), dtype=theano.config.floatX)
        params_flat_values = numpy.append(params_flat_values, vector_param)

    params_flat = theano.shared(params_flat_values)
    params_old_flat = theano.shared(params_flat_values)

    affine_nodes = []
    last_node = input_node
    counter = 0
    index_from = 0
    for layer_index, layer_output_size in enumerate(sizes):

        shape1 = shapes[counter]
        shape2 = shapes[counter+1]
        size1= numpy.prod(numpy.asarray(shape1))
        size2= numpy.prod(numpy.asarray(shape2))
        index_to = index_from + size1
        weights_ = params_flat[index_from:index_to].reshape(shape1)
        index_from = index_to
        index_to = index_from + size2
        bias_ = params_flat[index_from:index_to].reshape(shape2)
        index_from = index_to
        counter = counter + 2

        # Add dropout, if asked for
        include_probability = dropout_include_probabilities[layer_index]
        if include_probability != 1.0:
            last_node = Dropout(last_node, include_probability, theano_rng)

        output_format = DenseFormat(axes=('b', 'f'),
                                    shape=(-1, layer_output_size),
                                    dtype=None)

        if layer_index < (len(sizes) - 1):
            last_node = AffineLayer(last_node, output_format, weights=weights_, bias=bias_)
        else:
            last_node = SoftmaxLayer(last_node, output_format, weights=weights_, bias=bias_)

        affine_nodes.append(last_node.affine_node)

    return affine_nodes, last_node, params_flat, params_old_flat
    '''

    std_deviation = .05

    input_size = 784
    params_temp1 = [rng.standard_normal( (sizes[0]* input_size) ).astype(theano.config.floatX)*std_deviation,
                    numpy.zeros(sizes[0], dtype=theano.config.floatX) ]

    params_temp2 = sum([ [rng.standard_normal( sizes[i] * sizes[i+1] ).astype(theano.config.floatX)*std_deviation,
                          numpy.zeros(sizes[i+1], dtype=theano.config.floatX)] for i in range(len(sizes)-1) ],[] )

    params_flat_values = numpy.concatenate( params_temp1 + params_temp2 )

    params_flat = theano.shared(params_flat_values)
    params_old_flat = theano.shared(params_flat_values)

    shapes = []
    param_arrays = []
    index_to = input_size * sizes[0]
    param_arrays.append(params_flat[:index_to].reshape((sizes[0], input_size))) # Add weights
    shapes.append((input_size, sizes[0]))
    index_from = index_to
    index_to += sizes[0]
    param_arrays.append(params_flat[index_from:index_to]) # Add bias
    shapes.append((index_to-index_from, ))

    for i in range(len(sizes)-1):

        index_from = index_to
        index_to += sizes[i]*sizes[i+1]
        param_arrays.append(params_flat[index_from:index_to].reshape((sizes[i+1],sizes[i]))) # Add weight
        shapes.append((sizes[i], sizes[i+1]))
        #print(index_from, index_to)
        #print 'reshaped to'
        #print(sizes[i], sizes[i+1])
        index_from = index_to
        index_to += sizes[i+1]
        param_arrays.append(params_flat[index_from:index_to]) # Add bias
        shapes.append((index_to-index_from, ))

    layers = [input_node]

    for i in range(len(sizes)-1):  # repeat twice
        layers.append(AffineLayer(input_node=layers[-1],  # last element of <layers>
                                  output_format=DenseFormat(axes=('b', 'f'),  # axis order: (batch, feature)
                                                            shape=(-1, sizes[i]),   # output shape: (variable batch size, 10 classes)
                                                            dtype=None) ,   # don't change the input data type
                                  weights = theano.tensor.transpose(param_arrays[i*2]),
                                  bias = param_arrays[i*2+1]
                                  ))

    layers.append(SoftmaxLayer(input_node=layers[-1],
                               output_format=DenseFormat(axes=('b', 'f'),  # axis order: (batch, feature)
                                                         shape=(-1, sizes[i+1]),   # output shape: (variable batch size, 10 classes)
                                                         dtype=None),      # don't change the input data type
                               weights = theano.tensor.transpose(param_arrays[(i+1)*2]),
                               bias = param_arrays[(i+1)*2+1]
                               ))  # collapse the channel, row, and column axes to a single feature axis

    softmax_layer = layers[-1]

    last_node = softmax_layer
    affine_nodes = []
    for i in range(1,len(layers)):
        affine_nodes.append(layers[i].affine_node)

    print shapes

    return affine_nodes, last_node, params_flat, params_old_flat, shapes
def build_fc_classifier(input_node,
                        sizes,
                        sparse_init_counts,
                        dropout_include_probabilities,
                        rng,
                        theano_rng):
    '''
    Builds a stack of fully-connected layers followed by a Softmax.

    Each hidden layer will be preceded by a ReLU.

    Initialization:

    Weights are initialized in the same way as in Pylearn2's MLP tutorial:
    pylearn2/scripts/tutorials/multilayer_perceptron/mlp_tutorial_part_3.yaml

    This means the following:

    Of the N affine layers, the weights of the first N-1 are to all 0.0, except
    for k randomly-chosen elements, which are set to some random number drawn
    from the normal distribution with stddev=1.0.

    The biases are all initialized to 0.0.
    The last layer's weights and biases are both set to 0.0.

    Parameters
    ----------
    input_node: Node
      The node to build the stack on.

    sizes: Sequence
      A sequence of ints, indicating the output sizes of each layer.
      The last int is the number of classes.

    sparse_init_counts:
      A sequence of N-1 ints, where N = len(sizes).
      Used to initialize the weights of the first N-1 layers.
      If the n'th element is x, this means that the n'th layer
      will have x nonzeros, with the rest initialized to zeros.

    dropout_include_probabilities: Sequence
      A Sequence of N-1 floats, where N := len(sizes)
      The dropout include probabilities for the outputs of each of the layers,
      except for the final one.
      If any of these probabilities is 1.0, the corresponding Dropout node
      will be omitted.

    rng: numpy.random.RandomState
      The RandomState to draw initial weights from.

    theano_rng: theano.tensor.shared_randomstreams.RandomStreams
      The RandomStreams to draw dropout masks from.

    Returns
    -------
    rval: tuple
      (affine_nodes, output_node), where affine_nodes is a list of the
      AffineNodes, in order, and output_node is the final node, a Softmax.
    '''
    assert_is_instance(input_node, Node)
    assert_equal(input_node.output_format.dtype,
                 numpy.dtype(theano.config.floatX))

    assert_greater(len(sizes), 0)
    assert_all_greater(sizes, 0)

    assert_equal(len(sparse_init_counts), len(sizes) - 1)
    assert_all_integer(sparse_init_counts)
    assert_all_greater(sparse_init_counts, 0)
    assert_all_less_equal(sparse_init_counts, sizes[:-1])

    assert_equal(len(dropout_include_probabilities), len(sizes))

    affine_nodes = []

    last_node = input_node

    for layer_index, layer_output_size in enumerate(sizes):
        # Add dropout, if asked for
        include_probability = dropout_include_probabilities[layer_index]
        if include_probability != 1.0:
            last_node = Dropout(last_node, include_probability, theano_rng)

        output_format = DenseFormat(axes=('b', 'f'),
                                    shape=(-1, layer_output_size),
                                    dtype=None)

        if layer_index < (len(sizes) - 1):
            last_node = AffineLayer(last_node, output_format)
        else:
            last_node = SoftmaxLayer(last_node, output_format)

        affine_nodes.append(last_node.affine_node)

    def init_sparse_bias(shared_variable, num_nonzeros, rng):
        '''
        Mimics the sparse initialization in
        pylearn2.models.mlp.Linear.set_input_space()
        '''

        params = shared_variable.get_value()
        assert_equal(params.shape[0], 1)

        assert_greater_equal(num_nonzeros, 0)
        assert_less_equal(num_nonzeros, params.shape[1])

        params[...] = 0.0

        indices = rng.choice(params.size,
                             size=num_nonzeros,
                             replace=False)

        # normal dist with stddev=1.0
        params[0, indices] = rng.randn(num_nonzeros)

        # Found that for biases, this didn't help (it increased the
        # final misclassification rate by .001)
        # if num_nonzeros > 0:
        #     params /= float(num_nonzeros)

        shared_variable.set_value(params)

    def init_sparse_linear(shared_variable, num_nonzeros, rng):
        params = shared_variable.get_value()
        params[...] = 0.0

        assert_greater_equal(num_nonzeros, 0)
        assert_less_equal(num_nonzeros, params.shape[0])

        for c in xrange(params.shape[1]):
            indices = rng.choice(params.shape[0],
                                 size=num_nonzeros,
                                 replace=False)

            # normal dist with stddev=1.0
            params[indices, c] = rng.randn(num_nonzeros)

        # TODO: it's somewhat worrisome that the tutorial in
        # pylearn2.scripts.tutorials.multilayer_perceptron/
        #   multilayer_perceptron.ipynb
        # seems to do fine without scaling the weights like this
        if num_nonzeros > 0:
            params /= float(num_nonzeros)
            # Interestingly, while this seems more correct (normalize
            # columns to norm=1), it prevents the NN from converging.
            # params /= numpy.sqrt(float(num_nonzeros))

        shared_variable.set_value(params)

    # Initialize the affine layer weights (not the biases, and not the softmax
    # weights)
    for sparse_init_count, affine_node in safe_izip(sparse_init_counts,
                                                    affine_nodes[:-1]):
        # pylearn2 doesn't sparse_init the biases. I also found that
        # doing so slightly increases the final misclassification rate.
        init_sparse_linear(affine_node.linear_node.params,
                           sparse_init_count,
                           rng)

    return affine_nodes, last_node
Example #6
0
    def apply_subwindow_func(subwindow_func,
                             padded_images,
                             pads,
                             window_shape,
                             strides):
        '''
        Applies a sliding-window function to all subwindows of a feature map.

        Parameters
        ----------
        subwindow_func: function
          A function that takes a subwindow and returns a scalar.
          Input: tensor with shape [BATCH_SIZE, NUM_CHANNELS, ROWS, COLS]
          Output: tensor with shape [BATCH_SIZE, NUM_CHANNELS]

        padded_images: numpy.ndarray
          A feature map with shape [BATCH_SIZE, NUM_CHANNELS, ROWS, COLS].
          This has pad[0] rows and pad[1] columns of zero-padding.

        max_pad: Sequence
          [pad_rows, pad_columns], the # of padded rows and columns on each
          side of the image.

        '''
        assert_equal(padded_images.ndim, 4)
        assert_all_greater(padded_images.shape[2:], pads)
        _assert_is_shape2d(window_shape)
        _assert_is_shape2d(strides)

        pads, window_shape, strides = (numpy.asarray(a) for a in (pads,
                                                                  window_shape,
                                                                  strides))

        assert_all_greater(numpy.asarray(padded_images.shape[2:]), 2 * pads)

        # Check that pad region is full of the same value
        if pads[0] > 0:
            pad_value = padded_images[0, 0, 0, 0]
            assert_true(numpy.all(padded_images[:, :, :pads[0], :] ==
                                  pad_value))
            assert_true(numpy.all(padded_images[:, :, -pads[0]:, :] ==
                                  pad_value))

        if pads[1] > 0:
            pad_value = padded_images[0, 0, 0, 0]
            assert_true(numpy.all(padded_images[:, :, :, :pads[1]] ==
                                  pad_value))
            assert_true(numpy.all(padded_images[:, :, :, -pads[1]:] ==
                                  pad_value))

        rows, cols = (range(0,
                            padded_images.shape[i + 2] - window_shape[i] + 1,
                            strides[i])
                      for i in (0, 1))
        output_image = None

        for out_r, in_r in enumerate(rows):
            for out_c, in_c in enumerate(cols):
                subwindow = padded_images[:,
                                          :,
                                          in_r:(in_r + window_shape[0]),
                                          in_c:(in_c + window_shape[1])]
                output = subwindow_func(subwindow)
                assert_equal(output.ndim, 2)

                # check that subwindow_func preserved the batch size
                assert_equal(output.shape[0], padded_images.shape[0])
                assert_greater(output.shape[1], 0)

                if output_image is None:
                    output_image = numpy.zeros((output.shape[0],
                                                output.shape[1],
                                                len(rows),
                                                len(cols)),
                                               dtype=output.dtype)

                output_image[:, :, out_r, out_c] = output

        return output_image
Example #7
0
def _sliding_window_2d_testimpl(expected_subwindow_funcs,
                                pad_values,
                                make_node_funcs,
                                make_pad_args_funcs,
                                rtol=None):
    '''
    Implementation of tests for 2D sliding-window nodes like Pool2D and Conv2d.

    Parameters
    ----------
    expected_subwindow_funcs: Sequence
      A Sequence of subwindow functions.
      These take a subwindow and return a scalar.
      Input: tensor with shape [BATCH_SIZE, NUM_CHANNELS, ROWS, COLS]
      Output: tensor with shape [BATCH_SIZE, NUM_CHANNELS]

    pad_values: Sequence
      A sequence of pad filler values to use for eah of the
      expected_subwindow_funcs. For example, if expected_subwindow_funcs
      is [average_pool, max_pool], use [0.0, -numpy.inf].

    make_node_funcs: Sequence
      A Sequence of functions that create sliding-window Nodes to be tested
      against the ground-truth provided by the corresponding
      expected_subwindow_funcs. Its paramters are as follows:

      Parameters
      ----------
      input_node: Node
      window_shape: Sequence
        [NUM_ROWS, NUM_COLUMNS] of the sliding window.
      strides: Sequence
        [ROW_STRIDE, COLUMN_STRIDE], or how many rows/columns to skip between
        applications of the sliding window.

      pad: Sequence
        [ROW_PAD, COLUMN_PAD], or # of zero-padding rows/columns to add to each
        side of the image.

      axis_map: dict
        Maps strings to strings. Optional.
        If the node uses different axis names than 'b', 'c', '0', '1', this
        specifies the mapping from the node's axis names to 'b', 'c', '0', '1'.

    make_pad_args_funcs: Sequence
      A Sequence of functions that take a window_shape arg (2d array) and
      returns an Iterable of 'pad' arguments, which can be strings or 2d arrays
      of ints.
    '''

    assert_is_instance(expected_subwindow_funcs, Sequence)
    assert_is_instance(pad_values, Sequence)
    assert_is_instance(make_node_funcs, Sequence)

    # TODO: change this to construct a Toeplitz matrix out of padded_images,
    # so we get a giant stack of C X WR X WC matrices, which can then be fed
    # to subwindow_func as a single batch.
    # See scipy.linalg.toeplitz
    def apply_subwindow_func(subwindow_func,
                             padded_images,
                             pads,
                             window_shape,
                             strides):
        '''
        Applies a sliding-window function to all subwindows of a feature map.

        Parameters
        ----------
        subwindow_func: function
          A function that takes a subwindow and returns a scalar.
          Input: tensor with shape [BATCH_SIZE, NUM_CHANNELS, ROWS, COLS]
          Output: tensor with shape [BATCH_SIZE, NUM_CHANNELS]

        padded_images: numpy.ndarray
          A feature map with shape [BATCH_SIZE, NUM_CHANNELS, ROWS, COLS].
          This has pad[0] rows and pad[1] columns of zero-padding.

        max_pad: Sequence
          [pad_rows, pad_columns], the # of padded rows and columns on each
          side of the image.

        '''
        assert_equal(padded_images.ndim, 4)
        assert_all_greater(padded_images.shape[2:], pads)
        _assert_is_shape2d(window_shape)
        _assert_is_shape2d(strides)

        pads, window_shape, strides = (numpy.asarray(a) for a in (pads,
                                                                  window_shape,
                                                                  strides))

        assert_all_greater(numpy.asarray(padded_images.shape[2:]), 2 * pads)

        # Check that pad region is full of the same value
        if pads[0] > 0:
            pad_value = padded_images[0, 0, 0, 0]
            assert_true(numpy.all(padded_images[:, :, :pads[0], :] ==
                                  pad_value))
            assert_true(numpy.all(padded_images[:, :, -pads[0]:, :] ==
                                  pad_value))

        if pads[1] > 0:
            pad_value = padded_images[0, 0, 0, 0]
            assert_true(numpy.all(padded_images[:, :, :, :pads[1]] ==
                                  pad_value))
            assert_true(numpy.all(padded_images[:, :, :, -pads[1]:] ==
                                  pad_value))

        rows, cols = (range(0,
                            padded_images.shape[i + 2] - window_shape[i] + 1,
                            strides[i])
                      for i in (0, 1))
        output_image = None

        for out_r, in_r in enumerate(rows):
            for out_c, in_c in enumerate(cols):
                subwindow = padded_images[:,
                                          :,
                                          in_r:(in_r + window_shape[0]),
                                          in_c:(in_c + window_shape[1])]
                output = subwindow_func(subwindow)
                assert_equal(output.ndim, 2)

                # check that subwindow_func preserved the batch size
                assert_equal(output.shape[0], padded_images.shape[0])
                assert_greater(output.shape[1], 0)

                if output_image is None:
                    output_image = numpy.zeros((output.shape[0],
                                                output.shape[1],
                                                len(rows),
                                                len(cols)),
                                               dtype=output.dtype)

                output_image[:, :, out_r, out_c] = output

        return output_image

    max_stride = 3
    max_window_size = 3
    batch_size = 2
    num_channels = 2
    input_dtype = numpy.dtype('int')

    max_pad = max_window_size + 1

    assert_greater_equal(max_pad, 0)

    rng = numpy.random.RandomState(352)

    def get_padded_image(max_padded_images, pads):
        def margin_to_slice(margin):
            assert_greater_equal(margin, 0)
            if margin == 0:
                return slice(None, None)
            else:
                return slice(margin, -margin)

        return max_padded_images[:,
                                 :,
                                 margin_to_slice(max_pad - pads[0]),
                                 margin_to_slice(max_pad - pads[1])]


    def get_pads_from_pad_arg(pad_arg, window_shape):
        '''
        Converts a valid pad argument (str or 2-int Sequence) to
        an equivalent 2-int numpy.ndarray.
        '''
        window_shape = numpy.asarray(window_shape)
        _assert_is_shape2d(window_shape)

        if isinstance(pad_arg, basestring):
            if pad_arg == 'full':
                return window_shape - 1
            elif pad_arg == 'valid':
                return numpy.asarray([0, 0])
            elif pad_arg == 'same_shape':
                assert_true((window_shape % 2 != 0).all())
                return window_shape // 2
            else:
                raise ValueError("Unrecognized pad name: '%s'" % pad_arg)
        else:
            _assert_is_shape2d(pad_arg)
            return numpy.asarray(pad_arg)


    prod = itertools.product

    for (expected_func,
         pad_value,
         make_node_func,
         make_pad_args_func) in safe_izip(expected_subwindow_funcs,
                                          pad_values,
                                          make_node_funcs,
                                          make_pad_args_funcs):

        # An image with the maximum amount of padding.  We will vary the amount
        # of padding in practice by taking centered subwindows of this image.
        max_padded_images = numpy.empty((batch_size,
                                         num_channels,
                                         max_pad * 2 + max_window_size + 1,
                                         max_pad * 2 + max_window_size + 4),
                                        dtype=input_dtype)
        max_padded_images[...] = pad_value

        images = get_padded_image(max_padded_images, (0, 0))
        images[...] = rng.random_integers(low=-10, high=10, size=images.shape)
        #images[...] = numpy.arange(images.size).reshape(images.shape)
        assert_all_greater(images.shape, 0)

        if max_pad == 0:
            assert_array_equal(images, max_padded_images)
        else:
            assert_array_equal(images, max_padded_images[:,
                                                         :,
                                                         max_pad:-max_pad,
                                                         max_pad:-max_pad])

        # Make input_nodes with weird axis names and axis order
        axis_map = {'b': 'b', 'see': 'c', 'zero': '0', 'one': '1'}
        input_node_axes = ('b', 'see', 'zero', 'one')
        # input_node_axes = ('b', 'zero', 'see', 'one')
        transpose_indices = [('b', 'see', 'zero', 'one').index(a)
                             for a in input_node_axes]
        input_node_shape = [images.shape[t] for t in transpose_indices]
        input_node_shape[input_node_axes.index('b')] = -1
        input_node = InputNode(DenseFormat(axes=input_node_axes,
                                           shape=input_node_shape,
                                           dtype=input_dtype))

        # Loops through all possible window_shapes, pads (including padding
        # bigger than the window shape), strides.
        for window_shape in prod(range(1, max_window_size + 1), repeat=2):
            window_shape = numpy.asarray(window_shape)

            # for pad_arg in get_pad_args(window_shape, supports_padding):
            for pad_arg in make_pad_args_func(window_shape):
                # can't use same_shape padding with even window dims
                if pad_arg == 'same_shape' and (window_shape % 2 == 0).any():
                    continue

                pads = get_pads_from_pad_arg(pad_arg, window_shape)
                padded_images = get_padded_image(max_padded_images, pads)
                assert_array_equal(numpy.asarray(padded_images.shape[2:]),
                                   (2 * pads) +
                                   numpy.asarray(images.shape[2:]))

                for strides in prod(range(1, max_stride + 1), repeat=2):
                    expected_images = apply_subwindow_func(expected_func,
                                                           padded_images,
                                                           pads,
                                                           window_shape,
                                                           strides)

                    # If pads are bigger than window_size, expect an exception
                    # when creating the node.
                    if not isinstance(pads, basestring) and \
                       numpy.any(pads >= window_shape):
                        assert_raises_regexp(AssertionError,
                                             "Not all pads",
                                             make_node_func,
                                             input_node,
                                             window_shape=window_shape,
                                             strides=strides,
                                             pads=pad_arg,
                                             axis_map=axis_map)
                    else:
                        node = make_node_func(input_node,
                                              window_shape=window_shape,
                                              strides=strides,
                                              pads=pad_arg,
                                              axis_map=axis_map)

                        node_func = theano.function([input_node.output_symbol],
                                                    node.output_symbol)
                        transposed_images = images.transpose(transpose_indices)
                        actual_images = node_func(transposed_images)

                        node.output_format.check(actual_images)
                        # try:
                        #     node.output_format.check(actual_images)
                        # except AssertionError:
                        #     pdb.set_trace()

                        kwargs = {}
                        if rtol is not None:
                            kwargs['rtol'] = rtol

                        # pylint: disable=star-args
                        assert_allclose(actual_images,
                                        expected_images,
                                        **kwargs)