def build_conv_classifier(input_node, filter_shapes, filter_counts, filter_init_uniform_ranges, pool_shapes, pool_strides, affine_output_sizes, affine_init_stddevs, dropout_include_rates, conv_pads, rng, theano_rng): ''' Builds a classification convnet on top of input_node. Returns ------- rval: tuple (conv_nodes, affine_nodes, output_node), where: conv_nodes is a list of the Conv2d nodes. affine_nodes is a list of the AffineNodes. output_node is the final node, a Softmax. ''' assert_is_instance(input_node, Lcn) conv_shape_args = (filter_shapes, pool_shapes, pool_strides) for conv_shapes in conv_shape_args: for conv_shape in conv_shapes: assert_all_integer(conv_shape) assert_all_greater(conv_shape, 0) conv_args = conv_shape_args + (filter_counts, filter_init_uniform_ranges) assert_all_equal([len(c) for c in conv_args]) assert_equal(len(affine_output_sizes), len(affine_init_stddevs)) assert_equal(len(dropout_include_rates), len(filter_shapes) + len(affine_output_sizes)) assert_equal(affine_output_sizes[-1], 10) # for MNIST #assert_equal(input_node.output_format.axes, ('b', '0', '1')) # # Done sanity-checking args. # input_shape = input_node.output_format.shape # Converts from MNIST's ('b', '0', '1') to ('b', 'c', '0', '1') last_node = input_node conv_dropout_include_rates = \ dropout_include_rates[:len(filter_shapes)] # Adds a dropout-conv-bias-relu-maxpool stack for each element in # filter_XXXX conv_layers = [] def uniform_init(rng, params, init_range): ''' Fills params with values uniformly sampled from [-init_range, init_range] ''' assert_floating(init_range) assert_greater_equal(init_range, 0) values = params.get_value() values[...] = rng.uniform(low=-init_range, high=init_range, size=values.shape) params.set_value(values) for (filter_shape, filter_count, filter_init_range, pool_shape, pool_stride, conv_dropout_include_rate, conv_pads) in safe_izip(filter_shapes, filter_counts, filter_init_uniform_ranges, pool_shapes, pool_strides, conv_dropout_include_rates, conv_pads): if conv_dropout_include_rate != 1.0: last_node = Dropout(last_node, conv_dropout_include_rate, theano_rng) last_node = Conv2dLayer(last_node, filter_shape, filter_count, conv_pads=conv_pads, pool_window_shape=pool_shape, pool_strides=pool_stride, pool_pads='pylearn2') conv_layers.append(last_node) uniform_init(rng, last_node.conv2d_node.filters, filter_init_range) affine_dropout_include_rates = dropout_include_rates[len(filter_shapes):] affine_layers = [] def normal_distribution_init(rng, params, stddev): ''' Fills params with values uniformly sampled from [-init_range, init_range] ''' assert_floating(stddev) assert_greater_equal(stddev, 0) values = params.get_value() values[...] = rng.standard_normal(values.shape) * stddev params.set_value(values) # # Adds a dropout-affine-relu stack for each element in affine_XXXX, # except for the last one, where it omits the dropout. # # Add a fully connected layer here: output_format = DenseFormat(axes=('b', 'f'), shape=(-1, 500), dtype=None) if affine_dropout_include_rates < 1.0: last_node = Dropout(last_node, affine_dropout_include_rates, theano_rng) last_node = AffineLayer(last_node, output_format, input_to_bf_map={('0', '1', 'c'): 'f'}) affine_layers.append(last_node) normal_distribution_init(rng, last_node.affine_node.linear_node.params, 0.05) for (affine_size, affine_init_stddev, affine_dropout_include_rate) in \ safe_izip(affine_output_sizes, affine_init_stddevs, affine_dropout_include_rates): ''' if affine_dropout_include_rate < 1.0: last_node = Dropout(last_node, affine_dropout_include_rate, theano_rng) ''' # No need to supply an axis map for the first affine transform. # By default, it collapses all non-'b' axes into a feature vector, # which is what we want. # remap from bc01 to b01c before flattening to bf, as pylearn2 does, # just so that they do identical things. last_node = SoftmaxLayer(last_node, DenseFormat(axes=('b', 'f'), shape=(-1, affine_size), dtype=None)) #input_to_bf_map={('0', '1', 'c'): 'f'}) normal_distribution_init(rng, last_node.affine_node.linear_node.params, affine_init_stddev) # stddev_init(rng, last_node.bias_node.params, affine_init_stddev) affine_layers.append(last_node) return conv_layers, affine_layers, last_node
def build_fc_classifier(input_node, sizes, sparse_init_counts, dropout_include_probabilities, rng, theano_rng): """ Builds a stack of fully-connected layers followed by a Softmax. Each hidden layer will be preceded by a ReLU. Initialization: Weights are initialized in the same way as in Pylearn2's MLP tutorial: pylearn2/scripts/tutorials/multilayer_perceptron/mlp_tutorial_part_3.yaml This means the following: Of the N affine layers, the weights of the first N-1 are to all 0.0, except for k randomly-chosen elements, which are set to some random number drawn from the normal distribution with stddev=1.0. The biases are all initialized to 0.0. The last layer's weights and biases are both set to 0.0. Parameters ---------- input_node: Node The node to build the stack on. sizes: Sequence A sequence of ints, indicating the output sizes of each layer. The last int is the number of classes. sparse_init_counts: A sequence of N-1 ints, where N = len(sizes). Used to initialize the weights of the first N-1 layers. If the n'th element is x, this means that the n'th layer will have x nonzeros, with the rest initialized to zeros. dropout_include_probabilities: Sequence A Sequence of N-1 floats, where N := len(sizes) The dropout include probabilities for the outputs of each of the layers, except for the final one. If any of these probabilities is 1.0, the corresponding Dropout node will be omitted. rng: numpy.random.RandomState The RandomState to draw initial weights from. theano_rng: theano.tensor.shared_randomstreams.RandomStreams The RandomStreams to draw dropout masks from. Returns ------- rval: tuple (affine_nodes, output_node), where affine_nodes is a list of the AffineNodes, in order, and output_node is the final node, a Softmax. """ assert_is_instance(input_node, Node) # pylint: disable=no-member assert_equal(input_node.output_format.dtype, numpy.dtype(theano.config.floatX)) assert_greater(len(sizes), 0) assert_all_greater(sizes, 0) assert_equal(len(sparse_init_counts), len(sizes) - 1) assert_all_integer(sparse_init_counts) assert_all_greater(sparse_init_counts, 0) assert_all_less_equal(sparse_init_counts, sizes[:-1]) assert_equal(len(dropout_include_probabilities), len(sizes)) affine_nodes = [] last_node = input_node for layer_index, layer_output_size in enumerate(sizes): # Add dropout, if asked for include_probability = dropout_include_probabilities[layer_index] if include_probability != 1.0: last_node = Dropout(last_node, include_probability, theano_rng) output_format = DenseFormat(axes=("b", "f"), shape=(-1, layer_output_size), dtype=None) if layer_index < (len(sizes) - 1): last_node = AffineLayer(last_node, output_format) else: last_node = SoftmaxLayer(last_node, output_format) affine_nodes.append(last_node.affine_node) # Not used in this demo, but keeping it in in case we want to start using # it again. def init_sparse_bias(shared_variable, num_nonzeros, rng): """ Mimics the sparse initialization in pylearn2.models.mlp.Linear.set_input_space() """ params = shared_variable.get_value() assert_equal(params.shape[0], 1) assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[1]) params[...] = 0.0 indices = rng.choice(params.size, size=num_nonzeros, replace=False) # normal dist with stddev=1.0 params[0, indices] = rng.randn(num_nonzeros) # Found that for biases, this didn't help (it increased the # final misclassification rate by .001) # if num_nonzeros > 0: # params /= float(num_nonzeros) shared_variable.set_value(params) def init_sparse_linear(shared_variable, num_nonzeros, rng): params = shared_variable.get_value() params[...] = 0.0 assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[0]) for c in xrange(params.shape[1]): indices = rng.choice(params.shape[0], size=num_nonzeros, replace=False) # normal dist with stddev=1.0, divided by 255.0 # # We need to divide by 255 for convergence. This is because # we're using unnormalized (i.e. 0 to 255) pixel values, unlike the # 0.0-to-1.0 pixels in # pylearn2.scripts.tutorials.multilayer_perceptron/ # # We could just do as the above tutorial does and normalize the # pixels to [0.0, 1.0], and not rescale the weights. However, # experiments show that this converges to a higher error, and also # makes mnist_visualizer.py's results look very "staticky", without # any recognizable digit hallucinations. params[indices, c] = rng.randn(num_nonzeros) / 255.0 shared_variable.set_value(params) # Initialize the affine layer weights (not the biases, and not the softmax # weights) for sparse_init_count, affine_node in safe_izip(sparse_init_counts, affine_nodes[:-1]): # pylearn2 doesn't sparse_init the biases. I also found that # doing so slightly increases the final misclassification rate. init_sparse_linear(affine_node.linear_node.params, sparse_init_count, rng) return affine_nodes, last_node
def build_conv_classifier(input_node, filter_shapes, filter_counts, filter_init_uniform_ranges, pool_shapes, pool_strides, affine_output_sizes, affine_init_stddevs, dropout_include_rates, conv_pads, rng, theano_rng): ''' Builds a classification convnet on top of input_node. Returns ------- rval: tuple (conv_nodes, affine_nodes, output_node), where: conv_nodes is a list of the Conv2d nodes. affine_nodes is a list of the AffineNodes. output_node is the final node, a Softmax. ''' assert_is_instance(input_node, Lcn) conv_shape_args = (filter_shapes, pool_shapes, pool_strides) for conv_shapes in conv_shape_args: for conv_shape in conv_shapes: assert_all_integer(conv_shape) assert_all_greater(conv_shape, 0) conv_args = conv_shape_args + (filter_counts, filter_init_uniform_ranges) assert_all_equal([len(c) for c in conv_args]) assert_equal(len(affine_output_sizes), len(affine_init_stddevs)) assert_equal(len(dropout_include_rates), len(filter_shapes) + len(affine_output_sizes)) assert_equal(affine_output_sizes[-1], 10) # for MNIST #assert_equal(input_node.output_format.axes, ('b', '0', '1')) # # Done sanity-checking args. # input_shape = input_node.output_format.shape # Converts from MNIST's ('b', '0', '1') to ('b', 'c', '0', '1') last_node = input_node conv_dropout_include_rates = \ dropout_include_rates[:len(filter_shapes)] # Adds a dropout-conv-bias-relu-maxpool stack for each element in # filter_XXXX conv_layers = [] def uniform_init(rng, params, init_range): ''' Fills params with values uniformly sampled from [-init_range, init_range] ''' assert_floating(init_range) assert_greater_equal(init_range, 0) values = params.get_value() values[...] = rng.uniform(low=-init_range, high=init_range, size=values.shape) params.set_value(values) for (filter_shape, filter_count, filter_init_range, pool_shape, pool_stride, conv_dropout_include_rate, conv_pad) in safe_izip(filter_shapes, filter_counts, filter_init_uniform_ranges, pool_shapes, pool_strides, conv_dropout_include_rates, conv_pads): if conv_dropout_include_rate != 1.0: last_node = Dropout(last_node, conv_dropout_include_rate, theano_rng) last_node = Conv2dLayer(last_node, filter_shape, filter_count, conv_pads=conv_pad, pool_window_shape=pool_shape, pool_strides=pool_stride, pool_pads='pylearn2') conv_layers.append(last_node) uniform_init(rng, last_node.conv2d_node.filters, filter_init_range) affine_dropout_include_rates = dropout_include_rates[len(filter_shapes):] affine_layers = [] def normal_distribution_init(rng, params, stddev): ''' Fills params with values uniformly sampled from [-init_range, init_range] ''' assert_floating(stddev) assert_greater_equal(stddev, 0) values = params.get_value() values[...] = rng.standard_normal(values.shape) * stddev params.set_value(values) # # Adds a dropout-affine-relu stack for each element in affine_XXXX, # except for the last one, where it omits the dropout. # # Add a fully connected layer here: output_format = DenseFormat(axes=('b', 'f'), shape=(-1, 500), dtype=None) if affine_dropout_include_rates < 1.0: last_node = Dropout(last_node, affine_dropout_include_rates, theano_rng) last_node = AffineLayer(last_node, output_format, input_to_bf_map={('0', '1', 'c'): 'f'}) affine_layers.append(last_node) normal_distribution_init(rng, last_node.affine_node.linear_node.params, 0.05) for (affine_size, affine_init_stddev, affine_dropout_include_rate) in \ safe_izip(affine_output_sizes, affine_init_stddevs, affine_dropout_include_rates): ''' if affine_dropout_include_rate < 1.0: last_node = Dropout(last_node, affine_dropout_include_rate, theano_rng) ''' # No need to supply an axis map for the first affine transform. # By default, it collapses all non-'b' axes into a feature vector, # which is what we want. # remap from bc01 to b01c before flattening to bf, as pylearn2 does, # just so that they do identical things. last_node = SoftmaxLayer(last_node, DenseFormat(axes=('b', 'f'), shape=(-1, affine_size), dtype=None)) #input_to_bf_map={('0', '1', 'c'): 'f'}) normal_distribution_init(rng, last_node.affine_node.linear_node.params, affine_init_stddev) # stddev_init(rng, last_node.bias_node.params, affine_init_stddev) affine_layers.append(last_node) ################################################################################################# ### BUILD THE SECOND NETWORK WITH FLAT PARAMETERS (given the dimensions of the first) ########### ################################################################################################# rng = numpy.random.RandomState(281934) std_deviation = .05 # Fetch all parameters and shapes parameters = [] for conv_layer in conv_layers: filters = conv_layer.conv2d_node.filters parameters.append(filters) bias = conv_layer.bias_node.params parameters.append(bias) for affine_layer in affine_layers: weights = affine_layer.affine_node.linear_node.params parameters.append(weights) biases = affine_layer.affine_node.bias_node.params parameters.append(biases) ''' print(len(parameters)) for parameter in parameters: print(parameter.get_value().shape) shapes = [] params_flat_values = numpy.asarray([], dtype=theano.config.floatX) counter = 0 for parameter in parameters: shape = parameter.get_value().shape if counter%2 == 0 and len(shape)==4: vector_param = numpy.asarray(numpy.ndarray.flatten(parameter.get_value()), dtype=theano.config.floatX) vector_param[...] = rng.standard_normal(vector_param.shape) * std_deviation col_length = shape[2] index_from = 0 ### #for _ in range(shape[0]*shape[1]*shape[3]): # index_to = index_from + col_length # vector_param[index_from:index_to] = vector_param[index_from:index_to]/numpy.linalg.norm(vector_param[index_from:index_to]) # index_from = index_to #### elif counter%2==0: vector_param = numpy.asarray(numpy.ndarray.flatten(parameter.get_value()), dtype=theano.config.floatX) vector_param[...] = rng.standard_normal(vector_param.shape) * std_deviation else: vector_param = numpy.asarray(numpy.ndarray.flatten(parameter.get_value()), dtype=theano.config.floatX) params_flat_values = numpy.append(params_flat_values, vector_param) shapes.append(shape) ''' params_flat_values = numpy.asarray([], dtype=theano.config.floatX) shapes = [] for parameter in parameters: parameter_value = parameter.get_value() shapes.append(parameter_value.shape) vector_param = numpy.asarray(numpy.ndarray.flatten(parameter_value)) params_flat_values = numpy.append(params_flat_values, vector_param) print(parameter.get_value().shape) print(params_flat_values) print(params_flat_values.shape) params_flat = theano.shared(params_flat_values) params_old_flat = theano.shared(params_flat_values) ''' Builds a classification convnet on top of input_node. Returns ------- rval: tuple (conv_nodes, affine_nodes, output_node), where: conv_nodes is a list of the Conv2d nodes. affine_nodes is a list of the AffineNodes. output_node is the final node, a Softmax. ''' assert_is_instance(input_node, Lcn) conv_shape_args = (filter_shapes, pool_shapes, pool_strides) for conv_shapes in conv_shape_args: for conv_shape in conv_shapes: assert_all_integer(conv_shape) assert_all_greater(conv_shape, 0) conv_args = conv_shape_args + (filter_counts, filter_init_uniform_ranges) assert_all_equal([len(c) for c in conv_args]) assert_equal(len(affine_output_sizes), len(affine_init_stddevs)) assert_equal(len(dropout_include_rates), len(filter_shapes) + len(affine_output_sizes)) assert_equal(affine_output_sizes[-1], 10) # for MNIST #assert_equal(input_node.output_format.axes, ('b', '0', '1')) # # Done sanity-checking args. # input_shape = input_node.output_format.shape # Converts from MNIST's ('b', '0', '1') to ('b', 'c', '0', '1') last_node = input_node conv_dropout_include_rates = \ dropout_include_rates[:len(filter_shapes)] # Adds a dropout-conv-bias-relu-maxpool stack for each element in # filter_XXXX conv_layers = [] counter = 0 index_from = 0 for (filter_shape, filter_count, filter_init_range, pool_shape, pool_stride, conv_dropout_include_rate, conv_pad) in safe_izip(filter_shapes, filter_counts, filter_init_uniform_ranges, pool_shapes, pool_strides, conv_dropout_include_rates, conv_pads): if conv_dropout_include_rate != 1.0: last_node = Dropout(last_node, conv_dropout_include_rate, theano_rng) print(shapes) shape1 = shapes[counter] shape2 = shapes[counter+1] size1= numpy.prod(numpy.asarray(shape1)) size2= numpy.prod(numpy.asarray(shape2)) index_to = index_from + size1 #filters_ = theano.tensor.transpose(params_flat[index_from:index_to].reshape(shape1), axes=[0,1,3,2]) filters_ = params_flat[index_from:index_to].reshape(shape1) index_from = index_to index_to = index_from + size2 bias_ = params_flat[index_from:index_to].reshape(shape2) index_from = index_to last_node = Conv2dLayer(last_node, filter_shape, filter_count, conv_pads=conv_pad, pool_window_shape=pool_shape, pool_strides=pool_stride, pool_pads='pylearn2', filters=filters_, bias=bias_) conv_layers.append(last_node) counter = counter + 2 affine_dropout_include_rates = dropout_include_rates[len(filter_shapes):] affine_layers = [] # # Adds a dropout-affine-relu stack for each element in affine_XXXX, # except for the last one, where it omits the dropout. # # Add a fully connected layer here: shape1 = shapes[counter] #shape1 = (shape1[1], shape1[0]) shape2 = shapes[counter+1] size1= numpy.prod(numpy.asarray(shape1)) size2= numpy.prod(numpy.asarray(shape2)) index_to = index_from + size1 weights_ = params_flat[index_from:index_to].reshape(shape1) index_from = index_to index_to = index_from + size2 bias_ = params_flat[index_from:index_to].reshape(shape2) index_from = index_to output_format = DenseFormat(axes=('b', 'f'), shape=(-1, 500), dtype=None) if affine_dropout_include_rates < 1.0: last_node = Dropout(last_node, affine_dropout_include_rates, theano_rng) last_node = AffineLayer(last_node, output_format, weights=weights_, bias=bias_, input_to_bf_map={('0', '1', 'c'): 'f'}) affine_layers.append(last_node) counter += 2 for (affine_size, affine_init_stddev, affine_dropout_include_rate) in \ safe_izip(affine_output_sizes, affine_init_stddevs, affine_dropout_include_rates): if affine_dropout_include_rate < 1.0: last_node = Dropout(last_node, affine_dropout_include_rate, theano_rng) # No need to supply an axis map for the first affine transform. # By default, it collapses all non-'b' axes into a feature vector, # which is what we want. shape1 = shapes[counter] #shape1 = (shape1[1], shape1[0]) shape2 = shapes[counter+1] size1= numpy.prod(numpy.asarray(shape1)) size2= numpy.prod(numpy.asarray(shape2)) index_to = index_from + size1 weights_ = params_flat[index_from:index_to].reshape(shape1) index_from = index_to index_to = index_from + size2 bias_ = params_flat[index_from:index_to].reshape(shape2) index_from = index_to # remap from bc01 to b01c before flattening to bf, as pylearn2 does, # just so that they do identical things. last_node = SoftmaxLayer(last_node, DenseFormat(axes=('b', 'f'), shape=(-1, affine_size), dtype=None), weights=weights_, bias=bias_) #input_to_bf_map={('0', '1', 'c'): 'f'}) # stddev_init(rng, last_node.bias_node.params, affine_init_stddev) affine_layers.append(last_node) counter += 2 return conv_layers, affine_layers, last_node, params_flat, params_old_flat, shapes
def build_fc_classifier(input_node, sizes, sparse_init_counts, dropout_include_probabilities, rng, theano_rng): ''' Builds a stack of fully-connected layers followed by a Softmax. Each hidden layer will be preceded by a ReLU. Initialization: Weights are initialized in the same way as in Pylearn2's MLP tutorial: pylearn2/scripts/tutorials/multilayer_perceptron/mlp_tutorial_part_3.yaml This means the following: Of the N affine layers, the weights of the first N-1 are to all 0.0, except for k randomly-chosen elements, which are set to some random number drawn from the normal distribution with stddev=1.0. The biases are all initialized to 0.0. The last layer's weights and biases are both set to 0.0. Parameters ---------- input_node: Node The node to build the stack on. sizes: Sequence A sequence of ints, indicating the output sizes of each layer. The last int is the number of classes. sparse_init_counts: A sequence of N-1 ints, where N = len(sizes). Used to initialize the weights of the first N-1 layers. If the n'th element is x, this means that the n'th layer will have x nonzeros, with the rest initialized to zeros. dropout_include_probabilities: Sequence A Sequence of N-1 floats, where N := len(sizes) The dropout include probabilities for the outputs of each of the layers, except for the final one. If any of these probabilities is 1.0, the corresponding Dropout node will be omitted. rng: numpy.random.RandomState The RandomState to draw initial weights from. theano_rng: theano.tensor.shared_randomstreams.RandomStreams The RandomStreams to draw dropout masks from. Returns ------- rval: tuple (affine_nodes, output_node), where affine_nodes is a list of the AffineNodes, in order, and output_node is the final node, a Softmax. ''' assert_is_instance(input_node, Node) # pylint: disable=no-member assert_equal(input_node.output_format.dtype, numpy.dtype(theano.config.floatX)) assert_greater(len(sizes), 0) assert_all_greater(sizes, 0) assert_equal(len(sparse_init_counts), len(sizes) - 1) assert_all_integer(sparse_init_counts) assert_all_greater(sparse_init_counts, 0) assert_all_less_equal(sparse_init_counts, sizes[:-1]) assert_equal(len(dropout_include_probabilities), len(sizes)) ''' affine_nodes = [] last_node = input_node for layer_index, layer_output_size in enumerate(sizes): # Add dropout, if asked for include_probability = dropout_include_probabilities[layer_index] if include_probability != 1.0: last_node = Dropout(last_node, include_probability, theano_rng) output_format = DenseFormat(axes=('b', 'f'), shape=(-1, layer_output_size), dtype=None) if layer_index < (len(sizes) - 1): last_node = AffineLayer(last_node, output_format) else: last_node = SoftmaxLayer(last_node, output_format) affine_nodes.append(last_node.affine_node) # Not used in this demo, but keeping it in in case we want to start using # it again. def init_sparse_bias(shared_variable, num_nonzeros, rng): #Mimics the sparse initialization in #pylearn2.models.mlp.Linear.set_input_space() params = shared_variable.get_value() assert_equal(params.shape[0], 1) assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[1]) params[...] = 0.0 indices = rng.choice(params.size, size=num_nonzeros, replace=False) # normal dist with stddev=1.0 params[0, indices] = rng.randn(num_nonzeros) # Found that for biases, this didn't help (it increased the # final misclassification rate by .001) # if num_nonzeros > 0: # params /= float(num_nonzeros) shared_variable.set_value(params) def init_sparse_linear(shared_variable, num_nonzeros, rng): params = shared_variable.get_value() params[...] = 0.0 assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[0]) for c in xrange(params.shape[1]): indices = rng.choice(params.shape[0], size=num_nonzeros, replace=False) # normal dist with stddev=1.0, divided by 255.0 # # We need to divide by 255 for convergence. This is because # we're using unnormalized (i.e. 0 to 255) pixel values, unlike the # 0.0-to-1.0 pixels in # pylearn2.scripts.tutorials.multilayer_perceptron/ # # We could just do as the above tutorial does and normalize the # pixels to [0.0, 1.0], and not rescale the weights. However, # experiments show that this converges to a higher error, and also # makes mnist_visualizer.py's results look very "staticky", without # any recognizable digit hallucinations. params[indices, c] = rng.randn(num_nonzeros) / 255.0 shared_variable.set_value(params) # Initialize the affine layer weights (not the biases, and not the softmax # weights) for sparse_init_count, affine_node in safe_izip(sparse_init_counts, affine_nodes[:-1]): # pylearn2 doesn't sparse_init the biases. I also found that # doing so slightly increases the final misclassification rate. init_sparse_linear(affine_node.linear_node.params, sparse_init_count, rng) ################################################################################################# ### BUILD THE SECOND NETWORK WITH FLAT PARAMETERS (given the dimensions of the first) ########### ################################################################################################# parameters = [] shapes = [] for affine_node in affine_nodes: weights = affine_node.linear_node.params bias = affine_node.bias_node.params parameters.append(weights) parameters.append(bias) shapes.append(weights.get_value().shape) shapes.append(bias.get_value().shape) params_flat_values = numpy.asarray([], dtype=theano.config.floatX) for parameter in parameters: vector_param = numpy.asarray(numpy.ndarray.flatten(parameter.get_value()), dtype=theano.config.floatX) params_flat_values = numpy.append(params_flat_values, vector_param) params_flat = theano.shared(params_flat_values) params_old_flat = theano.shared(params_flat_values) affine_nodes = [] last_node = input_node counter = 0 index_from = 0 for layer_index, layer_output_size in enumerate(sizes): shape1 = shapes[counter] shape2 = shapes[counter+1] size1= numpy.prod(numpy.asarray(shape1)) size2= numpy.prod(numpy.asarray(shape2)) index_to = index_from + size1 weights_ = params_flat[index_from:index_to].reshape(shape1) index_from = index_to index_to = index_from + size2 bias_ = params_flat[index_from:index_to].reshape(shape2) index_from = index_to counter = counter + 2 # Add dropout, if asked for include_probability = dropout_include_probabilities[layer_index] if include_probability != 1.0: last_node = Dropout(last_node, include_probability, theano_rng) output_format = DenseFormat(axes=('b', 'f'), shape=(-1, layer_output_size), dtype=None) if layer_index < (len(sizes) - 1): last_node = AffineLayer(last_node, output_format, weights=weights_, bias=bias_) else: last_node = SoftmaxLayer(last_node, output_format, weights=weights_, bias=bias_) affine_nodes.append(last_node.affine_node) return affine_nodes, last_node, params_flat, params_old_flat ''' std_deviation = .05 input_size = 784 params_temp1 = [rng.standard_normal( (sizes[0]* input_size) ).astype(theano.config.floatX)*std_deviation, numpy.zeros(sizes[0], dtype=theano.config.floatX) ] params_temp2 = sum([ [rng.standard_normal( sizes[i] * sizes[i+1] ).astype(theano.config.floatX)*std_deviation, numpy.zeros(sizes[i+1], dtype=theano.config.floatX)] for i in range(len(sizes)-1) ],[] ) params_flat_values = numpy.concatenate( params_temp1 + params_temp2 ) params_flat = theano.shared(params_flat_values) params_old_flat = theano.shared(params_flat_values) shapes = [] param_arrays = [] index_to = input_size * sizes[0] param_arrays.append(params_flat[:index_to].reshape((sizes[0], input_size))) # Add weights shapes.append((input_size, sizes[0])) index_from = index_to index_to += sizes[0] param_arrays.append(params_flat[index_from:index_to]) # Add bias shapes.append((index_to-index_from, )) for i in range(len(sizes)-1): index_from = index_to index_to += sizes[i]*sizes[i+1] param_arrays.append(params_flat[index_from:index_to].reshape((sizes[i+1],sizes[i]))) # Add weight shapes.append((sizes[i], sizes[i+1])) #print(index_from, index_to) #print 'reshaped to' #print(sizes[i], sizes[i+1]) index_from = index_to index_to += sizes[i+1] param_arrays.append(params_flat[index_from:index_to]) # Add bias shapes.append((index_to-index_from, )) layers = [input_node] for i in range(len(sizes)-1): # repeat twice layers.append(AffineLayer(input_node=layers[-1], # last element of <layers> output_format=DenseFormat(axes=('b', 'f'), # axis order: (batch, feature) shape=(-1, sizes[i]), # output shape: (variable batch size, 10 classes) dtype=None) , # don't change the input data type weights = theano.tensor.transpose(param_arrays[i*2]), bias = param_arrays[i*2+1] )) layers.append(SoftmaxLayer(input_node=layers[-1], output_format=DenseFormat(axes=('b', 'f'), # axis order: (batch, feature) shape=(-1, sizes[i+1]), # output shape: (variable batch size, 10 classes) dtype=None), # don't change the input data type weights = theano.tensor.transpose(param_arrays[(i+1)*2]), bias = param_arrays[(i+1)*2+1] )) # collapse the channel, row, and column axes to a single feature axis softmax_layer = layers[-1] last_node = softmax_layer affine_nodes = [] for i in range(1,len(layers)): affine_nodes.append(layers[i].affine_node) print shapes return affine_nodes, last_node, params_flat, params_old_flat, shapes
def build_fc_classifier(input_node, sizes, sparse_init_counts, dropout_include_probabilities, rng, theano_rng): ''' Builds a stack of fully-connected layers followed by a Softmax. Each hidden layer will be preceded by a ReLU. Initialization: Weights are initialized in the same way as in Pylearn2's MLP tutorial: pylearn2/scripts/tutorials/multilayer_perceptron/mlp_tutorial_part_3.yaml This means the following: Of the N affine layers, the weights of the first N-1 are to all 0.0, except for k randomly-chosen elements, which are set to some random number drawn from the normal distribution with stddev=1.0. The biases are all initialized to 0.0. The last layer's weights and biases are both set to 0.0. Parameters ---------- input_node: Node The node to build the stack on. sizes: Sequence A sequence of ints, indicating the output sizes of each layer. The last int is the number of classes. sparse_init_counts: A sequence of N-1 ints, where N = len(sizes). Used to initialize the weights of the first N-1 layers. If the n'th element is x, this means that the n'th layer will have x nonzeros, with the rest initialized to zeros. dropout_include_probabilities: Sequence A Sequence of N-1 floats, where N := len(sizes) The dropout include probabilities for the outputs of each of the layers, except for the final one. If any of these probabilities is 1.0, the corresponding Dropout node will be omitted. rng: numpy.random.RandomState The RandomState to draw initial weights from. theano_rng: theano.tensor.shared_randomstreams.RandomStreams The RandomStreams to draw dropout masks from. Returns ------- rval: tuple (affine_nodes, output_node), where affine_nodes is a list of the AffineNodes, in order, and output_node is the final node, a Softmax. ''' assert_is_instance(input_node, Node) assert_equal(input_node.output_format.dtype, numpy.dtype(theano.config.floatX)) assert_greater(len(sizes), 0) assert_all_greater(sizes, 0) assert_equal(len(sparse_init_counts), len(sizes) - 1) assert_all_integer(sparse_init_counts) assert_all_greater(sparse_init_counts, 0) assert_all_less_equal(sparse_init_counts, sizes[:-1]) assert_equal(len(dropout_include_probabilities), len(sizes)) affine_nodes = [] last_node = input_node for layer_index, layer_output_size in enumerate(sizes): # Add dropout, if asked for include_probability = dropout_include_probabilities[layer_index] if include_probability != 1.0: last_node = Dropout(last_node, include_probability, theano_rng) output_format = DenseFormat(axes=('b', 'f'), shape=(-1, layer_output_size), dtype=None) if layer_index < (len(sizes) - 1): last_node = AffineLayer(last_node, output_format) else: last_node = SoftmaxLayer(last_node, output_format) affine_nodes.append(last_node.affine_node) def init_sparse_bias(shared_variable, num_nonzeros, rng): ''' Mimics the sparse initialization in pylearn2.models.mlp.Linear.set_input_space() ''' params = shared_variable.get_value() assert_equal(params.shape[0], 1) assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[1]) params[...] = 0.0 indices = rng.choice(params.size, size=num_nonzeros, replace=False) # normal dist with stddev=1.0 params[0, indices] = rng.randn(num_nonzeros) # Found that for biases, this didn't help (it increased the # final misclassification rate by .001) # if num_nonzeros > 0: # params /= float(num_nonzeros) shared_variable.set_value(params) def init_sparse_linear(shared_variable, num_nonzeros, rng): params = shared_variable.get_value() params[...] = 0.0 assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[0]) for c in xrange(params.shape[1]): indices = rng.choice(params.shape[0], size=num_nonzeros, replace=False) # normal dist with stddev=1.0 params[indices, c] = rng.randn(num_nonzeros) # TODO: it's somewhat worrisome that the tutorial in # pylearn2.scripts.tutorials.multilayer_perceptron/ # multilayer_perceptron.ipynb # seems to do fine without scaling the weights like this if num_nonzeros > 0: params /= float(num_nonzeros) # Interestingly, while this seems more correct (normalize # columns to norm=1), it prevents the NN from converging. # params /= numpy.sqrt(float(num_nonzeros)) shared_variable.set_value(params) # Initialize the affine layer weights (not the biases, and not the softmax # weights) for sparse_init_count, affine_node in safe_izip(sparse_init_counts, affine_nodes[:-1]): # pylearn2 doesn't sparse_init the biases. I also found that # doing so slightly increases the final misclassification rate. init_sparse_linear(affine_node.linear_node.params, sparse_init_count, rng) return affine_nodes, last_node
def apply_subwindow_func(subwindow_func, padded_images, pads, window_shape, strides): ''' Applies a sliding-window function to all subwindows of a feature map. Parameters ---------- subwindow_func: function A function that takes a subwindow and returns a scalar. Input: tensor with shape [BATCH_SIZE, NUM_CHANNELS, ROWS, COLS] Output: tensor with shape [BATCH_SIZE, NUM_CHANNELS] padded_images: numpy.ndarray A feature map with shape [BATCH_SIZE, NUM_CHANNELS, ROWS, COLS]. This has pad[0] rows and pad[1] columns of zero-padding. max_pad: Sequence [pad_rows, pad_columns], the # of padded rows and columns on each side of the image. ''' assert_equal(padded_images.ndim, 4) assert_all_greater(padded_images.shape[2:], pads) _assert_is_shape2d(window_shape) _assert_is_shape2d(strides) pads, window_shape, strides = (numpy.asarray(a) for a in (pads, window_shape, strides)) assert_all_greater(numpy.asarray(padded_images.shape[2:]), 2 * pads) # Check that pad region is full of the same value if pads[0] > 0: pad_value = padded_images[0, 0, 0, 0] assert_true(numpy.all(padded_images[:, :, :pads[0], :] == pad_value)) assert_true(numpy.all(padded_images[:, :, -pads[0]:, :] == pad_value)) if pads[1] > 0: pad_value = padded_images[0, 0, 0, 0] assert_true(numpy.all(padded_images[:, :, :, :pads[1]] == pad_value)) assert_true(numpy.all(padded_images[:, :, :, -pads[1]:] == pad_value)) rows, cols = (range(0, padded_images.shape[i + 2] - window_shape[i] + 1, strides[i]) for i in (0, 1)) output_image = None for out_r, in_r in enumerate(rows): for out_c, in_c in enumerate(cols): subwindow = padded_images[:, :, in_r:(in_r + window_shape[0]), in_c:(in_c + window_shape[1])] output = subwindow_func(subwindow) assert_equal(output.ndim, 2) # check that subwindow_func preserved the batch size assert_equal(output.shape[0], padded_images.shape[0]) assert_greater(output.shape[1], 0) if output_image is None: output_image = numpy.zeros((output.shape[0], output.shape[1], len(rows), len(cols)), dtype=output.dtype) output_image[:, :, out_r, out_c] = output return output_image
def _sliding_window_2d_testimpl(expected_subwindow_funcs, pad_values, make_node_funcs, make_pad_args_funcs, rtol=None): ''' Implementation of tests for 2D sliding-window nodes like Pool2D and Conv2d. Parameters ---------- expected_subwindow_funcs: Sequence A Sequence of subwindow functions. These take a subwindow and return a scalar. Input: tensor with shape [BATCH_SIZE, NUM_CHANNELS, ROWS, COLS] Output: tensor with shape [BATCH_SIZE, NUM_CHANNELS] pad_values: Sequence A sequence of pad filler values to use for eah of the expected_subwindow_funcs. For example, if expected_subwindow_funcs is [average_pool, max_pool], use [0.0, -numpy.inf]. make_node_funcs: Sequence A Sequence of functions that create sliding-window Nodes to be tested against the ground-truth provided by the corresponding expected_subwindow_funcs. Its paramters are as follows: Parameters ---------- input_node: Node window_shape: Sequence [NUM_ROWS, NUM_COLUMNS] of the sliding window. strides: Sequence [ROW_STRIDE, COLUMN_STRIDE], or how many rows/columns to skip between applications of the sliding window. pad: Sequence [ROW_PAD, COLUMN_PAD], or # of zero-padding rows/columns to add to each side of the image. axis_map: dict Maps strings to strings. Optional. If the node uses different axis names than 'b', 'c', '0', '1', this specifies the mapping from the node's axis names to 'b', 'c', '0', '1'. make_pad_args_funcs: Sequence A Sequence of functions that take a window_shape arg (2d array) and returns an Iterable of 'pad' arguments, which can be strings or 2d arrays of ints. ''' assert_is_instance(expected_subwindow_funcs, Sequence) assert_is_instance(pad_values, Sequence) assert_is_instance(make_node_funcs, Sequence) # TODO: change this to construct a Toeplitz matrix out of padded_images, # so we get a giant stack of C X WR X WC matrices, which can then be fed # to subwindow_func as a single batch. # See scipy.linalg.toeplitz def apply_subwindow_func(subwindow_func, padded_images, pads, window_shape, strides): ''' Applies a sliding-window function to all subwindows of a feature map. Parameters ---------- subwindow_func: function A function that takes a subwindow and returns a scalar. Input: tensor with shape [BATCH_SIZE, NUM_CHANNELS, ROWS, COLS] Output: tensor with shape [BATCH_SIZE, NUM_CHANNELS] padded_images: numpy.ndarray A feature map with shape [BATCH_SIZE, NUM_CHANNELS, ROWS, COLS]. This has pad[0] rows and pad[1] columns of zero-padding. max_pad: Sequence [pad_rows, pad_columns], the # of padded rows and columns on each side of the image. ''' assert_equal(padded_images.ndim, 4) assert_all_greater(padded_images.shape[2:], pads) _assert_is_shape2d(window_shape) _assert_is_shape2d(strides) pads, window_shape, strides = (numpy.asarray(a) for a in (pads, window_shape, strides)) assert_all_greater(numpy.asarray(padded_images.shape[2:]), 2 * pads) # Check that pad region is full of the same value if pads[0] > 0: pad_value = padded_images[0, 0, 0, 0] assert_true(numpy.all(padded_images[:, :, :pads[0], :] == pad_value)) assert_true(numpy.all(padded_images[:, :, -pads[0]:, :] == pad_value)) if pads[1] > 0: pad_value = padded_images[0, 0, 0, 0] assert_true(numpy.all(padded_images[:, :, :, :pads[1]] == pad_value)) assert_true(numpy.all(padded_images[:, :, :, -pads[1]:] == pad_value)) rows, cols = (range(0, padded_images.shape[i + 2] - window_shape[i] + 1, strides[i]) for i in (0, 1)) output_image = None for out_r, in_r in enumerate(rows): for out_c, in_c in enumerate(cols): subwindow = padded_images[:, :, in_r:(in_r + window_shape[0]), in_c:(in_c + window_shape[1])] output = subwindow_func(subwindow) assert_equal(output.ndim, 2) # check that subwindow_func preserved the batch size assert_equal(output.shape[0], padded_images.shape[0]) assert_greater(output.shape[1], 0) if output_image is None: output_image = numpy.zeros((output.shape[0], output.shape[1], len(rows), len(cols)), dtype=output.dtype) output_image[:, :, out_r, out_c] = output return output_image max_stride = 3 max_window_size = 3 batch_size = 2 num_channels = 2 input_dtype = numpy.dtype('int') max_pad = max_window_size + 1 assert_greater_equal(max_pad, 0) rng = numpy.random.RandomState(352) def get_padded_image(max_padded_images, pads): def margin_to_slice(margin): assert_greater_equal(margin, 0) if margin == 0: return slice(None, None) else: return slice(margin, -margin) return max_padded_images[:, :, margin_to_slice(max_pad - pads[0]), margin_to_slice(max_pad - pads[1])] def get_pads_from_pad_arg(pad_arg, window_shape): ''' Converts a valid pad argument (str or 2-int Sequence) to an equivalent 2-int numpy.ndarray. ''' window_shape = numpy.asarray(window_shape) _assert_is_shape2d(window_shape) if isinstance(pad_arg, basestring): if pad_arg == 'full': return window_shape - 1 elif pad_arg == 'valid': return numpy.asarray([0, 0]) elif pad_arg == 'same_shape': assert_true((window_shape % 2 != 0).all()) return window_shape // 2 else: raise ValueError("Unrecognized pad name: '%s'" % pad_arg) else: _assert_is_shape2d(pad_arg) return numpy.asarray(pad_arg) prod = itertools.product for (expected_func, pad_value, make_node_func, make_pad_args_func) in safe_izip(expected_subwindow_funcs, pad_values, make_node_funcs, make_pad_args_funcs): # An image with the maximum amount of padding. We will vary the amount # of padding in practice by taking centered subwindows of this image. max_padded_images = numpy.empty((batch_size, num_channels, max_pad * 2 + max_window_size + 1, max_pad * 2 + max_window_size + 4), dtype=input_dtype) max_padded_images[...] = pad_value images = get_padded_image(max_padded_images, (0, 0)) images[...] = rng.random_integers(low=-10, high=10, size=images.shape) #images[...] = numpy.arange(images.size).reshape(images.shape) assert_all_greater(images.shape, 0) if max_pad == 0: assert_array_equal(images, max_padded_images) else: assert_array_equal(images, max_padded_images[:, :, max_pad:-max_pad, max_pad:-max_pad]) # Make input_nodes with weird axis names and axis order axis_map = {'b': 'b', 'see': 'c', 'zero': '0', 'one': '1'} input_node_axes = ('b', 'see', 'zero', 'one') # input_node_axes = ('b', 'zero', 'see', 'one') transpose_indices = [('b', 'see', 'zero', 'one').index(a) for a in input_node_axes] input_node_shape = [images.shape[t] for t in transpose_indices] input_node_shape[input_node_axes.index('b')] = -1 input_node = InputNode(DenseFormat(axes=input_node_axes, shape=input_node_shape, dtype=input_dtype)) # Loops through all possible window_shapes, pads (including padding # bigger than the window shape), strides. for window_shape in prod(range(1, max_window_size + 1), repeat=2): window_shape = numpy.asarray(window_shape) # for pad_arg in get_pad_args(window_shape, supports_padding): for pad_arg in make_pad_args_func(window_shape): # can't use same_shape padding with even window dims if pad_arg == 'same_shape' and (window_shape % 2 == 0).any(): continue pads = get_pads_from_pad_arg(pad_arg, window_shape) padded_images = get_padded_image(max_padded_images, pads) assert_array_equal(numpy.asarray(padded_images.shape[2:]), (2 * pads) + numpy.asarray(images.shape[2:])) for strides in prod(range(1, max_stride + 1), repeat=2): expected_images = apply_subwindow_func(expected_func, padded_images, pads, window_shape, strides) # If pads are bigger than window_size, expect an exception # when creating the node. if not isinstance(pads, basestring) and \ numpy.any(pads >= window_shape): assert_raises_regexp(AssertionError, "Not all pads", make_node_func, input_node, window_shape=window_shape, strides=strides, pads=pad_arg, axis_map=axis_map) else: node = make_node_func(input_node, window_shape=window_shape, strides=strides, pads=pad_arg, axis_map=axis_map) node_func = theano.function([input_node.output_symbol], node.output_symbol) transposed_images = images.transpose(transpose_indices) actual_images = node_func(transposed_images) node.output_format.check(actual_images) # try: # node.output_format.check(actual_images) # except AssertionError: # pdb.set_trace() kwargs = {} if rtol is not None: kwargs['rtol'] = rtol # pylint: disable=star-args assert_allclose(actual_images, expected_images, **kwargs)