def build_fc_classifier(input_node, sizes, sparse_init_counts, dropout_include_probabilities, rng, theano_rng): ''' Builds a stack of fully-connected layers followed by a Softmax. Each hidden layer will be preceded by a ReLU. Initialization: Weights are initialized in the same way as in Pylearn2's MLP tutorial: pylearn2/scripts/tutorials/multilayer_perceptron/mlp_tutorial_part_3.yaml This means the following: Of the N affine layers, the weights of the first N-1 are to all 0.0, except for k randomly-chosen elements, which are set to some random number drawn from the normal distribution with stddev=1.0. The biases are all initialized to 0.0. The last layer's weights and biases are both set to 0.0. Parameters ---------- input_node: Node The node to build the stack on. sizes: Sequence A sequence of ints, indicating the output sizes of each layer. The last int is the number of classes. sparse_init_counts: A sequence of N-1 ints, where N = len(sizes). Used to initialize the weights of the first N-1 layers. If the n'th element is x, this means that the n'th layer will have x nonzeros, with the rest initialized to zeros. dropout_include_probabilities: Sequence A Sequence of N-1 floats, where N := len(sizes) The dropout include probabilities for the outputs of each of the layers, except for the final one. If any of these probabilities is 1.0, the corresponding Dropout node will be omitted. rng: numpy.random.RandomState The RandomState to draw initial weights from. theano_rng: theano.tensor.shared_randomstreams.RandomStreams The RandomStreams to draw dropout masks from. Returns ------- rval: tuple (affine_nodes, output_node), where affine_nodes is a list of the AffineNodes, in order, and output_node is the final node, a Softmax. ''' assert_is_instance(input_node, Node) # pylint: disable=no-member assert_equal(input_node.output_format.dtype, numpy.dtype(theano.config.floatX)) assert_greater(len(sizes), 0) assert_all_greater(sizes, 0) assert_equal(len(sparse_init_counts), len(sizes) - 1) assert_all_integer(sparse_init_counts) assert_all_greater(sparse_init_counts, 0) assert_all_less_equal(sparse_init_counts, sizes[:-1]) assert_equal(len(dropout_include_probabilities), len(sizes)) ''' affine_nodes = [] last_node = input_node for layer_index, layer_output_size in enumerate(sizes): # Add dropout, if asked for include_probability = dropout_include_probabilities[layer_index] if include_probability != 1.0: last_node = Dropout(last_node, include_probability, theano_rng) output_format = DenseFormat(axes=('b', 'f'), shape=(-1, layer_output_size), dtype=None) if layer_index < (len(sizes) - 1): last_node = AffineLayer(last_node, output_format) else: last_node = SoftmaxLayer(last_node, output_format) affine_nodes.append(last_node.affine_node) # Not used in this demo, but keeping it in in case we want to start using # it again. def init_sparse_bias(shared_variable, num_nonzeros, rng): #Mimics the sparse initialization in #pylearn2.models.mlp.Linear.set_input_space() params = shared_variable.get_value() assert_equal(params.shape[0], 1) assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[1]) params[...] = 0.0 indices = rng.choice(params.size, size=num_nonzeros, replace=False) # normal dist with stddev=1.0 params[0, indices] = rng.randn(num_nonzeros) # Found that for biases, this didn't help (it increased the # final misclassification rate by .001) # if num_nonzeros > 0: # params /= float(num_nonzeros) shared_variable.set_value(params) def init_sparse_linear(shared_variable, num_nonzeros, rng): params = shared_variable.get_value() params[...] = 0.0 assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[0]) for c in xrange(params.shape[1]): indices = rng.choice(params.shape[0], size=num_nonzeros, replace=False) # normal dist with stddev=1.0, divided by 255.0 # # We need to divide by 255 for convergence. This is because # we're using unnormalized (i.e. 0 to 255) pixel values, unlike the # 0.0-to-1.0 pixels in # pylearn2.scripts.tutorials.multilayer_perceptron/ # # We could just do as the above tutorial does and normalize the # pixels to [0.0, 1.0], and not rescale the weights. However, # experiments show that this converges to a higher error, and also # makes mnist_visualizer.py's results look very "staticky", without # any recognizable digit hallucinations. params[indices, c] = rng.randn(num_nonzeros) / 255.0 shared_variable.set_value(params) # Initialize the affine layer weights (not the biases, and not the softmax # weights) for sparse_init_count, affine_node in safe_izip(sparse_init_counts, affine_nodes[:-1]): # pylearn2 doesn't sparse_init the biases. I also found that # doing so slightly increases the final misclassification rate. init_sparse_linear(affine_node.linear_node.params, sparse_init_count, rng) ################################################################################################# ### BUILD THE SECOND NETWORK WITH FLAT PARAMETERS (given the dimensions of the first) ########### ################################################################################################# parameters = [] shapes = [] for affine_node in affine_nodes: weights = affine_node.linear_node.params bias = affine_node.bias_node.params parameters.append(weights) parameters.append(bias) shapes.append(weights.get_value().shape) shapes.append(bias.get_value().shape) params_flat_values = numpy.asarray([], dtype=theano.config.floatX) for parameter in parameters: vector_param = numpy.asarray(numpy.ndarray.flatten(parameter.get_value()), dtype=theano.config.floatX) params_flat_values = numpy.append(params_flat_values, vector_param) params_flat = theano.shared(params_flat_values) params_old_flat = theano.shared(params_flat_values) affine_nodes = [] last_node = input_node counter = 0 index_from = 0 for layer_index, layer_output_size in enumerate(sizes): shape1 = shapes[counter] shape2 = shapes[counter+1] size1= numpy.prod(numpy.asarray(shape1)) size2= numpy.prod(numpy.asarray(shape2)) index_to = index_from + size1 weights_ = params_flat[index_from:index_to].reshape(shape1) index_from = index_to index_to = index_from + size2 bias_ = params_flat[index_from:index_to].reshape(shape2) index_from = index_to counter = counter + 2 # Add dropout, if asked for include_probability = dropout_include_probabilities[layer_index] if include_probability != 1.0: last_node = Dropout(last_node, include_probability, theano_rng) output_format = DenseFormat(axes=('b', 'f'), shape=(-1, layer_output_size), dtype=None) if layer_index < (len(sizes) - 1): last_node = AffineLayer(last_node, output_format, weights=weights_, bias=bias_) else: last_node = SoftmaxLayer(last_node, output_format, weights=weights_, bias=bias_) affine_nodes.append(last_node.affine_node) return affine_nodes, last_node, params_flat, params_old_flat ''' std_deviation = .05 input_size = 784 params_temp1 = [rng.standard_normal( (sizes[0]* input_size) ).astype(theano.config.floatX)*std_deviation, numpy.zeros(sizes[0], dtype=theano.config.floatX) ] params_temp2 = sum([ [rng.standard_normal( sizes[i] * sizes[i+1] ).astype(theano.config.floatX)*std_deviation, numpy.zeros(sizes[i+1], dtype=theano.config.floatX)] for i in range(len(sizes)-1) ],[] ) params_flat_values = numpy.concatenate( params_temp1 + params_temp2 ) params_flat = theano.shared(params_flat_values) params_old_flat = theano.shared(params_flat_values) shapes = [] param_arrays = [] index_to = input_size * sizes[0] param_arrays.append(params_flat[:index_to].reshape((sizes[0], input_size))) # Add weights shapes.append((input_size, sizes[0])) index_from = index_to index_to += sizes[0] param_arrays.append(params_flat[index_from:index_to]) # Add bias shapes.append((index_to-index_from, )) for i in range(len(sizes)-1): index_from = index_to index_to += sizes[i]*sizes[i+1] param_arrays.append(params_flat[index_from:index_to].reshape((sizes[i+1],sizes[i]))) # Add weight shapes.append((sizes[i], sizes[i+1])) #print(index_from, index_to) #print 'reshaped to' #print(sizes[i], sizes[i+1]) index_from = index_to index_to += sizes[i+1] param_arrays.append(params_flat[index_from:index_to]) # Add bias shapes.append((index_to-index_from, )) layers = [input_node] for i in range(len(sizes)-1): # repeat twice layers.append(AffineLayer(input_node=layers[-1], # last element of <layers> output_format=DenseFormat(axes=('b', 'f'), # axis order: (batch, feature) shape=(-1, sizes[i]), # output shape: (variable batch size, 10 classes) dtype=None) , # don't change the input data type weights = theano.tensor.transpose(param_arrays[i*2]), bias = param_arrays[i*2+1] )) layers.append(SoftmaxLayer(input_node=layers[-1], output_format=DenseFormat(axes=('b', 'f'), # axis order: (batch, feature) shape=(-1, sizes[i+1]), # output shape: (variable batch size, 10 classes) dtype=None), # don't change the input data type weights = theano.tensor.transpose(param_arrays[(i+1)*2]), bias = param_arrays[(i+1)*2+1] )) # collapse the channel, row, and column axes to a single feature axis softmax_layer = layers[-1] last_node = softmax_layer affine_nodes = [] for i in range(1,len(layers)): affine_nodes.append(layers[i].affine_node) print shapes return affine_nodes, last_node, params_flat, params_old_flat, shapes
def build_fc_classifier(input_node, sizes, sparse_init_counts, dropout_include_probabilities, rng, theano_rng): """ Builds a stack of fully-connected layers followed by a Softmax. Each hidden layer will be preceded by a ReLU. Initialization: Weights are initialized in the same way as in Pylearn2's MLP tutorial: pylearn2/scripts/tutorials/multilayer_perceptron/mlp_tutorial_part_3.yaml This means the following: Of the N affine layers, the weights of the first N-1 are to all 0.0, except for k randomly-chosen elements, which are set to some random number drawn from the normal distribution with stddev=1.0. The biases are all initialized to 0.0. The last layer's weights and biases are both set to 0.0. Parameters ---------- input_node: Node The node to build the stack on. sizes: Sequence A sequence of ints, indicating the output sizes of each layer. The last int is the number of classes. sparse_init_counts: A sequence of N-1 ints, where N = len(sizes). Used to initialize the weights of the first N-1 layers. If the n'th element is x, this means that the n'th layer will have x nonzeros, with the rest initialized to zeros. dropout_include_probabilities: Sequence A Sequence of N-1 floats, where N := len(sizes) The dropout include probabilities for the outputs of each of the layers, except for the final one. If any of these probabilities is 1.0, the corresponding Dropout node will be omitted. rng: numpy.random.RandomState The RandomState to draw initial weights from. theano_rng: theano.tensor.shared_randomstreams.RandomStreams The RandomStreams to draw dropout masks from. Returns ------- rval: tuple (affine_nodes, output_node), where affine_nodes is a list of the AffineNodes, in order, and output_node is the final node, a Softmax. """ assert_is_instance(input_node, Node) # pylint: disable=no-member assert_equal(input_node.output_format.dtype, numpy.dtype(theano.config.floatX)) assert_greater(len(sizes), 0) assert_all_greater(sizes, 0) assert_equal(len(sparse_init_counts), len(sizes) - 1) assert_all_integer(sparse_init_counts) assert_all_greater(sparse_init_counts, 0) assert_all_less_equal(sparse_init_counts, sizes[:-1]) assert_equal(len(dropout_include_probabilities), len(sizes)) affine_nodes = [] last_node = input_node for layer_index, layer_output_size in enumerate(sizes): # Add dropout, if asked for include_probability = dropout_include_probabilities[layer_index] if include_probability != 1.0: last_node = Dropout(last_node, include_probability, theano_rng) output_format = DenseFormat(axes=("b", "f"), shape=(-1, layer_output_size), dtype=None) if layer_index < (len(sizes) - 1): last_node = AffineLayer(last_node, output_format) else: last_node = SoftmaxLayer(last_node, output_format) affine_nodes.append(last_node.affine_node) # Not used in this demo, but keeping it in in case we want to start using # it again. def init_sparse_bias(shared_variable, num_nonzeros, rng): """ Mimics the sparse initialization in pylearn2.models.mlp.Linear.set_input_space() """ params = shared_variable.get_value() assert_equal(params.shape[0], 1) assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[1]) params[...] = 0.0 indices = rng.choice(params.size, size=num_nonzeros, replace=False) # normal dist with stddev=1.0 params[0, indices] = rng.randn(num_nonzeros) # Found that for biases, this didn't help (it increased the # final misclassification rate by .001) # if num_nonzeros > 0: # params /= float(num_nonzeros) shared_variable.set_value(params) def init_sparse_linear(shared_variable, num_nonzeros, rng): params = shared_variable.get_value() params[...] = 0.0 assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[0]) for c in xrange(params.shape[1]): indices = rng.choice(params.shape[0], size=num_nonzeros, replace=False) # normal dist with stddev=1.0, divided by 255.0 # # We need to divide by 255 for convergence. This is because # we're using unnormalized (i.e. 0 to 255) pixel values, unlike the # 0.0-to-1.0 pixels in # pylearn2.scripts.tutorials.multilayer_perceptron/ # # We could just do as the above tutorial does and normalize the # pixels to [0.0, 1.0], and not rescale the weights. However, # experiments show that this converges to a higher error, and also # makes mnist_visualizer.py's results look very "staticky", without # any recognizable digit hallucinations. params[indices, c] = rng.randn(num_nonzeros) / 255.0 shared_variable.set_value(params) # Initialize the affine layer weights (not the biases, and not the softmax # weights) for sparse_init_count, affine_node in safe_izip(sparse_init_counts, affine_nodes[:-1]): # pylearn2 doesn't sparse_init the biases. I also found that # doing so slightly increases the final misclassification rate. init_sparse_linear(affine_node.linear_node.params, sparse_init_count, rng) return affine_nodes, last_node
def build_fc_classifier(input_node, sizes, sparse_init_counts, dropout_include_probabilities, rng, theano_rng): ''' Builds a stack of fully-connected layers followed by a Softmax. Each hidden layer will be preceded by a ReLU. Initialization: Weights are initialized in the same way as in Pylearn2's MLP tutorial: pylearn2/scripts/tutorials/multilayer_perceptron/mlp_tutorial_part_3.yaml This means the following: Of the N affine layers, the weights of the first N-1 are to all 0.0, except for k randomly-chosen elements, which are set to some random number drawn from the normal distribution with stddev=1.0. The biases are all initialized to 0.0. The last layer's weights and biases are both set to 0.0. Parameters ---------- input_node: Node The node to build the stack on. sizes: Sequence A sequence of ints, indicating the output sizes of each layer. The last int is the number of classes. sparse_init_counts: A sequence of N-1 ints, where N = len(sizes). Used to initialize the weights of the first N-1 layers. If the n'th element is x, this means that the n'th layer will have x nonzeros, with the rest initialized to zeros. dropout_include_probabilities: Sequence A Sequence of N-1 floats, where N := len(sizes) The dropout include probabilities for the outputs of each of the layers, except for the final one. If any of these probabilities is 1.0, the corresponding Dropout node will be omitted. rng: numpy.random.RandomState The RandomState to draw initial weights from. theano_rng: theano.tensor.shared_randomstreams.RandomStreams The RandomStreams to draw dropout masks from. Returns ------- rval: tuple (affine_nodes, output_node), where affine_nodes is a list of the AffineNodes, in order, and output_node is the final node, a Softmax. ''' assert_is_instance(input_node, Node) assert_equal(input_node.output_format.dtype, numpy.dtype(theano.config.floatX)) assert_greater(len(sizes), 0) assert_all_greater(sizes, 0) assert_equal(len(sparse_init_counts), len(sizes) - 1) assert_all_integer(sparse_init_counts) assert_all_greater(sparse_init_counts, 0) assert_all_less_equal(sparse_init_counts, sizes[:-1]) assert_equal(len(dropout_include_probabilities), len(sizes)) affine_nodes = [] last_node = input_node for layer_index, layer_output_size in enumerate(sizes): # Add dropout, if asked for include_probability = dropout_include_probabilities[layer_index] if include_probability != 1.0: last_node = Dropout(last_node, include_probability, theano_rng) output_format = DenseFormat(axes=('b', 'f'), shape=(-1, layer_output_size), dtype=None) if layer_index < (len(sizes) - 1): last_node = AffineLayer(last_node, output_format) else: last_node = SoftmaxLayer(last_node, output_format) affine_nodes.append(last_node.affine_node) def init_sparse_bias(shared_variable, num_nonzeros, rng): ''' Mimics the sparse initialization in pylearn2.models.mlp.Linear.set_input_space() ''' params = shared_variable.get_value() assert_equal(params.shape[0], 1) assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[1]) params[...] = 0.0 indices = rng.choice(params.size, size=num_nonzeros, replace=False) # normal dist with stddev=1.0 params[0, indices] = rng.randn(num_nonzeros) # Found that for biases, this didn't help (it increased the # final misclassification rate by .001) # if num_nonzeros > 0: # params /= float(num_nonzeros) shared_variable.set_value(params) def init_sparse_linear(shared_variable, num_nonzeros, rng): params = shared_variable.get_value() params[...] = 0.0 assert_greater_equal(num_nonzeros, 0) assert_less_equal(num_nonzeros, params.shape[0]) for c in xrange(params.shape[1]): indices = rng.choice(params.shape[0], size=num_nonzeros, replace=False) # normal dist with stddev=1.0 params[indices, c] = rng.randn(num_nonzeros) # TODO: it's somewhat worrisome that the tutorial in # pylearn2.scripts.tutorials.multilayer_perceptron/ # multilayer_perceptron.ipynb # seems to do fine without scaling the weights like this if num_nonzeros > 0: params /= float(num_nonzeros) # Interestingly, while this seems more correct (normalize # columns to norm=1), it prevents the NN from converging. # params /= numpy.sqrt(float(num_nonzeros)) shared_variable.set_value(params) # Initialize the affine layer weights (not the biases, and not the softmax # weights) for sparse_init_count, affine_node in safe_izip(sparse_init_counts, affine_nodes[:-1]): # pylearn2 doesn't sparse_init the biases. I also found that # doing so slightly increases the final misclassification rate. init_sparse_linear(affine_node.linear_node.params, sparse_init_count, rng) return affine_nodes, last_node