def net_dvc(image_size=(32,32)): convos = [5,5,5] pools = [2,2,2] filters = [100,200,300] tuplify = lambda x: (x,x) convos = list(map(tuplify, convos)) conv_layers = [Convolutional(filter_size=s,num_filters=o, num_channels=i, name="Conv"+str(n))\ for s,o,i,n in zip(convos, filters, [3] + filters, range(1000))] pool_layers = [MaxPooling(p) for p in map(tuplify, pools)] activations = [Rectifier() for i in convos] layers = [i for l in zip(conv_layers, activations, pool_layers) for i in l] cnn = ConvolutionalSequence(layers, 3, image_size=image_size, name="cnn", weights_init=Uniform(width=.1), biases_init=Constant(0)) cnn._push_allocation_config() cnn_output = np.prod(cnn.get_dim('output')) mlp_size = [cnn_output,500,2] mlp = MLP([Rectifier(), Softmax()], mlp_size, name="mlp", weights_init=Uniform(width=.1), biases_init=Constant(0)) seq = FeedforwardSequence([net.apply for net in [cnn,Flattener(),mlp]]) seq.push_initialization_config() seq.initialize() return seq
def _allocate(self): arghs = dict(shape=self.shape, broadcastable=self.broadcastable) sequence = [] if self.batch_normalize: sequence.append(Standardization(**arghs)) sequence.append(SharedScale(weights_init=Constant(1), **arghs)) sequence.append(SharedShift(biases_init=Constant(0), **arghs)) sequence.append(self.activation) self.sequence = FeedforwardSequence( [brick.apply for brick in sequence], name="ffs") self.children = [self.sequence]
def build_model(images, labels): # Construct a bottom convolutional sequence bottom_conv_sequence = convolutional_sequence((3,3), 16, (160, 160)) bottom_conv_sequence._push_allocation_config() # Flatten layer flattener = Flattener() # Construct a top MLP conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output')) #top_mlp = MLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0)) top_mlp = BatchNormalizedMLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0)) # Construct feedforward sequence ss_seq = FeedforwardSequence([bottom_conv_sequence.apply, flattener.apply, top_mlp.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost_noreg = CategoricalCrossEntropy().apply(labels.flatten(), prediction) # add regularization selector = Selector([top_mlp]) Ws = selector.get_parameters('W') mlp_brick_name = 'batchnormalizedmlp' W0 = Ws['/%s/linear_0.W' % mlp_brick_name] W1 = Ws['/%s/linear_1.W' % mlp_brick_name] cost = cost_noreg + .01 * (W0 ** 2).mean() + .01 * (W1 ** 2).mean() return cost
def build_model(images, labels): vgg = VGG(layer='conv4_4') vgg.push_initialization_config() vgg.initialize() tdb = top_direction_block() tdb.push_initialization_config() tdb.initialize() # Construct feedforward sequence ss_seq = FeedforwardSequence([vgg.apply, tdb.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost = StructuredCost().apply(labels, theano.tensor.clip(prediction, 1e-5, 1 - 1e-5)) cg = ComputationGraph(cost) cg_dropout = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[0]], .5) cost_dropout = cg_dropout.outputs[0] # define learned parameters selector = Selector([ss_seq]) W = selector.get_parameters() parameters = [] parameters += [v for k, v in W.items()] return cost_dropout, parameters
def build_model(images, labels): # Construct a bottom convolutional sequence bottom_conv_sequence = convolutional_sequence((3, 3), 64, (150, 150)) bottom_conv_sequence._push_allocation_config() # Flatten layer flattener = Flattener() # Construct a top MLP conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output')) top_mlp = MLP([ LeakyRectifier(name='non_linear_9'), LeakyRectifier(name='non_linear_10'), Softmax(name='non_linear_11') ], [conv_out_dim, 2048, 612, 10], weights_init=IsotropicGaussian(), biases_init=Constant(1)) # Construct feedforward sequence ss_seq = FeedforwardSequence( [bottom_conv_sequence.apply, flattener.apply, top_mlp.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost = CategoricalCrossEntropy().apply(labels.flatten(), prediction) return cost
def construct_model(task, patch_shape, initargs, n_channels, n_spatial_dims, hidden_dim, batch_normalize, hyperparameters, patch_cnn_spec=None, patch_mlp_spec=None, prefork_area_mlp_spec=[], postmerge_area_mlp_spec=[], response_mlp_spec=[], **kwargs): patch_transforms = [] if patch_cnn_spec: patch_transforms.append(masonry.construct_cnn( name="patch_cnn", layer_specs=patch_cnn_spec, input_shape=patch_shape, n_channels=n_channels, batch_normalize=batch_normalize).apply) shape = patch_transforms[-1].brick.get_dim("output") else: shape = (n_channels,) + tuple(patch_shape) patch_transforms.append(masonry.FeedforwardFlattener(input_shape=shape).apply) if patch_mlp_spec: patch_transforms.append(masonry.construct_mlp( name="patch_mlp", hidden_dims=patch_mlp_spec, input_dim=patch_transforms[-1].brick.output_dim, batch_normalize=batch_normalize, initargs=initargs).apply) patch_transform = FeedforwardSequence(patch_transforms, name="ffs") prefork_area_transform = masonry.construct_mlp( name="prefork_area_mlp", input_dim=hidden_dim, hidden_dims=prefork_area_mlp_spec, batch_normalize=batch_normalize, initargs=initargs) postmerge_area_transform = masonry.construct_mlp( name="postmerge_area_mlp", input_dim=2*n_spatial_dims, hidden_dims=postmerge_area_mlp_spec, batch_normalize=batch_normalize, initargs=initargs) # LSTM requires the input to have dim=4*hidden_dim response_mlp_spec.append(4*hidden_dim) response_transform = masonry.construct_mlp( name="response_mlp", hidden_dims=response_mlp_spec[1:], input_dim=response_mlp_spec[0], batch_normalize=batch_normalize, initargs=initargs) emitter = task.get_emitter(**hyperparameters) return Ram(patch_transform=patch_transform.apply, prefork_area_transform=prefork_area_transform.apply, postmerge_area_transform=postmerge_area_transform.apply, response_transform=response_transform.apply, emitter=emitter, **hyperparameters)
def __init__(self, visual_dim, textual_dim, output_dim, hidden_size, init_ranges, **kwargs): (visual_range, textual_range, linear_range_1, linear_range_2, linear_range_3) = init_ranges visual_layer = FeedforwardSequence([ BatchNormalization(input_dim=visual_dim).apply, LinearMaxout( input_dim=visual_dim, output_dim=hidden_size, weights_init=initialization.Uniform(width=visual_range), use_bias=False, biases_init=initialization.Constant(0), num_pieces=2).apply ], name='visual_layer') textual_layer = FeedforwardSequence([ BatchNormalization(input_dim=textual_dim).apply, LinearMaxout( input_dim=textual_dim, output_dim=hidden_size, weights_init=initialization.Uniform(width=textual_range), biases_init=initialization.Constant(0), use_bias=False, num_pieces=2).apply ], name='textual_layer') logistic_mlp = MLPGenreClassifier( hidden_size, output_dim, hidden_size, [linear_range_1, linear_range_2, linear_range_3]) # logistic_mlp = Sequence([ # BatchNormalization(input_dim=hidden_size, name='bn1').apply, # Linear(hidden_size, output_dim, name='linear_output', use_bias=False, # weights_init=initialization.Uniform(width=linear_range_1)).apply, # Logistic().apply #], name='logistic_mlp') children = [visual_layer, textual_layer, logistic_mlp] kwargs.setdefault('use_bias', False) kwargs.setdefault('children', children) super(LinearSumClassifier, self).__init__(**kwargs)
def _allocate(self): arghs = dict(shape=self.shape, broadcastable=self.broadcastable) sequence = [] if self.batch_normalize: sequence.append(Standardization(**arghs)) sequence.append(SharedScale( weights_init=Constant(1), **arghs)) sequence.append(SharedShift( biases_init=Constant(0), **arghs)) sequence.append(self.activation) self.sequence = FeedforwardSequence([ brick.apply for brick in sequence ], name="ffs") self.children = [self.sequence]
class NormalizedActivation(Initializable, Feedforward): @lazy(allocation="shape broadcastable".split()) def __init__(self, shape, broadcastable, activation=None, batch_normalize=False, **kwargs): super(NormalizedActivation, self).__init__(**kwargs) self.shape = shape self.broadcastable = broadcastable self.activation = activation or Rectifier() self.batch_normalize = batch_normalize @property def broadcastable(self): return self._broadcastable or [False]*len(self.shape) @broadcastable.setter def broadcastable(self, broadcastable): self._broadcastable = broadcastable def _allocate(self): arghs = dict(shape=self.shape, broadcastable=self.broadcastable) sequence = [] if self.batch_normalize: sequence.append(Standardization(**arghs)) sequence.append(SharedScale( weights_init=Constant(1), **arghs)) sequence.append(SharedShift( biases_init=Constant(0), **arghs)) sequence.append(self.activation) self.sequence = FeedforwardSequence([ brick.apply for brick in sequence ], name="ffs") self.children = [self.sequence] @application(inputs=["input_"], outputs=["output"]) def apply(self, input_): return self.sequence.apply(input_) def get_dim(self, name): try: return dict(input_=self.shape, output=self.shape) except: return super(NormalizedActivation, self).get_dim(name)
class NormalizedActivation(Initializable, Feedforward): @lazy(allocation="shape broadcastable".split()) def __init__(self, shape, broadcastable, activation=None, batch_normalize=False, **kwargs): super(NormalizedActivation, self).__init__(**kwargs) self.shape = shape self.broadcastable = broadcastable self.activation = activation or Rectifier() self.batch_normalize = batch_normalize @property def broadcastable(self): return self._broadcastable or [False] * len(self.shape) @broadcastable.setter def broadcastable(self, broadcastable): self._broadcastable = broadcastable def _allocate(self): arghs = dict(shape=self.shape, broadcastable=self.broadcastable) sequence = [] if self.batch_normalize: sequence.append(Standardization(**arghs)) sequence.append(SharedScale(weights_init=Constant(1), **arghs)) sequence.append(SharedShift(biases_init=Constant(0), **arghs)) sequence.append(self.activation) self.sequence = FeedforwardSequence( [brick.apply for brick in sequence], name="ffs") self.children = [self.sequence] @application(inputs=["input_"], outputs=["output"]) def apply(self, input_): return self.sequence.apply(input_) def get_dim(self, name): try: return dict(input_=self.shape, output=self.shape) except: return super(NormalizedActivation, self).get_dim(name)
def build_fork_lookup(vocab_size, args): x = tensor.lmatrix('features') virtual_dim = 6 time_length = 5 mini_batch_size = 2 skip_connections = True layers = 3 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) print output_names print output_dims lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) fork.initialize() f = theano.function([x], pre_rnn) return f
def net_dvc(image_size=(32, 32)): convos = [5, 5, 5] pools = [2, 2, 2] filters = [100, 200, 300] tuplify = lambda x: (x, x) convos = list(map(tuplify, convos)) conv_layers = [Convolutional(filter_size=s,num_filters=o, num_channels=i, name="Conv"+str(n))\ for s,o,i,n in zip(convos, filters, [3] + filters, range(1000))] pool_layers = [MaxPooling(p) for p in map(tuplify, pools)] activations = [Rectifier() for i in convos] layers = [i for l in zip(conv_layers, activations, pool_layers) for i in l] cnn = ConvolutionalSequence(layers, 3, image_size=image_size, name="cnn", weights_init=Uniform(width=.1), biases_init=Constant(0)) cnn._push_allocation_config() cnn_output = np.prod(cnn.get_dim('output')) mlp_size = [cnn_output, 500, 2] mlp = MLP([Rectifier(), Softmax()], mlp_size, name="mlp", weights_init=Uniform(width=.1), biases_init=Constant(0)) seq = FeedforwardSequence([net.apply for net in [cnn, Flattener(), mlp]]) seq.push_initialization_config() seq.initialize() return seq
def build_model_hard(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] for i in range(layers - 1): mlp = MLP(activations=[Logistic()], dims=[2 * state_dim, 1], weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have correctly: # h = [state_1, state_2, state_3 ...] # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
def get_prernn(args): # time x batch x_mask = tensor.fmatrix('mask') # Compute the state dim if args.rnn_type == 'lstm': state_dim = 4 * args.state_dim else: state_dim = args.state_dim # Prepare the arguments for the fork output_names = [] output_dims = [] for d in range(args.layers): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or args.skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) # Prepare the brick to be forked (LookupTable or Linear) # Check if the dataset provides indices (in the case of a # fixed vocabulary, x is 2D tensor) or if it gives raw values # (x is 3D tensor) if has_indices(args.dataset): features = args.mini_batch_size x = tensor.lmatrix('features') vocab_size = get_output_size(args.dataset) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) forked = FeedforwardSequence([lookup.apply]) if not has_mask(args.dataset): x_mask = tensor.ones_like(x, dtype=floatX) else: x = tensor.tensor3('features', dtype=floatX) if args.used_inputs is not None: x = tensor.set_subtensor(x[args.used_inputs:, :, :], tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX)) features = get_output_size(args.dataset) forked = Linear(input_dim=features, output_dim=state_dim) forked.weights_init = initialization.IsotropicGaussian(0.1) forked.biases_init = initialization.Constant(0) if not has_mask(args.dataset): x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX) # Define the fork fork = Fork(output_names=output_names, input_dim=features, output_dims=output_dims, prototype=forked) fork.initialize() # Apply the fork prernn = fork.apply(x) # Give a name to the input of each layer if args.skip_connections: for t in range(len(prernn)): prernn[t].name = "pre_rnn_" + str(t) else: prernn.name = "pre_rnn" return prernn, x_mask
def get_prernn(args): # time x batch x_mask = tensor.fmatrix('mask') # Compute the state dim if args.rnn_type == 'lstm': state_dim = 4 * args.state_dim else: state_dim = args.state_dim # Prepare the arguments for the fork output_names = [] output_dims = [] for d in range(args.layers): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or args.skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) # Prepare the brick to be forked (LookupTable or Linear) # Check if the dataset provides indices (in the case of a # fixed vocabulary, x is 2D tensor) or if it gives raw values # (x is 3D tensor) if has_indices(args.dataset): features = args.mini_batch_size x = tensor.lmatrix('features') vocab_size = get_output_size(args.dataset) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) forked = FeedforwardSequence([lookup.apply]) if not has_mask(args.dataset): x_mask = tensor.ones_like(x, dtype=floatX) else: x = tensor.tensor3('features', dtype=floatX) if args.used_inputs is not None: x = tensor.set_subtensor( x[args.used_inputs:, :, :], tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX)) features = get_output_size(args.dataset) forked = Linear(input_dim=features, output_dim=state_dim) forked.weights_init = initialization.IsotropicGaussian(0.1) forked.biases_init = initialization.Constant(0) if not has_mask(args.dataset): x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX) # Define the fork fork = Fork(output_names=output_names, input_dim=features, output_dims=output_dims, prototype=forked) fork.initialize() # Apply the fork prernn = fork.apply(x) # Give a name to the input of each layer if args.skip_connections: for t in range(len(prernn)): prernn[t].name = "pre_rnn_" + str(t) else: prernn.name = "pre_rnn" return prernn, x_mask
def build_fork_lookup(vocab_size, time_length, args): x = tensor.lmatrix('features') virtual_dim = 6 state_dim = 6 skip_connections = False layers = 1 # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=time_length, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) # Note that this order of the periods makes faster modules flow in slower # ones with is the opposite of the original paper transitions = [ClockworkBase(dim=state_dim, activation=Tanh(), period=2 ** i) for i in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # Return list of 3D Tensor, one for each layer # (Batch X Time X embedding_dim) pre_rnn = fork.apply(x) # Give time as the first index for each element in the list: # (Time X Batch X embedding_dim) if layers > 1 and skip_connections: for t in range(len(pre_rnn)): pre_rnn[t] = pre_rnn[t].dimshuffle(1, 0, 2) else: pre_rnn = pre_rnn.dimshuffle(1, 0, 2) f_pre_rnn = theano.function([x], pre_rnn) # Prepare inputs for the RNN kwargs = OrderedDict() for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] else: kwargs['inputs' + suffix] = pre_rnn print kwargs # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() f_h = theano.function([x], h) return f_pre_rnn, f_h
def build_model(images, labels): vgg = VGG(layer='conv3_4') vgg.push_initialization_config() vgg.initialize() sb = SubstractBatch() # Construct a bottom convolutional sequence layers = [ Convolutional(filter_size=(3, 3), num_filters=100, use_bias=True, tied_biases=True, name='final_conv0'), BatchNormalization(name='batchnorm_1'), Rectifier(name='final_conv0_act'), Convolutional(filter_size=(3, 3), num_filters=100, use_bias=True, tied_biases=True, name='final_conv1'), BatchNormalization(name='batchnorm_2'), Rectifier(name='final_conv1_act'), MaxPooling(pooling_size=(2, 2), name='maxpool_final') ] bottom_conv_sequence = ConvolutionalSequence( layers, num_channels=256, image_size=(40, 40), biases_init=Constant(0.), weights_init=IsotropicGaussian(0.01)) bottom_conv_sequence._push_allocation_config() # Flatten layer flattener = Flattener() # Construct a top MLP conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output')) print 'dim output conv:', bottom_conv_sequence.get_dim('output') # conv_out_dim = 20 * 40 * 40 top_mlp = BatchNormalizedMLP( [Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0)) # Construct feedforward sequence ss_seq = FeedforwardSequence([ vgg.apply, bottom_conv_sequence.apply, flattener.apply, top_mlp.apply ]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost_noreg = CategoricalCrossEntropy().apply(labels.flatten(), prediction) # add regularization selector = Selector([top_mlp]) Ws = selector.get_parameters('W') mlp_brick_name = 'batchnormalizedmlp' W0 = Ws['/%s/linear_0.W' % mlp_brick_name] W1 = Ws['/%s/linear_1.W' % mlp_brick_name] cost = cost_noreg + .0001 * (W0**2).sum() + .001 * (W1**2).sum() # define learned parameters selector = Selector([ss_seq]) Ws = selector.get_parameters('W') bs = selector.get_parameters('b') BNSCs = selector.get_parameters('batch_norm_scale') BNSHs = selector.get_parameters('batch_norm_shift') parameters_top = [] parameters_top += [v for k, v in Ws.items()] parameters_top += [v for k, v in bs.items()] parameters_top += [v for k, v in BNSCs.items()] parameters_top += [v for k, v in BNSHs.items()] selector = Selector([vgg]) convs = selector.get_parameters() parameters_all = [] parameters_all += parameters_top parameters_all += [v for k, v in convs.items()] return cost, [parameters_top, parameters_all]
def build_model_lstm(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections virtual_dim = 4 * state_dim # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) # Make sure time_length is what we need fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [ LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers) ] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear(input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} init_cells = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) init_cells[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='cell0_%d' % d) kwargs['states' + suffix] = init_states[d] kwargs['cells' + suffix] = init_cells[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} for d in range(layers): last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) updates.append((init_cells[d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = { "in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates } h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if layers > 1 # h = [state] if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if layers > 1: if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state" presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() # Dont initialize as Orthogonal if we are about to load new parameters if args.load_path is not None: rnn.weights_init = initialization.Constant(0) else: rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values