def visualize_states(hidden_states, updates, train_stream, valid_stream, args):

    # Get all the hidden_states
    filter_states = VariableFilter(theano_name_regex="hidden_state_.*")
    all_states = filter_states(hidden_states)
    all_states = sorted(all_states, key=lambda var: var.name[-1])

    # Get all the hidden_cells
    filter_cells = VariableFilter(theano_name_regex="hidden_cells_.*")
    all_cells = filter_cells(hidden_states)
    all_cells = sorted(all_cells, key=lambda var: var.name[-1])

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           not (has_indices(args.dataset)))

    # Compile the function
    logger.info("The compilation of the function has started")
    if args.rnn_type == "lstm" and args.visualize_cells:
        compiled = theano.function(inputs=ComputationGraph(all_cells).inputs,
                                   outputs=all_cells,
                                   givens=givens,
                                   updates=f_updates,
                                   mode=Mode(optimizer='fast_compile'))
    else:
        compiled = theano.function(inputs=ComputationGraph(all_states).inputs,
                                   outputs=all_states,
                                   givens=givens,
                                   updates=f_updates,
                                   mode=Mode(optimizer='fast_compile'))

    # Plot the function
    plot("hidden_state", train_stream, compiled, args)
Beispiel #2
0
def training(repo, learning_rate, batch_size, filenames):

    print 'LOAD DATA'
    (x_train,
     y_train), (x_valid,
                y_valid), (x_test,
                           y_test) = load_datasets_mnist(repo, filenames)

    print 'BUILD MODEL'
    train_f, valid_f, test_f, model, fisher, params = build_training()
    x_train = x_train[:1000]
    y_train = y_train[:1000]

    x = T.tensor4()
    y = T.imatrix()
    output = model.apply(x)
    output = output.reshape(
        (x.shape[0],
         model.get_dim('output')))  #TO DO : get_dim('name') for Architecture
    cost = Softmax().categorical_cross_entropy(y.flatten(), output).mean()
    cg = ComputationGraph(cost)

    inputs_conv = VariableFilter(roles=[INPUT], bricks=[Convolutional])(cg)
    outputs_conv = VariableFilter(roles=[OUTPUT], bricks=[Convolutional])(cg)
    inputs_fully = VariableFilter(roles=[INPUT], bricks=[Linear])(cg)
    outputs_fully = VariableFilter(roles=[OUTPUT], bricks=[Linear])(cg)
    dico = OrderedDict([('conv_output', outputs_conv[0])])
    [grad_s] = T.grad(cost, outputs_conv)
    dico['conv_output'] = grad_s

    f = theano.function([x, y],
                        grad_s,
                        allow_input_downcast=True,
                        on_unused_input='ignore')
    print np.mean(f(x_train[:10], y_train[:10]))
    def analyze(self, inputs, groundtruth, prediction):
        """Compute cost and aligment."""
        if not hasattr(self, "_analyze"):
            input_variables = list(self.single_inputs.values())
            input_variables.append(self.single_labels)
            input_variables.append(self.single_predicted_labels)

            cg = self.get_cost_graph(batch=False, use_prediction=True)
            costs = cg.outputs[0]

            weights, = VariableFilter(
                bricks=[self.generator], name="weights")(cg)
            energies = VariableFilter(
                bricks=[self.generator], name="energies")(cg)
            energies_output = [energies[0][:, 0, :] if energies
                               else tensor.zeros_like(weights)]

            self._analyze = theano.function(
                input_variables,
                [costs[0], weights[:, 0, :]] + energies_output,
                on_unused_input='warn')

        input_values_dict = dict(inputs)
        input_values_dict['labels'] = groundtruth
        input_values_dict['predicted_labels'] = prediction
        return self._analyze(**input_values_dict)
Beispiel #4
0
def build_mlp(features_int, features_cat, labels, labels_mean):

    inputs = tensor.concatenate([features_int, features_cat], axis=1)

    mlp = MLP(activations=[Rectifier(),
                           Rectifier(),
                           Rectifier(), None],
              dims=[337, 800, 1200, 1],
              weights_init=IsotropicGaussian(),
              biases_init=Constant(1))
    mlp.initialize()

    prediction = mlp.apply(inputs)
    cost = MAPECost().apply(prediction, labels, labels_mean)

    cg = ComputationGraph(cost)
    #cg_dropout0   = apply_dropout(cg, [VariableFilter(roles=[INPUT])(cg.variables)[1]], .2)
    cg_dropout1 = apply_dropout(cg, [
        VariableFilter(roles=[OUTPUT])(cg.variables)[1],
        VariableFilter(roles=[OUTPUT])(cg.variables)[3],
        VariableFilter(roles=[OUTPUT])(cg.variables)[5]
    ], .2)
    cost_dropout1 = cg_dropout1.outputs[0]

    return cost_dropout1, cg_dropout1.parameters, cost  #cost, cg.parameters, cost #
Beispiel #5
0
    def __init__(self, samples):
        # Extracting information from the sampling computation graph
        self.cg = ComputationGraph(samples)
        self.inputs = self.cg.inputs
        self.generator = get_brick(samples)
        if not isinstance(self.generator, BaseSequenceGenerator):
            raise ValueError
        self.generate_call = get_application_call(samples)
        if (not self.generate_call.application == self.generator.generate):
            raise ValueError
        self.inner_cg = ComputationGraph(self.generate_call.inner_outputs)

        # Fetching names from the sequence generator
        self.context_names = self.generator.generate.contexts
        self.state_names = self.generator.generate.states

        # Parsing the inner computation graph of sampling scan
        self.contexts = [
            VariableFilter(bricks=[self.generator], name=name,
                           roles=[INPUT])(self.inner_cg)[0]
            for name in self.context_names
        ]
        self.input_states = []
        # Includes only those state names that were actually used
        # in 'generate'
        self.input_state_names = []
        for name in self.generator.generate.states:
            var = VariableFilter(bricks=[self.generator],
                                 name=name,
                                 roles=[INPUT])(self.inner_cg)
            if var:
                self.input_state_names.append(name)
                self.input_states.append(var[0])

        self.compiled = False
Beispiel #6
0
 def analyze(self, recording, transcription):
     """Compute cost and aligment for a recording/transcription pair."""
     if not hasattr(self, "_analyze"):
         cost = self.get_cost_graph(batch=False)
         cg = ComputationGraph(cost)
         energies = VariableFilter(bricks=[self.generator],
                                   name="energies")(cg)
         energies_output = [
             energies[0][:, 0, :] if energies else tensor.zeros(
                 (self.single_transcription.shape[0],
                  self.single_recording.shape[0]))
         ]
         states, = VariableFilter(applications=[self.encoder.apply],
                                  roles=[OUTPUT],
                                  name="encoded")(cg)
         ctc_matrix_output = []
         # Temporarily disabled for compatibility with LM code
         # if len(self.generator.readout.source_names) == 1:
         #    ctc_matrix_output = [
         #        self.generator.readout.readout(weighted_averages=states)[:, 0, :]]
         weights, = VariableFilter(bricks=[self.generator],
                                   name="weights")(cg)
         self._analyze = theano.function(
             [self.single_recording, self.single_transcription],
             [cost[:, 0], weights[:, 0, :]] + energies_output +
             ctc_matrix_output)
     return self._analyze(recording, transcription)
Beispiel #7
0
 def analyze(self, recording, groundtruth, prediction=None):
     """Compute cost and aligment."""
     input_values = [recording, groundtruth]
     if prediction is not None:
         input_values.append(prediction)
     if not hasattr(self, "_analyze"):
         input_variables = [self.single_recording, self.single_transcription]
         prediction_variable = tensor.lvector('prediction')
         if prediction is not None:
             input_variables.append(prediction_variable)
             cg = self.get_cost_graph(
                 batch=False, prediction=prediction_variable[:, None])
         else:
             cg = self.get_cost_graph(batch=False)
         cost = cg.outputs[0]
         energies = VariableFilter(
             bricks=[self.generator], name="energies")(cg)
         energies_output = [energies[0][:, 0, :] if energies
                            else tensor.zeros((self.single_transcription.shape[0],
                                               self.single_recording.shape[0]))]
         states, = VariableFilter(
             applications=[self.encoder.apply], roles=[OUTPUT],
             name="encoded")(cg)
         ctc_matrix_output = []
         # Temporarily disabled for compatibility with LM code
         # if len(self.generator.readout.source_names) == 1:
         #    ctc_matrix_output = [
         #        self.generator.readout.readout(weighted_averages=states)[:, 0, :]]
         weights, = VariableFilter(
             bricks=[self.generator], name="weights")(cg)
         self._analyze = theano.function(
             input_variables,
             [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output,
             on_unused_input='warn')
     return self._analyze(*input_values)
Beispiel #8
0
def getParams(model, tensor):
    x = T.tensor4()
    cost = model.apply(tensor).sum()
    cg = ComputationGraph(cost)
    W = VariableFilter(roles=[WEIGHT])(cg.variables)
    B = VariableFilter(roles=[BIAS])(cg.variables)
    return W + B
Beispiel #9
0
def load_models(net, model_path=save_path, in_size=len(input_columns),
                out_size=len(output_columns) - 1 if cost_mode == 'RL-MDN' else len(output_columns),
                hidden_size=hidden_size, num_recurrent_layers=num_recurrent_layers, model=layer_models[0]):
    
    initials = []
    if not os.path.isfile(model_path):
        print 'Could not find model file.'
        sys.exit(0)
    print 'Loading model from {0}...'.format(model_path)
    x = tensor.tensor3('features', dtype=theano.config.floatX)
    y = tensor.tensor3('targets', dtype='floatX')
    train_flag = [theano.shared(0)]
    
    latent_size = net.get_size() # latent_size
    
    in_size = latent_size + len(input_columns)
    y_hat, cost, cells = nn_fprop(x, y, in_size, out_size, hidden_size, num_recurrent_layers, train_flag)
    main_loop = MainLoop(algorithm=None, data_stream=None, model=Model(cost),
                         extensions=[saveload.Load(model_path)])
    for extension in main_loop.extensions:
        extension.main_loop = main_loop
    main_loop._run_extensions('before_training')
    bin_model = main_loop.model
    print 'Model loaded. Building prediction function...'
    hiddens = []
    for i in range(num_recurrent_layers):
        brick = [b for b in bin_model.get_top_bricks() if b.name == layer_models[i] + str(i)][0]
        hiddens.extend(VariableFilter(theano_name=brick.name + '_apply_states')(bin_model.variables))
        hiddens.extend(VariableFilter(theano_name=brick.name + '_apply_cells')(cells))
        initials.extend(VariableFilter(roles=[roles.INITIAL_STATE])(brick.parameters))
    predict_func = theano.function([x], hiddens + [y_hat])
    encoder, code_size = load_encoder(net)
    return predict_func, initials, encoder, code_size
Beispiel #10
0
def build_dictionnary(cost):
    cg = ComputationGraph(cost)

    inputs_conv = VariableFilter(roles=[INPUT], bricks=[Convolutional])(cg)
    outputs_conv = VariableFilter(roles=[OUTPUT], bricks=[Convolutional])(cg)
    inputs_fully = VariableFilter(roles=[INPUT], bricks=[Linear])(cg)
    outputs_fully = VariableFilter(roles=[OUTPUT], bricks=[Linear])(cg)
    grad_conv = T.grad(cost, outputs_conv)
    grad_fully = T.grad(cost, outputs_fully)

    items = []
    for i, var_in, grad_out in zip(range(len(inputs_conv)), inputs_conv,
                                   grad_conv):
        items.append(('conv_input_' + str(i), var_in))
        items.append(('conv_output_' + str(i), grad_out))

    for i, var_in, grad_out in zip(range(len(inputs_fully)), inputs_fully,
                                   grad_fully):
        var_input = T.concatenate(
            [var_in, T.ones((var_in.shape[0], 1))], axis=1)
        items.append(('fully_input_' + str(i), var_input))
        items.append(('fully_output_' + str(i), grad_out))

    dico = OrderedDict(items)

    return dico
def build_tab_equiv(model):
    x = T.tensor4('x')
    y = T.imatrix()
    y_prev = model.apply(x)
    cg = ComputationGraph(T.sum(y_prev))

    weight_fully = VariableFilter(roles=[WEIGHT], bricks=[Linear])(cg)
    weight_conv = VariableFilter(roles=[WEIGHT], bricks=[Convolutional])(cg)

    dico = {}
    index = 0
    for w_fully in weight_fully[::-1]:
        dico[w_fully.name] = [
            'fully_input_' + str(index), 'fully_output_' + str(index)
        ]
        index += 1

    index = 0
    for w_conv in weight_conv[::-1]:
        dico[w_conv.name] = [
            'conv_input_' + str(index), 'conv_output_' + str(index)
        ]
        index += 1

    return dico
Beispiel #12
0
 def _create_model(with_dropout):
     cg = ComputationGraph(gan.compute_losses(x, z))
     if with_dropout:
         inputs = VariableFilter(bricks=gan.discriminator.children[1:],
                                 roles=[INPUT])(cg.variables)
         cg = apply_dropout(cg, inputs, 0.5)
         inputs = VariableFilter(bricks=[gan.discriminator],
                                 roles=[INPUT])(cg.variables)
         cg = apply_dropout(cg, inputs, 0.2)
     return Model(cg.outputs)
Beispiel #13
0
 def _compile_next_state_computer(self):
     next_states = [VariableFilter(bricks=[self.generator],
                                   name=name,
                                   roles=[OUTPUT])(self.inner_cg)[-1]
                    for name in self.state_names]
     next_outputs = VariableFilter(
         applications=[self.generator.readout.emit], roles=[OUTPUT])(
             self.inner_cg.variables)
     self.next_state_computer = function(
         self.contexts + self.input_states + next_outputs, next_states)
def test_fully_layer():
	batch_size=2
	x = T.tensor4();
	y = T.ivector()
	V = 200
	layer_conv = Convolutional(filter_size=(5,5),num_filters=V,
				name="toto",
				weights_init=IsotropicGaussian(0.01),
				biases_init=Constant(0.0))
	# try with no bias
	activation = Rectifier()
	pool = MaxPooling(pooling_size=(2,2))

	convnet = ConvolutionalSequence([layer_conv, activation, pool], num_channels=15,
					image_size=(10,10),
					name="conv_section")
	convnet.push_allocation_config()
	convnet.initialize()
	output=convnet.apply(x)
	batch_size=output.shape[0]
	output_dim=np.prod(convnet.get_dim('output'))
	result_conv = output.reshape((batch_size, output_dim))
	mlp=MLP(activations=[Rectifier().apply], dims=[output_dim, 10],
				weights_init=IsotropicGaussian(0.01),
				biases_init=Constant(0.0))
	mlp.initialize()
	output=mlp.apply(result_conv)
	cost = T.mean(Softmax().categorical_cross_entropy(y.flatten(), output))
	cg = ComputationGraph(cost)
	W = VariableFilter(roles=[WEIGHT])(cg.variables)
	B = VariableFilter(roles=[BIAS])(cg.variables)
	W = W[0]; b = B[0]

	inputs_fully = VariableFilter(roles=[INPUT], bricks=[Linear])(cg)
	outputs_fully = VariableFilter(roles=[OUTPUT], bricks=[Linear])(cg)
	var_input=inputs_fully[0]
	var_output=outputs_fully[0]
	
	[d_W,d_S,d_b] = T.grad(cost, [W, var_output, b])

	d_b = d_b.dimshuffle(('x',0))
	d_p = T.concatenate([d_W, d_b], axis=0)
	x_value = 1e3*np.random.ranf((2,15, 10, 10))
	f = theano.function([x,y], [var_input, d_S, d_p], allow_input_downcast=True, on_unused_input='ignore')
	A, B, C= f(x_value, [5, 0])
	A = np.concatenate([A, np.ones((2,1))], axis=1)
	print 'A', A.shape
	print 'B', B.shape
	print 'C', C.shape

	print lin.norm(C - np.dot(np.transpose(A), B), 'fro')

	return
	
	"""
Beispiel #15
0
 def do(self, which_callback, *args, **kwargs):
     if which_callback == 'before_training':
         cg = ComputationGraph(self.main_loop.algorithm.total_step_norm)
         self._learning_rate_var, = VariableFilter(
             theano_name='learning_rate')(cg)
         logger.debug("Annealing extension is initialized")
     elif which_callback == 'after_epoch':
         logger.debug("Annealing the learning rate to {}".format(
             self._annealing_learning_rate))
         self._learning_rate_var.set_value(self._annealing_learning_rate)
     else:
         raise ValueError("don't know what to do")
Beispiel #16
0
 def _compile_initial_state_and_context_computer(self):
     initial_states = VariableFilter(
                         applications=[self.generator.initial_states],
                         roles=[OUTPUT])(self.cg)
     outputs = OrderedDict([(v.tag.name, v) for v in initial_states])
     beam_size = unpack(VariableFilter(
                         applications=[self.generator.initial_states],
                         name='batch_size')(self.cg))
     for name, context in equizip(self.context_names, self.contexts):
         outputs[name] = context
     outputs['beam_size'] = beam_size
     self.initial_state_and_context_computer = function(
         self.inputs, outputs, on_unused_input='ignore')
Beispiel #17
0
    def create_act_table(self, save_to, act_table):
        batch_size = 500
        image_size = (28, 28)
        output_size = 10
        convnet = create_lenet_5()
        layers = convnet.layers

        x = tensor.tensor4('features')
        y = tensor.lmatrix('targets')

        # Normalize input and apply the convnet
        probs = convnet.apply(x)
        cg = ComputationGraph([probs])

        def full_brick_name(brick):
            return '/'.join([''] + [b.name for b in brick.get_unique_path()])

        # Find layer outputs to probe
        outmap = OrderedDict(
            (full_brick_name(get_brick(out)), out) for out in VariableFilter(
                roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables))
        # Generate pics for biases
        biases = VariableFilter(roles=[BIAS])(cg.parameters)

        # Generate parallel array, in the same order, for outputs
        outs = [outmap[full_brick_name(get_brick(b))] for b in biases]

        # Figure work count
        error_rate = (MisclassificationRate().apply(
            y.flatten(), probs).copy(name='error_rate'))
        max_activation_table = (MaxActivationTable().apply(outs).copy(
            name='max_activation_table'))
        max_activation_table.tag.aggregation_scheme = (
            Concatenate(max_activation_table))

        model = Model([error_rate, max_activation_table])

        # Load it with trained parameters
        params = load_parameters(open(save_to, 'rb'))
        model.set_parameter_values(params)

        mnist_test_stream = DataStream.default_stream(
            self.mnist_test,
            iteration_scheme=SequentialScheme(self.mnist_test.num_examples,
                                              batch_size))

        evaluator = DatasetEvaluator([error_rate, max_activation_table])
        results = evaluator.evaluate(mnist_test_stream)
        table = results['max_activation_table']
        pickle.dump(table, open(act_table, 'wb'))
        return table
Beispiel #18
0
 def _compile_next_state_computer(self):
     next_states = [VariableFilter(bricks=[self.generator],
                                   name=name,
                                   roles=[OUTPUT])(self.inner_cg)[-1]
                    for name in self.state_names]
     next_outputs = VariableFilter(
         applications=[self.generator.readout.emit], roles=[OUTPUT])(
             self.inner_cg.variables)
     self.next_state_computer = function(
         self.contexts + self.input_states + next_outputs, next_states,
         # This is temporarily required because `lm_logprobs` is a weird
         # state which is not used to compute next state, but used to
         # compute the next output.
         on_unused_input='ignore')
Beispiel #19
0
 def _compile_next_state_computer(self, givens):
     """Modified version of ``BeamSearch._compile_next_state_computer``
     with ``givens``.
     """
     next_states = [VariableFilter(bricks=[beam_search.generator],
                                   name=name,
                                   roles=[OUTPUT])(beam_search.inner_cg)[-1]
                    for name in beam_search.state_names]
     next_outputs = VariableFilter(
         applications=[beam_search.generator.readout.emit], roles=[OUTPUT])(
             beam_search.inner_cg.variables)
     self.next_state_computer = function(
         [self.src_indices] + beam_search.input_states + next_outputs, 
         next_states,
         givens=givens)
    def __init__(self, inputs, cg, reward_emitter, data, **kwargs):
        self.input_accumulator = shared_floatx_zeros((2, 2), dtype='int64')
        self.gain_accumulator = shared_floatx_zeros((2, 2, 2))
        self.reward_accumulator = shared_floatx_zeros((2, 2, 2), dtype='int64')
        self.dataset = data.get_dataset('train')
        self.inputs = inputs

        self.gains, = VariableFilter(applications=[reward_emitter.cost],
                                     roles=[INPUT],
                                     name='readouts')(cg.variables)
        self.reward, = VariableFilter(theano_name=reward_emitter.GAIN_MATRIX)(
            cg.variables)
        kwargs.setdefault('before_training', True)
        kwargs.setdefault('after_batch', True)
        super(LogInputsGains, self).__init__(**kwargs)
Beispiel #21
0
 def _create_model(with_dropout):
     cg = ComputationGraph(ali.compute_losses(x, z))
     if with_dropout:
         inputs = VariableFilter(
             bricks=([ali.discriminator.x_discriminator.layers[0],
                      ali.discriminator.z_discriminator.layers[0]]),
             roles=[INPUT])(cg.variables)
         cg = apply_dropout(cg, inputs, 0.2)
         inputs = VariableFilter(
             bricks=(ali.discriminator.x_discriminator.layers[2::3] +
                     ali.discriminator.z_discriminator.layers[2::2] +
                     ali.discriminator.joint_discriminator.layers[::2]),
             roles=[INPUT])(cg.variables)
         cg = apply_dropout(cg, inputs, 0.5)
     return Model(cg.outputs)
Beispiel #22
0
def setup_model(p):
    ladder = LadderAE(p)
    # Setup inputs
    input_type = TensorType('float32',
                            [False] * (len(p.encoder_layers[0]) + 1))
    x_only = input_type('features_unlabeled')
    if debug:
        x_only.tag.test_value = numpy.random.normal(
            size=(p.batch_size, ) + p.encoder_layers[0]).astype(floatX)
    x = input_type('features_labeled')
    if debug:
        x.tag.test_value = numpy.random.normal(
            size=(p.batch_size, ) + p.encoder_layers[0]).astype(floatX)
    y = theano.tensor.lvector('targets_labeled')
    if debug:
        y.tag.test_value = numpy.random.randint(1,
                                                int(p.encoder_layers[-1]) + 1,
                                                (p.batch_size))
    ladder.apply(x, y, x_only)

    # Load parameters if requested
    if p.get('load_from'):
        with open(p.load_from + '/trained_params.npz') as f:
            loaded = numpy.load(f)
            cg = ComputationGraph([ladder.costs.total])
            current_params = VariableFilter(roles=[PARAMETER])(cg.variables)
            logger.info('Loading parameters: %s' % ', '.join(loaded.keys()))
            for param in current_params:
                assert param.get_value().shape == loaded[param.name].shape
                param.set_value(loaded[param.name])

    return ladder
Beispiel #23
0
    def primal_step(self, x, y, learning_rate, alpha, beta, input_dim, p):

        mlp, cost = self.create_model(x, y, input_dim, p)
        cg = ComputationGraph([cost])
        weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        updates = Adam(cost, weights, y, alpha, beta)
        return mlp, updates, -1 * cost
Beispiel #24
0
def build_mlp(features_car_cat, features_car_int, features_nocar_cat,
              features_nocar_int, features_cp, features_hascar, means, labels):

    prediction, _, _, _, = \
            build_mlp_onlyloc(features_car_cat, features_car_int,
                              features_nocar_cat, features_nocar_int, features_cp, features_hascar,
                              means, labels)

    mlp_crm = MLP(activations=[None],
                  dims=[1, 1],
                  weights_init=IsotropicGaussian(.1),
                  biases_init=Constant(0),
                  name='mlp_crm')
    mlp_crm.initialize()
    crm = features_nocar_int[:, 0][:, None]

    prediction = prediction * mlp_crm.apply(crm)

    cost = MAPECost().apply(labels, prediction)

    cg = ComputationGraph(cost)
    input_var = VariableFilter(roles=[INPUT])(cg.variables)
    print input_var

    cg_dropout = apply_dropout(cg, [input_var[7], input_var[5]], .4)
    cost_dropout = cg_dropout.outputs[0]

    return prediction, cost_dropout, cg_dropout.parameters, cost
def build_model(images, labels):
    
    vgg = VGG(layer='conv4_4')
    vgg.push_initialization_config()
    vgg.initialize()

    tdb = top_direction_block()
    tdb.push_initialization_config()
    tdb.initialize()

    # Construct feedforward sequence
    ss_seq = FeedforwardSequence([vgg.apply, tdb.apply])
    ss_seq.push_initialization_config()
    ss_seq.initialize()
    
    prediction = ss_seq.apply(images)
    cost       = StructuredCost().apply(labels, theano.tensor.clip(prediction, 1e-5, 1 - 1e-5))

    cg           = ComputationGraph(cost)
    cg_dropout   = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[0]], .5)
    cost_dropout = cg_dropout.outputs[0]

    # define learned parameters
    selector = Selector([ss_seq])
    W         = selector.get_parameters()
    parameters = []
    parameters += [v for k, v in W.items()]

    return cost_dropout, parameters 
Beispiel #26
0
def main(save_to, num_epochs):
    mlp = MLP([Tanh(), Softmax()], [784, 100, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    probs = mlp.apply(tensor.flatten(x, outdim=2))
    cost = CategoricalCrossEntropy().apply(y.flatten(), probs)
    error_rate = MisclassificationRate().apply(y.flatten(), probs)

    cg = ComputationGraph([cost])
    W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum()
    cost.name = 'final_cost'

    mnist_train = MNIST(("train", ))
    mnist_test = MNIST(("test", ))

    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs),
        DataStreamMonitoring([cost, error_rate],
                             Flatten(DataStream.default_stream(
                                 mnist_test,
                                 iteration_scheme=SequentialScheme(
                                     mnist_test.num_examples, 500)),
                                     which_sources=('features', )),
                             prefix="test"),
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               after_epoch=True),
        Checkpoint(save_to),
        Printing()
    ]

    if BLOCKS_EXTRAS_AVAILABLE:
        extensions.append(
            Plot('MNIST example',
                 channels=[[
                     'test_final_cost',
                     'test_misclassificationrate_apply_error_rate'
                 ], ['train_total_gradient_norm']]))

    main_loop = MainLoop(algorithm,
                         Flatten(DataStream.default_stream(
                             mnist_train,
                             iteration_scheme=SequentialScheme(
                                 mnist_train.num_examples, 50)),
                                 which_sources=('features', )),
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
Beispiel #27
0
    def __init__(self, outputs):
        super(Model, self).__init__(outputs)
        if len(self.outputs) > 1:
            logger.warning("model with multiple output " + multiple_message)

        bricks = [
            get_brick(var) for var in self.variables + self.scan_variables
            if get_brick(var)
        ]
        children = set(chain(*(brick.children for brick in bricks)))
        # Quadratic complexity: we should not have thousands of
        # top-level bricks.
        self.top_bricks = []
        for brick in bricks:
            if brick not in children and brick not in self.top_bricks:
                self.top_bricks.append(brick)
        names = Counter([brick.name for brick in self.top_bricks])
        repeated_names = [name for name, count in names.items() if count > 1]
        if repeated_names:
            raise ValueError("top bricks with the same name:"
                             " {}".format(', '.join(repeated_names)))

        brick_param_names = {
            v: k
            for k, v in Selector(self.top_bricks).get_params().items()
        }
        self.params = []
        for param in VariableFilter(roles=[PARAMETER])(self.shared_variables):
            if param in brick_param_names:
                self.params.append((brick_param_names[param], param))
            else:
                self.params.append((param.name, param))
        self.params = OrderedDict(self.params)
Beispiel #28
0
    def test_many_steps(self):
        x = tensor.tensor3('x')
        mask = tensor.matrix('mask')
        h = self.simple.apply(x, mask=mask, iterate=True)
        calc_h = theano.function(inputs=[x, mask], outputs=[h])

        x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))),
                                    dtype=theano.config.floatX)
        x_val = numpy.ones((24, 4, 3),
                           dtype=theano.config.floatX) * x_val[..., None]
        mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
        mask_val[12:24, 3] = 0
        h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX)
        for i in range(1, 25):
            h_val[i] = numpy.tanh(h_val[i - 1].dot(
                2 * numpy.ones((3, 3))) + x_val[i - 1])
            h_val[i] = (mask_val[i - 1, :, None] * h_val[i] +
                        (1 - mask_val[i - 1, :, None]) * h_val[i - 1])
        h_val = h_val[1:]
        assert_allclose(h_val, calc_h(x_val, mask_val)[0], rtol=1e-04)

        # Also test that initial state is a parameter
        initial_state, = VariableFilter(roles=[INITIAL_STATE])(
            ComputationGraph(h))
        assert is_shared_variable(initial_state)
        assert initial_state.name == 'initial_state'
Beispiel #29
0
 def train_base_model(self, train_data, test_data, input_dim):
     x = T.matrix('features')
     y = T.matrix('targets')
     mlp, cost, mis_cost = self.create_base_model(x, y, input_dim)
     cg = ComputationGraph([cost])
     inputs = VariableFilter(roles=[INPUT])(cg.variables)
     cg = apply_dropout(cg, inputs, 0.2)
     algorithm = GradientDescent(cost=cost,
                                 parameters=cg.parameters,
                                 step_rule=Adam(learning_rate=0.001))
     data_stream = train_data
     data_stream_test = test_data
     monitor = DataStreamMonitoring(variables=[mis_cost],
                                    data_stream=data_stream_test,
                                    prefix="test")
     plot_ext = Plot('F1-measure',
                     channels=[['test_MisclassificationRate']],
                     after_batch=True)
     main_loop = MainLoop(data_stream=data_stream,
                          algorithm=algorithm,
                          extensions=[
                              monitor,
                              FinishAfter(after_n_epochs=50),
                              Printing(), plot_ext
                          ])
     main_loop.run()
     return mlp
Beispiel #30
0
    def primal_step(self, x, y, learning_rate, input_dim, p):
        self.model = self.model(x, y, input_dim, p)
        score, probs = self.model.create_model()
        criterion = self.alpha * p - self.beta * np.float32(1 - p)

        r = theano.shared(np.float32(0.0), name='tp+fp')
        q = theano.shared(np.float32(0.0), name='tn+fn')

        pos_criterion = T.lt(probs, 0.5) * -criterion * score

        neg_criterion = T.gt(probs, 0.5) * criterion * score

        cost_weighed = T.mean(pos_criterion * T.gt(criterion, 0) +
                              neg_criterion * T.lt(criterion, 0))

        cg = ComputationGraph([cost_weighed])

        # Reward version
        r_temp = (self.t * r + T.mean(score * T.gt(probs, 0.5))) / (self.t + 1)
        q_temp = (self.t * q + T.mean(score * T.lt(probs, 0.5))) / (self.t + 1)

        # True Count version
        # r_temp = (self.t*r + T.mean(1.0 * T.gt(probs, 0.5)))/(self.t + 1)
        # q_temp = (self.t*q + T.mean(1.0 * T.lt(probs, 0.5)))/(self.t + 1)

        primal_updates = [(r, r_temp), (q, q_temp), (self.t, self.t + 1)]

        weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        updates = Adam(cost_weighed, weights) + primal_updates

        # r = tp + fp
        # q = fp + fn

        primal_var = [r, q]
        return updates, cost_weighed, score, primal_var
Beispiel #31
0
    def monitoring_vars(self, cg):

        mu, sigma, coeff = VariableFilter(
        	applications = [self.gmm_emitter.gmmmlp.apply],
        	name_regex = "output")(cg.variables)

        min_sigma = sigma.min().copy(name="sigma_min")
        mean_sigma = sigma.mean().copy(name="sigma_mean")
        max_sigma = sigma.max().copy(name="sigma_max")

        min_mu = mu.min().copy(name="mu_min")
        mean_mu = mu.mean().copy(name="mu_mean")
        max_mu = mu.max().copy(name="mu_max")

        monitoring_vars = [mean_sigma, min_sigma,
            min_mu, max_mu, mean_mu, max_sigma]

        return monitoring_vars
def test_collect():
    x = tensor.matrix()
    mlp = MLP(activations=[Logistic(), Logistic()], dims=[784, 100, 784],
              use_bias=False)
    cost = SquaredError().apply(x, mlp.apply(x))
    cg = ComputationGraph(cost)
    var_filter = VariableFilter(roles=[PARAMETER])
    W1, W2 = var_filter(cg.variables)
    for i, W in enumerate([W1, W2]):
        W.set_value(numpy.ones_like(W.get_value()) * (i + 1))
    new_cg = collect_parameters(cg, cg.shared_variables)
    collected_parameters, = new_cg.shared_variables
    assert numpy.all(collected_parameters.get_value()[:784 * 100] == 1.)
    assert numpy.all(collected_parameters.get_value()[784 * 100:] == 2.)
    assert collected_parameters.ndim == 1
    W1, W2 = VariableFilter(roles=[COLLECTED])(new_cg.variables)
    assert W1.eval().shape == (784, 100)
    assert numpy.all(W1.eval() == 1.)
    assert W2.eval().shape == (100, 784)
    assert numpy.all(W2.eval() == 2.)
Beispiel #33
0
def train(train_set, test_set):
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')

    l1 = Linear(
            name='input_to_hidden',
            input_dim=2,
            output_dim=3,
            weights_init=IsotropicGaussian(0.1),
            biases_init=Constant(0)
    )
    l1.initialize()
    h = Logistic().apply(l1.apply(x))

    l2 = Linear(
            name='hidden_to_output',
            input_dim=l1.output_dim,
            output_dim=2,
            weights_init=IsotropicGaussian(0.1),
            biases_init=Constant(0)
    )
    l2.initialize()
    y_hat = Softmax().apply(l2.apply(h))

    cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)

    error = MisclassificationRate().apply(y.flatten(), y_hat)
    error.name = 'misclassification_rate'

    cg = ComputationGraph(cost)

    W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + 1e-8 * (W1 ** 2).sum() + 1e-8 * (W2 ** 2).sum()
    cost.name = 'cost_with_regularization'

    print('W1', W1.get_value())
    print('W2', W2.get_value())

    algorithm = GradientDescent(
            cost=cost,
            parameters=cg.parameters,
            step_rule=RMSProp()
    )

    data_stream_train = Flatten(
            DataStream.default_stream(
                    train_set,
                    iteration_scheme=ShuffledScheme(train_set.num_examples, batch_size=4)
            )
    )

    data_stream_test = Flatten(
            DataStream.default_stream(
                    test_set,
                    iteration_scheme=SequentialScheme(test_set.num_examples, batch_size=1)
            )
    )

    monitor = DataStreamMonitoring(
            variables=[cost, error],
            data_stream=data_stream_test,
            prefix="test"
    )

    main_loop = MainLoop(
            data_stream=data_stream_train,
            algorithm=algorithm,
            extensions=[
                monitor,
                FinishAfter(after_n_epochs=100),
                Printing(),
                # ProgressBar()
            ]
    )

    main_loop.run()
Beispiel #34
0
                         input_dim=input_dim, output_dim=num_hidden_nodes)
h = Rectifier().apply(input_to_hidden.apply(x))
hidden_to_output = Linear(name='hidden_to_output',
                         input_dim=num_hidden_nodes, output_dim=2)
y_hat = Softmax().apply(hidden_to_output.apply(h))

y = tensor.lmatrix('targets')
from blocks.bricks.cost import CategoricalCrossEntropy, MisclassificationRate
cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)


from blocks.roles import WEIGHT
from blocks.graph import ComputationGraph
from blocks.filter import VariableFilter
cg = ComputationGraph(cost)
W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
L1,L2 = 0.05, 0.05
cost = cost + L1 * (W1 ** 2).sum() + L2 * (W2 ** 2).sum()
cost.name = 'cost_with_regularization'

from blocks.bricks import MLP
mlp = MLP(activations=[Rectifier(), Softmax()], dims=[input_dim,num_hidden_nodes, 2]).apply(x)
W1.name = 'W1'


from blocks.initialization import IsotropicGaussian, Constant
hidden_to_output.weights_init = IsotropicGaussian(0.01)
input_to_hidden.weights_init = hidden_to_output.weights_init
hidden_to_output.biases_init = Constant(0)
input_to_hidden.biases_init = hidden_to_output.biases_init
input_to_hidden.initialize()
Beispiel #35
0
def initialize_all(config, save_path, bokeh_name,
                   params, bokeh_server, bokeh, test_tag, use_load_ext,
                   load_log, fast_start):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])
    train_conf = config['training']
    recognizer = create_model(config, data, test_tag)

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result
    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    prediction, prediction_mask = add_exploration(recognizer, data, train_conf)

    #
    # Observables:
    #
    primary_observables = []  # monitored each batch
    secondary_observables = []  # monitored every 10 batches
    validation_observables = []  # monitored on the validation set

    cg = recognizer.get_cost_graph(
        batch=True, prediction=prediction, prediction_mask=prediction_mask)
    labels, = VariableFilter(
        applications=[recognizer.cost], name='labels')(cg)
    labels_mask, = VariableFilter(
        applications=[recognizer.cost], name='labels_mask')(cg)

    gain_matrix = VariableFilter(
        theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg)
    if len(gain_matrix):
        gain_matrix, = gain_matrix
        primary_observables.append(
            rename(gain_matrix.min(), 'min_gain'))
        primary_observables.append(
            rename(gain_matrix.max(), 'max_gain'))

    batch_cost = cg.outputs[0].sum()
    batch_size = rename(recognizer.labels.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_total_cost"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(
        applications=[r.generator.readout.readout], name="output_0")(
            cost_cg)
    bottom_output = VariableFilter(
        # We need name_regex instead of name because LookupTable calls itsoutput output_0
        applications=[r.bottom.apply], name_regex="output")(
            cost_cg)[-1]
    attended, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended")(
            cost_cg)
    attended_mask, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended_mask")(
            cost_cg)
    weights, = VariableFilter(
        applications=[r.generator.evaluate], name="weights")(
            cost_cg)
    max_recording_length = rename(bottom_output.shape[0],
                                  "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = rename(attended_mask.shape[0],
                                      "max_attended_mask_length")
    max_attended_length = rename(attended.shape[0],
                                 "max_attended_length")
    max_num_phonemes = rename(labels.shape[0],
                              "max_num_phonemes")
    min_energy = rename(energies.min(), "min_energy")
    max_energy = rename(energies.max(), "max_energy")
    mean_attended = rename(abs(attended).mean(),
                           "mean_attended")
    mean_bottom_output = rename(abs(bottom_output).mean(),
                                "mean_bottom_output")
    weights_penalty = rename(monotonicity_penalty(weights, labels_mask),
                             "weights_penalty")
    weights_entropy = rename(entropy(weights, labels_mask),
                             "weights_entropy")
    mask_density = rename(labels_mask.mean(),
                          "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy,
        min_energy, max_energy,
        mean_attended, mean_bottom_output,
        batch_size, max_num_phonemes,
        mask_density])
    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config.get('regularization', dict())
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [p for p in cg.parameters if p not in attention_params]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])

    train_cost = regularized_cg.outputs[0]
    if reg_config.get("penalty_coof", .0) > 0:
        # big warning!!!
        # here we assume that:
        # regularized_weights_penalty = regularized_cg.outputs[1]
        train_cost = (train_cost +
                      reg_config.get("penalty_coof", .0) *
                      regularized_cg.outputs[1] / batch_size)
    if reg_config.get("decay", .0) > 0:
        train_cost = (train_cost + reg_config.get("decay", .0) *
                      l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2)

    train_cost = rename(train_cost, 'train_cost')

    gradients = None
    if reg_config.get('adaptive_noise'):
        logger.info('apply adaptive noise')
        if ((reg_config.get("penalty_coof", .0) > 0) or
                (reg_config.get("decay", .0) > 0)):
            logger.error('using  adaptive noise with alignment weight panalty '
                         'or weight decay is probably stupid')
        train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise(
            cg, cg.outputs[0],
            variables=cg.parameters,
            num_examples=data.get_dataset('train').num_examples,
            parameters=Model(regularized_cg.outputs[0]).get_parameter_dict().values(),
            **reg_config.get('adaptive_noise')
        )
        train_cost.name = 'train_cost'
        adapt_noise_cg = ComputationGraph(train_cost)
        model_prior_mean = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_mean')(adapt_noise_cg)[0],
            'model_prior_mean')
        model_cost = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_cost')(adapt_noise_cg)[0],
            'model_cost')
        model_prior_variance = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_variance')(adapt_noise_cg)[0],
            'model_prior_variance')
        regularized_cg = ComputationGraph(
            [train_cost, model_cost] +
            regularized_cg.outputs +
            [model_prior_mean, model_prior_variance])
        primary_observables += [
            regularized_cg.outputs[1],  # model cost
            regularized_cg.outputs[2],  # task cost
            regularized_cg.outputs[-2],  # model prior mean
            regularized_cg.outputs[-1]]  # model prior variance

    model = Model(train_cost)
    if params:
        logger.info("Load parameters from " + params)
        # please note: we cannot use recognizer.load_params
        # as it builds a new computation graph that dies not have
        # shapred variables added by adaptive weight noise
        with open(params, 'r') as src:
            param_values = load_parameters(src)
        model.set_parameter_values(param_values)

    parameters = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, parameters[key].get_value().shape) for key
                     in sorted(parameters.keys())],
                    width=120))

    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'], train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False) > 0:
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [v for v in maxnorm_subjects
                                if not isinstance(get_brick(v), LookupTable)]
        logger.info("Parameters covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in parameters.items()
                                      if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in parameters.items()
                                      if not p in maxnorm_subjects]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                     maxnorm_subjects)]
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(
            BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements ** 0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost,
        algorithm.total_gradient_norm,
        algorithm.total_step_norm, clipping.threshold,
        max_recording_length,
        max_attended_length, max_attended_mask_length]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances'),
        weights_entropy, weights_penalty]

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(rename(aggregation.mean(var, batch_size),
                                     'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(rename(aggregation.mean(var, labels_mask.sum()),
                                     'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(TrainingDataMonitoring(
        primary_observables, after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average", every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables),
        data.get_stream("valid", shuffle=False), prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)
    per = PhonemeErrorRate(recognizer, data,
                           **config['monitoring']['search'])
    per_monitoring = DataStreamMonitoring(
        [per], data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['search_every_epochs'],
            every_n_batches=mon_conf['search_every_batches'],
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_cost, track_the_best_per]
    extensions.append(AdaptiveClipping(
        algorithm.total_gradient_norm.name,
        clipping, train_conf['gradient_threshold'],
        decay_rate=0.998, burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf.get('num_batches'),
                    after_n_epochs=train_conf.get('num_epochs'))
            .add_condition(["after_batch"], _gradient_norm_is_none),
    ]
    channels = [
        # Plot 1: training and validation costs
        [average_monitoring.record_name(train_cost),
         validation.record_name(cost)],
        # Plot 2: gradient norm,
        [average_monitoring.record_name(algorithm.total_gradient_norm),
         average_monitoring.record_name(clipping.threshold)],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [average_monitoring._record_name('weights_entropy_per_label'),
         validation._record_name('weights_entropy_per_label')],
        # Plot 5: training and validation monotonicity penalty
        [average_monitoring._record_name('weights_penalty_per_recording'),
         validation._record_name('weights_penalty_per_recording')]]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name
                 else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start, after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True)
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_per.notification_name),
            (root_path + "_best" + extension,))
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_cost.notification_name),
            (root_path + "_best_ll" + extension,)),
        ProgressBar()]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))
    if config['net']['criterion']['name'].startswith('mse'):
        extensions.append(
            LogInputsGains(
                labels, cg, recognizer.generator.readout.emitter, data))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name]
        extensions.append(Patience(**patience_conf))

    extensions.append(Printing(every_n_batches=1,
                               attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
Beispiel #36
0
y = tensor.lmatrix('y')

mlp = MLP(activations=[Logistic(), Softmax()],
          dims=[117, 55, 2],
          weights_init=IsotropicGaussian(),
          biases_init=Constant(0.01))

mlp.initialize()

y_hat = mlp.apply(x)

cost = BinaryCrossEntropy().apply(y, y_hat)

cg = ComputationGraph(cost)

W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
cost = cost + 0.001 * abs(W1).sum() + 0.001 * abs(W2).sum()
cost.name = 'cost'

error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat)
error_rate.name = 'error_rate'

algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=Scale(learning_rate=0.1))

train_set = H5PYDataset('mushrooms.hdf5', which_sets=('train',))
train_stream = DataStream.default_stream(
    train_set, iteration_scheme=SequentialScheme(
        train_set.num_examples, batch_size=128))
def run_experiment():

    np.random.seed(42)

    X = tensor.tensor4('features')
    nbr_channels = 3
    image_shape = (5, 5)

    conv_layers = [ ConvolutionalLayer( filter_size=(2,2),
                                        num_filters=10,
                                        activation=Rectifier().apply,
                                        border_mode='valid',
                                        pooling_size=(1,1),
                                        weights_init=Uniform(width=0.1),
                                        #biases_init=Uniform(width=0.01),
                                        biases_init=Constant(0.0),
                                        name='conv0')]
    conv_sequence = ConvolutionalSequence(  conv_layers,
                                            num_channels=nbr_channels,
                                            image_size=image_shape)
    #conv_sequence.push_allocation_config()
    conv_sequence.initialize()
    
    flattener = Flattener()
    conv_output = conv_sequence.apply(X)
    y_hat = flattener.apply(conv_output)
    # Whatever. Not important since we're not going to actually train anything.
    cost = tensor.sqr(y_hat).sum()


    #L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[FILTER, BIAS])(ComputationGraph([y_hat]).variables)]
    L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[BIAS])(ComputationGraph([y_hat]).variables)]
    # works on the sum of the gradients in a mini-batch
    sum_square_norm_gradients_method_02 = sum([tensor.sqr(g).sum() for g in L_grads_method_02])


    D_by_layer = get_conv_layers_transformation_roles(ComputationGraph(conv_output))
    individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_conv_transformations(D_by_layer, cost)


    # why does this thing depend on N again ?
    # I don't think I've used a cost that divides by N.

    N = 2
    Xtrain = np.random.randn(N, nbr_channels, image_shape[0], image_shape[1]).astype(np.float32)
    #Xtrain[1:,:,:,:] = 0.0
    Xtrain[:,:,:,:] = 1.0

    convolution_filter_variable = VariableFilter(roles=[FILTER])(ComputationGraph([y_hat]).variables)[0]
    convolution_filter_variable_value = convolution_filter_variable.get_value()
    convolution_filter_variable_value[:,:,:,:] = 1.0
    #convolution_filter_variable_value[0,0,:,:] = 1.0
    convolution_filter_variable.set_value(convolution_filter_variable_value)

    f = theano.function([X],
                        [cost,
                            individual_sum_square_norm_gradients_method_00,
                            sum_square_norm_gradients_method_02])


    [c, v0, gs2] = f(Xtrain)

    #print "[c, v0, gs2]"
    L_c, L_v0, L_gs2 = ([], [], [])
    for n in range(N):
        [nc, nv0, ngs2] = f(Xtrain[n,:, :, :].reshape((1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3])))
        L_c.append(nc)
        L_v0.append(nv0)
        L_gs2.append(ngs2)

    print "Cost for whole mini-batch in single shot : %f." % c
    print "Cost for whole mini-batch accumulated    : %f." % sum(L_c)
    print ""
    print "Square-norm of all gradients for each data point in single shot :"
    print v0.reshape((1,-1))
    print "Square-norm of all gradients for each data point iteratively :"
    print np.array(L_gs2).reshape((1,-1))
    print ""
    print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2)))
    print ""
    print "Ratios : "
    print np.array(L_gs2).reshape((1,-1)) / v0.reshape((1,-1))
Beispiel #38
0
def train(config, save_path, bokeh_name,
          params, bokeh_server, test_tag, use_load_ext,
          load_log, fast_start, validation_epochs, validation_batches,
          per_epochs, per_batches):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])

    # Build the main brick and initialize all parameters.
    recognizer = SpeechRecognizer(
        data.recordings_source, data.labels_source,
        data.eos_label,
        data.num_features, data.num_labels,
        name="recognizer",
        data_prepend_eos=data.prepend_eos,
        character_map=data.character_map,
        **config["net"])
    for brick_path, attribute_dict in sorted(
            config['initialization'].items(),
            key=lambda (k, v): -k.count('/')):
        for attribute, value in attribute_dict.items():
            brick, = Selector(recognizer).select(brick_path).bricks
            setattr(brick, attribute, value)
            brick.push_initialization_config()
    recognizer.initialize()

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result
    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    if params:
        logger.info("Load parameters from " + params)
        recognizer.load_params(params)

    if test_tag:
        tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__
        __stream = data.get_stream("train")
        __data = next(__stream.get_epoch_iterator(as_dict=True))
        recognizer.recordings.tag.test_value = __data[data.recordings_source]
        recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask']
        recognizer.labels.tag.test_value = __data[data.labels_source]
        recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask']
        theano.config.compute_test_value = 'warn'

    batch_cost = recognizer.get_cost_graph().sum()
    batch_size = named_copy(recognizer.recordings.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_log_likelihood"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(
        applications=[r.generator.readout.readout], name="output_0")(
                cost_cg)
    bottom_output, = VariableFilter(
        applications=[r.bottom.apply], name="output")(
                cost_cg)
    attended, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended")(
                cost_cg)
    attended_mask, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended_mask")(
                cost_cg)
    weights, = VariableFilter(
        applications=[r.generator.evaluate], name="weights")(
                cost_cg)
    max_recording_length = named_copy(r.recordings.shape[0],
                                      "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = named_copy(attended_mask.shape[0],
                                          "max_attended_mask_length")
    max_attended_length = named_copy(attended.shape[0],
                                     "max_attended_length")
    max_num_phonemes = named_copy(r.labels.shape[0],
                                  "max_num_phonemes")
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_attended = named_copy(abs(attended).mean(),
                               "mean_attended")
    mean_bottom_output = named_copy(abs(bottom_output).mean(),
                                    "mean_bottom_output")
    weights_penalty = named_copy(monotonicity_penalty(weights, r.labels_mask),
                                 "weights_penalty")
    weights_entropy = named_copy(entropy(weights, r.labels_mask),
                                 "weights_entropy")
    mask_density = named_copy(r.labels_mask.mean(),
                              "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy,
        min_energy, max_energy,
        mean_attended, mean_bottom_output,
        batch_size, max_num_phonemes,
        mask_density])

    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config['regularization']
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [p for p in cg.parameters if p not in attention_params]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])
    regularized_cost = regularized_cg.outputs[0]
    regularized_weights_penalty = regularized_cg.outputs[1]

    # Model is weird class, we spend lots of time arguing with Bart
    # what it should be. However it can already nice things, e.g.
    # one extract all the parameters from the computation graphs
    # and give them hierahical names. This help to notice when a
    # because of some bug a parameter is not in the computation
    # graph.
    model = SpeechModel(regularized_cost)
    params = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, params[key].get_value().shape) for key
                        in sorted(params.keys())],
                    width=120))

    # Define the training algorithm.
    train_conf = config['training']
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'], train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False):
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [v for v in maxnorm_subjects
                                if not isinstance(get_brick(v), LookupTable)]
        logger.info("Parameters covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in params.items()
                                        if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in params.items()
                                        if not p in maxnorm_subjects]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                        maxnorm_subjects)]
    algorithm = GradientDescent(
        cost=regularized_cost +
            reg_config.get("penalty_coof", .0) * regularized_weights_penalty / batch_size +
            reg_config.get("decay", .0) *
            l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2,
        parameters=params.values(),
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)]))

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    observables = regularized_cg.outputs
    observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold]
    for name, param in params.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements ** 0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        observables.append(stats)

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(named_copy(aggregation.mean(var, batch_size),
                                            'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(named_copy(aggregation.mean(
                    var, recognizer.labels_mask.sum()), 'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
        ]
    extensions.append(TrainingDataMonitoring(
        [observables[0], algorithm.total_gradient_norm,
            algorithm.total_step_norm, clipping.threshold,
            max_recording_length,
            max_attended_length, max_attended_mask_length], after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(observables),
        prefix="average", every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes([cost, weights_entropy, weights_penalty]),
        data.get_stream("valid"), prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=validation_epochs,
            every_n_batches=validation_batches,
            after_training=False)
    extensions.append(validation)
    recognizer.init_beam_search(10)
    per = PhonemeErrorRate(recognizer, data.get_dataset("valid"))
    per_monitoring = DataStreamMonitoring(
        [per], data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=per_epochs,
            every_n_batches=per_batches,
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_likelihood = TrackTheBest(
        validation.record_name(cost)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_likelihood, track_the_best_per]
    extensions.append(AdaptiveClipping(
        algorithm.total_gradient_norm.name,
        clipping, train_conf['gradient_threshold'],
        decay_rate=0.998, burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf['num_batches'],
                    after_n_epochs=train_conf['num_epochs'])
        .add_condition(["after_batch"], _gradient_norm_is_none),
        # Live plotting: requires launching `bokeh-server`
        # and allows to see what happens online.
        Plot(bokeh_name
             if bokeh_name
             else os.path.basename(save_path),
             [# Plot 1: training and validation costs
             [average_monitoring.record_name(regularized_cost),
             validation.record_name(cost)],
             # Plot 2: gradient norm,
             [average_monitoring.record_name(algorithm.total_gradient_norm),
             average_monitoring.record_name(clipping.threshold)],
             # Plot 3: phoneme error rate
             [per_monitoring.record_name(per)],
             # Plot 4: training and validation mean weight entropy
             [average_monitoring._record_name('weights_entropy_per_label'),
             validation._record_name('weights_entropy_per_label')],
             # Plot 5: training and validation monotonicity penalty
             [average_monitoring._record_name('weights_penalty_per_recording'),
             validation._record_name('weights_penalty_per_recording')]],
             every_n_batches=10,
             server_url=bokeh_server),
        Checkpoint(save_path,
                   before_first_epoch=not fast_start, after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True)
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_per.notification_name),
            (root_path + "_best" + extension,))
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_likelihood.notification_name),
            (root_path + "_best_ll" + extension,)),
        ProgressBar(),
        Printing(every_n_batches=1,
                    attribute_filter=PrintingFilterList()
                    )]

    # Save the config into the status
    log = TrainingLog()
    log.status['_config'] = repr(config)
    main_loop = MainLoop(
        model=model, log=log, algorithm=algorithm,
        data_stream=data.get_stream("train"),
        extensions=extensions)
    main_loop.run()
Beispiel #39
0
x = tensor.matrix('features')
input_to_hidden = Linear(name = 'input_to_hidden', input_dim = 784, output_dim = 100) # define a function. 这个函数用于计算输入层到隐藏层的线性计算
h = Rectifier().apply(input_to_hidden.apply(x)) # 以每个隐藏层单元获得的线性计算结果为输入,计算使用Rectifier()激活函数的每一个隐藏层单元相应的输出结果,这个结果将被用于作为下一层的输入
hidden_to_output = Linear(name = 'hidden_to_output', input_dim = 100, output_dim = 10) # 定义最终输出层的每一个单元的线性计算结果。
y_hat = Softmax().apply(hidden_to_output.apply(h)) # 得出输出层每一个神经单元的非线性输出转换。

y = tensor.lmatrix('targets') # 定义输出变量

cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) #定义cost 函数
error_rate = MisclassificationRate().apply(y.flatten(), y_hat)
#构造计算图。
cg = ComputationGraph(cost)

#对cost函数进行正则化
#选择需要计算的参数 W1 为第一层所有线性转换的 W ,W2 为第二层所有线性转换的W
W1,W2 = VariableFilter(roles = [WEIGHT])(cg.variables)
#正则化公式定义,此处使用的是L2正则化
cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum()
cost.name = 'cost_with_regularization'

#定义一个多层神经网络,层与层之间的计算公式已经被之前定义。
#激活函数集activations定义了每一层的非线性转换函数,多层感知器每一层的输出都包含了两部分,第一部分是线性计算,然后将线性计算的结果进行非线性转换
#x是多层感知器的输入
mlp = MLP(activations = [Rectifier(),Softmax()], dims = [784, 100, 10]).apply(x)

#定义完整个神经网络的流程后,需要设置其线性转换的参数的初始值。

input_to_hidden.weights_init = IsotropicGaussian(0.01)
input_to_hidden.biases_init = Constant(0);
hidden_to_output.weights_init = IsotropicGaussian(0.01)
hidden_to_output.biases_init = Constant(0)
lord = [map_chr_2_ind[char] for char in lord_original]
print lord
zaza = prob_function([lord], numpy.ones((1, len(lord)), dtype="int8"))[:, 0, :]
print zaza
print zaza.shape
for (ey, row) in enumerate(zaza):
    print "PREDICTION PROBABILITIES FOR POSITION", ey, "LETTER", repr(lord_original[ey])
    sorted_thing = [(prob, ind) for (ind, prob) in enumerate(row)]
    sorted_thing.sort(reverse=True)
    for (prob, ind) in sorted_thing:
        print repr(map_ind_2_chr[ind]), ":", prob
    print "\n"
"""

# define a function that gets the overall "sum of scores" at a given time step
readouts = VariableFilter(theano_name="readout_readout_output_0")(lstm_net.cost_model.variables)[0]
score_function = function([lstm_net.x, lstm_net.mask], readouts.sum(axis=2))

# this section of the playground has some fun rides that revolve around various correlation stuff. uncomment to access
# =)
sc = StateComputer(lstm_net.cost_model, map_chr_2_ind)
# storage for the correlations at the very end
correlation_dict = dict()
for name in sc.state_var_names:
    correlation_dict[name] = numpy.zeros(lstm_net.hidden_dims[0], dtype=float)

# get validation data to run over
valid_data = H5PYDataset("bible.hdf5", which_sets=("valid",), load_in_memory=True)
data_stream = PadAndAddMasks(
    DataStream.default_stream(dataset=valid_data, iteration_scheme=SequentialScheme(valid_data.num_examples,
                                                                                    batch_size=128)),