def visualize_states(hidden_states, updates, train_stream, valid_stream, args): # Get all the hidden_states filter_states = VariableFilter(theano_name_regex="hidden_state_.*") all_states = filter_states(hidden_states) all_states = sorted(all_states, key=lambda var: var.name[-1]) # Get all the hidden_cells filter_cells = VariableFilter(theano_name_regex="hidden_cells_.*") all_cells = filter_cells(hidden_states) all_cells = sorted(all_cells, key=lambda var: var.name[-1]) # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, not (has_indices(args.dataset))) # Compile the function logger.info("The compilation of the function has started") if args.rnn_type == "lstm" and args.visualize_cells: compiled = theano.function(inputs=ComputationGraph(all_cells).inputs, outputs=all_cells, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) else: compiled = theano.function(inputs=ComputationGraph(all_states).inputs, outputs=all_states, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) # Plot the function plot("hidden_state", train_stream, compiled, args)
def training(repo, learning_rate, batch_size, filenames): print 'LOAD DATA' (x_train, y_train), (x_valid, y_valid), (x_test, y_test) = load_datasets_mnist(repo, filenames) print 'BUILD MODEL' train_f, valid_f, test_f, model, fisher, params = build_training() x_train = x_train[:1000] y_train = y_train[:1000] x = T.tensor4() y = T.imatrix() output = model.apply(x) output = output.reshape( (x.shape[0], model.get_dim('output'))) #TO DO : get_dim('name') for Architecture cost = Softmax().categorical_cross_entropy(y.flatten(), output).mean() cg = ComputationGraph(cost) inputs_conv = VariableFilter(roles=[INPUT], bricks=[Convolutional])(cg) outputs_conv = VariableFilter(roles=[OUTPUT], bricks=[Convolutional])(cg) inputs_fully = VariableFilter(roles=[INPUT], bricks=[Linear])(cg) outputs_fully = VariableFilter(roles=[OUTPUT], bricks=[Linear])(cg) dico = OrderedDict([('conv_output', outputs_conv[0])]) [grad_s] = T.grad(cost, outputs_conv) dico['conv_output'] = grad_s f = theano.function([x, y], grad_s, allow_input_downcast=True, on_unused_input='ignore') print np.mean(f(x_train[:10], y_train[:10]))
def analyze(self, inputs, groundtruth, prediction): """Compute cost and aligment.""" if not hasattr(self, "_analyze"): input_variables = list(self.single_inputs.values()) input_variables.append(self.single_labels) input_variables.append(self.single_predicted_labels) cg = self.get_cost_graph(batch=False, use_prediction=True) costs = cg.outputs[0] weights, = VariableFilter( bricks=[self.generator], name="weights")(cg) energies = VariableFilter( bricks=[self.generator], name="energies")(cg) energies_output = [energies[0][:, 0, :] if energies else tensor.zeros_like(weights)] self._analyze = theano.function( input_variables, [costs[0], weights[:, 0, :]] + energies_output, on_unused_input='warn') input_values_dict = dict(inputs) input_values_dict['labels'] = groundtruth input_values_dict['predicted_labels'] = prediction return self._analyze(**input_values_dict)
def build_mlp(features_int, features_cat, labels, labels_mean): inputs = tensor.concatenate([features_int, features_cat], axis=1) mlp = MLP(activations=[Rectifier(), Rectifier(), Rectifier(), None], dims=[337, 800, 1200, 1], weights_init=IsotropicGaussian(), biases_init=Constant(1)) mlp.initialize() prediction = mlp.apply(inputs) cost = MAPECost().apply(prediction, labels, labels_mean) cg = ComputationGraph(cost) #cg_dropout0 = apply_dropout(cg, [VariableFilter(roles=[INPUT])(cg.variables)[1]], .2) cg_dropout1 = apply_dropout(cg, [ VariableFilter(roles=[OUTPUT])(cg.variables)[1], VariableFilter(roles=[OUTPUT])(cg.variables)[3], VariableFilter(roles=[OUTPUT])(cg.variables)[5] ], .2) cost_dropout1 = cg_dropout1.outputs[0] return cost_dropout1, cg_dropout1.parameters, cost #cost, cg.parameters, cost #
def __init__(self, samples): # Extracting information from the sampling computation graph self.cg = ComputationGraph(samples) self.inputs = self.cg.inputs self.generator = get_brick(samples) if not isinstance(self.generator, BaseSequenceGenerator): raise ValueError self.generate_call = get_application_call(samples) if (not self.generate_call.application == self.generator.generate): raise ValueError self.inner_cg = ComputationGraph(self.generate_call.inner_outputs) # Fetching names from the sequence generator self.context_names = self.generator.generate.contexts self.state_names = self.generator.generate.states # Parsing the inner computation graph of sampling scan self.contexts = [ VariableFilter(bricks=[self.generator], name=name, roles=[INPUT])(self.inner_cg)[0] for name in self.context_names ] self.input_states = [] # Includes only those state names that were actually used # in 'generate' self.input_state_names = [] for name in self.generator.generate.states: var = VariableFilter(bricks=[self.generator], name=name, roles=[INPUT])(self.inner_cg) if var: self.input_state_names.append(name) self.input_states.append(var[0]) self.compiled = False
def analyze(self, recording, transcription): """Compute cost and aligment for a recording/transcription pair.""" if not hasattr(self, "_analyze"): cost = self.get_cost_graph(batch=False) cg = ComputationGraph(cost) energies = VariableFilter(bricks=[self.generator], name="energies")(cg) energies_output = [ energies[0][:, 0, :] if energies else tensor.zeros( (self.single_transcription.shape[0], self.single_recording.shape[0])) ] states, = VariableFilter(applications=[self.encoder.apply], roles=[OUTPUT], name="encoded")(cg) ctc_matrix_output = [] # Temporarily disabled for compatibility with LM code # if len(self.generator.readout.source_names) == 1: # ctc_matrix_output = [ # self.generator.readout.readout(weighted_averages=states)[:, 0, :]] weights, = VariableFilter(bricks=[self.generator], name="weights")(cg) self._analyze = theano.function( [self.single_recording, self.single_transcription], [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output) return self._analyze(recording, transcription)
def analyze(self, recording, groundtruth, prediction=None): """Compute cost and aligment.""" input_values = [recording, groundtruth] if prediction is not None: input_values.append(prediction) if not hasattr(self, "_analyze"): input_variables = [self.single_recording, self.single_transcription] prediction_variable = tensor.lvector('prediction') if prediction is not None: input_variables.append(prediction_variable) cg = self.get_cost_graph( batch=False, prediction=prediction_variable[:, None]) else: cg = self.get_cost_graph(batch=False) cost = cg.outputs[0] energies = VariableFilter( bricks=[self.generator], name="energies")(cg) energies_output = [energies[0][:, 0, :] if energies else tensor.zeros((self.single_transcription.shape[0], self.single_recording.shape[0]))] states, = VariableFilter( applications=[self.encoder.apply], roles=[OUTPUT], name="encoded")(cg) ctc_matrix_output = [] # Temporarily disabled for compatibility with LM code # if len(self.generator.readout.source_names) == 1: # ctc_matrix_output = [ # self.generator.readout.readout(weighted_averages=states)[:, 0, :]] weights, = VariableFilter( bricks=[self.generator], name="weights")(cg) self._analyze = theano.function( input_variables, [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output, on_unused_input='warn') return self._analyze(*input_values)
def getParams(model, tensor): x = T.tensor4() cost = model.apply(tensor).sum() cg = ComputationGraph(cost) W = VariableFilter(roles=[WEIGHT])(cg.variables) B = VariableFilter(roles=[BIAS])(cg.variables) return W + B
def load_models(net, model_path=save_path, in_size=len(input_columns), out_size=len(output_columns) - 1 if cost_mode == 'RL-MDN' else len(output_columns), hidden_size=hidden_size, num_recurrent_layers=num_recurrent_layers, model=layer_models[0]): initials = [] if not os.path.isfile(model_path): print 'Could not find model file.' sys.exit(0) print 'Loading model from {0}...'.format(model_path) x = tensor.tensor3('features', dtype=theano.config.floatX) y = tensor.tensor3('targets', dtype='floatX') train_flag = [theano.shared(0)] latent_size = net.get_size() # latent_size in_size = latent_size + len(input_columns) y_hat, cost, cells = nn_fprop(x, y, in_size, out_size, hidden_size, num_recurrent_layers, train_flag) main_loop = MainLoop(algorithm=None, data_stream=None, model=Model(cost), extensions=[saveload.Load(model_path)]) for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training') bin_model = main_loop.model print 'Model loaded. Building prediction function...' hiddens = [] for i in range(num_recurrent_layers): brick = [b for b in bin_model.get_top_bricks() if b.name == layer_models[i] + str(i)][0] hiddens.extend(VariableFilter(theano_name=brick.name + '_apply_states')(bin_model.variables)) hiddens.extend(VariableFilter(theano_name=brick.name + '_apply_cells')(cells)) initials.extend(VariableFilter(roles=[roles.INITIAL_STATE])(brick.parameters)) predict_func = theano.function([x], hiddens + [y_hat]) encoder, code_size = load_encoder(net) return predict_func, initials, encoder, code_size
def build_dictionnary(cost): cg = ComputationGraph(cost) inputs_conv = VariableFilter(roles=[INPUT], bricks=[Convolutional])(cg) outputs_conv = VariableFilter(roles=[OUTPUT], bricks=[Convolutional])(cg) inputs_fully = VariableFilter(roles=[INPUT], bricks=[Linear])(cg) outputs_fully = VariableFilter(roles=[OUTPUT], bricks=[Linear])(cg) grad_conv = T.grad(cost, outputs_conv) grad_fully = T.grad(cost, outputs_fully) items = [] for i, var_in, grad_out in zip(range(len(inputs_conv)), inputs_conv, grad_conv): items.append(('conv_input_' + str(i), var_in)) items.append(('conv_output_' + str(i), grad_out)) for i, var_in, grad_out in zip(range(len(inputs_fully)), inputs_fully, grad_fully): var_input = T.concatenate( [var_in, T.ones((var_in.shape[0], 1))], axis=1) items.append(('fully_input_' + str(i), var_input)) items.append(('fully_output_' + str(i), grad_out)) dico = OrderedDict(items) return dico
def build_tab_equiv(model): x = T.tensor4('x') y = T.imatrix() y_prev = model.apply(x) cg = ComputationGraph(T.sum(y_prev)) weight_fully = VariableFilter(roles=[WEIGHT], bricks=[Linear])(cg) weight_conv = VariableFilter(roles=[WEIGHT], bricks=[Convolutional])(cg) dico = {} index = 0 for w_fully in weight_fully[::-1]: dico[w_fully.name] = [ 'fully_input_' + str(index), 'fully_output_' + str(index) ] index += 1 index = 0 for w_conv in weight_conv[::-1]: dico[w_conv.name] = [ 'conv_input_' + str(index), 'conv_output_' + str(index) ] index += 1 return dico
def _create_model(with_dropout): cg = ComputationGraph(gan.compute_losses(x, z)) if with_dropout: inputs = VariableFilter(bricks=gan.discriminator.children[1:], roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.5) inputs = VariableFilter(bricks=[gan.discriminator], roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.2) return Model(cg.outputs)
def _compile_next_state_computer(self): next_states = [VariableFilter(bricks=[self.generator], name=name, roles=[OUTPUT])(self.inner_cg)[-1] for name in self.state_names] next_outputs = VariableFilter( applications=[self.generator.readout.emit], roles=[OUTPUT])( self.inner_cg.variables) self.next_state_computer = function( self.contexts + self.input_states + next_outputs, next_states)
def test_fully_layer(): batch_size=2 x = T.tensor4(); y = T.ivector() V = 200 layer_conv = Convolutional(filter_size=(5,5),num_filters=V, name="toto", weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0)) # try with no bias activation = Rectifier() pool = MaxPooling(pooling_size=(2,2)) convnet = ConvolutionalSequence([layer_conv, activation, pool], num_channels=15, image_size=(10,10), name="conv_section") convnet.push_allocation_config() convnet.initialize() output=convnet.apply(x) batch_size=output.shape[0] output_dim=np.prod(convnet.get_dim('output')) result_conv = output.reshape((batch_size, output_dim)) mlp=MLP(activations=[Rectifier().apply], dims=[output_dim, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0)) mlp.initialize() output=mlp.apply(result_conv) cost = T.mean(Softmax().categorical_cross_entropy(y.flatten(), output)) cg = ComputationGraph(cost) W = VariableFilter(roles=[WEIGHT])(cg.variables) B = VariableFilter(roles=[BIAS])(cg.variables) W = W[0]; b = B[0] inputs_fully = VariableFilter(roles=[INPUT], bricks=[Linear])(cg) outputs_fully = VariableFilter(roles=[OUTPUT], bricks=[Linear])(cg) var_input=inputs_fully[0] var_output=outputs_fully[0] [d_W,d_S,d_b] = T.grad(cost, [W, var_output, b]) d_b = d_b.dimshuffle(('x',0)) d_p = T.concatenate([d_W, d_b], axis=0) x_value = 1e3*np.random.ranf((2,15, 10, 10)) f = theano.function([x,y], [var_input, d_S, d_p], allow_input_downcast=True, on_unused_input='ignore') A, B, C= f(x_value, [5, 0]) A = np.concatenate([A, np.ones((2,1))], axis=1) print 'A', A.shape print 'B', B.shape print 'C', C.shape print lin.norm(C - np.dot(np.transpose(A), B), 'fro') return """
def do(self, which_callback, *args, **kwargs): if which_callback == 'before_training': cg = ComputationGraph(self.main_loop.algorithm.total_step_norm) self._learning_rate_var, = VariableFilter( theano_name='learning_rate')(cg) logger.debug("Annealing extension is initialized") elif which_callback == 'after_epoch': logger.debug("Annealing the learning rate to {}".format( self._annealing_learning_rate)) self._learning_rate_var.set_value(self._annealing_learning_rate) else: raise ValueError("don't know what to do")
def _compile_initial_state_and_context_computer(self): initial_states = VariableFilter( applications=[self.generator.initial_states], roles=[OUTPUT])(self.cg) outputs = OrderedDict([(v.tag.name, v) for v in initial_states]) beam_size = unpack(VariableFilter( applications=[self.generator.initial_states], name='batch_size')(self.cg)) for name, context in equizip(self.context_names, self.contexts): outputs[name] = context outputs['beam_size'] = beam_size self.initial_state_and_context_computer = function( self.inputs, outputs, on_unused_input='ignore')
def create_act_table(self, save_to, act_table): batch_size = 500 image_size = (28, 28) output_size = 10 convnet = create_lenet_5() layers = convnet.layers x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outmap = OrderedDict( (full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Generate parallel array, in the same order, for outputs outs = [outmap[full_brick_name(get_brick(b))] for b in biases] # Figure work count error_rate = (MisclassificationRate().apply( y.flatten(), probs).copy(name='error_rate')) max_activation_table = (MaxActivationTable().apply(outs).copy( name='max_activation_table')) max_activation_table.tag.aggregation_scheme = ( Concatenate(max_activation_table)) model = Model([error_rate, max_activation_table]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) mnist_test_stream = DataStream.default_stream( self.mnist_test, iteration_scheme=SequentialScheme(self.mnist_test.num_examples, batch_size)) evaluator = DatasetEvaluator([error_rate, max_activation_table]) results = evaluator.evaluate(mnist_test_stream) table = results['max_activation_table'] pickle.dump(table, open(act_table, 'wb')) return table
def _compile_next_state_computer(self): next_states = [VariableFilter(bricks=[self.generator], name=name, roles=[OUTPUT])(self.inner_cg)[-1] for name in self.state_names] next_outputs = VariableFilter( applications=[self.generator.readout.emit], roles=[OUTPUT])( self.inner_cg.variables) self.next_state_computer = function( self.contexts + self.input_states + next_outputs, next_states, # This is temporarily required because `lm_logprobs` is a weird # state which is not used to compute next state, but used to # compute the next output. on_unused_input='ignore')
def _compile_next_state_computer(self, givens): """Modified version of ``BeamSearch._compile_next_state_computer`` with ``givens``. """ next_states = [VariableFilter(bricks=[beam_search.generator], name=name, roles=[OUTPUT])(beam_search.inner_cg)[-1] for name in beam_search.state_names] next_outputs = VariableFilter( applications=[beam_search.generator.readout.emit], roles=[OUTPUT])( beam_search.inner_cg.variables) self.next_state_computer = function( [self.src_indices] + beam_search.input_states + next_outputs, next_states, givens=givens)
def __init__(self, inputs, cg, reward_emitter, data, **kwargs): self.input_accumulator = shared_floatx_zeros((2, 2), dtype='int64') self.gain_accumulator = shared_floatx_zeros((2, 2, 2)) self.reward_accumulator = shared_floatx_zeros((2, 2, 2), dtype='int64') self.dataset = data.get_dataset('train') self.inputs = inputs self.gains, = VariableFilter(applications=[reward_emitter.cost], roles=[INPUT], name='readouts')(cg.variables) self.reward, = VariableFilter(theano_name=reward_emitter.GAIN_MATRIX)( cg.variables) kwargs.setdefault('before_training', True) kwargs.setdefault('after_batch', True) super(LogInputsGains, self).__init__(**kwargs)
def _create_model(with_dropout): cg = ComputationGraph(ali.compute_losses(x, z)) if with_dropout: inputs = VariableFilter( bricks=([ali.discriminator.x_discriminator.layers[0], ali.discriminator.z_discriminator.layers[0]]), roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.2) inputs = VariableFilter( bricks=(ali.discriminator.x_discriminator.layers[2::3] + ali.discriminator.z_discriminator.layers[2::2] + ali.discriminator.joint_discriminator.layers[::2]), roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.5) return Model(cg.outputs)
def setup_model(p): ladder = LadderAE(p) # Setup inputs input_type = TensorType('float32', [False] * (len(p.encoder_layers[0]) + 1)) x_only = input_type('features_unlabeled') if debug: x_only.tag.test_value = numpy.random.normal( size=(p.batch_size, ) + p.encoder_layers[0]).astype(floatX) x = input_type('features_labeled') if debug: x.tag.test_value = numpy.random.normal( size=(p.batch_size, ) + p.encoder_layers[0]).astype(floatX) y = theano.tensor.lvector('targets_labeled') if debug: y.tag.test_value = numpy.random.randint(1, int(p.encoder_layers[-1]) + 1, (p.batch_size)) ladder.apply(x, y, x_only) # Load parameters if requested if p.get('load_from'): with open(p.load_from + '/trained_params.npz') as f: loaded = numpy.load(f) cg = ComputationGraph([ladder.costs.total]) current_params = VariableFilter(roles=[PARAMETER])(cg.variables) logger.info('Loading parameters: %s' % ', '.join(loaded.keys())) for param in current_params: assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) return ladder
def primal_step(self, x, y, learning_rate, alpha, beta, input_dim, p): mlp, cost = self.create_model(x, y, input_dim, p) cg = ComputationGraph([cost]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) updates = Adam(cost, weights, y, alpha, beta) return mlp, updates, -1 * cost
def build_mlp(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels): prediction, _, _, _, = \ build_mlp_onlyloc(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels) mlp_crm = MLP(activations=[None], dims=[1, 1], weights_init=IsotropicGaussian(.1), biases_init=Constant(0), name='mlp_crm') mlp_crm.initialize() crm = features_nocar_int[:, 0][:, None] prediction = prediction * mlp_crm.apply(crm) cost = MAPECost().apply(labels, prediction) cg = ComputationGraph(cost) input_var = VariableFilter(roles=[INPUT])(cg.variables) print input_var cg_dropout = apply_dropout(cg, [input_var[7], input_var[5]], .4) cost_dropout = cg_dropout.outputs[0] return prediction, cost_dropout, cg_dropout.parameters, cost
def build_model(images, labels): vgg = VGG(layer='conv4_4') vgg.push_initialization_config() vgg.initialize() tdb = top_direction_block() tdb.push_initialization_config() tdb.initialize() # Construct feedforward sequence ss_seq = FeedforwardSequence([vgg.apply, tdb.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost = StructuredCost().apply(labels, theano.tensor.clip(prediction, 1e-5, 1 - 1e-5)) cg = ComputationGraph(cost) cg_dropout = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[0]], .5) cost_dropout = cg_dropout.outputs[0] # define learned parameters selector = Selector([ss_seq]) W = selector.get_parameters() parameters = [] parameters += [v for k, v in W.items()] return cost_dropout, parameters
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run()
def __init__(self, outputs): super(Model, self).__init__(outputs) if len(self.outputs) > 1: logger.warning("model with multiple output " + multiple_message) bricks = [ get_brick(var) for var in self.variables + self.scan_variables if get_brick(var) ] children = set(chain(*(brick.children for brick in bricks))) # Quadratic complexity: we should not have thousands of # top-level bricks. self.top_bricks = [] for brick in bricks: if brick not in children and brick not in self.top_bricks: self.top_bricks.append(brick) names = Counter([brick.name for brick in self.top_bricks]) repeated_names = [name for name, count in names.items() if count > 1] if repeated_names: raise ValueError("top bricks with the same name:" " {}".format(', '.join(repeated_names))) brick_param_names = { v: k for k, v in Selector(self.top_bricks).get_params().items() } self.params = [] for param in VariableFilter(roles=[PARAMETER])(self.shared_variables): if param in brick_param_names: self.params.append((brick_param_names[param], param)) else: self.params.append((param.name, param)) self.params = OrderedDict(self.params)
def test_many_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') h = self.simple.apply(x, mask=mask, iterate=True) calc_h = theano.function(inputs=[x, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = numpy.ones((24, 4, 3), dtype=theano.config.floatX) * x_val[..., None] mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX) for i in range(1, 25): h_val[i] = numpy.tanh(h_val[i - 1].dot( 2 * numpy.ones((3, 3))) + x_val[i - 1]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] assert_allclose(h_val, calc_h(x_val, mask_val)[0], rtol=1e-04) # Also test that initial state is a parameter initial_state, = VariableFilter(roles=[INITIAL_STATE])( ComputationGraph(h)) assert is_shared_variable(initial_state) assert initial_state.name == 'initial_state'
def train_base_model(self, train_data, test_data, input_dim): x = T.matrix('features') y = T.matrix('targets') mlp, cost, mis_cost = self.create_base_model(x, y, input_dim) cg = ComputationGraph([cost]) inputs = VariableFilter(roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.2) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) data_stream = train_data data_stream_test = test_data monitor = DataStreamMonitoring(variables=[mis_cost], data_stream=data_stream_test, prefix="test") plot_ext = Plot('F1-measure', channels=[['test_MisclassificationRate']], after_batch=True) main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=[ monitor, FinishAfter(after_n_epochs=50), Printing(), plot_ext ]) main_loop.run() return mlp
def primal_step(self, x, y, learning_rate, input_dim, p): self.model = self.model(x, y, input_dim, p) score, probs = self.model.create_model() criterion = self.alpha * p - self.beta * np.float32(1 - p) r = theano.shared(np.float32(0.0), name='tp+fp') q = theano.shared(np.float32(0.0), name='tn+fn') pos_criterion = T.lt(probs, 0.5) * -criterion * score neg_criterion = T.gt(probs, 0.5) * criterion * score cost_weighed = T.mean(pos_criterion * T.gt(criterion, 0) + neg_criterion * T.lt(criterion, 0)) cg = ComputationGraph([cost_weighed]) # Reward version r_temp = (self.t * r + T.mean(score * T.gt(probs, 0.5))) / (self.t + 1) q_temp = (self.t * q + T.mean(score * T.lt(probs, 0.5))) / (self.t + 1) # True Count version # r_temp = (self.t*r + T.mean(1.0 * T.gt(probs, 0.5)))/(self.t + 1) # q_temp = (self.t*q + T.mean(1.0 * T.lt(probs, 0.5)))/(self.t + 1) primal_updates = [(r, r_temp), (q, q_temp), (self.t, self.t + 1)] weights = VariableFilter(roles=[WEIGHT])(cg.variables) updates = Adam(cost_weighed, weights) + primal_updates # r = tp + fp # q = fp + fn primal_var = [r, q] return updates, cost_weighed, score, primal_var
def monitoring_vars(self, cg): mu, sigma, coeff = VariableFilter( applications = [self.gmm_emitter.gmmmlp.apply], name_regex = "output")(cg.variables) min_sigma = sigma.min().copy(name="sigma_min") mean_sigma = sigma.mean().copy(name="sigma_mean") max_sigma = sigma.max().copy(name="sigma_max") min_mu = mu.min().copy(name="mu_min") mean_mu = mu.mean().copy(name="mu_mean") max_mu = mu.max().copy(name="mu_max") monitoring_vars = [mean_sigma, min_sigma, min_mu, max_mu, mean_mu, max_sigma] return monitoring_vars
def test_collect(): x = tensor.matrix() mlp = MLP(activations=[Logistic(), Logistic()], dims=[784, 100, 784], use_bias=False) cost = SquaredError().apply(x, mlp.apply(x)) cg = ComputationGraph(cost) var_filter = VariableFilter(roles=[PARAMETER]) W1, W2 = var_filter(cg.variables) for i, W in enumerate([W1, W2]): W.set_value(numpy.ones_like(W.get_value()) * (i + 1)) new_cg = collect_parameters(cg, cg.shared_variables) collected_parameters, = new_cg.shared_variables assert numpy.all(collected_parameters.get_value()[:784 * 100] == 1.) assert numpy.all(collected_parameters.get_value()[784 * 100:] == 2.) assert collected_parameters.ndim == 1 W1, W2 = VariableFilter(roles=[COLLECTED])(new_cg.variables) assert W1.eval().shape == (784, 100) assert numpy.all(W1.eval() == 1.) assert W2.eval().shape == (100, 784) assert numpy.all(W2.eval() == 2.)
def train(train_set, test_set): x = tensor.matrix('features') y = tensor.lmatrix('targets') l1 = Linear( name='input_to_hidden', input_dim=2, output_dim=3, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l1.initialize() h = Logistic().apply(l1.apply(x)) l2 = Linear( name='hidden_to_output', input_dim=l1.output_dim, output_dim=2, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l2.initialize() y_hat = Softmax().apply(l2.apply(h)) cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'misclassification_rate' cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 1e-8 * (W1 ** 2).sum() + 1e-8 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' print('W1', W1.get_value()) print('W2', W2.get_value()) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=RMSProp() ) data_stream_train = Flatten( DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, batch_size=4) ) ) data_stream_test = Flatten( DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size=1) ) ) monitor = DataStreamMonitoring( variables=[cost, error], data_stream=data_stream_test, prefix="test" ) main_loop = MainLoop( data_stream=data_stream_train, algorithm=algorithm, extensions=[ monitor, FinishAfter(after_n_epochs=100), Printing(), # ProgressBar() ] ) main_loop.run()
input_dim=input_dim, output_dim=num_hidden_nodes) h = Rectifier().apply(input_to_hidden.apply(x)) hidden_to_output = Linear(name='hidden_to_output', input_dim=num_hidden_nodes, output_dim=2) y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix('targets') from blocks.bricks.cost import CategoricalCrossEntropy, MisclassificationRate cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) from blocks.roles import WEIGHT from blocks.graph import ComputationGraph from blocks.filter import VariableFilter cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) L1,L2 = 0.05, 0.05 cost = cost + L1 * (W1 ** 2).sum() + L2 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' from blocks.bricks import MLP mlp = MLP(activations=[Rectifier(), Softmax()], dims=[input_dim,num_hidden_nodes, 2]).apply(x) W1.name = 'W1' from blocks.initialization import IsotropicGaussian, Constant hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.weights_init = hidden_to_output.weights_init hidden_to_output.biases_init = Constant(0) input_to_hidden.biases_init = hidden_to_output.biases_init input_to_hidden.initialize()
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph( batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter( applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter( applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append( rename(gain_matrix.min(), 'min_gain')) primary_observables.append( rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")( cost_cg)[-1] attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model(regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')) .add_condition(["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains( labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] extensions.append(Patience(**patience_conf)) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
y = tensor.lmatrix('y') mlp = MLP(activations=[Logistic(), Softmax()], dims=[117, 55, 2], weights_init=IsotropicGaussian(), biases_init=Constant(0.01)) mlp.initialize() y_hat = mlp.apply(x) cost = BinaryCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.001 * abs(W1).sum() + 0.001 * abs(W2).sum() cost.name = 'cost' error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) error_rate.name = 'error_rate' algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) train_set = H5PYDataset('mushrooms.hdf5', which_sets=('train',)) train_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme( train_set.num_examples, batch_size=128))
def run_experiment(): np.random.seed(42) X = tensor.tensor4('features') nbr_channels = 3 image_shape = (5, 5) conv_layers = [ ConvolutionalLayer( filter_size=(2,2), num_filters=10, activation=Rectifier().apply, border_mode='valid', pooling_size=(1,1), weights_init=Uniform(width=0.1), #biases_init=Uniform(width=0.01), biases_init=Constant(0.0), name='conv0')] conv_sequence = ConvolutionalSequence( conv_layers, num_channels=nbr_channels, image_size=image_shape) #conv_sequence.push_allocation_config() conv_sequence.initialize() flattener = Flattener() conv_output = conv_sequence.apply(X) y_hat = flattener.apply(conv_output) # Whatever. Not important since we're not going to actually train anything. cost = tensor.sqr(y_hat).sum() #L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[FILTER, BIAS])(ComputationGraph([y_hat]).variables)] L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[BIAS])(ComputationGraph([y_hat]).variables)] # works on the sum of the gradients in a mini-batch sum_square_norm_gradients_method_02 = sum([tensor.sqr(g).sum() for g in L_grads_method_02]) D_by_layer = get_conv_layers_transformation_roles(ComputationGraph(conv_output)) individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_conv_transformations(D_by_layer, cost) # why does this thing depend on N again ? # I don't think I've used a cost that divides by N. N = 2 Xtrain = np.random.randn(N, nbr_channels, image_shape[0], image_shape[1]).astype(np.float32) #Xtrain[1:,:,:,:] = 0.0 Xtrain[:,:,:,:] = 1.0 convolution_filter_variable = VariableFilter(roles=[FILTER])(ComputationGraph([y_hat]).variables)[0] convolution_filter_variable_value = convolution_filter_variable.get_value() convolution_filter_variable_value[:,:,:,:] = 1.0 #convolution_filter_variable_value[0,0,:,:] = 1.0 convolution_filter_variable.set_value(convolution_filter_variable_value) f = theano.function([X], [cost, individual_sum_square_norm_gradients_method_00, sum_square_norm_gradients_method_02]) [c, v0, gs2] = f(Xtrain) #print "[c, v0, gs2]" L_c, L_v0, L_gs2 = ([], [], []) for n in range(N): [nc, nv0, ngs2] = f(Xtrain[n,:, :, :].reshape((1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3]))) L_c.append(nc) L_v0.append(nv0) L_gs2.append(ngs2) print "Cost for whole mini-batch in single shot : %f." % c print "Cost for whole mini-batch accumulated : %f." % sum(L_c) print "" print "Square-norm of all gradients for each data point in single shot :" print v0.reshape((1,-1)) print "Square-norm of all gradients for each data point iteratively :" print np.array(L_gs2).reshape((1,-1)) print "" print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2))) print "" print "Ratios : " print np.array(L_gs2).reshape((1,-1)) / v0.reshape((1,-1))
def train(config, save_path, bokeh_name, params, bokeh_server, test_tag, use_load_ext, load_log, fast_start, validation_epochs, validation_batches, per_epochs, per_batches): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) # Build the main brick and initialize all parameters. recognizer = SpeechRecognizer( data.recordings_source, data.labels_source, data.eos_label, data.num_features, data.num_labels, name="recognizer", data_prepend_eos=data.prepend_eos, character_map=data.character_map, **config["net"]) for brick_path, attribute_dict in sorted( config['initialization'].items(), key=lambda (k, v): -k.count('/')): for attribute, value in attribute_dict.items(): brick, = Selector(recognizer).select(brick_path).bricks setattr(brick, attribute, value) brick.push_initialization_config() recognizer.initialize() # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) if params: logger.info("Load parameters from " + params) recognizer.load_params(params) if test_tag: tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__ __stream = data.get_stream("train") __data = next(__stream.get_epoch_iterator(as_dict=True)) recognizer.recordings.tag.test_value = __data[data.recordings_source] recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask'] recognizer.labels.tag.test_value = __data[data.labels_source] recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask'] theano.config.compute_test_value = 'warn' batch_cost = recognizer.get_cost_graph().sum() batch_size = named_copy(recognizer.recordings.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output, = VariableFilter( applications=[r.bottom.apply], name="output")( cost_cg) attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = named_copy(r.recordings.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = named_copy(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = named_copy(attended.shape[0], "max_attended_length") max_num_phonemes = named_copy(r.labels.shape[0], "max_num_phonemes") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_attended = named_copy(abs(attended).mean(), "mean_attended") mean_bottom_output = named_copy(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = named_copy(monotonicity_penalty(weights, r.labels_mask), "weights_penalty") weights_entropy = named_copy(entropy(weights, r.labels_mask), "weights_entropy") mask_density = named_copy(r.labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) regularized_cost = regularized_cg.outputs[0] regularized_weights_penalty = regularized_cg.outputs[1] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(regularized_cost) params = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, params[key].get_value().shape) for key in sorted(params.keys())], width=120)) # Define the training algorithm. train_conf = config['training'] clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False): logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] algorithm = GradientDescent( cost=regularized_cost + reg_config.get("penalty_coof", .0) * regularized_weights_penalty / batch_size + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2, parameters=params.values(), step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)])) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. observables = regularized_cg.outputs observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in params.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(named_copy(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(named_copy(aggregation.mean( var, recognizer.labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( [observables[0], algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes([cost, weights_entropy, weights_penalty]), data.get_stream("valid"), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=validation_epochs, every_n_batches=validation_batches, after_training=False) extensions.append(validation) recognizer.init_beam_search(10) per = PhonemeErrorRate(recognizer, data.get_dataset("valid")) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=per_epochs, every_n_batches=per_batches, after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_likelihood = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_likelihood, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter(data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']) .add_condition(["after_batch"], _gradient_norm_is_none), # Live plotting: requires launching `bokeh-server` # and allows to see what happens online. Plot(bokeh_name if bokeh_name else os.path.basename(save_path), [# Plot 1: training and validation costs [average_monitoring.record_name(regularized_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]], every_n_batches=10, server_url=bokeh_server), Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_likelihood.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar(), Printing(every_n_batches=1, attribute_filter=PrintingFilterList() )] # Save the config into the status log = TrainingLog() log.status['_config'] = repr(config) main_loop = MainLoop( model=model, log=log, algorithm=algorithm, data_stream=data.get_stream("train"), extensions=extensions) main_loop.run()
x = tensor.matrix('features') input_to_hidden = Linear(name = 'input_to_hidden', input_dim = 784, output_dim = 100) # define a function. 这个函数用于计算输入层到隐藏层的线性计算 h = Rectifier().apply(input_to_hidden.apply(x)) # 以每个隐藏层单元获得的线性计算结果为输入,计算使用Rectifier()激活函数的每一个隐藏层单元相应的输出结果,这个结果将被用于作为下一层的输入 hidden_to_output = Linear(name = 'hidden_to_output', input_dim = 100, output_dim = 10) # 定义最终输出层的每一个单元的线性计算结果。 y_hat = Softmax().apply(hidden_to_output.apply(h)) # 得出输出层每一个神经单元的非线性输出转换。 y = tensor.lmatrix('targets') # 定义输出变量 cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) #定义cost 函数 error_rate = MisclassificationRate().apply(y.flatten(), y_hat) #构造计算图。 cg = ComputationGraph(cost) #对cost函数进行正则化 #选择需要计算的参数 W1 为第一层所有线性转换的 W ,W2 为第二层所有线性转换的W W1,W2 = VariableFilter(roles = [WEIGHT])(cg.variables) #正则化公式定义,此处使用的是L2正则化 cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' #定义一个多层神经网络,层与层之间的计算公式已经被之前定义。 #激活函数集activations定义了每一层的非线性转换函数,多层感知器每一层的输出都包含了两部分,第一部分是线性计算,然后将线性计算的结果进行非线性转换 #x是多层感知器的输入 mlp = MLP(activations = [Rectifier(),Softmax()], dims = [784, 100, 10]).apply(x) #定义完整个神经网络的流程后,需要设置其线性转换的参数的初始值。 input_to_hidden.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = Constant(0); hidden_to_output.weights_init = IsotropicGaussian(0.01) hidden_to_output.biases_init = Constant(0)
lord = [map_chr_2_ind[char] for char in lord_original] print lord zaza = prob_function([lord], numpy.ones((1, len(lord)), dtype="int8"))[:, 0, :] print zaza print zaza.shape for (ey, row) in enumerate(zaza): print "PREDICTION PROBABILITIES FOR POSITION", ey, "LETTER", repr(lord_original[ey]) sorted_thing = [(prob, ind) for (ind, prob) in enumerate(row)] sorted_thing.sort(reverse=True) for (prob, ind) in sorted_thing: print repr(map_ind_2_chr[ind]), ":", prob print "\n" """ # define a function that gets the overall "sum of scores" at a given time step readouts = VariableFilter(theano_name="readout_readout_output_0")(lstm_net.cost_model.variables)[0] score_function = function([lstm_net.x, lstm_net.mask], readouts.sum(axis=2)) # this section of the playground has some fun rides that revolve around various correlation stuff. uncomment to access # =) sc = StateComputer(lstm_net.cost_model, map_chr_2_ind) # storage for the correlations at the very end correlation_dict = dict() for name in sc.state_var_names: correlation_dict[name] = numpy.zeros(lstm_net.hidden_dims[0], dtype=float) # get validation data to run over valid_data = H5PYDataset("bible.hdf5", which_sets=("valid",), load_in_memory=True) data_stream = PadAndAddMasks( DataStream.default_stream(dataset=valid_data, iteration_scheme=SequentialScheme(valid_data.num_examples, batch_size=128)),