def create_model(config, data, test_tag): # Build the main brick and initialize all parameters. recognizer = SpeechRecognizer(data.recordings_source, data.labels_source, data.eos_label, data.num_features, data.num_labels, name="recognizer", data_prepend_eos=data.prepend_eos, character_map=data.character_map, **config["net"]) for brick_path, attribute_dict in sorted(config['initialization'].items(), key=lambda (k, v): k.count('/')): for attribute, value in attribute_dict.items(): brick, = Selector(recognizer).select(brick_path).bricks setattr(brick, attribute, value) brick.push_initialization_config() recognizer.initialize() if test_tag: tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__ __stream = data.get_stream("train") __data = next(__stream.get_epoch_iterator(as_dict=True)) recognizer.recordings.tag.test_value = __data[data.recordings_source] recognizer.recordings_mask.tag.test_value = __data[ data.recordings_source + '_mask'] recognizer.labels.tag.test_value = __data[data.labels_source] recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask'] theano.config.compute_test_value = 'warn' return recognizer
def build_model(images, labels): vgg = VGG(layer='conv4_4') vgg.push_initialization_config() vgg.initialize() tdb = top_direction_block() tdb.push_initialization_config() tdb.initialize() # Construct feedforward sequence ss_seq = FeedforwardSequence([vgg.apply, tdb.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost = StructuredCost().apply(labels, theano.tensor.clip(prediction, 1e-5, 1 - 1e-5)) cg = ComputationGraph(cost) cg_dropout = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[0]], .5) cost_dropout = cg_dropout.outputs[0] # define learned parameters selector = Selector([ss_seq]) W = selector.get_parameters() parameters = [] parameters += [v for k, v in W.items()] return cost_dropout, parameters
def build_model(images, labels): # Construct a bottom convolutional sequence bottom_conv_sequence = convolutional_sequence((3,3), 16, (160, 160)) bottom_conv_sequence._push_allocation_config() # Flatten layer flattener = Flattener() # Construct a top MLP conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output')) #top_mlp = MLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0)) top_mlp = BatchNormalizedMLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0)) # Construct feedforward sequence ss_seq = FeedforwardSequence([bottom_conv_sequence.apply, flattener.apply, top_mlp.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost_noreg = CategoricalCrossEntropy().apply(labels.flatten(), prediction) # add regularization selector = Selector([top_mlp]) Ws = selector.get_parameters('W') mlp_brick_name = 'batchnormalizedmlp' W0 = Ws['/%s/linear_0.W' % mlp_brick_name] W1 = Ws['/%s/linear_1.W' % mlp_brick_name] cost = cost_noreg + .01 * (W0 ** 2).mean() + .01 * (W1 ** 2).mean() return cost
def create_model(config, data, test_tag): # Build the main brick and initialize all parameters. recognizer = SpeechRecognizer( data.recordings_source, data.labels_source, data.eos_label, data.num_features, data.num_labels, name="recognizer", data_prepend_eos=data.prepend_eos, character_map=data.character_map, **config["net"]) for brick_path, attribute_dict in sorted( config['initialization'].items(), key=lambda (k, v): k.count('/')): for attribute, value in attribute_dict.items(): brick, = Selector(recognizer).select(brick_path).bricks setattr(brick, attribute, value) brick.push_initialization_config() recognizer.initialize() if test_tag: tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__ __stream = data.get_stream("train") __data = next(__stream.get_epoch_iterator(as_dict=True)) recognizer.recordings.tag.test_value = __data[data.recordings_source] recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask'] recognizer.labels.tag.test_value = __data[data.labels_source] recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask'] theano.config.compute_test_value = 'warn' return recognizer
def get_gradients(self, X, Y, weights=1.0): W_mean, W_ls, b_mean, b_ls = self.parameters mean, log_sigma = self.sample_expected(Y) sigma = tensor.exp(log_sigma) cost = -log_sigma - 0.5 * (X - mean) ** 2 / tensor.exp(2 * log_sigma) if weights != 1.0: cost = -weights.dimshuffle(0, "x") * cost cost_scaled = sigma ** 2 * cost cost_gscale = (sigma ** 2).sum(axis=1).dimshuffle([0, "x"]) cost_gscale = cost_gscale * cost gradients = OrderedDict() params = Selector(self.mlp).get_parameters() for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost_gscale.sum(), param, consider_constant=[X, Y]) gradients[W_mean] = tensor.grad(cost_scaled.sum(), W_mean, consider_constant=[X, Y]) gradients[b_mean] = tensor.grad(cost_scaled.sum(), b_mean, consider_constant=[X, Y]) gradients[W_ls] = tensor.grad(cost_scaled.sum(), W_ls, consider_constant=[X, Y]) gradients[b_ls] = tensor.grad(cost_scaled.sum(), b_ls, consider_constant=[X, Y]) return gradients
def unify_parameters(self, source_id, dest_id): source = self.children[source_id] source_name = self.children[source_id].name source_prefix = '/' + source_name + '/' dest_name = self.children[dest_id].name dest_prefix = '/' + self.name + '/' + dest_name + '/' source_params = Selector(source).get_parameters() replaced = [] self.unified_parameters = [] for param, var in source_params.iteritems(): if not param.startswith(source_prefix): continue source_param = '/' + self.name + param param = param[len(source_prefix):] for unification in self.parameter_unifications_include: if unification.match(param): exclude = False for ex_unification in self.parameter_unifications_exclude: if ex_unification.match(param): exclude = True break if exclude: continue self.replace_parameter(dest_prefix + param, var) replaced += [dest_prefix + param] self.unified_parameters += [source_param] self.unified_parameters = self.convert_names_to_bricks( set(self.unified_parameters) | set(replaced)) return replaced
def make_sampling_computation_graph(model_path, num_samples): f = file(model_path, 'rb') model = cPickle.load(f)#main_loop = load(model_path)# f.close() #model = main_loop.model selector = Selector(model.top_bricks) decoder_mlp1, = selector.select('/decoder_network1').bricks decoder_mlp2, = selector.select('/decoder_network2').bricks decoder_mlp3, = selector.select('/decoder_network3').bricks theano_rng = Random().theano_rng z1 = theano_rng.normal(size=(num_samples, decoder_mlp1.input_dim), dtype=theano.config.floatX) z2 = decoder_mlp1.apply(z1) z2 = z2[:, :40]# + theano.tensor.exp(0.5 * z2[:, 40:]) * theano_rng.normal(size=(num_samples, 40), # dtype=theano.config.floatX) z3 = decoder_mlp2.apply(z2) z3 = z3[:, :100] + theano.tensor.exp(0.5 * z3[:, 100:]) * theano_rng.normal(size=(num_samples, 100), dtype=theano.config.floatX) p = decoder_mlp3.apply(z3).reshape((num_samples, 28, 28)) return ComputationGraph([p])
def make_sampling_computation_graph(model_path, num_samples): f = file(model_path, 'rb') model = cPickle.load(f)#main_loop = load(model_path)# f.close() #model = main_loop.model selector = Selector(model.top_bricks) decoder_mlp1, = selector.select('/decoder_network1').bricks decoder_mlp2, = selector.select('/decoder_network2').bricks decoder_mlp3, = selector.select('/decoder_network3').bricks theano_rng = Random().theano_rng z2 = theano_rng.normal(size=(num_samples, decoder_mlp1.input_dim), dtype=theano.config.floatX) h2 = decoder_mlp1.apply(z2) h2 = h2[:, :50] + theano.tensor.exp(0.5 * h2[:, 50:]) * theano_rng.normal(size=(num_samples, 50), dtype=theano.config.floatX) z1 = theano_rng.normal(size=(num_samples, 10), dtype=theano.config.floatX) h1 = decoder_mlp2.apply(theano.tensor.concatenate([h2, z1], axis=1)) h1 = h1[:, :50] + theano.tensor.exp(0.5 * h1[:, 50:]) * theano_rng.normal(size=(num_samples, 50), dtype=theano.config.floatX) p = decoder_mlp3.apply(theano.tensor.concatenate([h1, h2], axis=1)).reshape((num_samples, 28, 28)) return ComputationGraph([p])
def run(discriminative_regularization=True): streams = create_celeba_streams(training_batch_size=100, monitoring_batch_size=500, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs(discriminative_regularization) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp'))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring monitored_quantities_list = [] for graph in [bn_cg, cg]: cost, kl_term, reconstruction_term = graph.outputs cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' monitored_quantities_list.append( [cost, avg_kl_term, avg_reconstruction_term]) train_monitoring = DataStreamMonitoring( monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=False, every_n_epochs=5) valid_monitoring = DataStreamMonitoring( monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=False, every_n_epochs=5) # Prepare checkpoint save_path = 'celeba_vae_{}regularization.zip'.format( '' if discriminative_regularization else 'no_') checkpoint = Checkpoint(save_path, every_n_epochs=5, use_cpickle=True) extensions = [Timing(), FinishAfter(after_n_epochs=75), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar()] main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def get_gradients(self, features, n_samples): """Perform inference and calculate gradients. Returns ------- log_px : T.fvector log_psx : T.fvector gradients : OrderedDict """ p_layers = self.p_layers q_layers = self.q_layers n_layers = len(p_layers) batch_size = features.shape[0] x = replicate_batch(features, n_samples) # Get Q-samples samples, log_p, log_q = self.sample_q(x) # Reshape and sum samples = unflatten_values(samples, batch_size, n_samples) log_p = unflatten_values(log_p, batch_size, n_samples) log_q = unflatten_values(log_q, batch_size, n_samples) log_p_all = sum(log_p) log_q_all = sum(log_q) # Approximate log(p(x)) log_px = logsumexp(log_p_all - log_q_all, axis=-1) - tensor.log(n_samples) log_psx = (logsumexp((log_p_all - log_q_all) / 2, axis=-1) - tensor.log(n_samples)) * 2. # Approximate log p(x) and calculate IS weights w = self.importance_weights(log_p, log_q) wp = w.reshape((batch_size * n_samples, )) wq = w.reshape((batch_size * n_samples, )) wq = wq - (1. / n_samples) samples = flatten_values(samples, batch_size * n_samples) gradients = OrderedDict() for l in xrange(n_layers - 1): gradients = merge_gradients(gradients, p_layers[l].get_gradients(samples[l], samples[l + 1], weights=wp)) gradients = merge_gradients(gradients, q_layers[l].get_gradients(samples[l + 1], samples[l], weights=wq)) gradients = merge_gradients(gradients, p_layers[-1].get_gradients(samples[-1], weights=wp)) if (self.l1reg > 0.) or (self.l2reg > 0.): reg_gradients = OrderedDict() params = Selector(self).get_parameters() for pname, param in params.iteritems(): if has_roles(param, (WEIGHT,)): reg_cost = self.l1reg * tensor.sum(abs(param)) + self.l2reg * tensor.sum(param ** 2) reg_gradients[param] = tensor.grad(reg_cost, param) gradients = merge_gradients(gradients, reg_gradients) return log_px, log_psx, gradients
def get_gradients(self, features, n_samples): log_p_bound = self.log_likelihood_bound(features, n_samples) gradients = OrderedDict() params = Selector(self).get_parameters() for pname, param in params.iteritems(): cost = -log_p_bound.mean() + self.l2reg * tensor.sum(param ** 2) gradients[param] = tensor.grad(cost, param) return log_p_bound, gradients
def test_selector(): b1 = MockBrickBottom(name="b1") b2 = MockBrickBottom(name="b2") b3 = MockBrickBottom(name="b3") t1 = MockBrickTop([b1, b2], name="t1") t2 = MockBrickTop([b2, b3], name="t2") s1 = Selector([t1]) s11 = s1.select("/t1/b1") assert s11.bricks[0] == b1 assert len(s11.bricks) == 1 s12 = s1.select("/t1") assert s12.bricks[0] == t1 assert len(s12.bricks) == 1 s2 = Selector([t1, t2]) s21 = s2.select("/t2/b2") assert s21.bricks[0] == b2 assert len(s21.bricks) == 1 assert s2.select("/t2/b2.V")[0] == b2.parameters[0] parameters = list(s1.get_parameters().items()) assert parameters[0][0] == "/t1/b1.V" assert parameters[0][1] == b1.parameters[0] assert parameters[1][0] == "/t1/b1.W" assert parameters[1][1] == b1.parameters[1] assert parameters[2][0] == "/t1/b2.V" assert parameters[2][1] == b2.parameters[0] assert parameters[3][0] == "/t1/b2.W" assert parameters[3][1] == b2.parameters[1]
def create_recognizer(config, net_config, langs, info_dataset, postfix_manager, load_path=None, mask_path=None): if 'dependency' in net_config: net_config.pop('dependency') unification_include = [] unification_exclude = [] if 'unification_rules' in net_config: ur = net_config.pop('unification_rules') unification_include = ur.get('include', []) unification_exclude = ur.get('exclude', []) recognizer = MultilangDependencyRecognizer(langs, info_dataset, postfix_manager, unification_include, unification_exclude, **net_config) if recognizer.children[0].soft_pointer: global data_params_valid global data_params_train data_params_valid = {'soften_distributions': {'pointers': (0.0, None)}} data_params_train = {'soften_distributions': {'pointers': (recognizer.children[0].soft_pointer_val, None)}} if load_path: recognizer.load_params(load_path) unifications = [] for dest_id in xrange(1, len(recognizer.children)): unifications += recognizer.unify_parameters(0, dest_id) logger.info("Unified parameters: \n"+ pprint.pformat(unifications)) else: for brick_path, attribute_dict in sorted( config['initialization'].items(), key=lambda (k, v): k.count('/')): for attribute, value in attribute_dict.items(): brick, = Selector(recognizer).select(brick_path).bricks setattr(brick, attribute, value) brick.push_initialization_config() recognizer.initialize() unifications = [] for dest_id in xrange(1, len(recognizer.children)): unifications += recognizer.unify_parameters(0, dest_id) logger.info("Unified parameters: \n"+ pprint.pformat(unifications)) if mask_path: with open(mask_path, 'r') as f: mask_dict = pickle.load(f) recognizer.activate_masks(mask_dict) return recognizer
def get_decoder_function(model): selector = Selector(model.top_bricks) decoder_mlp, = selector.select("/decoder_mlp").bricks decoder_convnet, = selector.select("/decoder_convnet").bricks print("Building computation graph...") z = tensor.matrix() mu_theta = decoder_convnet.apply(decoder_mlp.apply(z).reshape((-1,) + decoder_convnet.get_dim("input_"))) computation_graph = ComputationGraph([z, mu_theta]) print("Compiling sampling function...") decoder_function = theano.function(computation_graph.inputs, computation_graph.outputs) return decoder_function
def get_gradients(self, X, Y, weights=1.0): cost = -(weights * self.log_prob(X, Y)).sum() params = Selector(self).get_parameters() gradients = OrderedDict() if isinstance(weights, float): for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost, param, consider_constant=[X, Y]) else: for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost, param, consider_constant=[X, Y, weights]) return gradients
def make_sampling_computation_graph(model_path, num_samples): f = file(model_path, 'rb') model = cPickle.load(f)#main_loop = load(model_path)# f.close() #model = main_loop.model selector = Selector(model.top_bricks) decoder_mlp, = selector.select('/decoder_network').bricks theano_rng = Random().theano_rng z = theano_rng.normal(size=(num_samples, decoder_mlp.input_dim), dtype=theano.config.floatX) p = decoder_mlp.apply(z).reshape((num_samples, 28, 28)) return ComputationGraph([p])
def __init__(self, outputs): super(Model, self).__init__(outputs) if len(self.outputs) > 1: logger.warning("model with multiple output " + multiple_message) bricks = [ get_brick(var) for var in self.variables + self.scan_variables if get_brick(var) ] children = set(chain(*(brick.children for brick in bricks))) # Quadratic complexity: we should not have thousands of # top-level bricks. self.top_bricks = [] for brick in bricks: if brick not in children and brick not in self.top_bricks: self.top_bricks.append(brick) names = Counter([brick.name for brick in self.top_bricks]) repeated_names = [name for name, count in names.items() if count > 1] if repeated_names: raise ValueError("top bricks with the same name:" " {}".format(', '.join(repeated_names))) brick_parameter_names = { v: k for k, v in Selector(self.top_bricks).get_parameters().items() } parameter_list = [] for parameter in self.parameters: if parameter in brick_parameter_names: parameter_list.append( (brick_parameter_names[parameter], parameter)) else: parameter_list.append((parameter.name, parameter)) self._parameter_dict = OrderedDict(parameter_list)
def __init__(self, outputs): super(Model, self).__init__(outputs) if len(self.outputs) > 1: logger.warning("model with multiple output " + multiple_message) bricks = [ get_brick(var) for var in self.variables + self.scan_variables if get_brick(var) ] children = set(chain(*(brick.children for brick in bricks))) # Quadratic complexity: we should not have thousands of # top-level bricks. self.top_bricks = [] for brick in bricks: if brick not in children and brick not in self.top_bricks: self.top_bricks.append(brick) if len(set(b.name for b in self.top_bricks)) < len(self.top_bricks): raise ValueError("top bricks with the same name") brick_param_names = { v: k for k, v in Selector(self.top_bricks).get_params().items() } self.params = [] for param in VariableFilter(roles=[PARAMETER])(self.shared_variables): if param in brick_param_names: self.params.append((brick_param_names[param], param)) else: self.params.append((param.name, param)) self.params = OrderedDict(self.params)
def create_running_graphs(classifier): try: classifier_model = Model(load(classifier).algorithm.cost) except AttributeError: # newer version of blocks with open(classifier, 'rb') as src: classifier_model = Model(load(src).algorithm.cost) selector = Selector(classifier_model.top_bricks) convnet, = selector.select('/convnet').bricks mlp, = selector.select('/mlp').bricks x = tensor.tensor4('features') y_hat = mlp.apply(convnet.apply(x).flatten(ndim=2)) cg = ComputationGraph([y_hat]) return cg
def sample_at(self, z): selector = Selector(self.model.top_bricks) decoder_mlp, = selector.select("/decoder_mlp").bricks decoder_convnet, = selector.select("/decoder_convnet").bricks print("Building computation graph...") sz = shared_floatx(z) mu_theta = decoder_convnet.apply(decoder_mlp.apply(sz).reshape((-1,) + decoder_convnet.get_dim("input_"))) computation_graph = ComputationGraph([mu_theta]) print("Compiling sampling function...") sampling_function = theano.function(computation_graph.inputs, computation_graph.outputs[0]) print("Sampling...") samples = sampling_function() return samples
def test_selector_get_parameters_uniqueness(): top = MockBrickTop( [MockBrickBottom(name="bottom"), MockBrickBottom(name="bottom")], name="top") selector = Selector([top]) assert_raises(ValueError, selector.get_parameters)
def get_decoder_function(model): selector = Selector(model.top_bricks) decoder_mlp, = selector.select('/decoder_mlp').bricks decoder_convnet, = selector.select('/decoder_convnet').bricks print('Building computation graph...') z = tensor.matrix() mu_theta = decoder_convnet.apply( decoder_mlp.apply(z).reshape((-1, ) + decoder_convnet.get_dim('input_'))) computation_graph = ComputationGraph([z, mu_theta]) print('Compiling sampling function...') decoder_function = theano.function(computation_graph.inputs, computation_graph.outputs) return decoder_function
def get_gradients(self, X, Y, weights=1.): cost = -(weights * self.log_prob(X, Y)).sum() params = Selector(self).get_parameters() gradients = OrderedDict() if isinstance(weights, float): for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost, param, consider_constant=[X, Y]) else: for pname, param in params.iteritems(): gradients[param] = tensor.grad( cost, param, consider_constant=[X, Y, weights]) return gradients
def print_parameteters(models): param_dict = merge(*[Selector(model).get_parameters() for model in models]) number_of_parameters = 0 logger.info("Parameter names: ") for name, value in param_dict.items(): number_of_parameters += np.product(value.get_value().shape) logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format(number_of_parameters))
def get_image_encoder_function(model): selector = Selector(model.top_bricks) encoder_convnet, = selector.select("/encoder_convnet").bricks encoder_mlp, = selector.select("/encoder_mlp").bricks print("Building computation graph...") x = tensor.tensor4("features") phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2)) nlat = encoder_mlp.output_dim // 2 mu_phi = phi[:, :nlat] log_sigma_phi = phi[:, nlat:] epsilon = Random().theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype) z = mu_phi + epsilon * tensor.exp(log_sigma_phi) computation_graph = ComputationGraph([x, z]) print("Compiling reconstruction function...") encoder_function = theano.function(computation_graph.inputs, computation_graph.outputs) return encoder_function
def get_image_encoder_function(model): selector = Selector(model.top_bricks) encoder_convnet, = selector.select('/encoder_convnet').bricks encoder_mlp, = selector.select('/encoder_mlp').bricks print('Building computation graph...') x = tensor.tensor4('features') phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2)) nlat = encoder_mlp.output_dim // 2 mu_phi = phi[:, :nlat] log_sigma_phi = phi[:, nlat:] epsilon = Random().theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype) z = mu_phi + epsilon * tensor.exp(log_sigma_phi) computation_graph = ComputationGraph([x, z]) print('Compiling reconstruction function...') encoder_function = theano.function( computation_graph.inputs, computation_graph.outputs) return encoder_function
def sample_at(self, z): selector = Selector(self.model.top_bricks) decoder_mlp, = selector.select('/decoder_mlp').bricks decoder_convnet, = selector.select('/decoder_convnet').bricks print('Building computation graph...') sz = shared_floatx(z) mu_theta = decoder_convnet.apply( decoder_mlp.apply(sz).reshape( (-1,) + decoder_convnet.get_dim('input_'))) computation_graph = ComputationGraph([mu_theta]) print('Compiling sampling function...') sampling_function = theano.function( computation_graph.inputs, computation_graph.outputs[0]) print('Sampling...') samples = sampling_function() return samples
def __init__(self, bricks, cost): if not isinstance(bricks, Selector): bricks = Selector(bricks) if isinstance(cost, Variable): cost = ComputationGraph(cost) self.bricks = bricks self.cost = cost self.properties = [] self.updates = []
def extract_parameter_values(bricks): """Extract parameter values from a bricks hierarchy. Parameters ---------- bricks : (list of) :class:`.Brick`, or :class:`.Selector` The top bricks. Returns ------- A dictionary of (parameter name, numpy array) pairs. """ if isinstance(bricks, Brick): bricks = Selector([bricks]) if not isinstance(bricks, Selector): bricks = Selector(bricks) return OrderedDict([(name, variable.get_value(borrow=True)) for name, variable in bricks.get_params().items()])
def preprocess_svhn(main_loop, save_path): h5file = h5py.File(save_path, mode='w') ali, = Selector(main_loop.model.top_bricks).select('/ali').bricks x = tensor.tensor4('features') y = tensor.imatrix('targets') params = ali.encoder.apply(x) mu = params[:, :ali.encoder._nlat] acts = [] acts += [mu] acts += VariableFilter(bricks=[ ali.encoder.layers[-9], ali.encoder.layers[-6], ali.encoder.layers[-3] ], roles=[OUTPUT])(ComputationGraph([mu]).variables) output = tensor.concatenate([act.flatten(ndim=2) for act in acts], axis=1) preprocess = theano.function([x, y], [output.flatten(ndim=2), y]) train_set = SVHN(2, which_sets=('train', ), sources=('features', 'targets')) train_stream = DataStream.default_stream(train_set, iteration_scheme=SequentialScheme( train_set.num_examples, 100)) train_features, train_targets = map( numpy.vstack, list( zip(*[ preprocess(*batch) for batch in train_stream.get_epoch_iterator() ]))) test_set = SVHN(2, which_sets=('test', ), sources=('features', 'targets')) test_stream = DataStream.default_stream(test_set, iteration_scheme=SequentialScheme( test_set.num_examples, 100)) test_features, test_targets = map( numpy.vstack, list( zip(*[ preprocess(*batch) for batch in test_stream.get_epoch_iterator() ]))) data = (('train', 'features', train_features), ('test', 'features', test_features), ('train', 'targets', train_targets), ('test', 'targets', test_targets)) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'feature')): h5file['features'].dims[i].label = label for i, label in enumerate(('batch', 'index')): h5file['targets'].dims[i].label = label h5file.flush() h5file.close()
def __init__(self, langs, info_data, postfix_manager, parameter_unifications_include, parameter_unifications_exclude, **net_config): super(MultilangDependencyRecognizer, self).__init__(name='recognizer') self.langs = langs self.info_data = info_data self.postfix_manager = postfix_manager self.parameter_unifications_include = [ re.compile(unification) for unification in parameter_unifications_include ] self.parameter_unifications_exclude = [ re.compile(unification) for unification in parameter_unifications_exclude ] self.init_recognizers(**net_config) self.selector = Selector(self) self.child_postfix_regexp = [ re.compile('.*' + chld.names_postfix + '($|_.*)') for chld in self.children ]
def inject_parameter_values(bricks, param_values): """Inject parameter values into a bricks hierarchy. Parameters ---------- bricks : :class:`.Brick` or :class:`.Selector or list of :class:`Brick` The top bricks. param_values : dict of (parameter name, :class:`~numpy.ndarray`) pairs The parameter values. """ if isinstance(bricks, Brick): bricks = Selector([bricks]) if not isinstance(bricks, Selector): bricks = Selector(bricks) for name, value in param_values.items(): selected = bricks.select(name) if len(selected) == 0: logger.error("Unknown parameter {}".format(name)) if not len(selected) == 1: raise ValueError selected = selected[0] assert selected.get_value( borrow=True, return_internal_type=True).shape == value.shape selected.set_value(value) params = bricks.get_params() for name in params.keys(): if name not in param_values: logger.error( "No value is provided for the parameter {}".format(name))
def make_sampling_computation_graph(model_path, num_samples): f = file(model_path, 'rb') model = cPickle.load(f)#main_loop = load(model_path)# f.close() #model = main_loop.model selector = Selector(model.top_bricks) decoder_mlp2, = selector.select('/decoder_network2').bricks decoder_mlp1, = selector.select('/decoder_network1').bricks upsample_mlp2, = selector.select('/upsample_network2').bricks upsample_mlp1, = selector.select('/upsample_network1').bricks theano_rng = Random().theano_rng z2 = theano_rng.normal(size=(num_samples, decoder_mlp2.input_dim), dtype=theano.config.floatX) h2_params = decoder_mlp2.apply(z2) length = int(h2_params.eval().shape[1]/2) h2_mu = h2_params[:, :length] h2_lognu = h2_params[:, length:] h2 = h2_mu + theano.tensor.exp(0.5 * h2_lognu) * theano_rng.normal(size=h2_mu.shape, dtype=h2_mu.dtype) z1 = theano_rng.normal(size=(num_samples, decoder_mlp1.input_dim), dtype=theano.config.floatX) h1_tilde_params = decoder_mlp1.apply(z1) length = int(h1_tilde_params.eval().shape[1]/2) h1_tilde_mu = h1_tilde_params[:, :length] h1_tilde_lognu = h1_tilde_params[:, length:] h1_tilde = h1_tilde_mu + theano.tensor.exp(0.5 * h1_tilde_lognu) * theano_rng.normal(size=h1_tilde_mu.shape, dtype=h1_tilde_mu.dtype) import pdb; pdb.set_trace() h1 = upsample_mlp1.apply(h2) + h1_tilde p = upsample_mlp2.apply(h1).reshape((num_samples, 28, 28)) return ComputationGraph([p])
def def_reading_parameters(self): parameters = Selector(self._def_reader).get_parameters().values() parameters.extend(Selector(self._combiner).get_parameters().values()) if self._reuse_word_embeddings: lookup_parameters = Selector( self._lookup).get_parameters().values() parameters = [p for p in parameters if p not in lookup_parameters] return parameters
def load_params(bricks, path): """Load brick parameters. Loads parameters from .npz file where they are saved with their pathes. Parameters ---------- bricks : Brick or Selector The bricks. path : str or file Source for loading. """ if isinstance(bricks, Brick): bricks = Selector([bricks]) assert isinstance(bricks, Selector) param_values = { name.replace("-", "/"): value for name, value in numpy.load(path).items() } for name, value in param_values.items(): selected = bricks.select(name) if len(selected) == 0: logger.error("Unknown parameter {}".format(name)) assert len(selected) == 1 selected = selected[0] assert selected.get_value( borrow=True, return_internal_type=True).shape == value.shape selected.set_value(value) params = bricks.get_params() for name in params.keys(): if name not in param_values: logger.error( "No value is provided for the parameter {}".format(name))
def get_gradients(self, X, Y, weights=1.): W_mean, W_ls, b_mean, b_ls = self.parameters mean, log_sigma = self.sample_expected(Y) sigma = tensor.exp(log_sigma) cost = -log_sigma - 0.5 * (X - mean)**2 / tensor.exp(2 * log_sigma) if weights != 1.: cost = -weights.dimshuffle(0, 'x') * cost cost_scaled = sigma**2 * cost cost_gscale = (sigma**2).sum(axis=1).dimshuffle([0, 'x']) cost_gscale = cost_gscale * cost gradients = OrderedDict() params = Selector(self.mlp).get_parameters() for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost_gscale.sum(), param, consider_constant=[X, Y]) gradients[W_mean] = tensor.grad(cost_scaled.sum(), W_mean, consider_constant=[X, Y]) gradients[b_mean] = tensor.grad(cost_scaled.sum(), b_mean, consider_constant=[X, Y]) gradients[W_ls] = tensor.grad(cost_scaled.sum(), W_ls, consider_constant=[X, Y]) gradients[b_ls] = tensor.grad(cost_scaled.sum(), b_ls, consider_constant=[X, Y]) return gradients
def test_selector(): class MockBrickTop(Brick): def __init__(self, children, **kwargs): super(MockBrickTop, self).__init__(**kwargs) self.children = children self.params = [] class MockBrickBottom(Brick): def __init__(self, **kwargs): super(MockBrickBottom, self).__init__(**kwargs) self.params = [theano.shared(0, "V"), theano.shared(0, "W")] b1 = MockBrickBottom(name="b1") b2 = MockBrickBottom(name="b2") b3 = MockBrickBottom(name="b3") t1 = MockBrickTop([b1, b2], name="t1") t2 = MockBrickTop([b2, b3], name="t2") s1 = Selector([t1]) s11 = s1.select("/t1/b1") assert s11.bricks[0] == b1 assert len(s11.bricks) == 1 s12 = s1.select("/t1") assert s12.bricks[0] == t1 assert len(s12.bricks) == 1 s2 = Selector([t1, t2]) s21 = s2.select("/t2/b2") assert s21.bricks[0] == b2 assert len(s21.bricks) == 1 assert s2.select("/t2/b2.V")[0] == b2.params[0] params = list(s1.get_params().items()) assert params[0][0] == "/t1/b1.V" assert params[0][1] == b1.params[0] assert params[1][0] == "/t1/b1.W" assert params[1][1] == b1.params[1] assert params[2][0] == "/t1/b2.V" assert params[2][1] == b2.params[0] assert params[3][0] == "/t1/b2.W" assert params[3][1] == b2.params[1]
def save_params(bricks, path): """Save bricks parameters. Saves parameters with their pathes into an .npz file. Parameters ---------- bricks : Brick or Selector The bricks. path : str of file Destination for saving. """ if isinstance(bricks, Brick): bricks = Selector([bricks]) assert isinstance(bricks, Selector) params = bricks.get_params() # numpy.savez is vulnerable to slashes in names param_values = { name.replace("/", "-"): param.get_value() for name, param in params.items() } numpy.savez(path, **param_values)
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') initial_context = tensor.matrix('initial_context') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) # let user specify the target transition class name in config, # eval it and pass to decoder target_transition_name = config.get( 'target_transition', 'GRUInitialStateWithInitialStateSumContext') target_transition = eval(target_transition_name) logger.info('Using target transition: {}'.format(target_transition_name)) decoder = InitialContextDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['context_dim'], target_transition) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, initial_context) cost.name = 'decoder_cost' # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING # TODO: validate performance with/without regularization if config.get('l2_regularization', False) is True: l2_reg_alpha = config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to name the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions # TODO: add checking for existing model and loading logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Create the theano variables that we need for the sampling graph sampling_input = tensor.lmatrix('input') sampling_context = tensor.matrix('context_input') # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config.get('bleu_script', None) is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation, sampling_context) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler( model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab=source_vocab, trg_vocab=target_vocab, src_vocab_size=config['src_vocab_size'], )) # Add early stopping based on bleu if config.get('bleu_script', None) is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, sampling_context, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Add early stopping based on Meteor if config.get('meteor_directory', None) is not None: logger.info("Building meteor validator") extensions.append( MeteorValidator(sampling_input, sampling_context, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[[ 'decoder_cost', 'validation_set_bleu_score', 'validation_set_meteor_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def discriminator_parameters(self): return list(Selector([self.discriminator]).get_parameters().values())
def generator_parameters(self): return list( Selector([self.encoder, self.decoder]).get_parameters().values())
def train(config, save_path, bokeh_name, params, bokeh_server, test_tag, use_load_ext, load_log, fast_start, validation_epochs, validation_batches, per_epochs, per_batches): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) # Build the main brick and initialize all parameters. recognizer = SpeechRecognizer( data.recordings_source, data.labels_source, data.eos_label, data.num_features, data.num_labels, name="recognizer", data_prepend_eos=data.prepend_eos, character_map=data.character_map, **config["net"]) for brick_path, attribute_dict in sorted( config['initialization'].items(), key=lambda (k, v): -k.count('/')): for attribute, value in attribute_dict.items(): brick, = Selector(recognizer).select(brick_path).bricks setattr(brick, attribute, value) brick.push_initialization_config() recognizer.initialize() # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) if params: logger.info("Load parameters from " + params) recognizer.load_params(params) if test_tag: tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__ __stream = data.get_stream("train") __data = next(__stream.get_epoch_iterator(as_dict=True)) recognizer.recordings.tag.test_value = __data[data.recordings_source] recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask'] recognizer.labels.tag.test_value = __data[data.labels_source] recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask'] theano.config.compute_test_value = 'warn' batch_cost = recognizer.get_cost_graph().sum() batch_size = named_copy(recognizer.recordings.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output, = VariableFilter( applications=[r.bottom.apply], name="output")( cost_cg) attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = named_copy(r.recordings.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = named_copy(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = named_copy(attended.shape[0], "max_attended_length") max_num_phonemes = named_copy(r.labels.shape[0], "max_num_phonemes") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_attended = named_copy(abs(attended).mean(), "mean_attended") mean_bottom_output = named_copy(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = named_copy(monotonicity_penalty(weights, r.labels_mask), "weights_penalty") weights_entropy = named_copy(entropy(weights, r.labels_mask), "weights_entropy") mask_density = named_copy(r.labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) regularized_cost = regularized_cg.outputs[0] regularized_weights_penalty = regularized_cg.outputs[1] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(regularized_cost) params = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, params[key].get_value().shape) for key in sorted(params.keys())], width=120)) # Define the training algorithm. train_conf = config['training'] clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False): logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] algorithm = GradientDescent( cost=regularized_cost + reg_config.get("penalty_coof", .0) * regularized_weights_penalty / batch_size + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2, parameters=params.values(), step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)])) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. observables = regularized_cg.outputs observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in params.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(named_copy(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(named_copy(aggregation.mean( var, recognizer.labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( [observables[0], algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes([cost, weights_entropy, weights_penalty]), data.get_stream("valid"), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=validation_epochs, every_n_batches=validation_batches, after_training=False) extensions.append(validation) recognizer.init_beam_search(10) per = PhonemeErrorRate(recognizer, data.get_dataset("valid")) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=per_epochs, every_n_batches=per_batches, after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_likelihood = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_likelihood, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter(data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']) .add_condition(["after_batch"], _gradient_norm_is_none), # Live plotting: requires launching `bokeh-server` # and allows to see what happens online. Plot(bokeh_name if bokeh_name else os.path.basename(save_path), [# Plot 1: training and validation costs [average_monitoring.record_name(regularized_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]], every_n_batches=10, server_url=bokeh_server), Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_likelihood.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar(), Printing(every_n_batches=1, attribute_filter=PrintingFilterList() )] # Save the config into the status log = TrainingLog() log.status['_config'] = repr(config) main_loop = MainLoop( model=model, log=log, algorithm=algorithm, data_stream=data.get_stream("train"), extensions=extensions) main_loop.run()
def create_training_computation_graphs( z_dim, image_size, net_depth, discriminative_regularization, classifer, vintage, reconstruction_factor, kl_factor, discriminative_factor, disc_weights, ): x = tensor.tensor4("features") pi = numpy.cast[theano.config.floatX](numpy.pi) bricks = create_model_bricks(z_dim=z_dim, image_size=image_size, depth=net_depth) encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks if discriminative_regularization: if vintage: classifier_model = Model(load(classifer).algorithm.cost) else: with open(classifer, "rb") as src: classifier_model = Model(load(src).algorithm.cost) selector = Selector(classifier_model.top_bricks) classifier_convnet, = selector.select("/convnet").bricks classifier_mlp, = selector.select("/mlp").bricks random_brick = Random() # Initialize conditional variances log_sigma_theta = shared_floatx(numpy.zeros((3, image_size, image_size)), name="log_sigma_theta") add_role(log_sigma_theta, PARAMETER) variance_parameters = [log_sigma_theta] num_disc_layers = 0 if discriminative_regularization: # We add discriminative regularization for the batch-normalized output # of the strided layers of the classifier. for layer in classifier_convnet.layers[1::3]: log_sigma = shared_floatx(numpy.zeros(layer.get_dim("output")), name="{}_log_sigma".format(layer.name)) add_role(log_sigma, PARAMETER) variance_parameters.append(log_sigma) # include mlp # DISABLED # log_sigma = shared_floatx( # numpy.zeros([classifier_mlp.output_dim]), # name='{}_log_sigma'.format("MLP")) # add_role(log_sigma, PARAMETER) # variance_parameters.append(log_sigma) # diagnostic num_disc_layers = len(variance_parameters) - 1 print("Applying discriminative regularization on {} layers".format(num_disc_layers)) # Computation graph creation is encapsulated within this function in order # to allow selecting which parts of the graph will use batch statistics for # batch normalization and which parts will use population statistics. # Specifically, we'd like to use population statistics for the classifier # even in the training graph. def create_computation_graph(): # Encode phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2)) nlat = encoder_mlp.output_dim // 2 mu_phi = phi[:, :nlat] log_sigma_phi = phi[:, nlat:] # Sample from the approximate posterior epsilon = random_brick.theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype) z = mu_phi + epsilon * tensor.exp(log_sigma_phi) # Decode mu_theta = decoder_convnet.apply(decoder_mlp.apply(z).reshape((-1,) + decoder_convnet.get_dim("input_"))) log_sigma = log_sigma_theta.dimshuffle("x", 0, 1, 2) # Compute KL and reconstruction terms kl_term = 0.5 * (tensor.exp(2 * log_sigma_phi) + mu_phi ** 2 - 2 * log_sigma_phi - 1).sum(axis=1) reconstruction_term = -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (x - mu_theta) ** 2 / tensor.exp(2 * log_sigma) ).sum(axis=[1, 2, 3]) discriminative_layer_terms = [None] * num_disc_layers for i in range(num_disc_layers): discriminative_layer_terms[i] = tensor.zeros_like(kl_term) discriminative_term = tensor.zeros_like(kl_term) if discriminative_regularization: # Propagate both the input and the reconstruction through the classifier acts_cg = ComputationGraph([classifier_mlp.apply(classifier_convnet.apply(x).flatten(ndim=2))]) acts_hat_cg = ComputationGraph([classifier_mlp.apply(classifier_convnet.apply(mu_theta).flatten(ndim=2))]) # Retrieve activations of interest and compute discriminative # regularization reconstruction terms cur_layer = 0 # CLASSIFIER MLP DISABLED # for i, zip_pair in enumerate(zip(classifier_convnet.layers[1::3] + [classifier_mlp], for i, zip_pair in enumerate(zip(classifier_convnet.layers[1::3], variance_parameters[1:])): layer, log_sigma = zip_pair variable_filter = VariableFilter(roles=[OUTPUT], bricks=[layer]) d, = variable_filter(acts_cg) d_hat, = variable_filter(acts_hat_cg) # TODO: this conditional could be less brittle if "mlp" in layer.name.lower(): log_sigma = log_sigma.dimshuffle("x", 0) sumaxis = [1] else: log_sigma = log_sigma.dimshuffle("x", 0, 1, 2) sumaxis = [1, 2, 3] discriminative_layer_term_unweighted = -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (d - d_hat) ** 2 / tensor.exp(2 * log_sigma) ).sum(axis=sumaxis) discriminative_layer_terms[i] = ( discriminative_factor * disc_weights[cur_layer] * discriminative_layer_term_unweighted ) discriminative_term = discriminative_term + discriminative_layer_terms[i] cur_layer = cur_layer + 1 # scale terms (disc is prescaled by layer) reconstruction_term = reconstruction_factor * reconstruction_term kl_term = kl_factor * kl_term # total_reconstruction_term is reconstruction + discriminative total_reconstruction_term = reconstruction_term + discriminative_term # cost is mean(kl - total reconstruction) cost = (kl_term - total_reconstruction_term).mean() return ComputationGraph([cost, kl_term, reconstruction_term, discriminative_term] + discriminative_layer_terms) cg = create_computation_graph() with batch_normalization(encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp): bn_cg = create_computation_graph() return cg, bn_cg, variance_parameters
def create_training_computation_graphs(discriminative_regularization): x = tensor.tensor4('features') pi = numpy.cast[theano.config.floatX](numpy.pi) bricks = create_model_bricks() encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks if discriminative_regularization: classifier_model = Model(load('celeba_classifier.zip').algorithm.cost) selector = Selector(classifier_model.top_bricks) classifier_convnet, = selector.select('/convnet').bricks random_brick = Random() # Initialize conditional variances log_sigma_theta = shared_floatx( numpy.zeros((3, 64, 64)), name='log_sigma_theta') add_role(log_sigma_theta, PARAMETER) variance_parameters = [log_sigma_theta] if discriminative_regularization: # We add discriminative regularization for the batch-normalized output # of the strided layers of the classifier. for layer in classifier_convnet.layers[4::6]: log_sigma = shared_floatx( numpy.zeros(layer.get_dim('output')), name='{}_log_sigma'.format(layer.name)) add_role(log_sigma, PARAMETER) variance_parameters.append(log_sigma) # Computation graph creation is encapsulated within this function in order # to allow selecting which parts of the graph will use batch statistics for # batch normalization and which parts will use population statistics. # Specifically, we'd like to use population statistics for the classifier # even in the training graph. def create_computation_graph(): # Encode phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2)) nlat = encoder_mlp.output_dim // 2 mu_phi = phi[:, :nlat] log_sigma_phi = phi[:, nlat:] # Sample from the approximate posterior epsilon = random_brick.theano_rng.normal( size=mu_phi.shape, dtype=mu_phi.dtype) z = mu_phi + epsilon * tensor.exp(log_sigma_phi) # Decode mu_theta = decoder_convnet.apply( decoder_mlp.apply(z).reshape( (-1,) + decoder_convnet.get_dim('input_'))) log_sigma = log_sigma_theta.dimshuffle('x', 0, 1, 2) # Compute KL and reconstruction terms kl_term = 0.5 * ( tensor.exp(2 * log_sigma_phi) + mu_phi ** 2 - 2 * log_sigma_phi - 1 ).sum(axis=1) reconstruction_term = -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (x - mu_theta) ** 2 / tensor.exp(2 * log_sigma) ).sum(axis=[1, 2, 3]) total_reconstruction_term = reconstruction_term if discriminative_regularization: # Propagate both the input and the reconstruction through the # classifier acts_cg = ComputationGraph([classifier_convnet.apply(x)]) acts_hat_cg = ComputationGraph( [classifier_convnet.apply(mu_theta)]) # Retrieve activations of interest and compute discriminative # regularization reconstruction terms for layer, log_sigma in zip(classifier_convnet.layers[4::6], variance_parameters[1:]): variable_filter = VariableFilter(roles=[OUTPUT], bricks=[layer]) d, = variable_filter(acts_cg) d_hat, = variable_filter(acts_hat_cg) log_sigma = log_sigma.dimshuffle('x', 0, 1, 2) total_reconstruction_term += -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (d - d_hat) ** 2 / tensor.exp(2 * log_sigma) ).sum(axis=[1, 2, 3]) cost = (kl_term - total_reconstruction_term).mean() return ComputationGraph([cost, kl_term, reconstruction_term]) cg = create_computation_graph() with batch_normalization(encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp): bn_cg = create_computation_graph() return cg, bn_cg, variance_parameters
def get_zdim(self): selector = Selector(self.model.top_bricks) decoder_mlp, = selector.select("/decoder_mlp").bricks return decoder_mlp.input_dim
def run(batch_size, save_path, z_dim, oldmodel, discriminative_regularization, classifier, vintage, monitor_every, monitor_before, checkpoint_every, dataset, color_convert, image_size, net_depth, subdir, reconstruction_factor, kl_factor, discriminative_factor, disc_weights, num_epochs): if dataset: streams = create_custom_streams(filename=dataset, training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=False, color_convert=color_convert) else: streams = create_celeba_streams(training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs( z_dim, image_size, net_depth, discriminative_regularization, classifier, vintage, reconstruction_factor, kl_factor, discriminative_factor, disc_weights) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp'))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring sys.setrecursionlimit(1000000) monitored_quantities_list = [] for graph in [bn_cg, cg]: # cost, kl_term, reconstruction_term, discriminative_term = graph.outputs cost, kl_term, reconstruction_term, discriminative_term = graph.outputs[:4] discriminative_layer_terms = graph.outputs[4:] cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' avg_discriminative_term = discriminative_term.mean(axis=0) avg_discriminative_term.name = 'avg_discriminative_term' num_layer_terms = len(discriminative_layer_terms) avg_discriminative_layer_terms = [None] * num_layer_terms for i, term in enumerate(discriminative_layer_terms): avg_discriminative_layer_terms[i] = discriminative_layer_terms[i].mean(axis=0) avg_discriminative_layer_terms[i].name = "avg_discriminative_term_layer_{:02d}".format(i) monitored_quantities_list.append( [cost, avg_kl_term, avg_reconstruction_term, avg_discriminative_term] + avg_discriminative_layer_terms) train_monitoring = DataStreamMonitoring( monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=monitor_before, every_n_epochs=monitor_every) valid_monitoring = DataStreamMonitoring( monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=monitor_before, every_n_epochs=monitor_every) # Prepare checkpoint checkpoint = Checkpoint(save_path, every_n_epochs=checkpoint_every, before_training=True, use_cpickle=True) sample_checkpoint = SampleCheckpoint(interface=DiscGenModel, z_dim=z_dim/2, image_size=(image_size, image_size), channels=3, dataset=dataset, split="valid", save_subdir=subdir, before_training=True, after_epoch=True) # TODO: why does z_dim=foo become foo/2? extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), checkpoint, sample_checkpoint, train_monitoring, valid_monitoring, Printing(), ProgressBar()] main_loop = MainLoop(model=model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) if oldmodel is not None: print("Initializing parameters with old model {}".format(oldmodel)) try: saved_model = load(oldmodel) except AttributeError: # newer version of blocks with open(oldmodel, 'rb') as src: saved_model = load(src) main_loop.model.set_parameter_values( saved_model.model.get_parameter_values()) del saved_model main_loop.run()
def create_model(config, data, load_path=None, test_tag=False): """ Build the main brick and initialize or load all parameters. Parameters ---------- config : dict the configuration dict data : object of class Data the dataset creation object load_path : str or None if given a string, it will be used to load model parameters. Else, the parameters will be randomly initalized by calling recognizer.initialize() test_tag : bool if true, will add tag the input variables with test values """ # First tell the recognizer about required data sources net_config = dict(config["net"]) bottom_class = net_config['bottom']['bottom_class'] input_dims = { source: data.num_features(source) for source in bottom_class.vector_input_sources} input_num_chars = { source: len(data.character_map(source)) for source in bottom_class.discrete_input_sources} recognizer = SpeechRecognizer( input_dims=input_dims, input_num_chars=input_num_chars, eos_label=data.eos_label, num_phonemes=data.num_labels, name="recognizer", data_prepend_eos=data.prepend_eos, character_map=data.character_map('labels'), **net_config) if load_path: recognizer.load_params(load_path) else: for brick_path, attribute_dict in sorted( config['initialization'].items(), key=lambda (k, v): k.count('/')): for attribute, value in attribute_dict.items(): brick, = Selector(recognizer).select(brick_path).bricks setattr(brick, attribute, value) brick.push_initialization_config() recognizer.initialize() if test_tag: # fails with newest theano # tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__ __stream = data.get_stream("train") __data = next(__stream.get_epoch_iterator(as_dict=True)) for __var in recognizer.inputs.values(): __var.tag.test_value = __data[__var.name] theano.config.compute_test_value = 'warn' return recognizer
def main(): # set para config = getattr(configurations, "get_config_cs2en")() logger.info("Model options:\n{}".format(pprint.pformat(config))) tr_stream = get_tr_stream(**config) # Create Theano variables logger.info("Creating theano variables") source_sentence0 = tensor.lmatrix("source0") source_sentence_mask0 = tensor.matrix("source0_mask") target_sentence0 = tensor.lmatrix("target0") target_sentence_mask0 = tensor.matrix("target0_mask") source_sentence1 = tensor.lmatrix("source1") source_sentence_mask1 = tensor.matrix("source1_mask") target_sentence1 = tensor.lmatrix("target1") target_sentence_mask1 = tensor.matrix("target1_mask") source_sentence2 = tensor.lmatrix("source2") source_sentence_mask2 = tensor.matrix("source2_mask") target_sentence2 = tensor.lmatrix("target2") target_sentence_mask2 = tensor.matrix("target2_mask") sampling_input0 = tensor.lmatrix("input0") sampling_input1 = tensor.lmatrix("input1") sampling_input2 = tensor.lmatrix("input2") sampling_hstates0 = tensor.fmatrix("hstates0") sampling_hstates1 = tensor.fmatrix("hstates1") sampling_hstates2 = tensor.fmatrix("hstates2") sampling_lastrep0 = tensor.tensor3("lastrep0") sampling_lastrep1 = tensor.tensor3("lastrep1") hstates = theano.shared(value=numpy.zeros((config["enc_nhids"]), dtype=theano.config.floatX), name="hstates") # Get vocab sources = get_attr_rec(tr_stream, "data_stream") src_vocab = sources.data_streams[0].dataset.dictionary trg_vocab = sources.data_streams[1].dataset.dictionary # Construct model logger.info("Building PoemModel") block0 = PoemBlock(config=config, blockid="block0", name="poemblock0") block1 = PoemBlock(config=config, blockid="block1", name="poemblock1") block2 = PoemBlock(config=config, blockid="block2", name="poemblock2") cost0, hsta0, rep0 = block0.cost( source_sentence0, source_sentence_mask0, source_sentence_mask1, source_sentence_mask0, target_sentence0, target_sentence_mask0, hstates, lastrep0=None, lastrep1=None, ) cost1, hsta1, rep1 = block1.cost( source_sentence1, source_sentence_mask0, source_sentence_mask1, source_sentence_mask1, target_sentence1, target_sentence_mask1, hsta0, lastrep0=rep0, lastrep1=None, ) cost2, hsta2, rep2 = block2.cost( source_sentence2, source_sentence_mask0, source_sentence_mask1, source_sentence_mask2, target_sentence2, target_sentence_mask2, hsta1, lastrep0=rep0, lastrep1=rep1, ) cost = cost0 + cost1 + cost2 cost.name = "total_cost" logger.info("Creating computational graph") cg = ComputationGraph(cost) # Initialize model logger.info("Initializing model") block0.set_initw(IsotropicGaussian(config["weight_scale"])) block0.set_initb(Constant(0)) block0.push_initialization_config() block0.set_specialinit(Orthogonal(), Orthogonal()) block0.initialize() block1.set_initw(IsotropicGaussian(config["weight_scale"])) block1.set_initb(Constant(0)) block1.push_initialization_config() block1.set_specialinit(Orthogonal(), Orthogonal()) block1.initialize() block2.set_initw(IsotropicGaussian(config["weight_scale"])) block2.set_initb(Constant(0)) block2.push_initialization_config() block2.set_specialinit(Orthogonal(), Orthogonal()) block2.initialize() # apply dropout for regularization if config["dropout"] < 1.0: # dropout is applied to the output of maxout in ghog logger.info("Applying dropout") dropout_inputs = [x for x in cg.intermediary_variables if x.name == "maxout_apply_output"] cg = apply_dropout(cg, dropout_inputs, config["dropout"]) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(" {:15}: {}".format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names param_dict = Selector(block0).get_parameters() logger.info("Parameter names: ") for name, value in param_dict.items(): logger.info(" {:15}: {}".format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format(len(param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # logger.info(cg.auxiliary_variables) # logger.info("______________________________") """ weights = "" for va in cg.auxiliary_variables: if va.name == "sequence_generator_block0_cost_matrix_weighted_averages": weights = va weightsize = weights.shape weightsize.name = "weightsize" states = "" for va in cg.auxiliary_variables: if va.name == "sequence_generator_block0_cost_matrix_states": states = va statesize = states.shape statesize.name = "statesize" rep = "" for va in cg.auxiliary_variables: if va.name == "poemblock0_cost_block0hstatesRepeat": rep = va repsize = rep.shape repsize.name = "repsize" """ # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config["finish_after"]), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config["saveto"], every_n_batches=config["save_freq"]), ] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config["step_clipping"]), eval(config["step_rule"])()]), ) # Reload model if necessary if config["reload"]: extensions.append(LoadNMT(config["saveto"])) # Add sampling if config["hook_samples"] >= 1: logger.info("Building sampler") generated0 = block0.mygenerate(sampling_input0, sampling_hstates0) search_model0 = Model(generated0) generated1 = block1.mygenerate(sampling_input1, sampling_hstates1, sampling_lastrep0) search_model1 = Model(generated1) generated2 = block2.mygenerate(sampling_input2, sampling_hstates2, sampling_lastrep0, sampling_lastrep1) search_model2 = Model(generated2) extensions.append( Sampler( config=config, model0=search_model0, model1=search_model1, model2=search_model2, data_stream=tr_stream, hook_samples=config["hook_samples"], every_n_batches=config["sampling_freq"], src_vocab_size=config["src_vocab_size"], ) ) logger.info("End of building sampler") # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def get_gradients(self, features, n_samples): """Perform inference and calculate gradients. Returns ------- log_px : T.fvector log_psx : T.fvector gradients : OrderedDict """ p_layers = self.p_layers q_layers = self.q_layers n_layers = len(p_layers) batch_size = features.shape[0] x = replicate_batch(features, n_samples) # Get Q-samples samples, log_p, log_q = self.sample_q(x) # Reshape and sum samples = unflatten_values(samples, batch_size, n_samples) log_p = unflatten_values(log_p, batch_size, n_samples) log_q = unflatten_values(log_q, batch_size, n_samples) log_p_all = sum(log_p) log_q_all = sum(log_q) # Approximate log p(x) log_px_bound = log_p_all[:,0] - log_q_all[:,0] log_px = logsumexp(log_p_all-log_q_all, axis=-1) - tensor.log(n_samples) log_psx = (logsumexp((log_p_all-log_q_all)/2, axis=-1) - tensor.log(n_samples)) * 2. # Calculate IS weights w = self.importance_weights(log_p, log_q) wp = w.reshape( (batch_size*n_samples, ) ) wq = w.reshape( (batch_size*n_samples, ) ) wq = wq - (1./n_samples) samples = flatten_values(samples, batch_size*n_samples) gradients = OrderedDict() for l in xrange(n_layers-1): gradients = merge_gradients(gradients, p_layers[l].get_gradients(samples[l], samples[l+1], weights=wp)) gradients = merge_gradients(gradients, q_layers[l].get_gradients(samples[l+1], samples[l], weights=wq)) gradients = merge_gradients(gradients, p_layers[-1].get_gradients(samples[-1], weights=wp)) if (self.l1reg > 0.) or (self.l2reg > 0.): reg_gradients = OrderedDict() params = Selector(self).get_parameters() for pname, param in params.iteritems(): if has_roles(param, (WEIGHT,)): reg_cost = self.l1reg * tensor.sum(abs(param)) + self.l2reg * tensor.sum(param**2) reg_gradients[param] = tensor.grad(reg_cost, param) gradients = merge_gradients(gradients, reg_gradients) self.log_p_bound = log_px_bound self.log_p = log_px self.log_ph = log_psx return log_px, log_psx, gradients
print 'Parsing dataset file...' vocab = Vocab(dataset_path=args.dataset_path) source_sentence = tensor.lmatrix('source') encoder = BidirectionalEncoder(vocab.sequenceLength(), args.embed, args.nhidden) encoder.weights_init = IsotropicGaussian(args.weight_scale) encoder.biases_init = Constant(0) encoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() encoder.initialize() print 'Parameter names: ' enc_param_dict = Selector(encoder).get_params() for name, value in enc_param_dict.iteritems(): print ' {:15}: {}'.format(value.get_value().shape, name) representation = encoder.apply(source_sentence) print 'Compiling theano function' f = theano.function([source_sentence], representation) reps = np.empty(len(vocab.dataset), dtype=object) bar = Bar('Encoding', max=len(vocab.dataset)) for idx, sentence in enumerate(vocab.dataset): reps[idx] = f(sentence).transpose((1, 2, 0)) bar.next() bar.finish()