Beispiel #1
0
def create_model(config, data, test_tag):

    # Build the main brick and initialize all parameters.
    recognizer = SpeechRecognizer(data.recordings_source,
                                  data.labels_source,
                                  data.eos_label,
                                  data.num_features,
                                  data.num_labels,
                                  name="recognizer",
                                  data_prepend_eos=data.prepend_eos,
                                  character_map=data.character_map,
                                  **config["net"])
    for brick_path, attribute_dict in sorted(config['initialization'].items(),
                                             key=lambda (k, v): k.count('/')):
        for attribute, value in attribute_dict.items():
            brick, = Selector(recognizer).select(brick_path).bricks
            setattr(brick, attribute, value)
            brick.push_initialization_config()
    recognizer.initialize()

    if test_tag:
        tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__
        __stream = data.get_stream("train")
        __data = next(__stream.get_epoch_iterator(as_dict=True))
        recognizer.recordings.tag.test_value = __data[data.recordings_source]
        recognizer.recordings_mask.tag.test_value = __data[
            data.recordings_source + '_mask']
        recognizer.labels.tag.test_value = __data[data.labels_source]
        recognizer.labels_mask.tag.test_value = __data[data.labels_source +
                                                       '_mask']
        theano.config.compute_test_value = 'warn'
    return recognizer
def build_model(images, labels):
    
    vgg = VGG(layer='conv4_4')
    vgg.push_initialization_config()
    vgg.initialize()

    tdb = top_direction_block()
    tdb.push_initialization_config()
    tdb.initialize()

    # Construct feedforward sequence
    ss_seq = FeedforwardSequence([vgg.apply, tdb.apply])
    ss_seq.push_initialization_config()
    ss_seq.initialize()
    
    prediction = ss_seq.apply(images)
    cost       = StructuredCost().apply(labels, theano.tensor.clip(prediction, 1e-5, 1 - 1e-5))

    cg           = ComputationGraph(cost)
    cg_dropout   = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[0]], .5)
    cost_dropout = cg_dropout.outputs[0]

    # define learned parameters
    selector = Selector([ss_seq])
    W         = selector.get_parameters()
    parameters = []
    parameters += [v for k, v in W.items()]

    return cost_dropout, parameters 
Beispiel #3
0
def build_model(images, labels):
    
    # Construct a bottom convolutional sequence
    bottom_conv_sequence = convolutional_sequence((3,3), 16, (160, 160))
    bottom_conv_sequence._push_allocation_config()
    
    # Flatten layer
    flattener = Flattener()

    # Construct a top MLP
    conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output'))
    #top_mlp = MLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0))
    top_mlp = BatchNormalizedMLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0))
    
    # Construct feedforward sequence
    ss_seq = FeedforwardSequence([bottom_conv_sequence.apply, flattener.apply, top_mlp.apply])
    ss_seq.push_initialization_config()
    ss_seq.initialize()
    
    prediction = ss_seq.apply(images)
    cost_noreg = CategoricalCrossEntropy().apply(labels.flatten(), prediction)

    # add regularization
    selector = Selector([top_mlp])
    Ws = selector.get_parameters('W')
    mlp_brick_name = 'batchnormalizedmlp'
    W0 = Ws['/%s/linear_0.W' % mlp_brick_name]
    W1 = Ws['/%s/linear_1.W' % mlp_brick_name]

    cost = cost_noreg + .01 * (W0 ** 2).mean() + .01 * (W1 ** 2).mean()


    return cost
Beispiel #4
0
def create_model(config, data, test_tag):

    # Build the main brick and initialize all parameters.
    recognizer = SpeechRecognizer(
        data.recordings_source, data.labels_source,
        data.eos_label,
        data.num_features, data.num_labels,
        name="recognizer",
        data_prepend_eos=data.prepend_eos,
        character_map=data.character_map,
        **config["net"])
    for brick_path, attribute_dict in sorted(
            config['initialization'].items(),
            key=lambda (k, v): k.count('/')):
        for attribute, value in attribute_dict.items():
            brick, = Selector(recognizer).select(brick_path).bricks
            setattr(brick, attribute, value)
            brick.push_initialization_config()
    recognizer.initialize()

    if test_tag:
        tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__
        __stream = data.get_stream("train")
        __data = next(__stream.get_epoch_iterator(as_dict=True))
        recognizer.recordings.tag.test_value = __data[data.recordings_source]
        recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask']
        recognizer.labels.tag.test_value = __data[data.labels_source]
        recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask']
        theano.config.compute_test_value = 'warn'
    return recognizer
Beispiel #5
0
    def get_gradients(self, X, Y, weights=1.0):
        W_mean, W_ls, b_mean, b_ls = self.parameters

        mean, log_sigma = self.sample_expected(Y)
        sigma = tensor.exp(log_sigma)

        cost = -log_sigma - 0.5 * (X - mean) ** 2 / tensor.exp(2 * log_sigma)
        if weights != 1.0:
            cost = -weights.dimshuffle(0, "x") * cost

        cost_scaled = sigma ** 2 * cost
        cost_gscale = (sigma ** 2).sum(axis=1).dimshuffle([0, "x"])
        cost_gscale = cost_gscale * cost

        gradients = OrderedDict()

        params = Selector(self.mlp).get_parameters()
        for pname, param in params.iteritems():
            gradients[param] = tensor.grad(cost_gscale.sum(), param, consider_constant=[X, Y])

        gradients[W_mean] = tensor.grad(cost_scaled.sum(), W_mean, consider_constant=[X, Y])
        gradients[b_mean] = tensor.grad(cost_scaled.sum(), b_mean, consider_constant=[X, Y])

        gradients[W_ls] = tensor.grad(cost_scaled.sum(), W_ls, consider_constant=[X, Y])
        gradients[b_ls] = tensor.grad(cost_scaled.sum(), b_ls, consider_constant=[X, Y])

        return gradients
Beispiel #6
0
    def unify_parameters(self, source_id, dest_id):
        source = self.children[source_id]
        source_name = self.children[source_id].name
        source_prefix = '/' + source_name + '/'
        dest_name = self.children[dest_id].name
        dest_prefix = '/' + self.name + '/' + dest_name + '/'

        source_params = Selector(source).get_parameters()

        replaced = []

        self.unified_parameters = []

        for param, var in source_params.iteritems():
            if not param.startswith(source_prefix):
                continue
            source_param = '/' + self.name + param
            param = param[len(source_prefix):]
            for unification in self.parameter_unifications_include:
                if unification.match(param):
                    exclude = False
                    for ex_unification in self.parameter_unifications_exclude:
                        if ex_unification.match(param):
                            exclude = True
                            break
                    if exclude:
                        continue
                    self.replace_parameter(dest_prefix + param, var)
                    replaced += [dest_prefix + param]
                    self.unified_parameters += [source_param]
        self.unified_parameters = self.convert_names_to_bricks(
            set(self.unified_parameters) | set(replaced))
        return replaced
def make_sampling_computation_graph(model_path, num_samples):
    f = file(model_path, 'rb')
    model = cPickle.load(f)#main_loop = load(model_path)#
    f.close()
    #model = main_loop.model
    selector = Selector(model.top_bricks)
    decoder_mlp1, = selector.select('/decoder_network1').bricks
    decoder_mlp2, = selector.select('/decoder_network2').bricks
    decoder_mlp3, = selector.select('/decoder_network3').bricks
    theano_rng = Random().theano_rng

    z1 = theano_rng.normal(size=(num_samples, decoder_mlp1.input_dim),
                           dtype=theano.config.floatX)

    z2 = decoder_mlp1.apply(z1)
    z2 = z2[:, :40]# + theano.tensor.exp(0.5 * z2[:, 40:]) * theano_rng.normal(size=(num_samples, 40),
                    #                                                          dtype=theano.config.floatX)

    z3 = decoder_mlp2.apply(z2)
    z3 = z3[:, :100] + theano.tensor.exp(0.5 * z3[:, 100:]) * theano_rng.normal(size=(num_samples, 100),
                                                                                dtype=theano.config.floatX)

    p = decoder_mlp3.apply(z3).reshape((num_samples, 28, 28))

    return ComputationGraph([p])
Beispiel #8
0
def make_sampling_computation_graph(model_path, num_samples):
    f = file(model_path, 'rb')
    model = cPickle.load(f)#main_loop = load(model_path)#
    f.close()
    #model = main_loop.model
    selector = Selector(model.top_bricks)
    decoder_mlp1, = selector.select('/decoder_network1').bricks
    decoder_mlp2, = selector.select('/decoder_network2').bricks
    decoder_mlp3, = selector.select('/decoder_network3').bricks
    theano_rng = Random().theano_rng

    z2 = theano_rng.normal(size=(num_samples, decoder_mlp1.input_dim),
                           dtype=theano.config.floatX)

    h2 = decoder_mlp1.apply(z2) 
    h2 = h2[:, :50] + theano.tensor.exp(0.5 * h2[:, 50:]) * theano_rng.normal(size=(num_samples, 50),
                                                                              dtype=theano.config.floatX)


    z1 = theano_rng.normal(size=(num_samples, 10),
                           dtype=theano.config.floatX)

    h1 = decoder_mlp2.apply(theano.tensor.concatenate([h2, z1], axis=1)) 
    h1 = h1[:, :50] + theano.tensor.exp(0.5 * h1[:, 50:]) * theano_rng.normal(size=(num_samples, 50),
                                                                              dtype=theano.config.floatX)

    p = decoder_mlp3.apply(theano.tensor.concatenate([h1, h2], axis=1)).reshape((num_samples, 28, 28))

    return ComputationGraph([p])
def run(discriminative_regularization=True):
    streams = create_celeba_streams(training_batch_size=100,
                                    monitoring_batch_size=500,
                                    include_targets=False)
    main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3]

    # Compute parameter updates for the batch normalization population
    # statistics. They are updated following an exponential moving average.
    rval = create_training_computation_graphs(discriminative_regularization)
    cg, bn_cg, variance_parameters = rval
    pop_updates = list(
        set(get_batch_normalization_updates(bn_cg, allow_duplicates=True)))
    decay_rate = 0.05
    extra_updates = [(p, m * decay_rate + p * (1 - decay_rate))
                     for p, m in pop_updates]

    model = Model(bn_cg.outputs[0])
    selector = Selector(
        find_bricks(
            model.top_bricks,
            lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp',
                                         'decoder_convnet', 'decoder_mlp')))
    parameters = list(selector.get_parameters().values()) + variance_parameters

    # Prepare algorithm
    step_rule = Adam()
    algorithm = GradientDescent(cost=bn_cg.outputs[0],
                                parameters=parameters,
                                step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    # Prepare monitoring
    monitored_quantities_list = []
    for graph in [bn_cg, cg]:
        cost, kl_term, reconstruction_term = graph.outputs
        cost.name = 'nll_upper_bound'
        avg_kl_term = kl_term.mean(axis=0)
        avg_kl_term.name = 'avg_kl_term'
        avg_reconstruction_term = -reconstruction_term.mean(axis=0)
        avg_reconstruction_term.name = 'avg_reconstruction_term'
        monitored_quantities_list.append(
            [cost, avg_kl_term, avg_reconstruction_term])
    train_monitoring = DataStreamMonitoring(
        monitored_quantities_list[0], train_monitor_stream, prefix="train",
        updates=extra_updates, after_epoch=False, before_first_epoch=False,
        every_n_epochs=5)
    valid_monitoring = DataStreamMonitoring(
        monitored_quantities_list[1], valid_monitor_stream, prefix="valid",
        after_epoch=False, before_first_epoch=False, every_n_epochs=5)

    # Prepare checkpoint
    save_path = 'celeba_vae_{}regularization.zip'.format(
        '' if discriminative_regularization else 'no_')
    checkpoint = Checkpoint(save_path, every_n_epochs=5, use_cpickle=True)

    extensions = [Timing(), FinishAfter(after_n_epochs=75), train_monitoring,
                  valid_monitoring, checkpoint, Printing(), ProgressBar()]
    main_loop = MainLoop(data_stream=main_loop_stream,
                         algorithm=algorithm, extensions=extensions)
    main_loop.run()
Beispiel #10
0
    def get_gradients(self, features, n_samples):
        """Perform inference and calculate gradients.

        Returns
        -------
        log_px : T.fvector
        log_psx : T.fvector
        gradients : OrderedDict
        """
        p_layers = self.p_layers
        q_layers = self.q_layers
        n_layers = len(p_layers)

        batch_size = features.shape[0]

        x = replicate_batch(features, n_samples)

        # Get Q-samples
        samples, log_p, log_q = self.sample_q(x)

        # Reshape and sum
        samples = unflatten_values(samples, batch_size, n_samples)
        log_p = unflatten_values(log_p, batch_size, n_samples)
        log_q = unflatten_values(log_q, batch_size, n_samples)

        log_p_all = sum(log_p)
        log_q_all = sum(log_q)

        # Approximate log(p(x))
        log_px = logsumexp(log_p_all - log_q_all, axis=-1) - tensor.log(n_samples)
        log_psx = (logsumexp((log_p_all - log_q_all) / 2, axis=-1) - tensor.log(n_samples)) * 2.

        # Approximate log p(x) and calculate IS weights
        w = self.importance_weights(log_p, log_q)

        wp = w.reshape((batch_size * n_samples, ))
        wq = w.reshape((batch_size * n_samples, ))
        wq = wq - (1. / n_samples)

        samples = flatten_values(samples, batch_size * n_samples)

        gradients = OrderedDict()
        for l in xrange(n_layers - 1):
            gradients = merge_gradients(gradients, p_layers[l].get_gradients(samples[l], samples[l + 1], weights=wp))
            gradients = merge_gradients(gradients, q_layers[l].get_gradients(samples[l + 1], samples[l], weights=wq))
        gradients = merge_gradients(gradients, p_layers[-1].get_gradients(samples[-1], weights=wp))

        if (self.l1reg > 0.) or (self.l2reg > 0.):
            reg_gradients = OrderedDict()
            params = Selector(self).get_parameters()
            for pname, param in params.iteritems():
                if has_roles(param, (WEIGHT,)):
                    reg_cost = self.l1reg * tensor.sum(abs(param)) + self.l2reg * tensor.sum(param ** 2)
                    reg_gradients[param] = tensor.grad(reg_cost, param)
            gradients = merge_gradients(gradients, reg_gradients)

        return log_px, log_psx, gradients
Beispiel #11
0
    def get_gradients(self, features, n_samples):
        log_p_bound = self.log_likelihood_bound(features, n_samples)

        gradients = OrderedDict()
        params = Selector(self).get_parameters()
        for pname, param in params.iteritems():
            cost = -log_p_bound.mean() + self.l2reg * tensor.sum(param ** 2)
            gradients[param] = tensor.grad(cost, param)

        return log_p_bound, gradients
def test_selector():
    b1 = MockBrickBottom(name="b1")
    b2 = MockBrickBottom(name="b2")
    b3 = MockBrickBottom(name="b3")
    t1 = MockBrickTop([b1, b2], name="t1")
    t2 = MockBrickTop([b2, b3], name="t2")

    s1 = Selector([t1])
    s11 = s1.select("/t1/b1")
    assert s11.bricks[0] == b1
    assert len(s11.bricks) == 1
    s12 = s1.select("/t1")
    assert s12.bricks[0] == t1
    assert len(s12.bricks) == 1

    s2 = Selector([t1, t2])
    s21 = s2.select("/t2/b2")
    assert s21.bricks[0] == b2
    assert len(s21.bricks) == 1

    assert s2.select("/t2/b2.V")[0] == b2.parameters[0]

    parameters = list(s1.get_parameters().items())
    assert parameters[0][0] == "/t1/b1.V"
    assert parameters[0][1] == b1.parameters[0]
    assert parameters[1][0] == "/t1/b1.W"
    assert parameters[1][1] == b1.parameters[1]
    assert parameters[2][0] == "/t1/b2.V"
    assert parameters[2][1] == b2.parameters[0]
    assert parameters[3][0] == "/t1/b2.W"
    assert parameters[3][1] == b2.parameters[1]
Beispiel #13
0
def create_recognizer(config, net_config, langs, info_dataset,
              postfix_manager, load_path=None, mask_path=None):
    if 'dependency' in net_config:
        net_config.pop('dependency')
    unification_include = []
    unification_exclude = []
    if 'unification_rules' in net_config:
        ur = net_config.pop('unification_rules')
        unification_include = ur.get('include', [])
        unification_exclude = ur.get('exclude', [])

        
    recognizer = MultilangDependencyRecognizer(langs, info_dataset, postfix_manager, unification_include, unification_exclude, **net_config)

    if recognizer.children[0].soft_pointer:
        global data_params_valid
        global data_params_train
        data_params_valid = {'soften_distributions': {'pointers': (0.0, None)}}
        data_params_train = {'soften_distributions':
                                {'pointers':
                                    (recognizer.children[0].soft_pointer_val,
                                     None)}}

    if load_path:
        recognizer.load_params(load_path)
        unifications = []
        for dest_id in xrange(1, len(recognizer.children)):
            unifications += recognizer.unify_parameters(0, dest_id)
        logger.info("Unified parameters: \n"+
                    pprint.pformat(unifications))
    else:
        for brick_path, attribute_dict in sorted(
                config['initialization'].items(),
                key=lambda (k, v): k.count('/')):
            for attribute, value in attribute_dict.items():
                brick, = Selector(recognizer).select(brick_path).bricks
                setattr(brick, attribute, value)
                brick.push_initialization_config()
        recognizer.initialize()
        unifications = []
        for dest_id in xrange(1, len(recognizer.children)):
            unifications += recognizer.unify_parameters(0, dest_id)
        logger.info("Unified parameters: \n"+
                    pprint.pformat(unifications))
    if mask_path:
        with open(mask_path, 'r') as f:
            mask_dict = pickle.load(f)
            recognizer.activate_masks(mask_dict)


    return recognizer
Beispiel #14
0
def get_decoder_function(model):
    selector = Selector(model.top_bricks)
    decoder_mlp, = selector.select("/decoder_mlp").bricks
    decoder_convnet, = selector.select("/decoder_convnet").bricks

    print("Building computation graph...")
    z = tensor.matrix()
    mu_theta = decoder_convnet.apply(decoder_mlp.apply(z).reshape((-1,) + decoder_convnet.get_dim("input_")))
    computation_graph = ComputationGraph([z, mu_theta])

    print("Compiling sampling function...")
    decoder_function = theano.function(computation_graph.inputs, computation_graph.outputs)

    return decoder_function
Beispiel #15
0
    def get_gradients(self, X, Y, weights=1.0):
        cost = -(weights * self.log_prob(X, Y)).sum()

        params = Selector(self).get_parameters()

        gradients = OrderedDict()
        if isinstance(weights, float):
            for pname, param in params.iteritems():
                gradients[param] = tensor.grad(cost, param, consider_constant=[X, Y])
        else:
            for pname, param in params.iteritems():
                gradients[param] = tensor.grad(cost, param, consider_constant=[X, Y, weights])

        return gradients
Beispiel #16
0
def make_sampling_computation_graph(model_path, num_samples):
    f = file(model_path, 'rb')
    model = cPickle.load(f)#main_loop = load(model_path)#
    f.close()
    #model = main_loop.model
    selector = Selector(model.top_bricks)
    decoder_mlp, = selector.select('/decoder_network').bricks
    theano_rng = Random().theano_rng

    z = theano_rng.normal(size=(num_samples, decoder_mlp.input_dim),
                          dtype=theano.config.floatX)
    p = decoder_mlp.apply(z).reshape((num_samples, 28, 28))

    return ComputationGraph([p])
Beispiel #17
0
    def __init__(self, outputs):
        super(Model, self).__init__(outputs)
        if len(self.outputs) > 1:
            logger.warning("model with multiple output " + multiple_message)

        bricks = [
            get_brick(var) for var in self.variables + self.scan_variables
            if get_brick(var)
        ]
        children = set(chain(*(brick.children for brick in bricks)))
        # Quadratic complexity: we should not have thousands of
        # top-level bricks.
        self.top_bricks = []
        for brick in bricks:
            if brick not in children and brick not in self.top_bricks:
                self.top_bricks.append(brick)
        names = Counter([brick.name for brick in self.top_bricks])
        repeated_names = [name for name, count in names.items() if count > 1]
        if repeated_names:
            raise ValueError("top bricks with the same name:"
                             " {}".format(', '.join(repeated_names)))
        brick_parameter_names = {
            v: k
            for k, v in Selector(self.top_bricks).get_parameters().items()
        }
        parameter_list = []
        for parameter in self.parameters:
            if parameter in brick_parameter_names:
                parameter_list.append(
                    (brick_parameter_names[parameter], parameter))
            else:
                parameter_list.append((parameter.name, parameter))
        self._parameter_dict = OrderedDict(parameter_list)
Beispiel #18
0
    def __init__(self, outputs):
        super(Model, self).__init__(outputs)
        if len(self.outputs) > 1:
            logger.warning("model with multiple output " + multiple_message)

        bricks = [
            get_brick(var) for var in self.variables + self.scan_variables
            if get_brick(var)
        ]
        children = set(chain(*(brick.children for brick in bricks)))
        # Quadratic complexity: we should not have thousands of
        # top-level bricks.
        self.top_bricks = []
        for brick in bricks:
            if brick not in children and brick not in self.top_bricks:
                self.top_bricks.append(brick)
        if len(set(b.name for b in self.top_bricks)) < len(self.top_bricks):
            raise ValueError("top bricks with the same name")

        brick_param_names = {
            v: k
            for k, v in Selector(self.top_bricks).get_params().items()
        }
        self.params = []
        for param in VariableFilter(roles=[PARAMETER])(self.shared_variables):
            if param in brick_param_names:
                self.params.append((brick_param_names[param], param))
            else:
                self.params.append((param.name, param))
        self.params = OrderedDict(self.params)
Beispiel #19
0
def create_running_graphs(classifier):
    try:
        classifier_model = Model(load(classifier).algorithm.cost)
    except AttributeError:
        # newer version of blocks
        with open(classifier, 'rb') as src:
            classifier_model = Model(load(src).algorithm.cost)

    selector = Selector(classifier_model.top_bricks)
    convnet, = selector.select('/convnet').bricks
    mlp, = selector.select('/mlp').bricks

    x = tensor.tensor4('features')
    y_hat = mlp.apply(convnet.apply(x).flatten(ndim=2))
    cg = ComputationGraph([y_hat])
    return cg
Beispiel #20
0
def create_running_graphs(classifier):
    try:
        classifier_model = Model(load(classifier).algorithm.cost)
    except AttributeError:
        # newer version of blocks
        with open(classifier, 'rb') as src:
            classifier_model = Model(load(src).algorithm.cost)

    selector = Selector(classifier_model.top_bricks)
    convnet, = selector.select('/convnet').bricks
    mlp, = selector.select('/mlp').bricks

    x = tensor.tensor4('features')
    y_hat = mlp.apply(convnet.apply(x).flatten(ndim=2))
    cg = ComputationGraph([y_hat])
    return cg
Beispiel #21
0
    def sample_at(self, z):
        selector = Selector(self.model.top_bricks)
        decoder_mlp, = selector.select("/decoder_mlp").bricks
        decoder_convnet, = selector.select("/decoder_convnet").bricks

        print("Building computation graph...")
        sz = shared_floatx(z)
        mu_theta = decoder_convnet.apply(decoder_mlp.apply(sz).reshape((-1,) + decoder_convnet.get_dim("input_")))
        computation_graph = ComputationGraph([mu_theta])

        print("Compiling sampling function...")
        sampling_function = theano.function(computation_graph.inputs, computation_graph.outputs[0])

        print("Sampling...")
        samples = sampling_function()
        return samples
def test_selector_get_parameters_uniqueness():
    top = MockBrickTop(
        [MockBrickBottom(name="bottom"),
         MockBrickBottom(name="bottom")],
        name="top")

    selector = Selector([top])
    assert_raises(ValueError, selector.get_parameters)
Beispiel #23
0
def get_decoder_function(model):
    selector = Selector(model.top_bricks)
    decoder_mlp, = selector.select('/decoder_mlp').bricks
    decoder_convnet, = selector.select('/decoder_convnet').bricks

    print('Building computation graph...')
    z = tensor.matrix()
    mu_theta = decoder_convnet.apply(
        decoder_mlp.apply(z).reshape((-1, ) +
                                     decoder_convnet.get_dim('input_')))
    computation_graph = ComputationGraph([z, mu_theta])

    print('Compiling sampling function...')
    decoder_function = theano.function(computation_graph.inputs,
                                       computation_graph.outputs)

    return decoder_function
Beispiel #24
0
    def get_gradients(self, X, Y, weights=1.):
        cost = -(weights * self.log_prob(X, Y)).sum()

        params = Selector(self).get_parameters()

        gradients = OrderedDict()
        if isinstance(weights, float):
            for pname, param in params.iteritems():
                gradients[param] = tensor.grad(cost,
                                               param,
                                               consider_constant=[X, Y])
        else:
            for pname, param in params.iteritems():
                gradients[param] = tensor.grad(
                    cost, param, consider_constant=[X, Y, weights])

        return gradients
Beispiel #25
0
def test_selector():
    b1 = MockBrickBottom(name="b1")
    b2 = MockBrickBottom(name="b2")
    b3 = MockBrickBottom(name="b3")
    t1 = MockBrickTop([b1, b2], name="t1")
    t2 = MockBrickTop([b2, b3], name="t2")

    s1 = Selector([t1])
    s11 = s1.select("/t1/b1")
    assert s11.bricks[0] == b1
    assert len(s11.bricks) == 1
    s12 = s1.select("/t1")
    assert s12.bricks[0] == t1
    assert len(s12.bricks) == 1

    s2 = Selector([t1, t2])
    s21 = s2.select("/t2/b2")
    assert s21.bricks[0] == b2
    assert len(s21.bricks) == 1

    assert s2.select("/t2/b2.V")[0] == b2.parameters[0]

    parameters = list(s1.get_parameters().items())
    assert parameters[0][0] == "/t1/b1.V"
    assert parameters[0][1] == b1.parameters[0]
    assert parameters[1][0] == "/t1/b1.W"
    assert parameters[1][1] == b1.parameters[1]
    assert parameters[2][0] == "/t1/b2.V"
    assert parameters[2][1] == b2.parameters[0]
    assert parameters[3][0] == "/t1/b2.W"
    assert parameters[3][1] == b2.parameters[1]
def print_parameteters(models):
    param_dict = merge(*[Selector(model).get_parameters() for model in models])
    number_of_parameters = 0

    logger.info("Parameter names: ")
    for name, value in param_dict.items():
        number_of_parameters += np.product(value.get_value().shape)
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(number_of_parameters))
Beispiel #27
0
def get_image_encoder_function(model):
    selector = Selector(model.top_bricks)
    encoder_convnet, = selector.select("/encoder_convnet").bricks
    encoder_mlp, = selector.select("/encoder_mlp").bricks

    print("Building computation graph...")
    x = tensor.tensor4("features")
    phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2))
    nlat = encoder_mlp.output_dim // 2
    mu_phi = phi[:, :nlat]
    log_sigma_phi = phi[:, nlat:]
    epsilon = Random().theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype)
    z = mu_phi + epsilon * tensor.exp(log_sigma_phi)
    computation_graph = ComputationGraph([x, z])

    print("Compiling reconstruction function...")
    encoder_function = theano.function(computation_graph.inputs, computation_graph.outputs)
    return encoder_function
Beispiel #28
0
def get_image_encoder_function(model):
    selector = Selector(model.top_bricks)
    encoder_convnet, = selector.select('/encoder_convnet').bricks
    encoder_mlp, = selector.select('/encoder_mlp').bricks

    print('Building computation graph...')
    x = tensor.tensor4('features')
    phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2))
    nlat = encoder_mlp.output_dim // 2
    mu_phi = phi[:, :nlat]
    log_sigma_phi = phi[:, nlat:]
    epsilon = Random().theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype)
    z = mu_phi + epsilon * tensor.exp(log_sigma_phi)
    computation_graph = ComputationGraph([x, z])

    print('Compiling reconstruction function...')
    encoder_function = theano.function(
        computation_graph.inputs, computation_graph.outputs)
    return encoder_function
Beispiel #29
0
    def sample_at(self, z):
        selector = Selector(self.model.top_bricks)
        decoder_mlp, = selector.select('/decoder_mlp').bricks
        decoder_convnet, = selector.select('/decoder_convnet').bricks

        print('Building computation graph...')
        sz = shared_floatx(z)
        mu_theta = decoder_convnet.apply(
            decoder_mlp.apply(sz).reshape(
                (-1,) + decoder_convnet.get_dim('input_')))
        computation_graph = ComputationGraph([mu_theta])

        print('Compiling sampling function...')
        sampling_function = theano.function(
            computation_graph.inputs, computation_graph.outputs[0])

        print('Sampling...')
        samples = sampling_function()
        return samples
Beispiel #30
0
    def __init__(self, bricks, cost):
        if not isinstance(bricks, Selector):
            bricks = Selector(bricks)
        if isinstance(cost, Variable):
            cost = ComputationGraph(cost)
        self.bricks = bricks
        self.cost = cost

        self.properties = []
        self.updates = []
Beispiel #31
0
def extract_parameter_values(bricks):
    """Extract parameter values from a bricks hierarchy.

    Parameters
    ----------
    bricks : (list of) :class:`.Brick`, or :class:`.Selector`
        The top bricks.

    Returns
    -------
    A dictionary of (parameter name, numpy array) pairs.

    """
    if isinstance(bricks, Brick):
        bricks = Selector([bricks])
    if not isinstance(bricks, Selector):
        bricks = Selector(bricks)
    return OrderedDict([(name, variable.get_value(borrow=True))
                        for name, variable in bricks.get_params().items()])
def preprocess_svhn(main_loop, save_path):
    h5file = h5py.File(save_path, mode='w')

    ali, = Selector(main_loop.model.top_bricks).select('/ali').bricks
    x = tensor.tensor4('features')
    y = tensor.imatrix('targets')
    params = ali.encoder.apply(x)
    mu = params[:, :ali.encoder._nlat]
    acts = []
    acts += [mu]
    acts += VariableFilter(bricks=[
        ali.encoder.layers[-9], ali.encoder.layers[-6], ali.encoder.layers[-3]
    ],
                           roles=[OUTPUT])(ComputationGraph([mu]).variables)
    output = tensor.concatenate([act.flatten(ndim=2) for act in acts], axis=1)
    preprocess = theano.function([x, y], [output.flatten(ndim=2), y])

    train_set = SVHN(2,
                     which_sets=('train', ),
                     sources=('features', 'targets'))
    train_stream = DataStream.default_stream(train_set,
                                             iteration_scheme=SequentialScheme(
                                                 train_set.num_examples, 100))
    train_features, train_targets = map(
        numpy.vstack,
        list(
            zip(*[
                preprocess(*batch)
                for batch in train_stream.get_epoch_iterator()
            ])))

    test_set = SVHN(2, which_sets=('test', ), sources=('features', 'targets'))
    test_stream = DataStream.default_stream(test_set,
                                            iteration_scheme=SequentialScheme(
                                                test_set.num_examples, 100))
    test_features, test_targets = map(
        numpy.vstack,
        list(
            zip(*[
                preprocess(*batch)
                for batch in test_stream.get_epoch_iterator()
            ])))

    data = (('train', 'features', train_features), ('test', 'features',
                                                    test_features),
            ('train', 'targets', train_targets), ('test', 'targets',
                                                  test_targets))
    fill_hdf5_file(h5file, data)
    for i, label in enumerate(('batch', 'feature')):
        h5file['features'].dims[i].label = label
    for i, label in enumerate(('batch', 'index')):
        h5file['targets'].dims[i].label = label

    h5file.flush()
    h5file.close()
Beispiel #33
0
 def __init__(self, langs, info_data, postfix_manager,
              parameter_unifications_include,
              parameter_unifications_exclude, **net_config):
     super(MultilangDependencyRecognizer, self).__init__(name='recognizer')
     self.langs = langs
     self.info_data = info_data
     self.postfix_manager = postfix_manager
     self.parameter_unifications_include = [
         re.compile(unification)
         for unification in parameter_unifications_include
     ]
     self.parameter_unifications_exclude = [
         re.compile(unification)
         for unification in parameter_unifications_exclude
     ]
     self.init_recognizers(**net_config)
     self.selector = Selector(self)
     self.child_postfix_regexp = [
         re.compile('.*' + chld.names_postfix + '($|_.*)')
         for chld in self.children
     ]
Beispiel #34
0
def inject_parameter_values(bricks, param_values):
    """Inject parameter values into a bricks hierarchy.

    Parameters
    ----------
    bricks : :class:`.Brick` or :class:`.Selector or list of :class:`Brick`
        The top bricks.
    param_values : dict of (parameter name, :class:`~numpy.ndarray`) pairs
        The parameter values.

    """
    if isinstance(bricks, Brick):
        bricks = Selector([bricks])
    if not isinstance(bricks, Selector):
        bricks = Selector(bricks)

    for name, value in param_values.items():
        selected = bricks.select(name)
        if len(selected) == 0:
            logger.error("Unknown parameter {}".format(name))
        if not len(selected) == 1:
            raise ValueError
        selected = selected[0]

        assert selected.get_value(
            borrow=True, return_internal_type=True).shape == value.shape
        selected.set_value(value)

    params = bricks.get_params()
    for name in params.keys():
        if name not in param_values:
            logger.error(
                "No value is provided for the parameter {}".format(name))
def make_sampling_computation_graph(model_path, num_samples):
    f = file(model_path, 'rb')
    model = cPickle.load(f)#main_loop = load(model_path)#
    f.close()
    #model = main_loop.model
    selector = Selector(model.top_bricks)
    decoder_mlp2, = selector.select('/decoder_network2').bricks
    decoder_mlp1, = selector.select('/decoder_network1').bricks
    upsample_mlp2, = selector.select('/upsample_network2').bricks
    upsample_mlp1, = selector.select('/upsample_network1').bricks
    theano_rng = Random().theano_rng

    z2 = theano_rng.normal(size=(num_samples, decoder_mlp2.input_dim),
                           dtype=theano.config.floatX)

    h2_params = decoder_mlp2.apply(z2)
    length = int(h2_params.eval().shape[1]/2)
    h2_mu = h2_params[:, :length]
    h2_lognu = h2_params[:, length:]
    h2 = h2_mu + theano.tensor.exp(0.5 * h2_lognu) * theano_rng.normal(size=h2_mu.shape,
                                                                       dtype=h2_mu.dtype)
    
    z1 = theano_rng.normal(size=(num_samples, decoder_mlp1.input_dim),
                           dtype=theano.config.floatX)

    h1_tilde_params = decoder_mlp1.apply(z1)
    length = int(h1_tilde_params.eval().shape[1]/2)
    h1_tilde_mu = h1_tilde_params[:, :length]
    h1_tilde_lognu = h1_tilde_params[:, length:]
    h1_tilde = h1_tilde_mu + theano.tensor.exp(0.5 * h1_tilde_lognu) * theano_rng.normal(size=h1_tilde_mu.shape,
                                                                                         dtype=h1_tilde_mu.dtype)


    import pdb; pdb.set_trace()
    h1 = upsample_mlp1.apply(h2) + h1_tilde
  
    p = upsample_mlp2.apply(h1).reshape((num_samples, 28, 28))

    return ComputationGraph([p])
 def def_reading_parameters(self):
     parameters = Selector(self._def_reader).get_parameters().values()
     parameters.extend(Selector(self._combiner).get_parameters().values())
     if self._reuse_word_embeddings:
         lookup_parameters = Selector(
             self._lookup).get_parameters().values()
         parameters = [p for p in parameters if p not in lookup_parameters]
     return parameters
Beispiel #37
0
def load_params(bricks, path):
    """Load brick parameters.

    Loads parameters from .npz file where they are saved with their pathes.

    Parameters
    ----------
    bricks : Brick or Selector
        The bricks.
    path : str or file
        Source for loading.

    """
    if isinstance(bricks, Brick):
        bricks = Selector([bricks])
    assert isinstance(bricks, Selector)

    param_values = {
        name.replace("-", "/"): value
        for name, value in numpy.load(path).items()
    }
    for name, value in param_values.items():
        selected = bricks.select(name)
        if len(selected) == 0:
            logger.error("Unknown parameter {}".format(name))
        assert len(selected) == 1
        selected = selected[0]

        assert selected.get_value(
            borrow=True, return_internal_type=True).shape == value.shape
        selected.set_value(value)

    params = bricks.get_params()
    for name in params.keys():
        if name not in param_values:
            logger.error(
                "No value is provided for the parameter {}".format(name))
Beispiel #38
0
    def get_gradients(self, X, Y, weights=1.):
        W_mean, W_ls, b_mean, b_ls = self.parameters

        mean, log_sigma = self.sample_expected(Y)
        sigma = tensor.exp(log_sigma)

        cost = -log_sigma - 0.5 * (X - mean)**2 / tensor.exp(2 * log_sigma)
        if weights != 1.:
            cost = -weights.dimshuffle(0, 'x') * cost

        cost_scaled = sigma**2 * cost
        cost_gscale = (sigma**2).sum(axis=1).dimshuffle([0, 'x'])
        cost_gscale = cost_gscale * cost

        gradients = OrderedDict()

        params = Selector(self.mlp).get_parameters()
        for pname, param in params.iteritems():
            gradients[param] = tensor.grad(cost_gscale.sum(),
                                           param,
                                           consider_constant=[X, Y])

        gradients[W_mean] = tensor.grad(cost_scaled.sum(),
                                        W_mean,
                                        consider_constant=[X, Y])
        gradients[b_mean] = tensor.grad(cost_scaled.sum(),
                                        b_mean,
                                        consider_constant=[X, Y])

        gradients[W_ls] = tensor.grad(cost_scaled.sum(),
                                      W_ls,
                                      consider_constant=[X, Y])
        gradients[b_ls] = tensor.grad(cost_scaled.sum(),
                                      b_ls,
                                      consider_constant=[X, Y])

        return gradients
Beispiel #39
0
def test_selector():
    class MockBrickTop(Brick):
        def __init__(self, children, **kwargs):
            super(MockBrickTop, self).__init__(**kwargs)
            self.children = children
            self.params = []

    class MockBrickBottom(Brick):
        def __init__(self, **kwargs):
            super(MockBrickBottom, self).__init__(**kwargs)
            self.params = [theano.shared(0, "V"), theano.shared(0, "W")]

    b1 = MockBrickBottom(name="b1")
    b2 = MockBrickBottom(name="b2")
    b3 = MockBrickBottom(name="b3")
    t1 = MockBrickTop([b1, b2], name="t1")
    t2 = MockBrickTop([b2, b3], name="t2")

    s1 = Selector([t1])
    s11 = s1.select("/t1/b1")
    assert s11.bricks[0] == b1
    assert len(s11.bricks) == 1
    s12 = s1.select("/t1")
    assert s12.bricks[0] == t1
    assert len(s12.bricks) == 1

    s2 = Selector([t1, t2])
    s21 = s2.select("/t2/b2")
    assert s21.bricks[0] == b2
    assert len(s21.bricks) == 1

    assert s2.select("/t2/b2.V")[0] == b2.params[0]

    params = list(s1.get_params().items())
    assert params[0][0] == "/t1/b1.V"
    assert params[0][1] == b1.params[0]
    assert params[1][0] == "/t1/b1.W"
    assert params[1][1] == b1.params[1]
    assert params[2][0] == "/t1/b2.V"
    assert params[2][1] == b2.params[0]
    assert params[3][0] == "/t1/b2.W"
    assert params[3][1] == b2.params[1]
Beispiel #40
0
def save_params(bricks, path):
    """Save bricks parameters.

    Saves parameters with their pathes into an .npz file.

    Parameters
    ----------
    bricks : Brick or Selector
        The bricks.
    path : str of file
        Destination for saving.

    """
    if isinstance(bricks, Brick):
        bricks = Selector([bricks])
    assert isinstance(bricks, Selector)

    params = bricks.get_params()
    # numpy.savez is vulnerable to slashes in names
    param_values = {
        name.replace("/", "-"): param.get_value()
        for name, param in params.items()
    }
    numpy.savez(path, **param_values)
Beispiel #41
0
def test_selector():
    class MockBrickTop(Brick):

        def __init__(self, children, **kwargs):
            super(MockBrickTop, self).__init__(**kwargs)
            self.children = children
            self.params = []

    class MockBrickBottom(Brick):

        def __init__(self, **kwargs):
            super(MockBrickBottom, self).__init__(**kwargs)
            self.params = [theano.shared(0, "V"), theano.shared(0, "W")]

    b1 = MockBrickBottom(name="b1")
    b2 = MockBrickBottom(name="b2")
    b3 = MockBrickBottom(name="b3")
    t1 = MockBrickTop([b1, b2], name="t1")
    t2 = MockBrickTop([b2, b3], name="t2")

    s1 = Selector([t1])
    s11 = s1.select("/t1/b1")
    assert s11.bricks[0] == b1
    assert len(s11.bricks) == 1
    s12 = s1.select("/t1")
    assert s12.bricks[0] == t1
    assert len(s12.bricks) == 1

    s2 = Selector([t1, t2])
    s21 = s2.select("/t2/b2")
    assert s21.bricks[0] == b2
    assert len(s21.bricks) == 1

    assert s2.select("/t2/b2.V")[0] == b2.params[0]

    params = list(s1.get_params().items())
    assert params[0][0] == "/t1/b1.V"
    assert params[0][1] == b1.params[0]
    assert params[1][0] == "/t1/b1.W"
    assert params[1][1] == b1.params[1]
    assert params[2][0] == "/t1/b2.V"
    assert params[2][1] == b2.params[0]
    assert params[3][0] == "/t1/b2.W"
    assert params[3][1] == b2.params[1]
Beispiel #42
0
def main(config,
         tr_stream,
         dev_stream,
         source_vocab,
         target_vocab,
         use_bokeh=False):

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    initial_context = tensor.matrix('initial_context')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])

    # let user specify the target transition class name in config,
    # eval it and pass to decoder
    target_transition_name = config.get(
        'target_transition', 'GRUInitialStateWithInitialStateSumContext')
    target_transition = eval(target_transition_name)

    logger.info('Using target transition: {}'.format(target_transition_name))
    decoder = InitialContextDecoder(config['trg_vocab_size'],
                                    config['dec_embed'], config['dec_nhids'],
                                    config['enc_nhids'] * 2,
                                    config['context_dim'], target_transition)

    cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask),
                        source_sentence_mask, target_sentence,
                        target_sentence_mask, initial_context)

    cost.name = 'decoder_cost'

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING
    # TODO: validate performance with/without regularization
    if config.get('l2_regularization', False) is True:
        l2_reg_alpha = config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to name the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_parameters(),
        Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions

    # TODO: add checking for existing model and loading
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Create the theano variables that we need for the sampling graph
    sampling_input = tensor.lmatrix('input')
    sampling_context = tensor.matrix('context_input')

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config.get('bleu_script',
                                                 None) is not None:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))

        generated = decoder.generate(sampling_input, sampling_representation,
                                     sampling_context)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(
                model=search_model,
                data_stream=tr_stream,
                hook_samples=config['hook_samples'],
                every_n_batches=config['sampling_freq'],
                src_vocab=source_vocab,
                trg_vocab=target_vocab,
                src_vocab_size=config['src_vocab_size'],
            ))

    # Add early stopping based on bleu
    if config.get('bleu_script', None) is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(sampling_input,
                          sampling_context,
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          src_vocab=source_vocab,
                          trg_vocab=target_vocab,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Add early stopping based on Meteor
    if config.get('meteor_directory', None) is not None:
        logger.info("Building meteor validator")
        extensions.append(
            MeteorValidator(sampling_input,
                            sampling_context,
                            samples=samples,
                            config=config,
                            model=search_model,
                            data_stream=dev_stream,
                            src_vocab=source_vocab,
                            trg_vocab=target_vocab,
                            normalize=config['normalized_bleu'],
                            every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(config['model_save_directory'],
                 channels=[[
                     'decoder_cost', 'validation_set_bleu_score',
                     'validation_set_meteor_score'
                 ]],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    # if there is dropout or random noise, we need to use the output of the modified graph
    if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0:
        algorithm = GradientDescent(cost=cg.outputs[0],
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Beispiel #43
0
 def discriminator_parameters(self):
     return list(Selector([self.discriminator]).get_parameters().values())
Beispiel #44
0
 def generator_parameters(self):
     return list(
         Selector([self.encoder, self.decoder]).get_parameters().values())
Beispiel #45
0
def train(config, save_path, bokeh_name,
          params, bokeh_server, test_tag, use_load_ext,
          load_log, fast_start, validation_epochs, validation_batches,
          per_epochs, per_batches):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])

    # Build the main brick and initialize all parameters.
    recognizer = SpeechRecognizer(
        data.recordings_source, data.labels_source,
        data.eos_label,
        data.num_features, data.num_labels,
        name="recognizer",
        data_prepend_eos=data.prepend_eos,
        character_map=data.character_map,
        **config["net"])
    for brick_path, attribute_dict in sorted(
            config['initialization'].items(),
            key=lambda (k, v): -k.count('/')):
        for attribute, value in attribute_dict.items():
            brick, = Selector(recognizer).select(brick_path).bricks
            setattr(brick, attribute, value)
            brick.push_initialization_config()
    recognizer.initialize()

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result
    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    if params:
        logger.info("Load parameters from " + params)
        recognizer.load_params(params)

    if test_tag:
        tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__
        __stream = data.get_stream("train")
        __data = next(__stream.get_epoch_iterator(as_dict=True))
        recognizer.recordings.tag.test_value = __data[data.recordings_source]
        recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask']
        recognizer.labels.tag.test_value = __data[data.labels_source]
        recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask']
        theano.config.compute_test_value = 'warn'

    batch_cost = recognizer.get_cost_graph().sum()
    batch_size = named_copy(recognizer.recordings.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_log_likelihood"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(
        applications=[r.generator.readout.readout], name="output_0")(
                cost_cg)
    bottom_output, = VariableFilter(
        applications=[r.bottom.apply], name="output")(
                cost_cg)
    attended, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended")(
                cost_cg)
    attended_mask, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended_mask")(
                cost_cg)
    weights, = VariableFilter(
        applications=[r.generator.evaluate], name="weights")(
                cost_cg)
    max_recording_length = named_copy(r.recordings.shape[0],
                                      "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = named_copy(attended_mask.shape[0],
                                          "max_attended_mask_length")
    max_attended_length = named_copy(attended.shape[0],
                                     "max_attended_length")
    max_num_phonemes = named_copy(r.labels.shape[0],
                                  "max_num_phonemes")
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_attended = named_copy(abs(attended).mean(),
                               "mean_attended")
    mean_bottom_output = named_copy(abs(bottom_output).mean(),
                                    "mean_bottom_output")
    weights_penalty = named_copy(monotonicity_penalty(weights, r.labels_mask),
                                 "weights_penalty")
    weights_entropy = named_copy(entropy(weights, r.labels_mask),
                                 "weights_entropy")
    mask_density = named_copy(r.labels_mask.mean(),
                              "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy,
        min_energy, max_energy,
        mean_attended, mean_bottom_output,
        batch_size, max_num_phonemes,
        mask_density])

    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config['regularization']
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [p for p in cg.parameters if p not in attention_params]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])
    regularized_cost = regularized_cg.outputs[0]
    regularized_weights_penalty = regularized_cg.outputs[1]

    # Model is weird class, we spend lots of time arguing with Bart
    # what it should be. However it can already nice things, e.g.
    # one extract all the parameters from the computation graphs
    # and give them hierahical names. This help to notice when a
    # because of some bug a parameter is not in the computation
    # graph.
    model = SpeechModel(regularized_cost)
    params = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, params[key].get_value().shape) for key
                        in sorted(params.keys())],
                    width=120))

    # Define the training algorithm.
    train_conf = config['training']
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'], train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False):
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [v for v in maxnorm_subjects
                                if not isinstance(get_brick(v), LookupTable)]
        logger.info("Parameters covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in params.items()
                                        if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in params.items()
                                        if not p in maxnorm_subjects]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                        maxnorm_subjects)]
    algorithm = GradientDescent(
        cost=regularized_cost +
            reg_config.get("penalty_coof", .0) * regularized_weights_penalty / batch_size +
            reg_config.get("decay", .0) *
            l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2,
        parameters=params.values(),
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)]))

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    observables = regularized_cg.outputs
    observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold]
    for name, param in params.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements ** 0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        observables.append(stats)

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(named_copy(aggregation.mean(var, batch_size),
                                            'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(named_copy(aggregation.mean(
                    var, recognizer.labels_mask.sum()), 'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
        ]
    extensions.append(TrainingDataMonitoring(
        [observables[0], algorithm.total_gradient_norm,
            algorithm.total_step_norm, clipping.threshold,
            max_recording_length,
            max_attended_length, max_attended_mask_length], after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(observables),
        prefix="average", every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes([cost, weights_entropy, weights_penalty]),
        data.get_stream("valid"), prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=validation_epochs,
            every_n_batches=validation_batches,
            after_training=False)
    extensions.append(validation)
    recognizer.init_beam_search(10)
    per = PhonemeErrorRate(recognizer, data.get_dataset("valid"))
    per_monitoring = DataStreamMonitoring(
        [per], data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=per_epochs,
            every_n_batches=per_batches,
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_likelihood = TrackTheBest(
        validation.record_name(cost)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_likelihood, track_the_best_per]
    extensions.append(AdaptiveClipping(
        algorithm.total_gradient_norm.name,
        clipping, train_conf['gradient_threshold'],
        decay_rate=0.998, burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf['num_batches'],
                    after_n_epochs=train_conf['num_epochs'])
        .add_condition(["after_batch"], _gradient_norm_is_none),
        # Live plotting: requires launching `bokeh-server`
        # and allows to see what happens online.
        Plot(bokeh_name
             if bokeh_name
             else os.path.basename(save_path),
             [# Plot 1: training and validation costs
             [average_monitoring.record_name(regularized_cost),
             validation.record_name(cost)],
             # Plot 2: gradient norm,
             [average_monitoring.record_name(algorithm.total_gradient_norm),
             average_monitoring.record_name(clipping.threshold)],
             # Plot 3: phoneme error rate
             [per_monitoring.record_name(per)],
             # Plot 4: training and validation mean weight entropy
             [average_monitoring._record_name('weights_entropy_per_label'),
             validation._record_name('weights_entropy_per_label')],
             # Plot 5: training and validation monotonicity penalty
             [average_monitoring._record_name('weights_penalty_per_recording'),
             validation._record_name('weights_penalty_per_recording')]],
             every_n_batches=10,
             server_url=bokeh_server),
        Checkpoint(save_path,
                   before_first_epoch=not fast_start, after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True)
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_per.notification_name),
            (root_path + "_best" + extension,))
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_likelihood.notification_name),
            (root_path + "_best_ll" + extension,)),
        ProgressBar(),
        Printing(every_n_batches=1,
                    attribute_filter=PrintingFilterList()
                    )]

    # Save the config into the status
    log = TrainingLog()
    log.status['_config'] = repr(config)
    main_loop = MainLoop(
        model=model, log=log, algorithm=algorithm,
        data_stream=data.get_stream("train"),
        extensions=extensions)
    main_loop.run()
Beispiel #46
0
def create_training_computation_graphs(
    z_dim,
    image_size,
    net_depth,
    discriminative_regularization,
    classifer,
    vintage,
    reconstruction_factor,
    kl_factor,
    discriminative_factor,
    disc_weights,
):
    x = tensor.tensor4("features")
    pi = numpy.cast[theano.config.floatX](numpy.pi)

    bricks = create_model_bricks(z_dim=z_dim, image_size=image_size, depth=net_depth)
    encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks
    if discriminative_regularization:
        if vintage:
            classifier_model = Model(load(classifer).algorithm.cost)
        else:
            with open(classifer, "rb") as src:
                classifier_model = Model(load(src).algorithm.cost)
        selector = Selector(classifier_model.top_bricks)
        classifier_convnet, = selector.select("/convnet").bricks
        classifier_mlp, = selector.select("/mlp").bricks

    random_brick = Random()

    # Initialize conditional variances
    log_sigma_theta = shared_floatx(numpy.zeros((3, image_size, image_size)), name="log_sigma_theta")
    add_role(log_sigma_theta, PARAMETER)
    variance_parameters = [log_sigma_theta]
    num_disc_layers = 0
    if discriminative_regularization:
        # We add discriminative regularization for the batch-normalized output
        # of the strided layers of the classifier.
        for layer in classifier_convnet.layers[1::3]:
            log_sigma = shared_floatx(numpy.zeros(layer.get_dim("output")), name="{}_log_sigma".format(layer.name))
            add_role(log_sigma, PARAMETER)
            variance_parameters.append(log_sigma)
        # include mlp
        # DISABLED
        # log_sigma = shared_floatx(
        #     numpy.zeros([classifier_mlp.output_dim]),
        #     name='{}_log_sigma'.format("MLP"))
        # add_role(log_sigma, PARAMETER)
        # variance_parameters.append(log_sigma)
        # diagnostic
        num_disc_layers = len(variance_parameters) - 1
        print("Applying discriminative regularization on {} layers".format(num_disc_layers))

    # Computation graph creation is encapsulated within this function in order
    # to allow selecting which parts of the graph will use batch statistics for
    # batch normalization and which parts will use population statistics.
    # Specifically, we'd like to use population statistics for the classifier
    # even in the training graph.
    def create_computation_graph():
        # Encode
        phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2))
        nlat = encoder_mlp.output_dim // 2
        mu_phi = phi[:, :nlat]
        log_sigma_phi = phi[:, nlat:]
        # Sample from the approximate posterior
        epsilon = random_brick.theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype)
        z = mu_phi + epsilon * tensor.exp(log_sigma_phi)
        # Decode
        mu_theta = decoder_convnet.apply(decoder_mlp.apply(z).reshape((-1,) + decoder_convnet.get_dim("input_")))
        log_sigma = log_sigma_theta.dimshuffle("x", 0, 1, 2)

        # Compute KL and reconstruction terms
        kl_term = 0.5 * (tensor.exp(2 * log_sigma_phi) + mu_phi ** 2 - 2 * log_sigma_phi - 1).sum(axis=1)

        reconstruction_term = -0.5 * (
            tensor.log(2 * pi) + 2 * log_sigma + (x - mu_theta) ** 2 / tensor.exp(2 * log_sigma)
        ).sum(axis=[1, 2, 3])

        discriminative_layer_terms = [None] * num_disc_layers
        for i in range(num_disc_layers):
            discriminative_layer_terms[i] = tensor.zeros_like(kl_term)
        discriminative_term = tensor.zeros_like(kl_term)
        if discriminative_regularization:
            # Propagate both the input and the reconstruction through the classifier
            acts_cg = ComputationGraph([classifier_mlp.apply(classifier_convnet.apply(x).flatten(ndim=2))])
            acts_hat_cg = ComputationGraph([classifier_mlp.apply(classifier_convnet.apply(mu_theta).flatten(ndim=2))])

            # Retrieve activations of interest and compute discriminative
            # regularization reconstruction terms
            cur_layer = 0
            # CLASSIFIER MLP DISABLED
            # for i, zip_pair in enumerate(zip(classifier_convnet.layers[1::3] + [classifier_mlp],
            for i, zip_pair in enumerate(zip(classifier_convnet.layers[1::3], variance_parameters[1:])):

                layer, log_sigma = zip_pair
                variable_filter = VariableFilter(roles=[OUTPUT], bricks=[layer])

                d, = variable_filter(acts_cg)
                d_hat, = variable_filter(acts_hat_cg)

                # TODO: this conditional could be less brittle
                if "mlp" in layer.name.lower():
                    log_sigma = log_sigma.dimshuffle("x", 0)
                    sumaxis = [1]
                else:
                    log_sigma = log_sigma.dimshuffle("x", 0, 1, 2)
                    sumaxis = [1, 2, 3]

                discriminative_layer_term_unweighted = -0.5 * (
                    tensor.log(2 * pi) + 2 * log_sigma + (d - d_hat) ** 2 / tensor.exp(2 * log_sigma)
                ).sum(axis=sumaxis)

                discriminative_layer_terms[i] = (
                    discriminative_factor * disc_weights[cur_layer] * discriminative_layer_term_unweighted
                )
                discriminative_term = discriminative_term + discriminative_layer_terms[i]

                cur_layer = cur_layer + 1

        # scale terms (disc is prescaled by layer)
        reconstruction_term = reconstruction_factor * reconstruction_term
        kl_term = kl_factor * kl_term

        # total_reconstruction_term is reconstruction + discriminative
        total_reconstruction_term = reconstruction_term + discriminative_term

        # cost is mean(kl - total reconstruction)
        cost = (kl_term - total_reconstruction_term).mean()

        return ComputationGraph([cost, kl_term, reconstruction_term, discriminative_term] + discriminative_layer_terms)

    cg = create_computation_graph()
    with batch_normalization(encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp):
        bn_cg = create_computation_graph()

    return cg, bn_cg, variance_parameters
Beispiel #47
0
def create_training_computation_graphs(discriminative_regularization):
    x = tensor.tensor4('features')
    pi = numpy.cast[theano.config.floatX](numpy.pi)

    bricks = create_model_bricks()
    encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks
    if discriminative_regularization:
        classifier_model = Model(load('celeba_classifier.zip').algorithm.cost)
        selector = Selector(classifier_model.top_bricks)
        classifier_convnet, = selector.select('/convnet').bricks
    random_brick = Random()

    # Initialize conditional variances
    log_sigma_theta = shared_floatx(
        numpy.zeros((3, 64, 64)), name='log_sigma_theta')
    add_role(log_sigma_theta, PARAMETER)
    variance_parameters = [log_sigma_theta]
    if discriminative_regularization:
        # We add discriminative regularization for the batch-normalized output
        # of the strided layers of the classifier.
        for layer in classifier_convnet.layers[4::6]:
            log_sigma = shared_floatx(
                numpy.zeros(layer.get_dim('output')),
                name='{}_log_sigma'.format(layer.name))
            add_role(log_sigma, PARAMETER)
            variance_parameters.append(log_sigma)

    # Computation graph creation is encapsulated within this function in order
    # to allow selecting which parts of the graph will use batch statistics for
    # batch normalization and which parts will use population statistics.
    # Specifically, we'd like to use population statistics for the classifier
    # even in the training graph.
    def create_computation_graph():
        # Encode
        phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2))
        nlat = encoder_mlp.output_dim // 2
        mu_phi = phi[:, :nlat]
        log_sigma_phi = phi[:, nlat:]
        # Sample from the approximate posterior
        epsilon = random_brick.theano_rng.normal(
            size=mu_phi.shape, dtype=mu_phi.dtype)
        z = mu_phi + epsilon * tensor.exp(log_sigma_phi)
        # Decode
        mu_theta = decoder_convnet.apply(
            decoder_mlp.apply(z).reshape(
                (-1,) + decoder_convnet.get_dim('input_')))
        log_sigma = log_sigma_theta.dimshuffle('x', 0, 1, 2)

        # Compute KL and reconstruction terms
        kl_term = 0.5 * (
            tensor.exp(2 * log_sigma_phi) + mu_phi ** 2 - 2 * log_sigma_phi - 1
        ).sum(axis=1)
        reconstruction_term = -0.5 * (
            tensor.log(2 * pi) + 2 * log_sigma +
            (x - mu_theta) ** 2 / tensor.exp(2 * log_sigma)
        ).sum(axis=[1, 2, 3])
        total_reconstruction_term = reconstruction_term

        if discriminative_regularization:
            # Propagate both the input and the reconstruction through the
            # classifier
            acts_cg = ComputationGraph([classifier_convnet.apply(x)])
            acts_hat_cg = ComputationGraph(
                [classifier_convnet.apply(mu_theta)])

            # Retrieve activations of interest and compute discriminative
            # regularization reconstruction terms
            for layer, log_sigma in zip(classifier_convnet.layers[4::6],
                                        variance_parameters[1:]):
                variable_filter = VariableFilter(roles=[OUTPUT],
                                                 bricks=[layer])
                d, = variable_filter(acts_cg)
                d_hat, = variable_filter(acts_hat_cg)
                log_sigma = log_sigma.dimshuffle('x', 0, 1, 2)

                total_reconstruction_term += -0.5 * (
                    tensor.log(2 * pi) + 2 * log_sigma +
                    (d - d_hat) ** 2 / tensor.exp(2 * log_sigma)
                ).sum(axis=[1, 2, 3])

        cost = (kl_term - total_reconstruction_term).mean()

        return ComputationGraph([cost, kl_term, reconstruction_term])

    cg = create_computation_graph()
    with batch_normalization(encoder_convnet, encoder_mlp,
                             decoder_convnet, decoder_mlp):
        bn_cg = create_computation_graph()

    return cg, bn_cg, variance_parameters
Beispiel #48
0
 def get_zdim(self):
     selector = Selector(self.model.top_bricks)
     decoder_mlp, = selector.select("/decoder_mlp").bricks
     return decoder_mlp.input_dim
Beispiel #49
0
def run(batch_size, save_path, z_dim, oldmodel, discriminative_regularization,
        classifier, vintage, monitor_every, monitor_before, checkpoint_every, dataset, color_convert,
        image_size, net_depth, subdir,
        reconstruction_factor, kl_factor, discriminative_factor, disc_weights,
        num_epochs):

    if dataset:
        streams = create_custom_streams(filename=dataset,
                                        training_batch_size=batch_size,
                                        monitoring_batch_size=batch_size,
                                        include_targets=False,
                                        color_convert=color_convert)
    else:
        streams = create_celeba_streams(training_batch_size=batch_size,
                                        monitoring_batch_size=batch_size,
                                        include_targets=False)

    main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3]

    # Compute parameter updates for the batch normalization population
    # statistics. They are updated following an exponential moving average.
    rval = create_training_computation_graphs(
                z_dim, image_size, net_depth, discriminative_regularization, classifier,
                vintage, reconstruction_factor, kl_factor, discriminative_factor, disc_weights)
    cg, bn_cg, variance_parameters = rval

    pop_updates = list(
        set(get_batch_normalization_updates(bn_cg, allow_duplicates=True)))
    decay_rate = 0.05
    extra_updates = [(p, m * decay_rate + p * (1 - decay_rate))
                     for p, m in pop_updates]

    model = Model(bn_cg.outputs[0])

    selector = Selector(
        find_bricks(
            model.top_bricks,
            lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp',
                                         'decoder_convnet', 'decoder_mlp')))
    parameters = list(selector.get_parameters().values()) + variance_parameters

    # Prepare algorithm
    step_rule = Adam()
    algorithm = GradientDescent(cost=bn_cg.outputs[0],
                                parameters=parameters,
                                step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    # Prepare monitoring
    sys.setrecursionlimit(1000000)

    monitored_quantities_list = []
    for graph in [bn_cg, cg]:
        # cost, kl_term, reconstruction_term, discriminative_term = graph.outputs
        cost, kl_term, reconstruction_term, discriminative_term = graph.outputs[:4]
        discriminative_layer_terms = graph.outputs[4:]

        cost.name = 'nll_upper_bound'
        avg_kl_term = kl_term.mean(axis=0)
        avg_kl_term.name = 'avg_kl_term'
        avg_reconstruction_term = -reconstruction_term.mean(axis=0)
        avg_reconstruction_term.name = 'avg_reconstruction_term'
        avg_discriminative_term = discriminative_term.mean(axis=0)
        avg_discriminative_term.name = 'avg_discriminative_term'

        num_layer_terms = len(discriminative_layer_terms)
        avg_discriminative_layer_terms = [None] * num_layer_terms
        for i, term in enumerate(discriminative_layer_terms):
            avg_discriminative_layer_terms[i] = discriminative_layer_terms[i].mean(axis=0)
            avg_discriminative_layer_terms[i].name = "avg_discriminative_term_layer_{:02d}".format(i)

        monitored_quantities_list.append(
            [cost, avg_kl_term, avg_reconstruction_term,
             avg_discriminative_term] + avg_discriminative_layer_terms)

    train_monitoring = DataStreamMonitoring(
        monitored_quantities_list[0], train_monitor_stream, prefix="train",
        updates=extra_updates, after_epoch=False, before_first_epoch=monitor_before,
        every_n_epochs=monitor_every)
    valid_monitoring = DataStreamMonitoring(
        monitored_quantities_list[1], valid_monitor_stream, prefix="valid",
        after_epoch=False, before_first_epoch=monitor_before,
        every_n_epochs=monitor_every)

    # Prepare checkpoint
    checkpoint = Checkpoint(save_path, every_n_epochs=checkpoint_every,
                            before_training=True, use_cpickle=True)

    sample_checkpoint = SampleCheckpoint(interface=DiscGenModel, z_dim=z_dim/2,
                            image_size=(image_size, image_size), channels=3,
                            dataset=dataset, split="valid", save_subdir=subdir,
                            before_training=True, after_epoch=True)
    # TODO: why does z_dim=foo become foo/2?
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs),
                  checkpoint,
                  sample_checkpoint,
                  train_monitoring, valid_monitoring, 
                  Printing(),
                  ProgressBar()]
    main_loop = MainLoop(model=model, data_stream=main_loop_stream,
                         algorithm=algorithm, extensions=extensions)

    if oldmodel is not None:
        print("Initializing parameters with old model {}".format(oldmodel))
        try:
            saved_model = load(oldmodel)
        except AttributeError:
            # newer version of blocks
            with open(oldmodel, 'rb') as src:
                saved_model = load(src)
        main_loop.model.set_parameter_values(
            saved_model.model.get_parameter_values())
        del saved_model

    main_loop.run()
Beispiel #50
0
def create_model(config, data,
                 load_path=None,
                 test_tag=False):
    """
    Build the main brick and initialize or load all parameters.

    Parameters
    ----------

    config : dict
        the configuration dict

    data : object of class Data
        the dataset creation object

    load_path : str or None
        if given a string, it will be used to load model parameters. Else,
        the parameters will be randomly initalized by calling
        recognizer.initialize()

    test_tag : bool
        if true, will add tag the input variables with test values

    """
    # First tell the recognizer about required data sources
    net_config = dict(config["net"])
    bottom_class = net_config['bottom']['bottom_class']
    input_dims = {
        source: data.num_features(source)
        for source in bottom_class.vector_input_sources}
    input_num_chars = {
        source: len(data.character_map(source))
        for source in bottom_class.discrete_input_sources}

    recognizer = SpeechRecognizer(
        input_dims=input_dims,
        input_num_chars=input_num_chars,
        eos_label=data.eos_label,
        num_phonemes=data.num_labels,
        name="recognizer",
        data_prepend_eos=data.prepend_eos,
        character_map=data.character_map('labels'),
        **net_config)
    if load_path:
        recognizer.load_params(load_path)
    else:
        for brick_path, attribute_dict in sorted(
                config['initialization'].items(),
                key=lambda (k, v): k.count('/')):
            for attribute, value in attribute_dict.items():
                brick, = Selector(recognizer).select(brick_path).bricks
                setattr(brick, attribute, value)
                brick.push_initialization_config()
        recognizer.initialize()

    if test_tag:
        # fails with newest theano
        # tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__
        __stream = data.get_stream("train")
        __data = next(__stream.get_epoch_iterator(as_dict=True))
        for __var in recognizer.inputs.values():
            __var.tag.test_value = __data[__var.name]
        theano.config.compute_test_value = 'warn'
    return recognizer
Beispiel #51
0
def main():

    # set para
    config = getattr(configurations, "get_config_cs2en")()
    logger.info("Model options:\n{}".format(pprint.pformat(config)))
    tr_stream = get_tr_stream(**config)

    # Create Theano variables
    logger.info("Creating theano variables")

    source_sentence0 = tensor.lmatrix("source0")
    source_sentence_mask0 = tensor.matrix("source0_mask")
    target_sentence0 = tensor.lmatrix("target0")
    target_sentence_mask0 = tensor.matrix("target0_mask")

    source_sentence1 = tensor.lmatrix("source1")
    source_sentence_mask1 = tensor.matrix("source1_mask")
    target_sentence1 = tensor.lmatrix("target1")
    target_sentence_mask1 = tensor.matrix("target1_mask")

    source_sentence2 = tensor.lmatrix("source2")
    source_sentence_mask2 = tensor.matrix("source2_mask")
    target_sentence2 = tensor.lmatrix("target2")
    target_sentence_mask2 = tensor.matrix("target2_mask")

    sampling_input0 = tensor.lmatrix("input0")
    sampling_input1 = tensor.lmatrix("input1")
    sampling_input2 = tensor.lmatrix("input2")

    sampling_hstates0 = tensor.fmatrix("hstates0")
    sampling_hstates1 = tensor.fmatrix("hstates1")
    sampling_hstates2 = tensor.fmatrix("hstates2")

    sampling_lastrep0 = tensor.tensor3("lastrep0")
    sampling_lastrep1 = tensor.tensor3("lastrep1")

    hstates = theano.shared(value=numpy.zeros((config["enc_nhids"]), dtype=theano.config.floatX), name="hstates")

    # Get vocab
    sources = get_attr_rec(tr_stream, "data_stream")
    src_vocab = sources.data_streams[0].dataset.dictionary
    trg_vocab = sources.data_streams[1].dataset.dictionary

    # Construct model
    logger.info("Building PoemModel")

    block0 = PoemBlock(config=config, blockid="block0", name="poemblock0")
    block1 = PoemBlock(config=config, blockid="block1", name="poemblock1")
    block2 = PoemBlock(config=config, blockid="block2", name="poemblock2")

    cost0, hsta0, rep0 = block0.cost(
        source_sentence0,
        source_sentence_mask0,
        source_sentence_mask1,
        source_sentence_mask0,
        target_sentence0,
        target_sentence_mask0,
        hstates,
        lastrep0=None,
        lastrep1=None,
    )

    cost1, hsta1, rep1 = block1.cost(
        source_sentence1,
        source_sentence_mask0,
        source_sentence_mask1,
        source_sentence_mask1,
        target_sentence1,
        target_sentence_mask1,
        hsta0,
        lastrep0=rep0,
        lastrep1=None,
    )

    cost2, hsta2, rep2 = block2.cost(
        source_sentence2,
        source_sentence_mask0,
        source_sentence_mask1,
        source_sentence_mask2,
        target_sentence2,
        target_sentence_mask2,
        hsta1,
        lastrep0=rep0,
        lastrep1=rep1,
    )

    cost = cost0 + cost1 + cost2
    cost.name = "total_cost"

    logger.info("Creating computational graph")

    cg = ComputationGraph(cost)

    # Initialize model
    logger.info("Initializing model")
    block0.set_initw(IsotropicGaussian(config["weight_scale"]))
    block0.set_initb(Constant(0))
    block0.push_initialization_config()
    block0.set_specialinit(Orthogonal(), Orthogonal())
    block0.initialize()

    block1.set_initw(IsotropicGaussian(config["weight_scale"]))
    block1.set_initb(Constant(0))
    block1.push_initialization_config()
    block1.set_specialinit(Orthogonal(), Orthogonal())
    block1.initialize()

    block2.set_initw(IsotropicGaussian(config["weight_scale"]))
    block2.set_initb(Constant(0))
    block2.push_initialization_config()
    block2.set_specialinit(Orthogonal(), Orthogonal())
    block2.initialize()

    # apply dropout for regularization
    if config["dropout"] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info("Applying dropout")
        dropout_inputs = [x for x in cg.intermediary_variables if x.name == "maxout_apply_output"]
        cg = apply_dropout(cg, dropout_inputs, config["dropout"])

    # Print shapes

    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info("    {:15}: {}".format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names

    param_dict = Selector(block0).get_parameters()
    logger.info("Parameter names: ")
    for name, value in param_dict.items():
        logger.info("    {:15}: {}".format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(len(param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # logger.info(cg.auxiliary_variables)
    # logger.info("______________________________")

    """
    weights = ""
    for va in cg.auxiliary_variables:
        if va.name == "sequence_generator_block0_cost_matrix_weighted_averages":
            weights = va

    weightsize = weights.shape
    weightsize.name = "weightsize"

    states = ""
    for va in cg.auxiliary_variables:
        if va.name == "sequence_generator_block0_cost_matrix_states":
            states = va

    statesize = states.shape
    statesize.name = "statesize"

    rep = ""
    for va in cg.auxiliary_variables:
        if va.name == "poemblock0_cost_block0hstatesRepeat":
            rep = va

    repsize = rep.shape
    repsize.name = "repsize"

    """

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config["finish_after"]),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config["saveto"], every_n_batches=config["save_freq"]),
    ]

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost,
        parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config["step_clipping"]), eval(config["step_rule"])()]),
    )

    # Reload model if necessary
    if config["reload"]:
        extensions.append(LoadNMT(config["saveto"]))

    # Add sampling

    if config["hook_samples"] >= 1:
        logger.info("Building sampler")

        generated0 = block0.mygenerate(sampling_input0, sampling_hstates0)
        search_model0 = Model(generated0)

        generated1 = block1.mygenerate(sampling_input1, sampling_hstates1, sampling_lastrep0)
        search_model1 = Model(generated1)

        generated2 = block2.mygenerate(sampling_input2, sampling_hstates2, sampling_lastrep0, sampling_lastrep1)
        search_model2 = Model(generated2)

        extensions.append(
            Sampler(
                config=config,
                model0=search_model0,
                model1=search_model1,
                model2=search_model2,
                data_stream=tr_stream,
                hook_samples=config["hook_samples"],
                every_n_batches=config["sampling_freq"],
                src_vocab_size=config["src_vocab_size"],
            )
        )

        logger.info("End of building sampler")

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions)

    # Train!
    main_loop.run()
Beispiel #52
0
    def get_gradients(self, features, n_samples):
        """Perform inference and calculate gradients.

        Returns
        -------
            log_px : T.fvector
            log_psx : T.fvector
            gradients : OrderedDict
        """
        p_layers = self.p_layers
        q_layers = self.q_layers
        n_layers = len(p_layers)

        batch_size = features.shape[0]

        x = replicate_batch(features, n_samples)

        # Get Q-samples
        samples, log_p, log_q = self.sample_q(x)

        # Reshape and sum
        samples = unflatten_values(samples, batch_size, n_samples)
        log_p = unflatten_values(log_p, batch_size, n_samples)
        log_q = unflatten_values(log_q, batch_size, n_samples)

        log_p_all = sum(log_p)
        log_q_all = sum(log_q)

        # Approximate log p(x)
        log_px_bound = log_p_all[:,0] - log_q_all[:,0]
        log_px  = logsumexp(log_p_all-log_q_all, axis=-1) - tensor.log(n_samples)
        log_psx = (logsumexp((log_p_all-log_q_all)/2, axis=-1) - tensor.log(n_samples)) * 2.

        # Calculate IS weights
        w = self.importance_weights(log_p, log_q)

        wp = w.reshape( (batch_size*n_samples, ) )
        wq = w.reshape( (batch_size*n_samples, ) )
        wq = wq - (1./n_samples)

        samples = flatten_values(samples, batch_size*n_samples)

        gradients = OrderedDict()
        for l in xrange(n_layers-1):
            gradients = merge_gradients(gradients, p_layers[l].get_gradients(samples[l], samples[l+1], weights=wp))
            gradients = merge_gradients(gradients, q_layers[l].get_gradients(samples[l+1], samples[l], weights=wq))
        gradients = merge_gradients(gradients, p_layers[-1].get_gradients(samples[-1], weights=wp))

        if (self.l1reg > 0.) or (self.l2reg > 0.):
            reg_gradients = OrderedDict()
            params = Selector(self).get_parameters()
            for pname, param in params.iteritems():
                if has_roles(param, (WEIGHT,)):
                    reg_cost = self.l1reg * tensor.sum(abs(param)) + self.l2reg * tensor.sum(param**2)
                    reg_gradients[param] = tensor.grad(reg_cost, param)
            gradients = merge_gradients(gradients, reg_gradients)

        self.log_p_bound = log_px_bound
        self.log_p = log_px
        self.log_ph = log_psx

        return log_px, log_psx, gradients
Beispiel #53
0
    print 'Parsing dataset file...'
    vocab = Vocab(dataset_path=args.dataset_path)

    source_sentence = tensor.lmatrix('source')
    
    encoder = BidirectionalEncoder(vocab.sequenceLength(), args.embed, args.nhidden)

    encoder.weights_init = IsotropicGaussian(args.weight_scale)
    encoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    encoder.initialize()

    print 'Parameter names: '
    enc_param_dict = Selector(encoder).get_params()
    for name, value in enc_param_dict.iteritems():
        print '    {:15}: {}'.format(value.get_value().shape, name)

    representation = encoder.apply(source_sentence)

    print 'Compiling theano function'
    f = theano.function([source_sentence], representation)

    reps = np.empty(len(vocab.dataset), dtype=object)

    bar = Bar('Encoding', max=len(vocab.dataset))
    for idx, sentence in enumerate(vocab.dataset):
        reps[idx] = f(sentence).transpose((1, 2, 0))
        bar.next()
    bar.finish()