Esempio n. 1
0
 def __init__(
     self,
     nvis,
     nhid,
     epsilon,
     batch_size,
     noise_scaling=1.0,
     lateral_x=False,
     lateral_h=False,
     debug=0,
     n_inference_steps=3,
     initial_noise=0.1,
     **kwargs
 ):
     super(FivEM, self).__init__(**kwargs)
     self.nvis = nvis
     self.nhid = nhid
     self.lateral_x = lateral_x
     self.lateral_h = lateral_h
     self.epsilon = epsilon
     self.batch_size = batch_size
     self.states_init = Constant(0)
     self.rho = Rho()
     self.noise_scaling = noise_scaling
     self.children = [self.rho]
     self.debug = debug
     self.n_inference_steps = n_inference_steps
     self.initial_noise = initial_noise
Esempio n. 2
0
 def _initialize(self):
     #TODO: know what to do after Blocks #740 is resolved:
     if self.recurrent_weights_init is None:
         self.recurrent_weights_init = self.weights_init
     if self.initial_states_init is None:
         self.initial_states_init = Constant(0.0)
     self.recurrent_weights_init.initialize(self.state_to_state, self.rng)
     state_to_update = self.weights_init.generate(
         self.rng, (self.dim, self.dim))
     state_to_reset = self.weights_init.generate(
         self.rng, (self.dim, self.dim))
     self.state_to_gates.set_value(
         numpy.hstack([state_to_update, state_to_reset]))
     self.initial_states_init.initialize(self.parameters.initial_state, self.rng)
Esempio n. 3
0
from ali.algorithms import ali_algorithm
from ali.bricks import (ALI, GaussianConditional, DeterministicConditional,
                        XZJointDiscriminator)
from ali.streams import create_handbags_shoes_data_streams
from ali.utils import get_log_odds, conv_brick, conv_transpose_brick, bn_brick
from ali.backup_model import BackupModel, PlotLoss, PlotAccuracy
from ali.miriam_logger import Logger

BATCH_SIZE = 100
MONITORING_BATCH_SIZE = 500
NUM_EPOCHS = 150
IMAGE_SIZE = (64, 64)
NUM_CHANNELS = 3
NLAT = 256
GAUSSIAN_INIT = IsotropicGaussian(std=0.01)
ZERO_INIT = Constant(0)
LEARNING_RATE = 1e-4
BETA1 = 0.5
LEAK = 0.02
dropout = 0.4


def create_model_brick():
    layers = [
        conv_brick(2, 1, 64),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(7, 2, 128),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(5, 2, 256),
 def setUp(self):
     self.lstm = LSTM(dim=3,
                      weights_init=Constant(2),
                      biases_init=Constant(0))
     self.lstm.initialize()
Esempio n. 5
0
class FivEM(Initializable, Random):
    """Implementation of the 5EM model.

    The model this brick represents is a simple bipartite, energy-based,
    undirected graphical model.

    Parameters
    ----------
    nvis : int
        Number of visible units.
    nhid : int
        Number of hidden units.
    epsilon : float
        Step size.
    batch_size : int
        Batch size, used for initializing the persistent states h_prev
        and h.

    """

    @lazy(allocation=["nvis", "nhid"])
    def __init__(
        self,
        nvis,
        nhid,
        epsilon,
        batch_size,
        noise_scaling=1.0,
        lateral_x=False,
        lateral_h=False,
        debug=0,
        n_inference_steps=3,
        initial_noise=0.1,
        **kwargs
    ):
        super(FivEM, self).__init__(**kwargs)
        self.nvis = nvis
        self.nhid = nhid
        self.lateral_x = lateral_x
        self.lateral_h = lateral_h
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.states_init = Constant(0)
        self.rho = Rho()
        self.noise_scaling = noise_scaling
        self.children = [self.rho]
        self.debug = debug
        self.n_inference_steps = n_inference_steps
        self.initial_noise = initial_noise

    def pp(self, var, name, level=1):
        if self.debug >= level:
            return theano.printing.Print(name)(var)
        else:
            return var

    def _allocate(self):
        Wxh = shared_floatx_nans((self.nvis, self.nhid), name="Wxh")
        self.parameters.append(Wxh)
        add_role(Wxh, WEIGHT)
        b = shared_floatx_nans((self.nhid), name="b")
        self.parameters.append(b)
        add_role(b, BIAS)
        c = shared_floatx_nans((self.nvis), name="c")
        self.parameters.append(c)
        add_role(c, BIAS)
        Whh = shared_floatx_nans((self.nhid, self.nhid), name="Whh")
        self.parameters.append(Whh)
        add_role(Whh, WEIGHT)
        Wxx = shared_floatx_nans((self.nvis, self.nvis), name="Wxx")
        self.parameters.append(Wxx)
        add_role(Wxx, WEIGHT)
        self.h = shared_floatx_nans((self.batch_size, self.nhid), name="h")
        x = tensor.matrix()
        h = tensor.matrix()
        self.generate_step_f = theano.function(inputs=[x, h], outputs=self.langevin_update(x, h, update_x=True))

    def _initialize(self):
        Wxh, b, c, Whh, Wxx = self.parameters
        self.weights_init.initialize(Wxh, self.rng)
        self.biases_init.initialize(b, self.rng)
        self.biases_init.initialize(c, self.rng)
        self.weights_init.initialize(Whh, self.rng)
        self.weights_init.initialize(Wxx, self.rng)
        self.states_init.initialize(self.h, self.rng)

    @property
    def Wxh(self):
        return self.parameters[0]

    @property
    def b(self):
        return self.parameters[1]

    @property
    def c(self):
        return self.parameters[2]

    @property
    def Whh(self):
        return self.parameters[3]

    @property
    def Wxx(self):
        return self.parameters[4]

    def energy(self, x, h):
        """Computes the energy function.

        Parameters
        ----------
        x : tensor variable
            Batch of visible states.
        h : tensor variable
            Batch of hidden states.

        """
        rx = self.rho.apply(x)
        rh = self.rho.apply(h)
        energy = (
            0.5 * ((x * x).sum(axis=1) + (h * h).sum(axis=1))
            - (tensor.dot(rx, tensor.tanh(self.Wxh)) * rh).sum(axis=1)
            - tensor.dot(rx, self.c)
            + tensor.dot(rh, self.b)
        )
        if self.lateral_x:
            energy = energy + (tensor.dot(rx, tensor.tanh(self.Wxx)) * rx).sum(axis=1)
        if self.lateral_h:
            energy = energy + (tensor.dot(rh, tensor.tanh(self.Whh)) * rh).sum(axis=1)
        return energy

    def langevin_update(self, x, h, update_x=False):
        """Computes state updates according to Langevin dynamics.

        Parameters
        ----------
        x : tensor variable
            Batch of visible states.
        h : tensor variable
            Batch of hidden states.
        update_x : bool, optional
            Whether to return updates for visible states as well. Defaults
            to `False`.

        """
        if update_x:
            return (
                self.corrupt(x) - self.epsilon * tensor.grad(self.energy(x, h).sum(), x),
                self.corrupt(h) - self.epsilon * tensor.grad(self.energy(x, h).sum(), h),
            )
        else:
            return self.corrupt(h) - self.epsilon * tensor.grad(self.energy(x, h).sum(), h)

    def map_update(self, x, h):
        """Computes h update going down the energy gradient, given x.

        Parameters
        ----------
        x : tensor variable
            Batch of visible states.
        h : tensor variable
            Batch of hidden states.
        """
        return h - self.epsilon * tensor.grad(self.energy(x, h).sum(), h)

    def corrupt(self, var):
        """Adds zero-mean gaussian noise to the input variable.

        Parameters
        ----------
        var : tensor variable
            Input.

        """
        return var + 2 * self.epsilon * self.noise_scaling * self.theano_rng.normal(size=var.shape, dtype=var.dtype)

    @application(inputs=["given_x"], outputs=["value"])
    def cost(self, given_x, application_call):
        """Computes the loss function.

        Parameters
        ----------
        given_x : tensor variable
                  Batch of given visible states from dataset.

        Notes
        -----
        The `application_call` argument is an effect of the `application`
        decorator and isn't visible to users. It's used internally to
        set an updates dictionary for  `h` that's
        discoverable by `ComputationGraph`.

        """
        x = given_x
        h_prev = self.h + self.initial_noise * self.theano_rng.normal(size=self.h.shape, dtype=self.h.dtype)
        h = h_next = h_prev
        old_energy = self.pp(self.energy(x, h).sum(), "old_energy", 1)
        for iteration in range(self.n_inference_steps):
            h_prev = h
            h = h_next
            h_next = self.pp(
                disconnected_grad(self.langevin_update(self.pp(x, "x", 3), self.pp(h_next, "h", 2))), "h_next", 2
            )
            new_energy = self.pp(self.energy(x, h_next).sum(), "new_energy", 1)
            delta_energy = self.pp(old_energy - new_energy, "delta_energy", 1)
            old_energy = new_energy
            h_prediction_residual = (
                h_next - self.pp(h_prev, "h_prev", 3) + self.epsilon * tensor.grad(self.energy(x, h_prev).sum(), h_prev)
            )
            J_h = self.pp((h_prediction_residual * h_prediction_residual).sum(axis=1).mean(axis=0), "J_h", 1)
            x_prediction_residual = self.pp(tensor.grad(self.energy(given_x, h_prev).sum(), given_x), "x_residual", 2)
            J_x = self.pp((x_prediction_residual * x_prediction_residual).sum(axis=1).mean(axis=0), "J_x", 1)
            if self.debug > 1:
                application_call.add_auxiliary_variable(J_x, name="J_x" + str(iteration))
                application_call.add_auxiliary_variable(J_h, name="J_h" + str(iteration))
            if iteration == 0:
                total_cost = J_h + J_x
            else:
                total_cost = total_cost + J_h + J_x

        per_iteration_cost = total_cost / self.n_inference_steps

        updates = OrderedDict([(self.h, h_next)])
        application_call.updates = dict_union(application_call.updates, updates)

        if self.debug > 0:
            application_call.add_auxiliary_variable(per_iteration_cost, name="per_iteration_cost")
        if self.debug > 1:
            application_call.add_auxiliary_variable(self.Wxh * 1.0, name="Wxh")
            application_call.add_auxiliary_variable(self.Whh * 1.0, name="Whh")
            application_call.add_auxiliary_variable(self.Wxx * 1.0, name="Wxx")
            application_call.add_auxiliary_variable(self.b * 1, name="b")
            application_call.add_auxiliary_variable(self.c * 1, name="c")

        return self.pp(total_cost, "total_cost")
    ('week_of_year', 52, 10),
    ('day_of_week', 7, 10),
    ('qhour_of_day', 24 * 4, 10),
    ('day_type', 3, 10),
]

embed_weights_init = IsotropicGaussian(0.001)

class MLPConfig(object):
    __slots__ = ('dim_input', 'dim_hidden', 'dim_output', 'weights_init', 'biases_init', 'embed_weights_init', 'dim_embeddings')

prefix_encoder = MLPConfig()
prefix_encoder.dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
prefix_encoder.dim_hidden = [100, 100]
prefix_encoder.weights_init = IsotropicGaussian(0.01)
prefix_encoder.biases_init = Constant(0.001)
prefix_encoder.embed_weights_init = embed_weights_init
prefix_encoder.dim_embeddings = dim_embeddings

candidate_encoder = MLPConfig()
candidate_encoder.dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
candidate_encoder.dim_hidden = [100, 100]
candidate_encoder.weights_init = IsotropicGaussian(0.01)
candidate_encoder.biases_init = Constant(0.001)
candidate_encoder.embed_weights_init = embed_weights_init
candidate_encoder.dim_embeddings = dim_embeddings

representation_size = 100
representation_activation = Tanh

normalize_representation = True
Esempio n. 7
0
    def train(self):

        x = self.sharedBatch['x']
        x.name = 'x_myinput'
        x_mask = self.sharedBatch['x_mask']
        x_mask.name = 'x_mask_myinput'
        y = self.sharedBatch['y']
        y.name = 'y_myinput'

        if self.usePro:
            proportion = self.sharedBatch['pro']
            proportion.name = 'pro'

        # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see
        # LSTM layer documentation for the explanation
        x_to_h = Linear(self.input_dimx1,
                        self.dim * 4,
                        name='x_to_h',
                        weights_init=IsotropicGaussian(),
                        biases_init=Constant(0.0))
        lstm = LSTM(self.dim,
                    name='lstm',
                    weights_init=IsotropicGaussian(),
                    biases_init=Constant(0.0))
        h_to_o = Linear(self.dim,
                        1,
                        name='h_to_o',
                        weights_init=IsotropicGaussian(),
                        biases_init=Constant(0.0))

        x_transform = x_to_h.apply(x)
        h, c = lstm.apply(x_transform, mask=x_mask)

        # only values of hidden units of the last timeframe are used for
        # the classification
        y_hat = h_to_o.apply(h[-1])
        y_hat = Logistic().apply(y_hat)

        if self.usePro:
            cost = BinaryCrossEntropyProp().apply(y, y_hat, proportion)
        else:
            cost = BinaryCrossEntropy().apply(y, y_hat)

        cost.name = 'cost'

        lstm.initialize()
        x_to_h.initialize()
        h_to_o.initialize()

        self.f = theano.function(inputs=[], outputs=y_hat)
        self.lastH = theano.function(inputs=[], outputs=h[-1])
        self.cg = ComputationGraph(cost)
        m = Model(cost)

        algorithm = GradientDescent(cost=cost,
                                    parameters=self.cg.parameters,
                                    step_rule=RMSProp(learning_rate=0.01),
                                    on_unused_sources='ignore')
        valid_monitor = DataStreamMonitoringShared(
            variables=[cost],
            data_stream=self.stream_valid_int,
            prefix="valid",
            sharedBatch=self.sharedBatch,
            sharedData=self.sharedData)
        train_monitor = TrainingDataMonitoring(variables=[cost],
                                               prefix="train",
                                               after_epoch=True)

        sharedVarMonitor = SwitchSharedReferences(self.sharedBatch,
                                                  self.sharedData)
        tBest = self.track_best('valid_cost', self.cg)
        self.tracker = tBest[0]
        extensions = [sharedVarMonitor, valid_monitor] + tBest

        if self.debug:
            extensions.append(Printing())

        self.algorithm = algorithm
        self.extensions = extensions
        self.model = m
        self.mainloop = MainLoop(self.algorithm,
                                 self.stream_train_int,
                                 extensions=self.extensions,
                                 model=self.model)
        self.main_loop(True)
Esempio n. 8
0
def task_ID_layers(x, recurrent_in_size):
    mlp = MLP([Rectifier()] * (len(task_ID_FF_dims)-1), task_ID_FF_dims, name='task_ID_mlp', weights_init=Uniform(width=.2), biases_init=Constant(0))
    mlp.push_initialization_config()
    mlp.initialize()
    out_size = task_ID_FF_dims[-1] + recurrent_in_size - len(game_tasks)
    zero_padded_task_IDs = T.concatenate([x[:,:,-len(game_tasks):], T.zeros((x.shape[0], x.shape[1], task_ID_FF_dims[0] - len(game_tasks)))], axis=2)
    mlp_out = mlp.apply(zero_padded_task_IDs)
    task_ID_out = T.concatenate([x[:,:,:-len(game_tasks)]] + [mlp_out], axis=2)
    return task_ID_out, out_size
Esempio n. 9
0
                     k=k,
                     name="emitter")

source_names = [name for name in transition.apply.states if 'states' in name]
readout = Readout(readout_dim=hidden_size_recurrent,
                  source_names=source_names,
                  emitter=emitter,
                  feedback_brick=feedback,
                  name="readout")

generator = SequenceGenerator(readout=readout,
                              transition=transition,
                              name="generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.)
generator.push_initialization_config()

generator.transition.biases_init = IsotropicGaussian(0.01, 1)
generator.transition.push_initialization_config()

generator.initialize()

states = {}
states = generator.transition.apply.outputs

states = {
    name: shared_floatx_zeros((batch_size, hidden_size_recurrent))
    for name in states
}
Esempio n. 10
0
def default_init(brick):
    brick.weights_init = Uniform(width=0.08)
    brick.biases_init = Constant(0)
    brick.initialize()
Esempio n. 11
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Case study of language modeling with RNN",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "mode",
        choices=["train", "sample"],
        help="The mode to run. Use `train` to train a new model"
        " and `sample` to sample a sequence generated by an"
        " existing one.")
    parser.add_argument("prefix",
                        default="sine",
                        help="The prefix for model, timing and state files")
    parser.add_argument("state",
                        nargs="?",
                        default="",
                        help="Changes to Groundhog state")
    parser.add_argument("--path", help="Path to a language dataset")
    parser.add_argument("--dict", help="Path to the dataset dictionary")
    parser.add_argument("--restart", help="Start anew")
    parser.add_argument("--reset",
                        action="store_true",
                        default=False,
                        help="Reset the hidden state between batches")
    parser.add_argument("--steps",
                        type=int,
                        default=100,
                        help="Number of steps to plot for the 'sample' mode"
                        " OR training sequence length for the 'train' mode.")
    args = parser.parse_args()
    logger.debug("Args:\n" + str(args))

    dim = 200
    num_chars = 50

    transition = GatedRecurrent(name="transition",
                                activation=Tanh(),
                                dim=dim,
                                weights_init=Orthogonal())
    generator = SequenceGenerator(LinearReadout(
        readout_dim=num_chars,
        source_names=["states"],
        emitter=SoftmaxEmitter(name="emitter"),
        feedbacker=LookupFeedback(num_chars, dim, name='feedback'),
        name="readout"),
                                  transition,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  name="generator")
    generator.allocate()
    logger.debug("Parameters:\n" + pprint.pformat(
        [(key, value.get_value().shape)
         for key, value in Selector(generator).get_params().items()],
        width=120))

    if args.mode == "train":
        batch_size = 1
        seq_len = args.steps

        generator.initialize()

        # Build cost computation graph that uses the saved hidden states.
        # An issue: for Groundhog this is completely transparent, that's
        # why it does not carry the hidden state over the period when
        # validation in done. We should find a way to fix in the future.
        x = tensor.lmatrix('x')
        init_states = shared_floatx_zeros((batch_size, dim),
                                          name='init_states')
        reset = tensor.scalar('reset')
        cost = ComputationGraph(
            generator.cost(x, states=init_states * reset).sum())
        # TODO: better search routine
        states = [
            v for v in cost.variables if hasattr(v.tag, 'application_call')
            and v.tag.application_call.brick == generator.transition and
            (v.tag.application_call.application == generator.transition.apply)
            and v.tag.role == VariableRole.OUTPUT and v.tag.name == 'states'
        ]
        assert len(states) == 1
        states = states[0]

        gh_model = GroundhogModel(generator, cost)
        gh_model.properties.append(
            ('bpc', cost.outputs[0] * numpy.log(2) / seq_len))
        gh_model.properties.append(('mean_init_state', init_states.mean()))
        gh_model.properties.append(('reset', reset))
        if not args.reset:
            gh_model.updates.append((init_states, states[-1]))

        state = GroundhogState(args.prefix, batch_size,
                               learning_rate=0.0001).as_dict()
        changes = eval("dict({})".format(args.state))
        state.update(changes)

        def output_format(x, y, reset):
            return dict(x=x[:, None], reset=reset)

        train, valid, test = [
            LMIterator(batch_size=batch_size,
                       use_infinite_loop=mode == 'train',
                       path=args.path,
                       seq_len=seq_len,
                       mode=mode,
                       chunks='chars',
                       output_format=output_format,
                       can_fit=True) for mode in ['train', 'valid', 'test']
        ]

        trainer = SGD(gh_model, state, train)
        state['on_nan'] = 'warn'
        state['cutoff'] = 1.

        main_loop = MainLoop(train, valid, None, gh_model, trainer, state,
                             None)
        if not args.restart:
            main_loop.load()
        main_loop.main()
    elif args.mode == "sample":
        load_params(generator, args.prefix + "model.npz")

        chars = numpy.load(args.dict)['unique_chars']

        sample = ComputationGraph(
            generator.generate(n_steps=args.steps, batch_size=10,
                               iterate=True)).function()

        states, outputs, costs = sample()

        for i in range(10):
            print("Generation cost: {}".format(costs[:, i].sum()))
            print("".join([chars[o] for o in outputs[:, i]]))
    else:
        assert False
Esempio n. 12
0
def main(config):
    print('working on it ...')
    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2)
    cost = decoder.cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)
    # Extensions
    extensions = []
    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Set up beam search and sampling computation graphs if necessary
    if config['bleu_script'] is not None:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs'''

     
    # Add sampling
    logger.info("Building sampler")
    global samplers_ob
    samplers_ob=Sampler(model=search_model, data_stream=input_sentence_mask,
                hook_samples=config['hook_samples'],
                every_n_batches=config['sampling_freq'],
                src_vocab_size=config['src_vocab_size'])
                # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=training_model,
        algorithm=None,
        data_stream=None,
        extensions=extensions
    )
                
    for extension in main_loop.extensions:
        extension.main_loop = main_loop
    main_loop._run_extensions('before_training')
Esempio n. 13
0
class GatedRecurrent(BaseRecurrent, Initializable):
    u"""Gated recurrent neural network.

    Gated recurrent neural network (GRNN) as introduced in [CvMG14]_. Every
    unit of a GRNN is equipped with update and reset gates that facilitate
    better gradient propagation.

    Parameters
    ----------
    dim : int
        The dimension of the hidden state.
    activation : :class:`.Brick` or None
        The brick to apply as activation. If ``None`` a
        :class:`.Tanh` brick is used.
    gate_activation : :class:`.Brick` or None
        The brick to apply as activation for gates. If ``None`` a
        :class:`.Logistic` brick is used.

    Notes
    -----
    See :class:`.Initializable` for initialization parameters.

    .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre,
        Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua
        Bengio, *Learning Phrase Representations using RNN Encoder-Decoder
        for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734.

    """
    @lazy(allocation=['dim'])
    def __init__(self, dim, activation=None, gate_activation=None, **kwargs):
        self.dim = dim

        self.recurrent_weights_init = None
        self.initial_states_init = None

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        children = [activation, gate_activation] + kwargs.get('children', [])
        super(GatedRecurrent, self).__init__(children=children, **kwargs)

    @property
    def state_to_state(self):
        return self.parameters[0]

    @property
    def state_to_gates(self):
        return self.parameters[1]

    @property
    def initial_states_(self):
        return self.parameters[2]

    def get_dim(self, name):
        if name == 'mask':
            return 0
        if name in ['inputs', 'states']:
            return self.dim
        if name == 'gate_inputs':
            return 2 * self.dim
        return super(GatedRecurrent, self).get_dim(name)

    def _allocate(self):
        self.parameters.append(
            shared_floatx_nans((self.dim, self.dim), name='state_to_state'))
        add_role(self.parameters[-1], WEIGHT)

        self.parameters.append(
            shared_floatx_nans((self.dim, 2 * self.dim),
                               name='state_to_gates'))
        add_role(self.parameters[-1], WEIGHT)

        self.parameters.append(
            shared_floatx_nans((self.dim, ), name="initial_state"))
        add_role(self.parameters[-1], INITIAL_STATE)

    def _initialize(self):
        #TODO: know what to do after Blocks #740 is resolved:
        if self.recurrent_weights_init is None:
            self.recurrent_weights_init = self.weights_init
        if self.initial_states_init is None:
            self.initial_states_init = Constant(0.0)
        self.recurrent_weights_init.initialize(self.state_to_state, self.rng)
        state_to_update = self.weights_init.generate(self.rng,
                                                     (self.dim, self.dim))
        state_to_reset = self.weights_init.generate(self.rng,
                                                    (self.dim, self.dim))
        self.state_to_gates.set_value(
            numpy.hstack([state_to_update, state_to_reset]))
        self.initial_states_init.initialize(self.parameters.initial_state,
                                            self.rng)

    @recurrent(sequences=['mask', 'inputs', 'gate_inputs'],
               states=['states'],
               outputs=['states'],
               contexts=[])
    def apply(self, inputs, gate_inputs, states, mask=None):
        """Apply the gated recurrent transition.

        Parameters
        ----------
        states : :class:`~tensor.TensorVariable`
            The 2 dimensional matrix of current states in the shape
            (batch_size, dim). Required for `one_step` usage.
        inputs : :class:`~tensor.TensorVariable`
            The 2 dimensional matrix of inputs in the shape (batch_size,
            dim)
        gate_inputs : :class:`~tensor.TensorVariable`
            The 2 dimensional matrix of inputs to the gates in the
            shape (batch_size, 2 * dim).
        mask : :class:`~tensor.TensorVariable`
            A 1D binary array in the shape (batch,) which is 1 if there is
            data available, 0 if not. Assumed to be 1-s only if not given.

        Returns
        -------
        output : :class:`~tensor.TensorVariable`
            Next states of the network.

        """
        gate_values = self.gate_activation.apply(
            states.dot(self.state_to_gates) + gate_inputs)
        update_values = gate_values[:, :self.dim]
        reset_values = gate_values[:, self.dim:]
        states_reset = states * reset_values
        next_states = self.activation.apply(
            states_reset.dot(self.state_to_state) + inputs)
        next_states = (next_states * update_values + states *
                       (1 - update_values))
        if mask:
            next_states = (mask[:, None] * next_states +
                           (1 - mask[:, None]) * states)
        return next_states

    @application(outputs=apply.states)
    def initial_states(self, batch_size, *args, **kwargs):
        return [
            tensor.repeat(self.parameters.initial_state[None, :], batch_size,
                          0)
        ]
Esempio n. 14
0
    def __init__(self, ref_data, output_dim):
        if pca_dims is not None:
            covmat = numpy.dot(ref_data.T, ref_data)
            ev, evec = numpy.linalg.eig(covmat)
            best_i = ev.argsort()[-pca_dims:]
            best_evecs = evec[:, best_i]
            best_evecs = best_evecs / numpy.sqrt(
                (best_evecs**2).sum(axis=0))  #normalize
            ref_data = numpy.dot(ref_data, best_evecs)

        input_dim = ref_data.shape[1]

        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32),
                                    name='ref_data')

        # Construct the model
        j = tensor.lvector('j')
        r = ref_data_sh[j, :]
        x = tensor.fmatrix('x')
        y = tensor.ivector('y')

        # input_dim must be nr
        mlp = MLP(activations=activation_functions,
                  dims=[input_dim] + hidden_dims + [n_inter],
                  name='inter_gen')
        mlp2 = MLP(activations=activation_functions_2 + [None],
                   dims=[n_inter] + hidden_dims_2 + [output_dim],
                   name='end_mlp')

        inter_weights = mlp.apply(r)

        if inter_bias == None:
            ibias = Bias(n_inter)
            ibias.biases_init = Constant(0)
            ibias.initialize()
            inter = ibias.apply(tensor.dot(x, inter_weights))
        else:
            inter = tensor.dot(x, inter_weights) - inter_bias
        inter = inter_act_fun.apply(inter)

        final = mlp2.apply(inter)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        # error_rate = tensor.neq(y, pred).mean()
        ber = balanced_error_rate.ber(y, pred)

        # Initialize parameters
        for brick in [mlp, mlp2]:
            brick.weights_init = IsotropicGaussian(0.01)
            brick.biases_init = Constant(0.001)
            brick.initialize()

        # apply regularization
        cg = ComputationGraph([cost, ber])

        if r_dropout != 0:
            # - dropout on input vector r : r_dropout
            cg = apply_dropout(cg, [r], r_dropout)

        if x_dropout != 0:
            cg = apply_dropout(cg, [x], x_dropout)

        if s_dropout != 0:
            # - dropout on intermediate layers of first mlp : s_dropout
            s_dropout_vars = list(
                set(
                    VariableFilter(bricks=[Tanh], name='output')
                    (ComputationGraph([inter_weights]))) -
                set([inter_weights]))
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)

        if i_dropout != 0:
            # - dropout on input to second mlp : i_dropout
            cg = apply_dropout(cg, [inter], i_dropout)

        if a_dropout != 0:
            # - dropout on hidden layers of second mlp : a_dropout
            a_dropout_vars = list(
                set(
                    VariableFilter(bricks=[Tanh], name='output')
                    (ComputationGraph([final]))) - set([inter_weights]) -
                set(s_dropout_vars))
            cg = apply_dropout(cg, a_dropout_vars, a_dropout)

        if r_noise_std != 0:
            cg = apply_noise(cg, [r], r_noise_std)

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        [cost_reg, ber_reg] = cg.outputs

        if s_l1pen != 0:
            s_weights = VariableFilter(bricks=mlp.linear_transformations,
                                       roles=[WEIGHT])(cg)
            cost_reg = cost_reg + s_l1pen * sum(
                abs(w).sum() for w in s_weights)
        if i_l1pen != 0:
            cost_reg = cost_reg + i_l1pen * abs(inter).sum()
        if a_l1pen != 0:
            a_weights = VariableFilter(bricks=mlp2.linear_transformations,
                                       roles=[WEIGHT])(cg)
            cost_reg = cost_reg + a_l1pen * sum(
                abs(w).sum() for w in a_weights)

        self.cost = cost
        self.cost_reg = cost_reg
        self.ber = ber
        self.ber_reg = ber_reg
        self.pred = pred
        self.confidence = confidence
Esempio n. 15
0
def main_rnn(config):

    x = tensor.tensor3('features')
    y = tensor.matrix('targets')

#    if 'LSTM' in config['model'] :
#        from models import getLSTMstack
#        y_hat = getLSTMstack(input_dim=13, input_var=x, depth=int(config['model'][-1]))
#    else :
#        raise Exception("These are not the LSTM we are looking for")

#    y_hat = model.apply(x)
    

    emitter = TestEmitter()
#    emitter = TrivialEmitter(readout_dim=config['lstm_hidden_size'])

#    cost_func = SquaredError()

 #   @application
 #   def qwe(self, readouts, outputs=None):
 #       print(type(self), type(readouts))
 #       x = cost_func.apply(readouts,outputs)
 #       return x
    print(type(emitter.cost))
 #   emitter.cost = qwe
  #  print(type(qwe))

    steps = 2 
    n_samples= config['target_size']

    transition = [LSTM(config['lstm_hidden_size']) for _ in range(4)]
    transition = RecurrentStack(transition,
            name="transition", skip_connections=False)

    source_names = [name for name in transition.apply.states if 'states' in name]

    readout = Readout(emitter, readout_dim=config['lstm_hidden_size'], source_names=source_names,feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None)

    seqgen = SequenceGenerator(readout, transition, attention=None, add_contexts=False)
    seqgen.weights_init = IsotropicGaussian(0.01)
    seqgen.biases_init = Constant(0.)
    seqgen.push_initialization_config()

    seqgen.transition.biases_init = IsotropicGaussian(0.01,1)
    seqgen.transition.push_initialization_config()
    seqgen.initialize()

    states = seqgen.transition.apply.outputs
    print('states',states)
    states = {name: shared_floatx_zeros((n_samples, config['lstm_hidden_size']))
        for name in states}

    cost_matrix = seqgen.cost_matrix(x, **states)
    cost = cost_matrix.mean()
    cost.name = "nll"

    cg = ComputationGraph(cost)
    model = Model(cost)
    #Cost
#    cost = SquaredError().apply(y_hat ,y)
    #cost = CategoricalCrossEntropy().apply(T.flatten(),Y)
 #   

        #for sampling
    #cg = ComputationGraph(seqgen.generate(n_steps=steps,batch_size=n_samples, iterate=True))
  

    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=Scale(learning_rate=config['learning_rate']))



    #Getting the stream
    train_stream = MFCC.get_stream(config['batch_size'],config['source_size'],config['target_size'],config['num_examples'])


    #Monitoring stuff
    extensions = [Timing(),
                  FinishAfter(after_n_batches=config['num_batches']),
                  #DataStreamMonitoring([cost, error_rate],test_stream,prefix="test"),
                  TrainingDataMonitoring([cost], prefix="train", every_n_batches=1),
                  #Checkpoint(save_to),
                  ProgressBar(),
                  Printing(every_n_batches=1)]
   

    main_loop = MainLoop(
        algorithm,
        train_stream,
 #       model=model,
        extensions=extensions)

    main_loop.run()
Esempio n. 16
0
def main(save_to, hist_file):
    batch_size = 365
    feature_maps = [6, 16]
    mlp_hiddens = [120, 84]
    conv_sizes = [5, 5]
    pool_sizes = [2, 2]
    image_size = (28, 28)
    output_size = 10

    # The above are from LeCun's paper. The blocks example had:
    #    feature_maps = [20, 50]
    #    mlp_hiddens = [500]

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations,
                    1,
                    image_size,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='valid',
                    weights_init=Uniform(width=.2),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.push_initialization_config()
    convnet.layers[0].weights_init = Uniform(width=.2)
    convnet.layers[1].weights_init = Uniform(width=.09)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info(
        "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))

    mnist_test = MNIST(("test", ), sources=['features', 'targets'])

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    error_rate = (MisclassificationRate().apply(y.flatten(),
                                                probs).copy(name='error_rate'))
    confusion = (ConfusionMatrix().apply(y.flatten(),
                                         probs).copy(name='confusion'))
    confusion.tag.aggregation_scheme = Sum(confusion)

    model = Model([error_rate, confusion])

    # Load it with trained parameters
    params = load_parameters(open(save_to, 'rb'))
    model.set_parameter_values(params)

    def full_brick_name(brick):
        return '/'.join([''] + [b.name for b in brick.get_unique_path()])

    # Find layer outputs to probe
    outs = OrderedDict(
        (full_brick_name(get_brick(out)), out) for out in VariableFilter(
            roles=[OUTPUT], bricks=[Convolutional, Linear])(model.variables))

    # Load histogram information
    with open(hist_file, 'rb') as handle:
        histograms = pickle.load(handle)

    # Corpora
    mnist_train = MNIST(("train", ))
    mnist_train_stream = DataStream.default_stream(
        mnist_train,
        iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size))

    mnist_test = MNIST(("test", ))
    mnist_test_stream = DataStream.default_stream(
        mnist_test,
        iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size))

    # Probe the given layer
    target_layer = '/lenet/mlp/linear_0'
    next_layer_param = '/lenet/mlp/linear_1.W'
    sample = extract_sample(outs[target_layer], mnist_test_stream)
    print('sample shape', sample.shape)

    # Figure neurons to ablate
    hist = histograms[('linear_1', 'b')]
    targets = [i for i in range(hist.shape[1]) if hist[2, i] * hist[7, i] < 0]
    print('ablating', len(targets), ':', targets)

    # Now adjust the next layer weights based on the probe
    param = model.get_parameter_dict()[next_layer_param]
    print('param shape', param.get_value().shape)

    new_weights = ablate_inputs(targets,
                                sample,
                                param.get_value(),
                                compensate=False)
    param.set_value(new_weights)

    # Evaluation pass
    evaluator = DatasetEvaluator([error_rate, confusion])
    print(evaluator.evaluate(mnist_test_stream))
Esempio n. 17
0
def main(mode, save_path, steps, num_batches):
    num_states = MarkovChainDataset.num_states

    if mode == "train":
        # Experiment configuration
        rng = numpy.random.RandomState(1)
        batch_size = 50
        seq_len = 100
        dim = 10
        feedback_dim = 8

        # Build the bricks and initialize them
        transition = GatedRecurrent(name="transition",
                                    activation=Tanh(),
                                    dim=dim)
        generator = SequenceGenerator(LinearReadout(
            readout_dim=num_states,
            source_names=["states"],
            emitter=SoftmaxEmitter(name="emitter"),
            feedbacker=LookupFeedback(num_states,
                                      feedback_dim,
                                      name='feedback'),
            name="readout"),
                                      transition,
                                      weights_init=IsotropicGaussian(0.01),
                                      biases_init=Constant(0),
                                      name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()

        # Give an idea of what's going on.
        logger.info("Parameters:\n" + pprint.pformat(
            [(key, value.get_value().shape)
             for key, value in Selector(generator).get_params().items()],
            width=120))
        logger.info("Markov chain entropy: {}".format(
            MarkovChainDataset.entropy))
        logger.info("Expected min error: {}".format(
            -MarkovChainDataset.entropy * seq_len))

        # Build the cost computation graph.
        x = tensor.lmatrix('data')
        cost = aggregation.mean(generator.cost(x[:, :]).sum(), x.shape[1])
        cost.name = "sequence_log_likelihood"

        algorithm = GradientDescent(
            cost=cost,
            params=list(Selector(generator).get_params().values()),
            step_rule=Scale(0.001))
        main_loop = MainLoop(algorithm=algorithm,
                             data_stream=DataStream(
                                 MarkovChainDataset(rng, seq_len),
                                 iteration_scheme=ConstantScheme(batch_size)),
                             model=Model(cost),
                             extensions=[
                                 FinishAfter(after_n_batches=num_batches),
                                 TrainingDataMonitoring(
                                     [cost],
                                     prefix="this_step",
                                     after_every_batch=True),
                                 TrainingDataMonitoring([cost],
                                                        prefix="average",
                                                        every_n_batches=100),
                                 SerializeMainLoop(save_path,
                                                   every_n_batches=500),
                                 Printing(every_n_batches=100)
                             ])
        main_loop.run()
    elif mode == "sample":
        main_loop = cPickle.load(open(save_path, "rb"))
        generator = main_loop.model

        sample = ComputationGraph(
            generator.generate(n_steps=steps, batch_size=1,
                               iterate=True)).get_theano_function()

        states, outputs, costs = [data[:, 0] for data in sample()]

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()
        print("Frequencies:\n {} vs {}".format(freqs,
                                               MarkovChainDataset.equilibrium))

        trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        print("Transition frequencies:\n{}\nvs\n{}".format(
            trans_freqs, MarkovChainDataset.trans_prob))
    else:
        assert False
Esempio n. 18
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code,
                               level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream,
                              reverse_words,
                              add_sources=("targets", ))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weights_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(chars, chars_mask, targets,
                                   targets_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        parameters = model.get_parameter_dict()
        logger.info("Parameters:\n" +
                    pprint.pformat([(key, value.get_value().shape)
                                    for key, value in parameters.items()],
                                   width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule(
                                        [StepClipping(10.0),
                                         Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies, ) = VariableFilter(applications=[generator.readout.readout],
                                      name_regex="output")(cg.variables)
        (activations, ) = VariableFilter(
            applications=[generator.transition.apply],
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        mean_activation = named_copy(
            abs(activations).mean(), "mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation, batch_size,
            max_length, cost_per_character, algorithm.total_step_norm,
            algorithm.total_gradient_norm
        ]
        for name, parameter in parameters.items():
            observables.append(named_copy(parameter.norm(2), name + "_norm"))
            observables.append(
                named_copy(algorithm.gradients[parameter].norm(2),
                           name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(observables,
                                                    prefix="average",
                                                    every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition(["after_batch"], _is_nan),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path,
                           every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)
            ])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_parameter_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(bricks=[reverser.generator],
                                          name="outputs")(ComputationGraph(
                                              generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(samples)
                outputs, costs = beam_search.search({chars: input_},
                                                    char2code['</S>'],
                                                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            line = input("Enter a sentence\n")
            message = ("Enter the number of samples\n"
                       if mode == "sample" else "Enter the beam size\n")
            batch_size = int(input(message))

            encoded_input = [
                char2code.get(char, char2code["<UNK>"])
                for char in line.lower().strip()
            ]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input, ))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size,
                             axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
Esempio n. 19
0
def construct_model(input_dim, out_dim):
    # Construct the model
    r = tensor.fmatrix('r')
    x = tensor.fmatrix('x')
    y = tensor.ivector('y')

    nx = x.shape[0]
    nj = x.shape[1]  # also is r.shape[0]
    nr = r.shape[1]

    # r is nj x nr
    # x is nx x nj
    # y is nx

    # r_rep is nx x nj x nr
    r_rep = r[None, :, :].repeat(axis=0, repeats=nx)
    # x3 is nx x nj x 1
    x3 = x[:, :, None]

    # concat is nx x nj x (nr + 1)
    concat = tensor.concatenate([r_rep, x3], axis=2)

    # Change concat from Batch x Time x Features to T X B x F
    rnn_input = concat.dimshuffle(1, 0, 2)

    if use_ensembling:
        # Split time dimension into batches of size num_feats
        # Join that dimension with the B dimension
        ens_shape = (num_feats, rnn_input.shape[0] / num_feats,
                     rnn_input.shape[1])
        rnn_input = rnn_input.reshape(ens_shape + (input_dim + 1, ))
        rnn_input = rnn_input.reshape(
            (ens_shape[0], ens_shape[1] * ens_shape[2], input_dim + 1))

    linear = Linear(input_dim=input_dim + 1,
                    output_dim=4 * hidden_dim,
                    name="input_linear")
    lstm = LSTM(dim=hidden_dim,
                activation=activation_function,
                name="hidden_recurrent")
    top_linear = Linear(input_dim=hidden_dim,
                        output_dim=out_dim,
                        name="out_linear")

    pre_rnn = linear.apply(rnn_input)
    states = lstm.apply(pre_rnn)[0]
    activations = top_linear.apply(states)

    if use_ensembling:
        activations = activations.reshape(ens_shape + (out_dim, ))
        # Unsplit batches (ensembling)
        activations = tensor.mean(activations, axis=1)
    # Mean over time
    activations = tensor.mean(activations, axis=0)

    cost = Softmax().categorical_cross_entropy(y, activations)

    pred = activations.argmax(axis=1)
    error_rate = tensor.neq(y, pred).mean()

    # Initialize parameters

    for brick in (linear, lstm, top_linear):
        brick.weights_init = IsotropicGaussian(0.1)
        brick.biases_init = Constant(0.)
        brick.initialize()

    # apply noise
    cg = ComputationGraph([cost, error_rate])
    noise_vars = VariableFilter(roles=[WEIGHT])(cg)
    apply_noise(cg, noise_vars, noise_std)
    [cost_reg, error_rate_reg] = cg.outputs

    return cost_reg, error_rate_reg, cost, error_rate
Esempio n. 20
0
    def main(self):
        import itertools

        import numpy
        from theano import tensor
        from blocks.algorithms import Adam
        from blocks.bricks import MLP, Rectifier, Identity, LinearMaxout, Linear
        from blocks.bricks.bn import BatchNormalization
        from blocks.bricks.sequences import Sequence
        from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar
        from blocks.extensions.monitoring import DataStreamMonitoring
        from blocks.extensions.saveload import Checkpoint
        from blocks.graph import ComputationGraph, apply_dropout
        from blocks.graph.bn import (batch_normalization,
                                     get_batch_normalization_updates)
        from blocks.filter import VariableFilter
        from blocks.initialization import IsotropicGaussian, Constant
        from blocks.model import Model
        from blocks.main_loop import MainLoop
        from blocks.roles import INPUT

        from ali.algorithms import ali_algorithm
        from ali.streams import create_gaussian_mixture_data_streams
        from ali.bricks import (ALI, COVConditional, DeterministicConditional,
                                XZJointDiscriminator)
        from ali.utils import as_array

        from blocks.select import Selector

        import logging
        import argparse

        from pacgan.extensions import ModelLogger, GraphLogger, MetricLogger
        import fuel
        from math import cos, sin

        seed = random.randint(1, 100000)
        fuelrc_path = os.path.join(self._work_dir, ".fuelrc")
        f = open(fuelrc_path, "w")
        f.write("default_seed: {}\n".format(seed))
        f.close()
        fuel.config.default_seed = seed

        INPUT_DIM = 2
        NLAT = 2
        GEN_HIDDEN = 400
        DISC_HIDDEN = 200
        GEN_ACTIVATION = Rectifier
        MAXOUT_PIECES = 5
        GAUSSIAN_INIT = IsotropicGaussian(std=0.02)
        ZERO_INIT = Constant(0.0)

        NUM_EPOCHS = 400
        LEARNING_RATE = 1e-4
        BETA1 = 0.8
        BATCH_SIZE = 100
        MONITORING_BATCH_SIZE = 500
        MEANS = [
            numpy.array(
                [cos(id * numpy.pi * 2 / 8),
                 sin(id * numpy.pi * 2 / 8)]) for id in range(8)
        ]
        VARIANCES = [0.01**2 * numpy.eye(len(mean)) for mean in MEANS]
        PRIORS = None

        def create_model_brick():
            encoder_mapping = MLP(
                dims=[2 * INPUT_DIM, GEN_HIDDEN, GEN_HIDDEN, NLAT],
                activations=[
                    Sequence([
                        BatchNormalization(GEN_HIDDEN).apply,
                        GEN_ACTIVATION().apply
                    ],
                             name='encoder_h1'),
                    Sequence([
                        BatchNormalization(GEN_HIDDEN).apply,
                        GEN_ACTIVATION().apply
                    ],
                             name='encoder_h2'),
                    Identity(name='encoder_out')
                ],
                use_bias=False,
                name='encoder_mapping')
            encoder = COVConditional(encoder_mapping, (INPUT_DIM, ),
                                     name='encoder')

            decoder_mapping = MLP(dims=[
                NLAT, GEN_HIDDEN, GEN_HIDDEN, GEN_HIDDEN, GEN_HIDDEN, INPUT_DIM
            ],
                                  activations=[
                                      Sequence([
                                          BatchNormalization(GEN_HIDDEN).apply,
                                          GEN_ACTIVATION().apply
                                      ],
                                               name='decoder_h1'),
                                      Sequence([
                                          BatchNormalization(GEN_HIDDEN).apply,
                                          GEN_ACTIVATION().apply
                                      ],
                                               name='decoder_h2'),
                                      Sequence([
                                          BatchNormalization(GEN_HIDDEN).apply,
                                          GEN_ACTIVATION().apply
                                      ],
                                               name='decoder_h3'),
                                      Sequence([
                                          BatchNormalization(GEN_HIDDEN).apply,
                                          GEN_ACTIVATION().apply
                                      ],
                                               name='decoder_h4'),
                                      Identity(name='decoder_out')
                                  ],
                                  use_bias=False,
                                  name='decoder_mapping')
            decoder = DeterministicConditional(decoder_mapping, name='decoder')

            x_discriminator = Identity(name='x_discriminator')
            z_discriminator = Identity(name='z_discriminator')
            joint_discriminator = Sequence(application_methods=[
                LinearMaxout(input_dim=INPUT_DIM + NLAT,
                             output_dim=DISC_HIDDEN,
                             num_pieces=MAXOUT_PIECES,
                             weights_init=GAUSSIAN_INIT,
                             biases_init=ZERO_INIT,
                             name='discriminator_h1').apply,
                LinearMaxout(input_dim=DISC_HIDDEN,
                             output_dim=DISC_HIDDEN,
                             num_pieces=MAXOUT_PIECES,
                             weights_init=GAUSSIAN_INIT,
                             biases_init=ZERO_INIT,
                             name='discriminator_h2').apply,
                LinearMaxout(input_dim=DISC_HIDDEN,
                             output_dim=DISC_HIDDEN,
                             num_pieces=MAXOUT_PIECES,
                             weights_init=GAUSSIAN_INIT,
                             biases_init=ZERO_INIT,
                             name='discriminator_h3').apply,
                Linear(input_dim=DISC_HIDDEN,
                       output_dim=1,
                       weights_init=GAUSSIAN_INIT,
                       biases_init=ZERO_INIT,
                       name='discriminator_out').apply
            ],
                                           name='joint_discriminator')
            discriminator = XZJointDiscriminator(x_discriminator,
                                                 z_discriminator,
                                                 joint_discriminator,
                                                 name='discriminator')

            ali = ALI(encoder=encoder,
                      decoder=decoder,
                      discriminator=discriminator,
                      weights_init=GAUSSIAN_INIT,
                      biases_init=ZERO_INIT,
                      name='ali')
            ali.push_allocation_config()
            encoder_mapping.linear_transformations[-1].use_bias = True
            decoder_mapping.linear_transformations[-1].use_bias = True
            ali.initialize()

            print("Number of parameters in discriminator: {}".format(
                numpy.sum([
                    numpy.prod(v.shape.eval()) for v in Selector(
                        ali.discriminator).get_parameters().values()
                ])))
            print("Number of parameters in encoder: {}".format(
                numpy.sum([
                    numpy.prod(v.shape.eval())
                    for v in Selector(ali.encoder).get_parameters().values()
                ])))
            print("Number of parameters in decoder: {}".format(
                numpy.sum([
                    numpy.prod(v.shape.eval())
                    for v in Selector(ali.decoder).get_parameters().values()
                ])))

            return ali

        def create_models():
            ali = create_model_brick()
            x = tensor.matrix('features')
            z = ali.theano_rng.normal(size=(x.shape[0], NLAT))

            def _create_model(with_dropout):
                cg = ComputationGraph(ali.compute_losses(x, z))
                if with_dropout:
                    inputs = VariableFilter(bricks=ali.discriminator.
                                            joint_discriminator.children[1:],
                                            roles=[INPUT])(cg.variables)
                    cg = apply_dropout(cg, inputs, 0.5)
                    inputs = VariableFilter(
                        bricks=[ali.discriminator.joint_discriminator],
                        roles=[INPUT])(cg.variables)
                    cg = apply_dropout(cg, inputs, 0.2)
                return Model(cg.outputs)

            model = _create_model(with_dropout=False)
            with batch_normalization(ali):
                bn_model = _create_model(with_dropout=False)

            pop_updates = list(
                set(
                    get_batch_normalization_updates(bn_model,
                                                    allow_duplicates=True)))
            bn_updates = [(p, m * 0.05 + p * 0.95) for p, m in pop_updates]

            return model, bn_model, bn_updates

        def create_main_loop():
            model, bn_model, bn_updates = create_models()
            ali, = bn_model.top_bricks
            discriminator_loss, generator_loss = bn_model.outputs
            step_rule = Adam(learning_rate=LEARNING_RATE, beta1=BETA1)
            algorithm = ali_algorithm(discriminator_loss,
                                      ali.discriminator_parameters, step_rule,
                                      generator_loss, ali.generator_parameters,
                                      step_rule)
            algorithm.add_updates(bn_updates)
            streams = create_gaussian_mixture_data_streams(
                batch_size=BATCH_SIZE,
                monitoring_batch_size=MONITORING_BATCH_SIZE,
                means=MEANS,
                variances=VARIANCES,
                priors=PRIORS)
            main_loop_stream, train_monitor_stream, valid_monitor_stream = streams
            bn_monitored_variables = ([
                v for v in bn_model.auxiliary_variables if 'norm' not in v.name
            ] + bn_model.outputs)
            monitored_variables = (
                [v
                 for v in model.auxiliary_variables if 'norm' not in v.name] +
                model.outputs)
            extensions = [
                Timing(),
                FinishAfter(after_n_epochs=NUM_EPOCHS),
                DataStreamMonitoring(bn_monitored_variables,
                                     train_monitor_stream,
                                     prefix="train",
                                     updates=bn_updates),
                DataStreamMonitoring(monitored_variables,
                                     valid_monitor_stream,
                                     prefix="valid"),
                Checkpoint(os.path.join(self._work_dir, "main_loop.tar"),
                           after_epoch=True,
                           after_training=True,
                           use_cpickle=True),
                ProgressBar(),
                Printing(),

                #ModelLogger(folder=self._work_dir, after_epoch=True),
                GraphLogger(num_modes=1,
                            num_samples=2500,
                            dimension=2,
                            r=0,
                            std=1,
                            folder=self._work_dir,
                            after_epoch=True,
                            after_training=True),
                MetricLogger(means=MEANS,
                             variances=VARIANCES,
                             folder=self._work_dir,
                             after_epoch=True)
            ]
            main_loop = MainLoop(model=bn_model,
                                 data_stream=main_loop_stream,
                                 algorithm=algorithm,
                                 extensions=extensions)
            return main_loop

        main_loop = create_main_loop()
        main_loop.run()
Esempio n. 21
0
def initialize(to_init, weights_init=Uniform(width=0.08), biases_init=Constant(0)):
    for bricks in to_init:
        bricks.weights_init = weights_init
        bricks.biases_init = biases_init
        bricks.initialize()
Esempio n. 22
0
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter,
         enc_dim, dec_dim, z_dim, oldmodel):

    image_size, channels, data_train, data_valid, data_test = datasets.get_data(
        dataset)

    train_stream = Flatten(
        DataStream.default_stream(data_train,
                                  iteration_scheme=SequentialScheme(
                                      data_train.num_examples, batch_size)))
    valid_stream = Flatten(
        DataStream.default_stream(data_valid,
                                  iteration_scheme=SequentialScheme(
                                      data_valid.num_examples, batch_size)))
    test_stream = Flatten(
        DataStream.default_stream(data_test,
                                  iteration_scheme=SequentialScheme(
                                      data_test.num_examples, batch_size)))

    if name is None:
        name = dataset

    img_height, img_width = image_size
    x_dim = channels * img_height * img_width

    rnninits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    # Configure attention mechanism
    if attention != "":
        read_N, write_N = attention.split(',')

        read_N = int(read_N)
        write_N = int(write_N)
        read_dim = 2 * channels * read_N**2

        reader = AttentionReader(x_dim=x_dim,
                                 dec_dim=dec_dim,
                                 channels=channels,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
        writer = AttentionWriter(input_dim=dec_dim,
                                 output_dim=x_dim,
                                 channels=channels,
                                 width=img_width,
                                 height=img_height,
                                 N=write_N,
                                 **inits)
        attention_tag = "r%d-w%d" % (read_N, write_N)
    else:
        read_dim = 2 * x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

        attention_tag = "full"

    #----------------------------------------------------------------------

    if name is None:
        name = dataset

    # Learning rate
    def lr_tag(value):
        """ Convert a float into a short tag-usable string representation. E.g.:
            0.1   -> 11
            0.01  -> 12
            0.001 -> 13
            0.005 -> 53
        """
        exp = np.floor(np.log10(value))
        leading = ("%e" % value)[0]
        return "%s%d" % (leading, -exp)

    lr_str = lr_tag(learning_rate)

    subdir = name + "-" + time.strftime("%Y%m%d-%H%M%S")
    longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (
        dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str)
    pickle_file = subdir + "/" + longname + ".pkl"

    print("\nRunning experiment %s" % longname)
    print("               dataset: %s" % dataset)
    print("          subdirectory: %s" % subdir)
    print("         learning rate: %g" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print("            batch size: %d" % batch_size)
    print("                epochs: %d" % epochs)
    print()

    #----------------------------------------------------------------------

    encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim],
                      name="MLP_enc",
                      **inits)
    decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim],
                      name="MLP_dec",
                      **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    draw = DrawModel(n_iter,
                     reader=reader,
                     encoder_mlp=encoder_mlp,
                     encoder_rnn=encoder_rnn,
                     sampler=q_sampler,
                     decoder_mlp=decoder_mlp,
                     decoder_rnn=decoder_rnn,
                     writer=writer)
    draw.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    x_recons, kl_terms = draw.reconstruct(x)

    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule([
            StepClipping(10.),
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t, :].mean()
        kl_term_t.name = "kl_term_%d" % t

        #x_recons_t = T.nnet.sigmoid(c[t,:,:])
        #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        #recons_term_t = recons_term_t.mean()
        #recons_term_t.name = "recons_term_%d" % t

        monitors += [kl_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]
    # Live plotting...
    plot_channels = [
        ["train_nll_bound", "test_nll_bound"],
        ["train_kl_term_%d" % t for t in range(n_iter)],
        #["train_recons_term_%d" % t for t in range(n_iter)],
        ["train_total_gradient_norm", "train_total_step_norm"]
    ]

    #------------------------------------------------------------

    if not os.path.exists(subdir):
        os.makedirs(subdir)

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            TrainingDataMonitoring(train_monitors,
                                   prefix="train",
                                   after_epoch=True),
            #            DataStreamMonitoring(
            #                monitors,
            #                valid_stream,
            ##                updates=scan_updates,
            #                prefix="valid"),
            DataStreamMonitoring(
                monitors,
                test_stream,
                #                updates=scan_updates,
                prefix="test"),
            PartsOnlyCheckpoint("{}/{}".format(subdir, name),
                                before_training=True,
                                after_epoch=True,
                                save_separately=['log', 'model']),
            SampleCheckpoint(image_size=image_size[0],
                             channels=channels,
                             save_subdir=subdir,
                             before_training=True,
                             after_epoch=True),
            # Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()
        ])

    if oldmodel is not None:
        print("Initializing parameters with old model %s" % oldmodel)
        with open(oldmodel, "rb") as f:
            oldmodel = pickle.load(f)
            main_loop.model.set_param_values(oldmodel.get_param_values())
        del oldmodel

    main_loop.run()
Esempio n. 23
0
 def check_constant(const, shape, ground_truth):
     # rng unused, so pass None.
     init = Constant(const).generate(None, ground_truth.shape)
     assert_(ground_truth.dtype == theano.config.floatX)
     assert_(ground_truth.shape == init.shape)
     assert_equal(ground_truth, init)
Esempio n. 24
0
import data
from model.time_mlp import Model, Stream

n_begin_end_pts = 5  # how many points we consider at the beginning and end of the known trajectory

dim_embeddings = [
    ('origin_call', data.origin_call_train_size, 10),
    ('origin_stand', data.stands_size, 10),
    ('week_of_year', 52, 10),
    ('day_of_week', 7, 10),
    ('qhour_of_day', 24 * 4, 10),
    ('day_type', 3, 10),
    ('taxi_id', 448, 10),
]

dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
dim_hidden = [500, 100]
dim_output = 1

embed_weights_init = IsotropicGaussian(0.001)
mlp_weights_init = IsotropicGaussian(0.01)
mlp_biases_init = Constant(0.001)

exp_base = 1.5

learning_rate = 0.00001
momentum = 0.99
batch_size = 32

max_splits = 100
def test_attention_recurrent():
    rng = numpy.random.RandomState(1234)

    dim = 5
    batch_size = 4
    input_length = 20

    attended_dim = 10
    attended_length = 15

    wrapped = SimpleRecurrent(dim, Identity())
    attention = SequenceContentAttention(
        state_names=wrapped.apply.states,
        attended_dim=attended_dim, match_dim=attended_dim)
    recurrent = AttentionRecurrent(wrapped, attention, seed=1234)
    recurrent.weights_init = IsotropicGaussian(0.5)
    recurrent.biases_init = Constant(0)
    recurrent.initialize()

    attended = tensor.tensor3("attended")
    attended_mask = tensor.matrix("attended_mask")
    inputs = tensor.tensor3("inputs")
    inputs_mask = tensor.matrix("inputs_mask")
    outputs = recurrent.apply(
        inputs=inputs, mask=inputs_mask,
        attended=attended, attended_mask=attended_mask)
    states, glimpses, weights = outputs
    assert states.ndim == 3
    assert glimpses.ndim == 3
    assert weights.ndim == 3

    # For values.
    def rand(size):
        return rng.uniform(size=size).astype(theano.config.floatX)

    # For masks.
    def generate_mask(length, batch_size):
        mask = numpy.ones((length, batch_size), dtype=theano.config.floatX)
        # To make it look like read data
        for i in range(batch_size):
            mask[1 + rng.randint(0, length - 1):, i] = 0.0
        return mask

    input_vals = rand((input_length, batch_size, dim))
    input_mask_vals = generate_mask(input_length, batch_size)
    attended_vals = rand((attended_length, batch_size, attended_dim))
    attended_mask_vals = generate_mask(attended_length, batch_size)

    func = theano.function([inputs, inputs_mask, attended, attended_mask],
                           [states, glimpses, weights])
    states_vals, glimpses_vals, weight_vals = func(
        input_vals, input_mask_vals,
        attended_vals, attended_mask_vals)
    assert states_vals.shape == (input_length, batch_size, dim)
    assert glimpses_vals.shape == (input_length, batch_size, attended_dim)

    assert (len(ComputationGraph(outputs).shared_variables) ==
            len(Selector(recurrent).get_parameters()))

    # weights for not masked position must be zero
    assert numpy.all(weight_vals * (1 - attended_mask_vals.T) == 0)
    # weights for masked positions must be non-zero
    assert numpy.all(abs(weight_vals + (1 - attended_mask_vals.T)) > 1e-5)
    # weights from different steps should be noticeably different
    assert (abs(weight_vals[0] - weight_vals[1])).sum() > 1e-2
    # weights for all state after the last masked position should be same
    for i in range(batch_size):
        last = int(input_mask_vals[:, i].sum())
        for j in range(last, input_length):
            assert_allclose(weight_vals[last, i], weight_vals[j, i], 1e-5)

    # freeze sums
    assert_allclose(weight_vals.sum(), input_length * batch_size, 1e-5)
    assert_allclose(states_vals.sum(), 113.429, rtol=1e-5)
    assert_allclose(glimpses_vals.sum(), 415.901, rtol=1e-5)
Esempio n. 26
0
        act = Rectifier
    elif activation_function == 'tanh':
        act = Tanh
    elif activation_function == 'sigmoid':
        act = Logistic
    elif activation_function == 'softplus':
        act = Softplus
    layers_act = [act('layer_%d' % i) for i in range(len(hidden_size))]
    NADE_CF_model = tabula_NADE(activations=layers_act,
                                input_dim0=input_dim0,
                                input_dim1=input_dim1,
                                C_dim=C_dim,
                                other_dims=hidden_size,
                                batch_size=batch_size,
                                weights_init=Uniform(std=0.05),
                                biases_init=Constant(0.0)
                                )
    NADE_CF_model.push_initialization_config()
    dims = [input_dim0] + hidden_size + [input_dim0]
    linear_layers = [layer for layer in NADE_CF_model.children
                     if 'linear' in layer.name]
    assert len(linear_layers) == len(dims) - 1
    for i in range(len(linear_layers)):
        H1 = dims[i]
        H2 = dims[i + 1]
        width = 2 * np.sqrt(6) / np.sqrt(H1 + H2)
#         std = np.sqrt(2. / dim)
        linear_layers[i].weights_init = Uniform(width=width)
    
    
#     NADE_CF_model.children[0].weights_init = Constant(1)
Esempio n. 27
0
from blocks.roles import WEIGHT
from blocks.graph import ComputationGraph
from blocks.filter import VariableFilter
cg = ComputationGraph(cost)
W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum()
cost.name = 'cost_with_regularization'


from blocks.bricks import MLP
mlp = MLP(activations=[Rectifier(), Softmax()], dims=[784, 100, 10]).apply(x)


from blocks.initialization import IsotropicGaussian, Constant
input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01)
input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0)
input_to_hidden.initialize()
hidden_to_output.initialize()


from fuel.datasets import MNIST
mnist = MNIST(("train",))


from fuel.streams import DataStream
from fuel.schemes import SequentialScheme
from fuel.transformers import Flatten


data_stream = Flatten(DataStream.default_stream(
        mnist,
Esempio n. 28
0
def build_and_run(label, config):
    ############## CREATE THE NETWORK ###############
    #Define the parameters
    num_epochs, num_batches, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation = config[
        'num_epochs'], config['num_batches'], config['num_channels'], config[
            'image_shape'], config['filter_size'], config[
                'num_filter'], config['pooling_sizes'], config[
                    'mlp_hiddens'], config['output_size'], config[
                        'batch_size'], config['activation'], config[
                            'mlp_activation']
    #    print(num_epochs, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation)
    lambda_l1 = 0.000025
    lambda_l2 = 0.000025

    print("Building model")
    #Create the symbolics variable
    x = T.tensor4('image_features')
    y = T.lmatrix('targets')

    #Get the parameters
    conv_parameters = zip(filter_size, num_filter)

    #Create the convolutions layers
    conv_layers = list(
        interleave([(Convolutional(filter_size=filter_size,
                                   num_filters=num_filter,
                                   name='conv_{}'.format(i))
                     for i, (filter_size,
                             num_filter) in enumerate(conv_parameters)),
                    (activation),
                    (MaxPooling(size, name='pool_{}'.format(i))
                     for i, size in enumerate(pooling_sizes))]))
    #    (AveragePooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))]))

    #Create the sequence
    conv_sequence = ConvolutionalSequence(conv_layers,
                                          num_channels,
                                          image_size=image_shape,
                                          weights_init=Uniform(width=0.2),
                                          biases_init=Constant(0.))
    #Initialize the convnet
    conv_sequence.initialize()
    #Add the MLP
    top_mlp_dims = [np.prod(conv_sequence.get_dim('output'))
                    ] + mlp_hiddens + [output_size]
    out = Flattener().apply(conv_sequence.apply(x))
    mlp = MLP(mlp_activation,
              top_mlp_dims,
              weights_init=Uniform(0, 0.2),
              biases_init=Constant(0.))
    #Initialisze the MLP
    mlp.initialize()
    #Get the output
    predict = mlp.apply(out)

    cost = CategoricalCrossEntropy().apply(y.flatten(),
                                           predict).copy(name='cost')
    error = MisclassificationRate().apply(y.flatten(), predict)

    #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason)
    error_rate = error.copy(name='error_rate')
    error_rate2 = error.copy(name='error_rate2')

    ########### REGULARIZATION ##################
    cg = ComputationGraph([cost])
    weights = VariableFilter(roles=[WEIGHT])(cg.variables)
    biases = VariableFilter(roles=[BIAS])(cg.variables)
    # # l2_penalty_weights = T.sum([i*lambda_l2/len(weights) * (W ** 2).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer
    l2_penalty = T.sum([
        lambda_l2 * (W**2).sum() for i, W in enumerate(weights + biases)
    ])  # Gradually increase penalty for layer
    # # #l2_penalty_bias = T.sum([lambda_l2*(B **2).sum() for B in biases])
    # # #l2_penalty = l2_penalty_weights + l2_penalty_bias
    l2_penalty.name = 'l2_penalty'
    l1_penalty = T.sum([lambda_l1 * T.abs_(z).sum() for z in weights + biases])
    #  l1_penalty_weights = T.sum([i*lambda_l1/len(weights) * T.abs_(W).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer
    #  l1_penalty_biases = T.sum([lambda_l1 * T.abs_(B).sum() for B in biases])
    #  l1_penalty = l1_penalty_biases + l1_penalty_weights
    l1_penalty.name = 'l1_penalty'
    costreg = cost + l2_penalty + l1_penalty
    costreg.name = 'costreg'

    ########### DEFINE THE ALGORITHM #############
    #  algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum())
    algorithm = GradientDescent(cost=costreg,
                                parameters=cg.parameters,
                                step_rule=Adam())

    ########### GET THE DATA #####################
    istest = 'test' in config.keys()
    train_stream, valid_stream, test_stream = get_stream(batch_size,
                                                         image_shape,
                                                         test=istest)

    ########### INITIALIZING EXTENSIONS ##########
    checkpoint = Checkpoint('models/best_' + label + '.tar')
    checkpoint.add_condition(
        ['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far'))
    #Adding a live plot with the bokeh server
    plot = Plot(
        label,
        channels=[
            ['train_error_rate', 'valid_error_rate'],
            ['valid_cost', 'valid_error_rate2'],
            # ['train_costreg','train_grad_norm']], #
            [
                'train_costreg', 'train_total_gradient_norm',
                'train_l2_penalty', 'train_l1_penalty'
            ]
        ],
        server_url="http://hades.calculquebec.ca:5042")

    grad_norm = aggregation.mean(algorithm.total_gradient_norm)
    grad_norm.name = 'grad_norm'

    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches),
        DataStreamMonitoring([cost, error_rate, error_rate2],
                             valid_stream,
                             prefix="valid"),
        TrainingDataMonitoring([
            costreg, error_rate, error_rate2, grad_norm, l2_penalty, l1_penalty
        ],
                               prefix="train",
                               after_epoch=True),
        plot,
        ProgressBar(),
        Printing(),
        TrackTheBest('valid_error_rate', min),  #Keep best
        checkpoint,  #Save best
        FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=4)
    ]  # Early-stopping
    model = Model(cost)
    main_loop = MainLoop(algorithm,
                         data_stream=train_stream,
                         model=model,
                         extensions=extensions)
    main_loop.run()
 def setUp(self):
     self.simple = SimpleRecurrent(dim=3,
                                   weights_init=Constant(2),
                                   activation=Tanh())
     self.simple.initialize()
Esempio n. 30
0
                           name="lstm")
else:
    lstm = LSTM(activation=Tanh(),
                dim=h_dim,
                bias=bias,
                name="lstm")
h, c = lstm.apply(x_transform)
h_to_o = Linear(name='h_to_o',
                input_dim=h_dim,
                output_dim=o_dim)
o = h_to_o.apply(h)
o = NDimensionalSoftmax().apply(o, extra_ndim=1)

for brick in (lstm, x_to_h, h_to_o):
    brick.weights_init = Glorot()
    brick.biases_init = Constant(0)
    brick.initialize()

cost = CategoricalCrossEntropy().apply(y, o)
cost.name = 'CE'

print 'Bulding training process...'
shapes = []
for param in ComputationGraph(cost).parameters:
    # shapes.append((param.name, param.eval().shape))
    shapes.append(np.prod(list(param.eval().shape)))
print "Total number of parameters: " + str(np.sum(shapes))

if not os.path.exists(save_path):
    os.makedirs(save_path)
log_path = save_path + '/log.txt'
Esempio n. 31
0
    def create_model(self):
        input_dim = self.input_dim
        x = self.x
        y = self.y
        p = self.p
        mask = self.mask
        hidden_dim = self.hidden_dim
        embedding_dim = self.embedding_dim
        lookup = LookupTable(self.dict_size,
                             embedding_dim,
                             weights_init=IsotropicGaussian(0.001),
                             name='LookupTable')
        x_to_h = Linear(embedding_dim,
                        hidden_dim * 4,
                        name='x_to_h',
                        weights_init=IsotropicGaussian(0.001),
                        biases_init=Constant(0.0))
        lstm = LSTM(hidden_dim,
                    name='lstm',
                    weights_init=IsotropicGaussian(0.001),
                    biases_init=Constant(0.0))
        h_to_o = MLP([Logistic()], [hidden_dim, 1],
                     weights_init=IsotropicGaussian(0.001),
                     biases_init=Constant(0),
                     name='h_to_o')

        lookup.initialize()
        x_to_h.initialize()
        lstm.initialize()
        h_to_o.initialize()

        embed = lookup.apply(x).reshape(
            (x.shape[0], x.shape[1], self.embedding_dim))
        embed.name = "embed_vec"
        x_transform = x_to_h.apply(embed.transpose(1, 0, 2))
        x_transform.name = "Transformed X"
        self.lookup = lookup
        self.x_to_h = x_to_h
        self.lstm = lstm
        self.h_to_o = h_to_o

        #if mask is None:
        h, c = lstm.apply(x_transform)
        #else:
        #h, c = lstm.apply(x_transform, mask=mask)
        h.name = "hidden_state"
        c.name = "cell state"
        # only values of hidden units of the last timeframe are used for
        # the classification
        indices = T.sum(mask, axis=0) - 1
        rel_hid = h[indices, T.arange(h.shape[1])]
        out = self.h_to_o.apply(rel_hid)

        probs = 1 - out
        probs.name = "probability"
        y = y.dimshuffle(0, 'x')
        # Create the if-else cost function
        pos_ex = (y * probs) / p
        neg_ex = (1 - y) * (1 - probs) / np.float32(1 - p)
        reward = pos_ex + neg_ex
        cost = reward  # Negative of reward
        cost.name = "cost"
        return cost
Esempio n. 32
0
def create_layers(layer_spec,
                  data_dim,
                  deterministic_layers=0,
                  deterministic_act=None,
                  deterministic_size=1.):
    """
    Parameters
    ----------
    layer_spec : str
        A specification for the layers to construct; typically takes a string
        like "100,50,25,10" and create P- and Q-models with  4 hidden layers
        of specified size.
    data_dim : int
        Dimensionality of the trainig/test data. The bottom-most layers
        will work with thgis dimension.
    deterministic_layers : int
        Dont want to talk about it.
    deterministic_act : 
    deterministic_size : float

    Returns
    -------
    p_layers : list
        List of ProbabilisticLayers with a ProbabilisticTopLayer on top.
    q_layers : list
        List of ProbabilisticLayers
    """
    inits = {
        'weights_init': RWSInitialization(factor=1.),
        #        'weights_init': IsotropicGaussian(0.1),
        'biases_init': Constant(-1.0),
    }

    m = re.match("(\d*\.?\d*)x-(\d+)l-(\d+)", layer_spec)
    if m:
        first = int(data_dim * float(m.groups()[0]))
        last = float(m.groups()[2])
        n_layers = int(m.groups()[1])

        base = numpy.exp(numpy.log(first / last) / (n_layers - 1))
        layer_sizes = [data_dim] + [
            int(last * base**i) for i in reversed(range(n_layers))
        ]
        print(layer_sizes)
    else:
        layer_specs = [i for i in layer_spec.split(",")]
        layer_sizes = [data_dim] + [int(i) for i in layer_specs]

    p_layers = []
    q_layers = []
    for l, (size_lower,
            size_upper) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
        """
        if size_upper < 0:
            lower_before_repeat = size_lower
            p = BernoulliLayer(
                    MLP([Sigmoid()], [size_lower, size_lower], **rinits), 
                    name="p_layer%d"%l)
            q = BernoulliLayer(
                    MLP([Sigmoid()], [size_lower, size_lower], **rinits), 
                    name="q_layer%d"%l)
            for r in xrange(-size_upper):
                p_layers.append(p)
                q_layers.append(q)
            continue
        elif size_lower < 0:
            size_lower = lower_before_repeat
        """
        size_mid = (deterministic_size * (size_upper + size_lower)) // 2

        p_layers.append(
            BernoulliLayer(MLP(
                [deterministic_act()
                 for i in range(deterministic_layers)] + [Logistic()],
                [size_upper] + [size_mid
                                for i in range(deterministic_layers)] +
                [size_lower], **inits),
                           name="p_layer%d" % l))
        q_layers.append(
            BernoulliLayer(MLP(
                [deterministic_act()
                 for i in range(deterministic_layers)] + [Logistic()],
                [size_lower] + [size_mid
                                for i in range(deterministic_layers)] +
                [size_upper], **inits),
                           name="q_layer%d" % l))

    p_layers.append(
        BernoulliTopLayer(layer_sizes[-1], name="p_top_layer", **inits))

    return p_layers, q_layers
Esempio n. 33
0
def testing_init(brick):
    brick.weights_init = Identity()
    brick.biases_init = Constant(0)
    brick.initialize()
Esempio n. 34
0
class GatedRecurrent(BaseRecurrent, Initializable):
    u"""Gated recurrent neural network.

    Gated recurrent neural network (GRNN) as introduced in [CvMG14]_. Every
    unit of a GRNN is equipped with update and reset gates that facilitate
    better gradient propagation.

    Parameters
    ----------
    dim : int
        The dimension of the hidden state.
    activation : :class:`.Brick` or None
        The brick to apply as activation. If ``None`` a
        :class:`.Tanh` brick is used.
    gate_activation : :class:`.Brick` or None
        The brick to apply as activation for gates. If ``None`` a
        :class:`.Logistic` brick is used.

    Notes
    -----
    See :class:`.Initializable` for initialization parameters.

    .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre,
        Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua
        Bengio, *Learning Phrase Representations using RNN Encoder-Decoder
        for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734.

    """
    @lazy(allocation=['dim'])
    def __init__(self, dim, activation=None, gate_activation=None,
                 **kwargs):
        super(GatedRecurrent, self).__init__(**kwargs)
        self.dim = dim

        self.recurrent_weights_init = None
        self.initial_states_init = None

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        self.children = [activation, gate_activation]

    @property
    def state_to_state(self):
        return self.parameters[0]

    @property
    def state_to_gates(self):
        return self.parameters[1]

    @property
    def initial_states_(self):
        return self.parameters[2]

    def get_dim(self, name):
        if name == 'mask':
            return 0
        if name in ['inputs', 'states']:
            return self.dim
        if name == 'gate_inputs':
            return 2 * self.dim
        return super(GatedRecurrent, self).get_dim(name)

    def _allocate(self):
        self.parameters.append(shared_floatx_nans((self.dim, self.dim),
                           name='state_to_state'))
        add_role(self.parameters[-1], WEIGHT)

        self.parameters.append(shared_floatx_nans((self.dim, 2 * self.dim),
                           name='state_to_gates'))
        add_role(self.parameters[-1], WEIGHT)

        self.parameters.append(shared_floatx_nans((self.dim,),
                           name="initial_state"))
        add_role(self.parameters[-1], INITIAL_STATE)


    def _initialize(self):
        #TODO: know what to do after Blocks #740 is resolved:
        if self.recurrent_weights_init is None:
            self.recurrent_weights_init = self.weights_init
        if self.initial_states_init is None:
            self.initial_states_init = Constant(0.0)
        self.recurrent_weights_init.initialize(self.state_to_state, self.rng)
        state_to_update = self.weights_init.generate(
            self.rng, (self.dim, self.dim))
        state_to_reset = self.weights_init.generate(
            self.rng, (self.dim, self.dim))
        self.state_to_gates.set_value(
            numpy.hstack([state_to_update, state_to_reset]))
        self.initial_states_init.initialize(self.parameters.initial_state, self.rng)

    @recurrent(sequences=['mask', 'inputs', 'gate_inputs'],
               states=['states'], outputs=['states'], contexts=[])
    def apply(self, inputs, gate_inputs, states, mask=None):
        """Apply the gated recurrent transition.

        Parameters
        ----------
        states : :class:`~tensor.TensorVariable`
            The 2 dimensional matrix of current states in the shape
            (batch_size, dim). Required for `one_step` usage.
        inputs : :class:`~tensor.TensorVariable`
            The 2 dimensional matrix of inputs in the shape (batch_size,
            dim)
        gate_inputs : :class:`~tensor.TensorVariable`
            The 2 dimensional matrix of inputs to the gates in the
            shape (batch_size, 2 * dim).
        mask : :class:`~tensor.TensorVariable`
            A 1D binary array in the shape (batch,) which is 1 if there is
            data available, 0 if not. Assumed to be 1-s only if not given.

        Returns
        -------
        output : :class:`~tensor.TensorVariable`
            Next states of the network.

        """
        gate_values = self.gate_activation.apply(
            states.dot(self.state_to_gates) + gate_inputs)
        update_values = gate_values[:, :self.dim]
        reset_values = gate_values[:, self.dim:]
        states_reset = states * reset_values
        next_states = self.activation.apply(
            states_reset.dot(self.state_to_state) + inputs)
        next_states = (next_states * update_values +
                       states * (1 - update_values))
        if mask:
            next_states = (mask[:, None] * next_states +
                           (1 - mask[:, None]) * states)
        return next_states

    @application(outputs=apply.states)
    def initial_states(self, batch_size, *args, **kwargs):
        return [tensor.repeat(self.parameters.initial_state[None, :], batch_size, 0)]