Exemple #1
0
def main(save_to, num_batches, continue_=False):
    mlp = MLP([Tanh(), Identity()], [1, 10, 1],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0),
              seed=1)
    mlp.initialize()
    x = tensor.vector('numbers')
    y = tensor.vector('roots')
    cost = SquaredError().apply(y[:, None], mlp.apply(x[:, None]))
    cost.name = "cost"

    main_loop = MainLoop(
        GradientDescent(cost=cost,
                        params=ComputationGraph(cost).parameters,
                        step_rule=Scale(learning_rate=0.001)),
        get_data_stream(range(100)),
        model=Model(cost),
        extensions=([LoadFromDump(save_to)] if continue_ else []) + [
            Timing(),
            FinishAfter(after_n_batches=num_batches),
            DataStreamMonitoring(
                [cost], get_data_stream(range(100, 200)), prefix="test"),
            TrainingDataMonitoring([cost], after_epoch=True),
            Dump(save_to),
            Printing()
        ])
    main_loop.run()
    return main_loop
Exemple #2
0
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name,
         max_length, bokeh, GRU, dropout, depth, max_grad, step_method,
         epsilon, sample):

    #----------------------------------------------------------------------
    datasource = name

    def shnum(x):
        """ Convert a positive float into a short tag-usable string
             E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2
        """
        return '0' if x <= 0 else '%s%d' % (
            ("%e" % x)[0], -np.floor(np.log10(x)))

    jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (
        datasource, depth, dim, mix_dim, int(
            dropout * 10), shnum(learning_rate), batch_size, shnum(epsilon))
    if max_length != 600:
        jobname += '-L%d' % max_length

    if GRU:
        jobname += 'g'
    if max_grad != 5.:
        jobname += 'G%g' % max_grad
    if step_method != 'adam':
        jobname += step_method

    if sample:
        print("Sampling")
    else:
        print("\nRunning experiment %s" % jobname)

    #----------------------------------------------------------------------
    if depth > 1:
        transition = LSTMstack(dim=dim,
                               depth=depth,
                               name="transition",
                               lstm_name="transition")
        assert not GRU
    elif GRU:
        transition = GatedRecurrent(dim=dim, name="transition")
    else:
        transition = LSTM(dim=dim, name="transition")

    emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter")
    readout = Readout(readout_dim=emitter.get_dim('inputs'),
                      source_names=['states'],
                      emitter=emitter,
                      name="readout")
    normal_inputs = [
        name for name in transition.apply.sequences if 'mask' not in name
    ]
    fork = Fork(normal_inputs, prototype=Linear(use_bias=True))
    generator = SequenceGenerator(readout=readout,
                                  transition=transition,
                                  fork=fork)

    # Initialization settings
    generator.weights_init = OrthogonalGlorot()
    generator.biases_init = Constant(0)

    # Build the cost computation graph [steps,batch_size, 3]
    x = T.tensor3('features', dtype=floatX)[:max_length, :, :]
    x.tag.test_value = np.ones((max_length, batch_size, 3)).astype(np.float32)
    cost = generator.cost(x)
    cost.name = "sequence_log_likelihood"

    # Give an idea of what's going on
    model = Model(cost)
    params = model.get_params()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, value.get_value().shape)
                                for key, value in params.items()],
                               width=120))
    model_size = 0
    for v in params.itervalues():
        s = v.get_value().shape
        model_size += s[0] * (s[1] if len(s) > 1 else 1)
    logger.info("Total number of parameters %d" % model_size)

    #------------------------------------------------------------
    extensions = []
    if old_model_name == 'continue':
        extensions.append(LoadFromDump(jobname))
    elif old_model_name:
        # or you can just load the weights without state using:
        old_params = LoadFromDump(old_model_name).manager.load_parameters()
        model.set_param_values(old_params)
    else:
        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

    if sample:
        assert old_model_name and old_model_name != 'continue'
        Sample(generator, steps=max_length, path='.').do(None)
        exit(0)

    #------------------------------------------------------------
    # Define the training algorithm.
    cg = ComputationGraph(cost)
    if dropout > 0.:
        from blocks.roles import INPUT, OUTPUT
        dropout_target = VariableFilter(roles=[OUTPUT],
                                        bricks=[transition],
                                        name_regex='states')(cg.variables)
        cg = apply_dropout(cg, dropout_target, dropout)
        cost = cg.outputs[0]

    if step_method == 'adam':
        step_rule = Adam(learning_rate)
    elif step_method == 'rmsprop':
        step_rule = RMSProp(learning_rate, decay_rate=0.95)
    elif step_method == 'adagrad':
        step_rule = AdaGrad(learning_rate)
    elif step_method == 'adadelta':
        step_rule = AdaDelta()
    elif step_method == 'scale':
        step_rule = Scale(learning_rate=0.1)
    else:
        raise Exception('Unknown sttep method %s' % step_method)

    step_rule = CompositeRule([StepClipping(max_grad), step_rule])

    algorithm = GradientDescent(cost=cost,
                                params=cg.parameters,
                                step_rule=step_rule)

    #------------------------------------------------------------
    observables = [cost]

    # Fetch variables useful for debugging
    (energies, ) = VariableFilter(applications=[generator.readout.readout],
                                  name_regex="output")(cg.variables)
    (activations, ) = VariableFilter(
        applications=[generator.transition.apply],
        name=generator.transition.apply.states[0])(cg.variables)
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_activation = named_copy(abs(activations).mean(), "mean_activation")
    observables += [min_energy, max_energy, mean_activation]

    observables += [algorithm.total_step_norm, algorithm.total_gradient_norm]
    for name, param in params.items():
        observables.append(named_copy(param.norm(2), name + "_norm"))
        observables.append(
            named_copy(algorithm.gradients[param].norm(2),
                       name + "_grad_norm"))

    #------------------------------------------------------------
    datasource_fname = os.path.join(fuel.config.data_path, datasource,
                                    datasource + '.hdf5')

    train_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='train',
        sources=('features', ),
        load_in_memory=True)
    train_stream = DataStream(train_ds,
                              iteration_scheme=ShuffledScheme(
                                  train_ds.num_examples, batch_size))

    test_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='test',
        sources=('features', ),
        load_in_memory=True)
    test_stream = DataStream(test_ds,
                             iteration_scheme=SequentialScheme(
                                 test_ds.num_examples, batch_size))

    train_stream = Mapping(train_stream, _transpose)
    test_stream = Mapping(test_stream, _transpose)

    def stream_stats(ds, label):
        itr = ds.get_epoch_iterator(as_dict=True)
        batch_count = 0
        examples_count = 0
        for batch in itr:
            batch_count += 1
            examples_count += batch['features'].shape[1]
        print('%s #batch %d #examples %d' %
              (label, batch_count, examples_count))

    stream_stats(train_stream, 'train')
    stream_stats(test_stream, 'test')

    extensions += [
        Timing(every_n_batches=10),
        TrainingDataMonitoring(observables, prefix="train",
                               every_n_batches=10),
        DataStreamMonitoring(
            [cost],
            test_stream,
            prefix="test",
            on_resumption=True,
            after_epoch=False,  # by default this is True
            every_n_batches=100),
        # all monitored data is ready so print it...
        # (next steps may take more time and we want to see the
        # results as soon as possible so print as soon as you can)
        Printing(every_n_batches=10),
        # perform multiple dumps at different intervals
        # so if one of them breaks (has nan) we can hopefully
        # find a model from few batches ago in the other
        Dump(jobname, every_n_batches=11),
        Dump(jobname + '.test', every_n_batches=100),
        Sample(generator,
               steps=max_length,
               path=jobname + '.test',
               every_n_batches=100),
        ProgressBar(),
        FinishAfter(after_n_epochs=epochs)
        # This shows a way to handle NaN emerging during
        # training: simply finish it.
        .add_condition("after_batch", _is_nan),
    ]

    if bokeh:
        extensions.append(Plot('sketch', channels=[
            ['cost'],
        ]))

    # Construct the main loop and start training!
    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
Exemple #3
0
def main(config, tr_stream, dev_stream):

    # Create Theano variables
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['dec_nhids'], config['enc_nhids'] * 2)
    cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask),
                        source_sentence_mask, target_sentence,
                        target_sentence_mask)

    # Initialize model
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Apply weight noise for regularization
    if config['weight_noise_ff'] > 0.0:
        enc_params = Selector(encoder.lookup).get_params().values()
        enc_params += Selector(encoder.fwd_fork).get_params().values()
        enc_params += Selector(encoder.back_fork).get_params().values()
        dec_params = Selector(
            decoder.sequence_generator.readout).get_params().values()
        dec_params += Selector(
            decoder.sequence_generator.fork).get_params().values()
        dec_params += Selector(
            decoder.transition.initial_transformer).get_params().values()
        cg = apply_noise(cg, enc_params + dec_params,
                         config['weight_noise_ff'])

    cost = cg.outputs[0]

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_params(),
        Selector(decoder).get_params())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.iteritems():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training algorithm
    if args.subtensor_fix:
        assert config['step_rule'] == 'AdaDelta'
        from subtensor_gradient import GradientDescent_SubtensorFix, AdaDelta_SubtensorFix, subtensor_params
        lookups = subtensor_params(cg, [
            encoder.lookup,
            decoder.sequence_generator.readout.feedback_brick.lookup
        ])
        algorithm = GradientDescent_SubtensorFix(
            subtensor_params=lookups,
            cost=cost,
            params=cg.parameters,
            step_rule=CompositeRule([
                StepClipping(config['step_clipping']),
                RemoveNotFinite(0.9),
                AdaDelta_SubtensorFix(subtensor_params=lookups)
            ]))
    else:
        algorithm = GradientDescent(cost=cost,
                                    params=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        RemoveNotFinite(0.9),
                                        eval(config['step_rule'])()
                                    ]))

    # Set up beam search and sampling computation graphs
    sampling_representation = encoder.apply(sampling_input,
                                            tensor.ones(sampling_input.shape))
    generated = decoder.generate(sampling_input, sampling_representation)
    search_model = Model(generated)
    samples, = VariableFilter(
        bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph(
            generated[1]))  # generated[1] is the next_outputs

    # Set up training model
    training_model = Model(cost)

    # Set extensions
    extensions = [
        Sampler(model=search_model,
                config=config,
                data_stream=tr_stream,
                src_eos_idx=config['src_eos_idx'],
                trg_eos_idx=config['trg_eos_idx'],
                every_n_batches=config['sampling_freq']),
        BleuValidator(sampling_input,
                      samples=samples,
                      config=config,
                      model=search_model,
                      data_stream=dev_stream,
                      src_eos_idx=config['src_eos_idx'],
                      trg_eos_idx=config['trg_eos_idx'],
                      every_n_batches=config['bleu_val_freq']),
        TrainingDataMonitoring([cost], after_batch=True),
        #Plot('En-Fr', channels=[['decoder_cost_cost']],
        #     after_batch=True),
        Printing(after_batch=True),
        Dump(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Reload model if necessary
    if config['reload']:
        extensions += [LoadFromDumpWMT15(config['saveto'])]

    # Initialize main loop
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Exemple #4
0
def main(save, load, sample, path, **kwargs):
    input_dim = 784
    hidden_dim = 2
    batch_size = 100

    features = tensor.matrix('features')

    vae = VariationalAutoEncoder(input_dim, hidden_dim,
                                 weights_init=IsotropicGaussian(0.01),
                                 biases_init=Constant(0.))
    vae.initialize()

    mu, logsigma, x_hat = vae.apply(features)
    cost = vae.cost(features)
    cost.name = 'cost'
    regularization_cost = vae.regularization_cost(mu, logsigma).mean()
    regularization_cost.name = 'regularization_cost'
    reconstruction_cost = vae.reconstruction_cost(features, x_hat).mean()
    reconstruction_cost.name = 'reconstruction_cost'

    cg = ComputationGraph([cost, reconstruction_cost, regularization_cost])
    model = Model(cost)

    algorithm = GradientDescent(step_rule=RMSProp(1e-4), params=cg.parameters,
                                cost=cost)

    extensions = []
    if load:
        extensions.append(LoadFromDump(path))
    if save:
        extensions.append(Dump(path, after_epoch=True))
    extensions.append(FinishAfter(after_n_epochs=6001))

    train_dataset = MNIST('train', binary=False, sources=('features',))
    train_stream = DataStream(train_dataset,
                              iteration_scheme=ShuffledScheme(
                                  examples=train_dataset.num_examples,
                                  batch_size=batch_size))
    train_monitor = TrainingDataMonitoring(
        [cost, regularization_cost, reconstruction_cost],
        prefix='train', after_epoch=True)

    test_dataset = MNIST('test', binary=True, sources=('features',))
    test_stream = DataStream(test_dataset,
                             iteration_scheme=ShuffledScheme(
                                 examples=test_dataset.num_examples,
                                 batch_size=batch_size))
    test_monitor = DataStreamMonitoring([cost], test_stream, prefix='test')
    extensions.extend([train_monitor, test_monitor])
    extensions.extend([Timing(), Printing()])
    main_loop = MainLoop(model=model, algorithm=algorithm,
                         data_stream=train_stream,
                         extensions=extensions)
    if not sample:
        main_loop.run()
    else:
        parameters = load_parameter_values(path + '/params.npz')
        model.set_param_values(parameters)

        num_samples = 10
        samples = vae.sample(num_samples)
        samples = function([], samples)()
        z = tensor.matrix('z')
        decode_z = function([z], vae.decoder.apply(z))

        from matplotlib import pyplot as plt

        sample = numpy.zeros((28, 0))

        size = 40
        z_val = numpy.zeros((size ** 2, 2))
        for i in xrange(size):
            for j in xrange(size):
                z_val[i * size + j, :] = numpy.array(
                    [i / float(0.3 * size) - .5 / .3,
                     j / float(0.3 * size) - .5 / .3])
        samples = decode_z(z_val)
        samples = samples.reshape((size, size, 28, 28))
        samples = numpy.concatenate(samples, axis=1)
        samples = numpy.concatenate(samples, axis=1)
        plt.imshow(samples, cmap=plt.get_cmap('Greys'))
        plt.show()
        f = function([features], x_hat)
        for data in train_stream.get_epoch_iterator():
            data_hat = f(data[0])
            for image, image_hat in zip(data[0], data_hat):
                im = numpy.concatenate([image_hat.reshape((28, 28)),
                                        image.reshape((28, 28))])
                plt.imshow(im, cmap=plt.get_cmap('Greys'))
                plt.show()
Exemple #5
0
def main(model_path, recurrent_type):
    dataset_options = dict(dictionary=char2code, level="character",
                           preprocess=_lower)
    dataset = OneBillionWord("training", [99], **dataset_options)
    data_stream = dataset.get_example_stream()
    data_stream = Filter(data_stream, _filter_long)
    data_stream = Mapping(data_stream, _make_target,
                          add_sources=('target',))
    data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(100))
    data_stream = Padding(data_stream)
    data_stream = Mapping(data_stream, _transpose)

    features = tensor.lmatrix('features')
    features_mask = tensor.matrix('features_mask')
    target = tensor.lmatrix('target')
    target_mask = tensor.matrix('target_mask')

    dim = 100
    lookup = LookupTable(len(all_chars), dim,
                         weights_init=IsotropicGaussian(0.01),
                         biases_init=Constant(0.))

    if recurrent_type == 'lstm':
        rnn = LSTM(dim / 4, Tanh(),
                   weights_init=IsotropicGaussian(0.01),
                   biases_init=Constant(0.))
    elif recurrent_type == 'simple':
        rnn = SimpleRecurrent(dim, Tanh())
        rnn = Bidirectional(rnn,
                            weights_init=IsotropicGaussian(0.01),
                            biases_init=Constant(0.))
    else:
        raise ValueError('Not known RNN type')
    rnn.initialize()
    lookup.initialize()
    y_hat = rnn.apply(lookup.apply(features), mask=features_mask)

    print len(all_chars)
    linear = Linear(2 * dim, len(all_chars),
                    weights_init=IsotropicGaussian(0.01),
                    biases_init=Constant(0.))
    linear.initialize()
    y_hat = linear.apply(y_hat)
    seq_lenght = y_hat.shape[0]
    batch_size = y_hat.shape[1]
    y_hat = Softmax().apply(y_hat.reshape((seq_lenght * batch_size, -1))).reshape(y_hat.shape)
    cost = CategoricalCrossEntropy().apply(
        target.flatten(),
        y_hat.reshape((-1, len(all_chars)))) * seq_lenght * batch_size
    cost.name = 'cost'
    cost_per_character = cost / features_mask.sum()
    cost_per_character.name = 'cost_per_character'

    cg = ComputationGraph([cost, cost_per_character])
    model = Model(cost)
    algorithm = GradientDescent(step_rule=Adam(), cost=cost,
                                params=cg.parameters)

    train_monitor = TrainingDataMonitoring(
        [cost, cost_per_character], prefix='train',
        after_batch=True)
    extensions = [train_monitor, Printing(every_n_batches=40),
                  Dump(model_path, every_n_batches=200),
                  #Checkpoint('rnn.pkl', every_n_batches=200)
                  ]
    main_loop = MainLoop(model=model, algorithm=algorithm,
                         data_stream=data_stream, extensions=extensions)
    main_loop.run()