def run(discriminative_regularization=True):
    streams = create_celeba_streams(training_batch_size=100,
                                    monitoring_batch_size=500,
                                    include_targets=False)
    main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3]

    # Compute parameter updates for the batch normalization population
    # statistics. They are updated following an exponential moving average.
    rval = create_training_computation_graphs(discriminative_regularization)
    cg, bn_cg, variance_parameters = rval
    pop_updates = list(
        set(get_batch_normalization_updates(bn_cg, allow_duplicates=True)))
    decay_rate = 0.05
    extra_updates = [(p, m * decay_rate + p * (1 - decay_rate))
                     for p, m in pop_updates]

    model = Model(bn_cg.outputs[0])
    selector = Selector(
        find_bricks(
            model.top_bricks,
            lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp',
                                         'decoder_convnet', 'decoder_mlp')))
    parameters = list(selector.get_parameters().values()) + variance_parameters

    # Prepare algorithm
    step_rule = Adam()
    algorithm = GradientDescent(cost=bn_cg.outputs[0],
                                parameters=parameters,
                                step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    # Prepare monitoring
    monitored_quantities_list = []
    for graph in [bn_cg, cg]:
        cost, kl_term, reconstruction_term = graph.outputs
        cost.name = 'nll_upper_bound'
        avg_kl_term = kl_term.mean(axis=0)
        avg_kl_term.name = 'avg_kl_term'
        avg_reconstruction_term = -reconstruction_term.mean(axis=0)
        avg_reconstruction_term.name = 'avg_reconstruction_term'
        monitored_quantities_list.append(
            [cost, avg_kl_term, avg_reconstruction_term])
    train_monitoring = DataStreamMonitoring(
        monitored_quantities_list[0], train_monitor_stream, prefix="train",
        updates=extra_updates, after_epoch=False, before_first_epoch=False,
        every_n_epochs=5)
    valid_monitoring = DataStreamMonitoring(
        monitored_quantities_list[1], valid_monitor_stream, prefix="valid",
        after_epoch=False, before_first_epoch=False, every_n_epochs=5)

    # Prepare checkpoint
    save_path = 'celeba_vae_{}regularization.zip'.format(
        '' if discriminative_regularization else 'no_')
    checkpoint = Checkpoint(save_path, every_n_epochs=5, use_cpickle=True)

    extensions = [Timing(), FinishAfter(after_n_epochs=75), train_monitoring,
                  valid_monitoring, checkpoint, Printing(), ProgressBar()]
    main_loop = MainLoop(data_stream=main_loop_stream,
                         algorithm=algorithm, extensions=extensions)
    main_loop.run()
Beispiel #2
0
def train_rnnrbm(train, rnnrbm, epochs=1000, test=None, bokeh=True,
                 load_path=None):
    cdk = theano.shared(10)
    lr = theano.shared(float32(0.004))

    cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk)

    error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask)
    error_rate.name = "error on note as a whole"
    mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask)
    mistake_rate.name = "single error within note"
    cost.name = 'rbm_cost'

    model = Model(cost)
    cg = ComputationGraph([cost])
    step_rule = CompositeRule(
        [RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0),
         RemoveNotFinite()])  # Scale(0.01)
    gradients = dict(equizip(cg.parameters, T.grad(cost, cg.parameters, consider_constant=[v_sample])))
    algorithm = GradientDescent(step_rule=step_rule, gradients=gradients, cost=cost,
                                params=cg.parameters)
    algorithm.add_updates(cg.updates)
    extensions = [
        SharedVariableModifier(parameter=cdk,
                               function=lambda n, v: rnnrbm_cdk[n] if rnnrbm_cdk.get(n) else v),
        SharedVariableModifier(parameter=lr,
                               function=lambda n, v: float32(0.78 * v) if n % (200 * 5) == 0 else v),
        FinishAfter(after_n_epochs=epochs),
        TrainingDataMonitoring(
            [cost, error_rate, mistake_rate, ],  # hidden_states, debug_val, param_nans,
            # aggregation.mean(algorithm.total_gradient_norm)],  #+ params,
            prefix="train",
            after_epoch=False, every_n_batches=40),
        Timing(),
        Printing(),
        ProgressBar()]
    if test is not None:
        extensions.append(DataStreamMonitoring(
            [cost, error_rate, mistake_rate],
            data_stream=test,
            updates=cg.updates,
            prefix="test", after_epoch=False, every_n_batches=40))
    if bokeh:
        extensions.append(Plot(
            'Training RNN-RBM',
            channels=[
                ['train_error on note as a whole', 'train_single error within note',
                 'test_error on note as a whole',
                 'test_single error within note'],
                ['train_final_cost'],
                # ['train_total_gradient_norm'],
            ]))

    main_loop = MainLoop(algorithm=algorithm,
                         data_stream=train,
                         model=model,
                         extensions=extensions
                         )
    return main_loop
def test_gradient_descent_finds_inputs_additional_updates():
    W = shared_floatx(numpy.array([[1, 2], [3, 4]]))
    n = shared_floatx(1)
    m = tensor.scalar('m')
    algorithm = GradientDescent(gradients=OrderedDict([(W, W + 1)]))
    algorithm.add_updates([(n, n + m)])
    algorithm.initialize()
    assert m in algorithm.inputs
Beispiel #4
0
def test_gradient_descent_finds_inputs_additional_updates():
    W = shared_floatx(numpy.array([[1, 2], [3, 4]]))
    n = shared_floatx(1)
    m = tensor.scalar('m')
    algorithm = GradientDescent(gradients=OrderedDict([(W, W + 1)]))
    algorithm.add_updates([(n, n + m)])
    algorithm.initialize()
    assert m in algorithm.inputs
def create_main_loop(dataset, nvis, nhid, num_epochs, debug_level=0, lrate=1e-3):
    seed = 188229
    n_inference_steps = 6
    num_examples = dataset.num_examples
    batch_size = num_examples

    train_loop_stream = Flatten(
        DataStream.default_stream(
            dataset=dataset,
            iteration_scheme=SequentialScheme(dataset.num_examples, batch_size)  # Repeat(
            # , n_inference_steps)
            #            ShuffledScheme(dataset.num_examples, batch_size), n_inference_steps))
        ),
        which_sources=("features",),
    )

    model_brick = FivEM(
        nvis=nvis,
        nhid=nhid,
        epsilon=0.01,
        batch_size=batch_size,
        weights_init=IsotropicGaussian(0.1),
        biases_init=Constant(0),
        noise_scaling=1,
        debug=debug_level,
        lateral_x=False,
        lateral_h=False,
        n_inference_steps=n_inference_steps,
    )
    model_brick.initialize()

    x = tensor.matrix("features")

    cost = model_brick.cost(x)
    computation_graph = ComputationGraph([cost])
    model = Model(cost)
    # step_rule = Adam(learning_rate=2e-5, beta1=0.1, beta2=0.001, epsilon=1e-8,
    #                 decay_factor=(1 - 1e-8))
    step_rule = Momentum(learning_rate=lrate, momentum=0.95)
    # step_rule = AdaDelta()
    # step_rule = RMSProp(learning_rate=0.01)
    # step_rule = AdaGrad(learning_rate=1e-4)
    algorithm = GradientDescent(cost=cost, params=computation_graph.parameters, step_rule=step_rule)
    algorithm.add_updates(computation_graph.updates)

    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs),
        TrainingDataMonitoring([cost] + computation_graph.auxiliary_variables, after_batch=False, after_epoch=True),
        #                       every_n_epochs=1),
        Printing(after_epoch=True, after_batch=False),  # every_n_epochs=1,
        # Checkpoint(path="./fivem.zip",every_n_epochs=10,after_training=True)
    ]
    main_loop = MainLoop(model=model, data_stream=train_loop_stream, algorithm=algorithm, extensions=extensions)
    return main_loop
Beispiel #6
0
def run():
    streams = create_celeba_streams(training_batch_size=100,
                                    monitoring_batch_size=500,
                                    include_targets=True)
    main_loop_stream = streams[0]
    train_monitor_stream = streams[1]
    valid_monitor_stream = streams[2]

    cg, bn_dropout_cg = create_training_computation_graphs()

    # Compute parameter updates for the batch normalization population
    # statistics. They are updated following an exponential moving average.
    pop_updates = get_batch_normalization_updates(bn_dropout_cg)
    decay_rate = 0.05
    extra_updates = [(p, m * decay_rate + p * (1 - decay_rate))
                     for p, m in pop_updates]

    # Prepare algorithm
    step_rule = Adam()
    algorithm = GradientDescent(cost=bn_dropout_cg.outputs[0],
                                parameters=bn_dropout_cg.parameters,
                                step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    # Prepare monitoring
    cost = bn_dropout_cg.outputs[0]
    cost.name = 'cost'
    train_monitoring = DataStreamMonitoring(
        [cost], train_monitor_stream, prefix="train",
        before_first_epoch=False, after_epoch=False, after_training=True,
        updates=extra_updates)

    cost, accuracy = cg.outputs
    cost.name = 'cost'
    accuracy.name = 'accuracy'
    monitored_quantities = [cost, accuracy]
    valid_monitoring = DataStreamMonitoring(
        monitored_quantities, valid_monitor_stream, prefix="valid",
        before_first_epoch=False, after_epoch=False, every_n_epochs=5)

    # Prepare checkpoint
    checkpoint = Checkpoint(
        'celeba_classifier.zip', every_n_epochs=5, use_cpickle=True)

    extensions = [Timing(), FinishAfter(after_n_epochs=50), train_monitoring,
                  valid_monitoring, checkpoint, Printing(), ProgressBar()]
    main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm,
                         extensions=extensions)
    main_loop.run()
def train_language_model(new_training_job, config, save_path, params,
                         fast_start, fuel_server, seed):
    c = config
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    data, lm, retrieval = initialize_data_and_model(config)

    # full main loop can be saved...
    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    # or only state (log + params) which can be useful not to pickle embeddings
    state_path = os.path.join(save_path, 'training_state.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')
    best_tar_path = os.path.join(save_path, "best_model.tar")

    words = tensor.ltensor3('words')
    words_mask = tensor.matrix('words_mask')
    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', batch_size=4,
                            max_length=5).get_epoch_iterator())
        words.tag.test_value = test_value_data[0]
        words_mask.tag.test_value = test_value_data[1]

    costs, updates = lm.apply(words, words_mask)
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(words.shape[1], 'length')
    perplexity, = VariableFilter(name='perplexity')(cg)
    perplexities = VariableFilter(name_regex='perplexity.*')(cg)
    monitored_vars = [length, cost] + perplexities
    if c['dict_path']:
        num_definitions, = VariableFilter(name='num_definitions')(cg)
        monitored_vars.extend([num_definitions])

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    saved_parameters = parameters.values()
    if c['embedding_path']:
        logger.debug("Exclude word embeddings from the trained parameters")
        trained_parameters = [
            p for p in trained_parameters
            if not p == lm.get_def_embeddings_params()
        ]
        saved_parameters = [
            p for p in saved_parameters
            if not p == lm.get_def_embeddings_params()
        ]

    if c['cache_size'] != 0:
        logger.debug("Enable fake recursivity for looking up embeddings")
        trained_parameters = [
            p for p in trained_parameters if not p == lm.get_cache_params()
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    if c['cache_size'] != 0:
        algorithm.add_updates(updates)

    train_monitored_vars = list(monitored_vars)
    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)

    word_emb_RMS, = VariableFilter(name='word_emb_RMS')(cg)
    main_rnn_in_RMS, = VariableFilter(name='main_rnn_in_RMS')(cg)
    train_monitored_vars.extend([word_emb_RMS, main_rnn_in_RMS])

    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    # We use a completely random seed on purpose. With Fuel server
    # it's currently not possible to restore the state of the training
    # stream. That's why it's probably better to just have it stateless.
    stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None
    training_stream = data.get_stream('train',
                                      batch_size=c['batch_size'],
                                      max_length=c['max_length'],
                                      seed=stream_seed)
    valid_stream = data.get_stream('valid',
                                   batch_size=c['batch_size_valid'],
                                   max_length=c['max_length'],
                                   seed=stream_seed)
    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    validation = DataStreamMonitoring(monitored_vars,
                                      valid_stream,
                                      prefix="valid").set_conditions(
                                          before_first_epoch=not fast_start,
                                          on_resumption=True,
                                          every_n_batches=c['mon_freq_valid'])
    track_the_best = TrackTheBest(validation.record_name(perplexity),
                                  choose_best=min).set_conditions(
                                      on_resumption=True,
                                      after_epoch=True,
                                      every_n_batches=c['mon_freq_valid'])

    # don't save them the entire main loop to avoid pickling everything
    if c['fast_checkpoint']:
        load = (LoadNoUnpickling(state_path,
                                 load_iteration_state=True,
                                 load_log=True).set_conditions(
                                     before_training=not new_training_job))
        cp_args = {
            'save_main_loop': False,
            'save_separately': ['log', 'iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(state_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                state_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)
    else:
        load = (Load(main_loop_path, load_iteration_state=True,
                     load_log=True).set_conditions(
                         before_training=not new_training_job))
        cp_args = {
            'save_separately': ['iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(main_loop_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                main_loop_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)

    checkpoint = checkpoint.add_condition(
        ['after_batch', 'after_epoch'],
        OnLogRecord(track_the_best.notification_name), (best_tar_path, ))

    extensions = [
        load,
        StartFuelServer(original_training_stream,
                        stream_path,
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train'])
    ]

    if retrieval:
        extensions.append(
            RetrievalPrintStats(retrieval=retrieval,
                                every_n_batches=c['mon_freq_train'],
                                before_training=not fast_start))

    extensions.extend([
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
        validation, track_the_best, checkpoint
    ])
    if c['checkpoint_every_n_batches']:
        extensions.append(intermediate_cp)
    extensions.extend([
        DumpTensorflowSummaries(save_path,
                                every_n_batches=c['mon_freq_train'],
                                after_training=True),
        Printing(on_resumption=True, every_n_batches=c['mon_freq_train']),
        FinishIfNoImprovementAfter(track_the_best.notification_name,
                                   iterations=50 * c['mon_freq_valid'],
                                   every_n_batches=c['mon_freq_valid']),
        FinishAfter(after_n_batches=c['n_batches'])
    ])

    logger.info("monitored variables during training:" + "\n" +
                pprint.pformat(train_monitored_vars, width=120))
    logger.info("monitored variables during valid:" + "\n" +
                pprint.pformat(monitored_vars, width=120))

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
Beispiel #8
0
if normalization == 'bn2':
    for m,s,var in statistics_list:
        parameters.extend([m,s])

algorithm = GradientDescent(
    cost=cost, parameters=parameters, step_rule=Adam(0.01))

#update the M and S with batch statistics
alpha = 0.1
updates = []
if normalization == 'bn2':
    for m,s,var in statistics_list:
        updates.append((m, cast(alpha*m + (1-alpha)*var.mean(axis=0), floatX)))
        updates.append((s, cast(alpha*s + (1-alpha)*var.std(axis=0) , floatX)))

algorithm.add_updates(updates)
# Since this line wont work with the extension to include parameters
# in the gradient descent. Here's an extension that will do the job.

from blocks.extensions import SimpleExtension
from theano import function

class UpdateExtraParams(SimpleExtension):
    """Adjusts shared variable parameter using some function.
    """
    def __init__(self, input1, input2, updates, **kwargs):
        kwargs.setdefault("after_batch", True)
        super(UpdateExtraParams, self).__init__(**kwargs)
        self.updates = updates
        self.func = function([input1, input2],[],
            updates = updates,
def main():
    nclasses = 27

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=1)
    parser.add_argument("--length", type=int, default=180)
    parser.add_argument("--num-epochs", type=int, default=100)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=1e-3)
    parser.add_argument("--epsilon", type=float, default=1e-5)
    parser.add_argument("--num-hidden", type=int, default=1000)
    parser.add_argument("--baseline", action="store_true")
    parser.add_argument("--initialization", choices="identity glorot orthogonal uniform".split(), default="identity")
    parser.add_argument("--initial-gamma", type=float, default=1e-1)
    parser.add_argument("--initial-beta", type=float, default=0)
    parser.add_argument("--cluster", action="store_true")
    parser.add_argument("--activation", choices=list(activations.keys()), default="tanh")
    parser.add_argument("--optimizer", choices="sgdmomentum adam rmsprop", default="rmsprop")
    parser.add_argument("--continue-from")
    parser.add_argument("--evaluate")
    parser.add_argument("--dump-hiddens")
    args = parser.parse_args()

    np.random.seed(args.seed)
    blocks.config.config.default_seed = args.seed

    if args.continue_from:
        from blocks.serialization import load

        main_loop = load(args.continue_from)
        main_loop.run()
        sys.exit(0)

    graphs, extensions, updates = construct_graphs(args, nclasses)

    ### optimization algorithm definition
    if args.optimizer == "adam":
        optimizer = Adam(learning_rate=args.learning_rate)
    elif args.optimizer == "rmsprop":
        optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9)
    elif args.optimizer == "sgdmomentum":
        optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99)
    step_rule = CompositeRule([StepClipping(1.0), optimizer])
    algorithm = GradientDescent(
        cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule
    )
    algorithm.add_updates(updates["training"])
    model = Model(graphs["training"].outputs[0])
    extensions = extensions["training"] + extensions["inference"]

    # step monitor
    step_channels = []
    step_channels.extend(
        [
            algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
            for name, param in model.get_parameter_dict().items()
        ]
    )
    step_channels.append(algorithm.total_step_norm.copy(name="total_step_norm"))
    step_channels.append(algorithm.total_gradient_norm.copy(name="total_gradient_norm"))
    step_channels.extend(graphs["training"].outputs)
    logger.warning("constructing training data monitor")
    extensions.append(TrainingDataMonitoring(step_channels, prefix="iteration", after_batch=True))

    # parameter monitor
    extensions.append(
        DataStreamMonitoring(
            [param.norm(2).copy(name="parameter.norm:%s" % name) for name, param in model.get_parameter_dict().items()],
            data_stream=None,
            after_epoch=True,
        )
    )

    validation_interval = 500
    # performance monitor
    for situation in "training inference".split():
        if situation == "inference" and not args.evaluate:
            # save time when we don't need the inference graph
            continue

        for which_set in "train valid test".split():
            logger.warning("constructing %s %s monitor" % (which_set, situation))
            channels = list(graphs[situation].outputs)
            extensions.append(
                DataStreamMonitoring(
                    channels,
                    prefix="%s_%s" % (which_set, situation),
                    every_n_batches=validation_interval,
                    data_stream=get_stream(
                        which_set=which_set, batch_size=args.batch_size, num_examples=10000, length=args.length
                    ),
                )
            )

    extensions.extend(
        [
            TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"),
            DumpBest("best_valid_training_error_rate", "best.zip"),
            FinishAfter(after_n_epochs=args.num_epochs),
            # FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50),
            Checkpoint("checkpoint.zip", on_interrupt=False, every_n_epochs=1, use_cpickle=True),
            DumpLog("log.pkl", after_epoch=True),
        ]
    )

    if not args.cluster:
        extensions.append(ProgressBar())

    extensions.extend([Timing(), Printing(every_n_batches=validation_interval), PrintingTo("log")])
    main_loop = MainLoop(
        data_stream=get_stream(which_set="train", batch_size=args.batch_size, length=args.length, augment=True),
        algorithm=algorithm,
        extensions=extensions,
        model=model,
    )

    if args.dump_hiddens:
        dump_hiddens(args, main_loop)
        return

    if args.evaluate:
        evaluate(args, main_loop)
        return

    main_loop.run()
Beispiel #10
0
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs,
          test_cost, experiment_path, initialization, init_width, weight_noise,
          z_prob, z_prob_states, z_prob_cells, drop_prob_igates,
          ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop,
          rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len,
          decrease_lr_after_epoch, lr_decay, **kwargs):

    print '.. PTB experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()

    ###########################################
    #
    # LOAD DATA
    #
    ###########################################

    def onehot(x, numclasses=None):
        """ Convert integer encoding for class-labels (starting with 0 !)
            to one-hot encoding.
            The output is an array whose shape is the shape of the input array
            plus an extra dimension, containing the 'one-hot'-encoded labels.
        """
        if x.shape == ():
            x = x[None]
        if numclasses is None:
            numclasses = x.max() + 1
        result = numpy.zeros(list(x.shape) + [numclasses], dtype="int")
        z = numpy.zeros(x.shape, dtype="int")
        for c in range(numclasses):
            z *= 0
            z[numpy.where(x == c)] = 1
            result[..., c] += z
        return result.astype(theano.config.floatX)

    alphabetsize = 10000
    data = np.load('penntree_char_and_word.npz')
    trainset = data['train_words']
    validset = data['valid_words']
    testset = data['test_words']

    if testing:
        trainset = trainset[:3000]
        validset = validset[:3000]

    if share_mask:
        if not z_prob:
            raise ValueError('z_prob must be provided when using share_mask')
        if z_prob_cells or z_prob_states:
            raise ValueError(
                'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)'
            )
        z_prob_cells = z_prob
        # we don't want to actually use these masks, so this is to debug
        z_prob_states = None
    else:
        if z_prob:
            raise ValueError('z_prob is only used with share_mask')
        z_prob_cells = z_prob_cells or '1'
        z_prob_states = z_prob_states or '1'


#    rng = np.random.RandomState(seed)

###########################################
#
# MAKE STREAMS
#
###########################################

    def prep_dataset(dataset):
        dataset = dataset[:(len(dataset) - (len(dataset) %
                                            (seq_len * batch_size)))]
        dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2))

        stream = DataStream(
            IndexableDataset(indexables=OrderedDict([('data', dataset)])),
            iteration_scheme=SequentialExampleScheme(dataset.shape[0]))
        stream = Transpose(stream, [(1, 0)])
        stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells,
                                   drop_prob_igates, layer_size, num_layers,
                                   False, stoch_depth, share_mask,
                                   gaussian_drop, alphabetsize)
        stream.sources = ('data', ) * 3 + stream.sources + (
            'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates')
        return (stream, )

    train_stream, = prep_dataset(trainset)
    valid_stream, = prep_dataset(validset)
    test_stream, = prep_dataset(testset)

    ####################

    data = train_stream.get_epoch_iterator(as_dict=True).next()

    ####################

    ###########################################
    #
    # BUILD MODEL
    #
    ###########################################
    print '.. building model'

    x = T.tensor3('data')
    y = x
    zoneouts_states = T.tensor3('zoneouts_states')
    zoneouts_cells = T.tensor3('zoneouts_cells')
    zoneouts_igates = T.tensor3('zoneouts_igates')

    x.tag.test_value = data['data']
    zoneouts_states.tag.test_value = data['zoneouts_states']
    zoneouts_cells.tag.test_value = data['zoneouts_cells']
    zoneouts_igates.tag.test_value = data['zoneouts_igates']

    if init_width and not initialization == 'uniform':
        raise ValueError('Width is only for uniform init, whassup?')

    if initialization == 'glorot':
        weights_init = NormalizedInitialization()
    elif initialization == 'uniform':
        weights_init = Uniform(width=init_width)
    elif initialization == 'ortho':
        weights_init = OrthogonalInitialization()
    else:
        raise ValueError('No such initialization')

    if rnn_type.lower() == 'lstm':
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size * 4,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropLSTM(dim=layer_size,
                     weights_init=weights_init,
                     activation=Tanh(),
                     model_type=6,
                     name='rnn%d' % l,
                     ogates_zoneout=ogates_zoneout) for l in range(num_layers)
        ]
    elif rnn_type.lower() == 'gru':
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size * 3,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropGRU(dim=layer_size,
                    weights_init=weights_init,
                    activation=Tanh(),
                    name='rnn%d' % l) for l in range(num_layers)
        ]
    elif rnn_type.lower() == 'srnn':  # FIXME!!! make ReLU
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropSimpleRecurrent(dim=layer_size,
                                weights_init=weights_init,
                                activation=Rectifier(),
                                name='rnn%d' % l) for l in range(num_layers)
        ]
    else:
        raise NotImplementedError

    hid_to_out = Linear(layer_size,
                        alphabetsize,
                        name='hid_to_out',
                        weights_init=weights_init,
                        biases_init=Constant(0.0))

    for layer in in_to_hids:
        layer.initialize()
    for layer in recurrent_layers:
        layer.initialize()
    hid_to_out.initialize()

    layer_input = x  #in_to_hid.apply(x)

    init_updates = OrderedDict()
    for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)):
        rnn_embedding = in_to_hid.apply(layer_input)
        if rnn_type.lower() == 'lstm':
            states_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            cells_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name, cells_init.name = "states_init", "cells_init"
            states, cells = layer.apply(
                rnn_embedding,
                zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size],
                zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size],
                zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size],
                states_init, cells_init)
            init_updates.update([(states_init, states[-1]),
                                 (cells_init, cells[-1])])
        elif rnn_type.lower() in ['gru', 'srnn']:
            # untested!
            states_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name = "states_init"
            states = layer.apply(rnn_embedding, zoneouts_states,
                                 zoneouts_igates, states_init)
            init_updates.update([(states_init, states[-1])])
        else:
            raise NotImplementedError
        layer_input = states

    y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1]))
    shape_ = y_hat_pre_softmax.shape
    y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize)))

    ####################

    ###########################################
    #
    # SET UP COSTS AND MONITORS
    #
    ###########################################

    cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)),
                                           y_hat).copy('cost')

    bpc = (cost / np.log(2.0)).copy(name='bpr')
    perp = T.exp(cost).copy(name='perp')

    cost_train = cost.copy(name='train_cost')
    cg_train = ComputationGraph([cost_train])

    ###########################################
    #
    # NORM STABILIZER
    #
    ###########################################
    norm_cost = 0.

    def _magnitude(x, axis=-1):
        return T.sqrt(
            T.maximum(T.sqr(x).sum(axis=axis),
                      numpy.finfo(x.dtype).tiny))

    if penalty == 'cells':
        assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables)
        for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables):
            norms = _magnitude(cell)
            norm_cost += T.mean(
                T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))
    elif penalty == 'hids':
        for l in range(num_layers):
            assert 'rnn%d_apply_states' % l in [
                o.name
                for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)
            ]
        for output in VariableFilter(roles=[OUTPUT])(cg_train.variables):
            for l in range(num_layers):
                if output.name == 'rnn%d_apply_states' % l:
                    norms = _magnitude(output)
                    norm_cost += T.mean(
                        T.sum((norms[1:] - norms[:-1])**2, axis=0) /
                        (seq_len - 1))

    norm_cost.name = 'norm_cost'
    #cost_valid = cost_train
    cost_train += norm_cost_coeff * norm_cost
    cost_train = cost_train.copy(
        'cost_train')  #should this be cost_train.outputs[0]? no.

    cg_train = ComputationGraph([cost_train])

    ###########################################
    #
    # WEIGHT NOISE
    #
    ###########################################

    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg_train.variables)
        cg_train = apply_noise(cg_train, weights, weight_noise)
        cost_train = cg_train.outputs[0].copy(name='cost_train')

    model = Model(cost_train)

    learning_rate = float(learning_rate)
    clipping = StepClipping(threshold=np.cast[floatX](clipping))
    if algorithm == 'adam':
        adam = Adam(learning_rate=learning_rate)
        learning_rate = adam.learning_rate
        step_rule = CompositeRule([adam, clipping])
    elif algorithm == 'rms_prop':
        rms_prop = RMSProp(learning_rate=learning_rate)
        learning_rate = rms_prop.learning_rate
        step_rule = CompositeRule([clipping, rms_prop])
    elif algorithm == 'momentum':
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        learning_rate = sgd_momentum.learning_rate
        step_rule = CompositeRule([clipping, sgd_momentum])
    elif algorithm == 'sgd':
        sgd = Scale(learning_rate=learning_rate)
        learning_rate = sgd.learning_rate
        step_rule = CompositeRule([clipping, sgd])
    else:
        raise NotImplementedError
    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost_train,
                                parameters=cg_train.parameters)
    # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)})

    algorithm.add_updates(init_updates)

    def cond_number(x):
        _, _, sing_vals = T.nlinalg.svd(x, True, True)
        sing_mags = abs(sing_vals)
        return T.max(sing_mags) / T.min(sing_mags)

    def rms(x):
        return (x * x).mean().sqrt()

    whysplode_cond = []
    whysplode_rms = []
    for i, p in enumerate(init_updates):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(
                cond_number(p).copy(
                    'ini%d:%s_cond(%s)' %
                    (i, p.name, "x".join(map(str,
                                             p.get_value().shape)))))
        whysplode_rms.append(
            rms(p).copy('ini%d:%s_rms(%s)' %
                        (i, p.name, "x".join(map(str,
                                                 p.get_value().shape)))))
    for i, p in enumerate(cg_train.parameters):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(
                cond_number(p).copy(
                    'ini%d:%s_cond(%s)' %
                    (i, p.name, "x".join(map(str,
                                             p.get_value().shape)))))
        whysplode_rms.append(
            rms(p).copy('ini%d:%s_rms(%s)' %
                        (i, p.name, "x".join(map(str,
                                                 p.get_value().shape)))))

    observed_vars = [
        cost_train, cost, bpc, perp, learning_rate,
        aggregation.mean(
            algorithm.total_gradient_norm).copy("gradient_norm_mean")
    ]  # + whysplode_rms

    parameters = model.get_parameter_dict()
    for name, param in parameters.iteritems():
        observed_vars.append(param.norm(2).copy(name=name + "_norm"))
        observed_vars.append(
            algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm"))

    train_monitor = TrainingDataMonitoring(variables=observed_vars,
                                           prefix="train",
                                           after_epoch=True)

    dev_inits = [p.clone() for p in init_updates]
    cg_dev = ComputationGraph([cost, bpc, perp] +
                              init_updates.values()).replace(
                                  zip(init_updates.keys(), dev_inits))
    dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3]
    dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:]))

    dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp],
                                       data_stream=valid_stream,
                                       prefix="dev",
                                       updates=dev_init_updates)

    # noone does this
    if 'load_path' in kwargs:
        with open(kwargs['load_path']) as f:
            loaded = np.load(f)
            model = Model(cost_train)
            params_dicts = model.get_parameter_dict()
            params_names = params_dicts.keys()
            for param_name in params_names:
                param = params_dicts[param_name]
                # '/f_6_.W' --> 'f_6_.W'
                slash_index = param_name.find('/')
                param_name = param_name[slash_index + 1:]
                if param.get_value().shape == loaded[param_name].shape:
                    print 'Found: ' + param_name
                    param.set_value(loaded[param_name])
                else:
                    print 'Not found: ' + param_name

    extensions = []
    extensions.extend(
        [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor])
    if test_cost:
        test_inits = [p.clone() for p in init_updates]
        cg_test = ComputationGraph([cost, bpc, perp] +
                                   init_updates.values()).replace(
                                       zip(init_updates.keys(), test_inits))
        test_cost, test_bpc, test_perp = cg_test.outputs[:3]
        test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:]))

        test_monitor = DataStreamMonitoring(
            variables=[test_cost, test_bpc, test_perp],
            data_stream=test_stream,
            prefix="test",
            updates=test_init_updates)
        extensions.extend([test_monitor])

    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
    log_path = os.path.join(experiment_path, 'log.txt')
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    extensions.append(
        SaveParams('dev_cost', model, experiment_path, every_n_epochs=1))
    extensions.append(SaveLog(every_n_epochs=1))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    class RollsExtension(TrainingExtension):
        """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """
        def __init__(self, shvars):
            self.shvars = shvars

        def before_epoch(self):
            for v in self.shvars:
                v.set_value(np.roll(v.get_value(), 1, 0))

    extensions.append(
        RollsExtension(init_updates.keys() + dev_init_updates.keys() +
                       (test_init_updates.keys() if test_cost else [])))

    class LearningRateSchedule(TrainingExtension):
        """ Lets you set a number to divide learning rate by each epoch + when to start doing that """
        def __init__(self):
            self.epoch_number = 0

        def after_epoch(self):
            self.epoch_number += 1
            if self.epoch_number > decrease_lr_after_epoch:
                learning_rate.set_value(learning_rate.get_value() / lr_decay)

    if bool(lr_decay) != bool(decrease_lr_after_epoch):
        raise ValueError(
            'Need to define both lr_decay and decrease_lr_after_epoch')
    if lr_decay and decrease_lr_after_epoch:
        extensions.append(LearningRateSchedule())

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    t1 = time.time()
    print "Building time: %f" % (t1 - t0)

    main_loop.run()
    print "Execution time: %f" % (time.time() - t1)
def train(algorithm, learning_rate, clipping, momentum,
          layer_size, epochs, test_cost, experiment_path,
          initialization, init_width, weight_noise, z_prob,
          z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout,
          batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type,
          num_layers, norm_cost_coeff, penalty, testing, seq_len, 
          decrease_lr_after_epoch, lr_decay,
          **kwargs):

    print '.. PTB experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()

    ###########################################
    #
    # LOAD DATA
    #
    ###########################################

    def onehot(x, numclasses=None):
        """ Convert integer encoding for class-labels (starting with 0 !)
            to one-hot encoding.
            The output is an array whose shape is the shape of the input array
            plus an extra dimension, containing the 'one-hot'-encoded labels.
        """
        if x.shape == ():
            x = x[None]
        if numclasses is None:
            numclasses = x.max() + 1
        result = numpy.zeros(list(x.shape) + [numclasses], dtype="int")
        z = numpy.zeros(x.shape, dtype="int")
        for c in range(numclasses):
            z *= 0
            z[numpy.where(x == c)] = 1
            result[..., c] += z
        return result.astype(theano.config.floatX)

    alphabetsize = 10000
    data = np.load('penntree_char_and_word.npz')
    trainset = data['train_words']
    validset = data['valid_words']
    testset = data['test_words']

    if testing:
        trainset = trainset[:3000]
        validset = validset[:3000]

    if share_mask:
        if not z_prob:
            raise ValueError('z_prob must be provided when using share_mask')
        if z_prob_cells or z_prob_states:
            raise ValueError('z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)')
        z_prob_cells = z_prob
        # we don't want to actually use these masks, so this is to debug
        z_prob_states = None
    else:
        if z_prob:
            raise ValueError('z_prob is only used with share_mask')
        z_prob_cells = z_prob_cells or '1'
        z_prob_states = z_prob_states or '1'

#    rng = np.random.RandomState(seed)



    ###########################################
    #
    # MAKE STREAMS
    #
    ###########################################

    def prep_dataset(dataset):
        dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))]
        dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2))


        stream = DataStream(IndexableDataset(indexables=OrderedDict([
            ('data', dataset)])),
            iteration_scheme=SequentialExampleScheme(dataset.shape[0]))
        stream = Transpose(stream, [(1, 0)])
        stream = SampleDropsNPWord(
          stream, z_prob_states, z_prob_cells, drop_prob_igates,
          layer_size, num_layers, False, stoch_depth, share_mask,
          gaussian_drop, alphabetsize)
        stream.sources = ('data',) * 3 + stream.sources + ('zoneouts_states', 'zoneouts_cells', 'zoneouts_igates')
        return (stream,)
    train_stream, = prep_dataset(trainset)
    valid_stream, = prep_dataset(validset)
    test_stream, = prep_dataset(testset)


    ####################


    data = train_stream.get_epoch_iterator(as_dict=True).next()


    ####################


    ###########################################
    #
    # BUILD MODEL
    #
    ###########################################
    print '.. building model'

    x = T.tensor3('data')
    y = x
    zoneouts_states = T.tensor3('zoneouts_states')
    zoneouts_cells = T.tensor3('zoneouts_cells')
    zoneouts_igates = T.tensor3('zoneouts_igates')

    x.tag.test_value = data['data']
    zoneouts_states.tag.test_value = data['zoneouts_states']
    zoneouts_cells.tag.test_value = data['zoneouts_cells']
    zoneouts_igates.tag.test_value = data['zoneouts_igates']

    if init_width and not initialization == 'uniform':
        raise ValueError('Width is only for uniform init, whassup?')

    if initialization == 'glorot':
        weights_init = NormalizedInitialization()
    elif initialization == 'uniform':
        weights_init = Uniform(width=init_width)
    elif initialization == 'ortho':
        weights_init = OrthogonalInitialization()
    else:
        raise ValueError('No such initialization')

    if rnn_type.lower() == 'lstm':
        in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*4, name='in_to_hid%d'%l,
                           weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)]
        recurrent_layers = [DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d'%l, ogates_zoneout=ogates_zoneout) for l in range(num_layers)]
    elif rnn_type.lower() == 'gru':
        in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*3, name='in_to_hid%d'%l,
                           weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)]
        recurrent_layers = [DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d'%l) for l in range(num_layers)]
    elif rnn_type.lower() == 'srnn':  # FIXME!!! make ReLU
        in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d'%l,
                           weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)]
        recurrent_layers = [DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d'%l) for l in range(num_layers)]
    else:
        raise NotImplementedError

    hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out',
                        weights_init=weights_init, biases_init=Constant(0.0))

    for layer in in_to_hids:
        layer.initialize()
    for layer in recurrent_layers:
        layer.initialize()
    hid_to_out.initialize()

    layer_input = x #in_to_hid.apply(x)

    init_updates = OrderedDict()
    for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)):
        rnn_embedding = in_to_hid.apply(layer_input)
        if rnn_type.lower() == 'lstm':
            states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX))
            cells_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name, cells_init.name = "states_init", "cells_init"
            states, cells = layer.apply(rnn_embedding,
                                        zoneouts_states[:, :, l * layer_size : (l + 1) * layer_size],
                                        zoneouts_cells[:, :, l * layer_size : (l + 1) * layer_size],
                                        zoneouts_igates[:, :, l * layer_size : (l + 1) * layer_size],
                                        states_init,
                                        cells_init)
            init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])])
        elif rnn_type.lower() in ['gru', 'srnn']:
            # untested!
            states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name = "states_init"
            states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init)
            init_updates.update([(states_init, states[-1])])
        else:
            raise NotImplementedError
        layer_input = states

    y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1]))
    shape_ = y_hat_pre_softmax.shape
    y_hat = Softmax().apply(
        y_hat_pre_softmax.reshape((-1, alphabetsize)))

    ####################


    ###########################################
    #
    # SET UP COSTS AND MONITORS
    #
    ###########################################

    cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost')

    bpc = (cost/np.log(2.0)).copy(name='bpr')
    perp = T.exp(cost).copy(name='perp')


    cost_train = cost.copy(name='train_cost')
    cg_train = ComputationGraph([cost_train])


    ###########################################
    #
    # NORM STABILIZER
    #
    ###########################################
    norm_cost = 0.

    def _magnitude(x, axis=-1):
        return T.sqrt(T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny))

    if penalty == 'cells':
        assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables)
        for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables):
            norms = _magnitude(cell)
            norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))
    elif penalty == 'hids':
        for l in range(num_layers):
            assert 'rnn%d_apply_states'%l in [o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)]
        for output in VariableFilter(roles=[OUTPUT])(cg_train.variables):
            for l in range(num_layers):
                if output.name == 'rnn%d_apply_states'%l:
                    norms = _magnitude(output)
                    norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))

    norm_cost.name = 'norm_cost'
    #cost_valid = cost_train
    cost_train += norm_cost_coeff * norm_cost
    cost_train = cost_train.copy('cost_train') #should this be cost_train.outputs[0]? no.

    cg_train = ComputationGraph([cost_train])


    ###########################################
    #
    # WEIGHT NOISE
    #
    ###########################################

    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg_train.variables)
        cg_train = apply_noise(cg_train, weights, weight_noise)
        cost_train = cg_train.outputs[0].copy(name='cost_train')

    model = Model(cost_train)

    learning_rate = float(learning_rate)
    clipping = StepClipping(threshold=np.cast[floatX](clipping))
    if algorithm == 'adam':
        adam = Adam(learning_rate=learning_rate)
        learning_rate = adam.learning_rate
        step_rule = CompositeRule([adam, clipping])
    elif algorithm == 'rms_prop':
        rms_prop = RMSProp(learning_rate=learning_rate)
        learning_rate = rms_prop.learning_rate
        step_rule = CompositeRule([clipping, rms_prop])
    elif algorithm == 'momentum':
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        learning_rate = sgd_momentum.learning_rate
        step_rule = CompositeRule([clipping, sgd_momentum])
    elif algorithm == 'sgd':
        sgd = Scale(learning_rate=learning_rate)
        learning_rate = sgd.learning_rate
        step_rule = CompositeRule([clipping, sgd])
    else:
        raise NotImplementedError
    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost_train,
                                parameters=cg_train.parameters)
                                # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)})

    algorithm.add_updates(init_updates)

    def cond_number(x):
        _, _, sing_vals = T.nlinalg.svd(x, True, True)
        sing_mags = abs(sing_vals)
        return T.max(sing_mags) / T.min(sing_mags)
    def rms(x):
        return (x*x).mean().sqrt()

    whysplode_cond = []
    whysplode_rms = []
    for i, p in enumerate(init_updates):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape)))))
        whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape)))))
    for i, p in enumerate(cg_train.parameters):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape)))))
        whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape)))))

    observed_vars = [cost_train, cost, bpc, perp, learning_rate,
                     aggregation.mean(algorithm.total_gradient_norm).copy("gradient_norm_mean")] # + whysplode_rms

    parameters = model.get_parameter_dict()
    for name, param in parameters.iteritems():
        observed_vars.append(param.norm(2).copy(name=name + "_norm"))
        observed_vars.append(
            algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm"))
    
    train_monitor = TrainingDataMonitoring(
        variables=observed_vars,
        prefix="train", after_epoch=True
    )

    dev_inits = [p.clone() for p in init_updates]
    cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), dev_inits))
    dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3]
    dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:]))

    dev_monitor = DataStreamMonitoring(
        variables=[dev_cost, dev_bpc, dev_perp],
        data_stream=valid_stream, prefix="dev",
        updates=dev_init_updates
    )

    # noone does this
    if 'load_path' in kwargs:
        with open(kwargs['load_path']) as f:
            loaded = np.load(f)
            model = Model(cost_train)
            params_dicts = model.get_parameter_dict()
            params_names = params_dicts.keys()
            for param_name in params_names:
                param = params_dicts[param_name]
                # '/f_6_.W' --> 'f_6_.W'
                slash_index = param_name.find('/')
                param_name = param_name[slash_index + 1:]
                if param.get_value().shape == loaded[param_name].shape:
                    print 'Found: ' + param_name
                    param.set_value(loaded[param_name])
                else:
                    print 'Not found: ' + param_name
    
    extensions = []
    extensions.extend([FinishAfter(after_n_epochs=epochs),
                       train_monitor, dev_monitor])
    if test_cost:
        test_inits = [p.clone() for p in init_updates]
        cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), test_inits))
        test_cost, test_bpc, test_perp = cg_test.outputs[:3]
        test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:]))

        test_monitor = DataStreamMonitoring(
            variables=[test_cost, test_bpc, test_perp],
            data_stream=test_stream,
            prefix="test",
            updates=test_init_updates
        )
        extensions.extend([test_monitor])

    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
    log_path = os.path.join(experiment_path, 'log.txt')
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    extensions.append(SaveParams('dev_cost', model, experiment_path,
                                 every_n_epochs=1))
    extensions.append(SaveLog(every_n_epochs=1))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    class RollsExtension(TrainingExtension):
        """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """
        def __init__(self, shvars):
            self.shvars = shvars
        def before_epoch(self):
            for v in self.shvars:
                v.set_value(np.roll(v.get_value(), 1, 0))
    extensions.append(RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else [])))

    class LearningRateSchedule(TrainingExtension):
        """ Lets you set a number to divide learning rate by each epoch + when to start doing that """
        def __init__(self):
            self.epoch_number = 0
        def after_epoch(self):
            self.epoch_number += 1
            if self.epoch_number > decrease_lr_after_epoch:
                learning_rate.set_value(learning_rate.get_value()/lr_decay)
    if bool(lr_decay) != bool(decrease_lr_after_epoch):
        raise ValueError('Need to define both lr_decay and decrease_lr_after_epoch')
    if lr_decay and decrease_lr_after_epoch:
        extensions.append(LearningRateSchedule())


    main_loop = MainLoop(model=model, data_stream=train_stream,
                         algorithm=algorithm, extensions=extensions)
    t1 = time.time()
    print "Building time: %f" % (t1 - t0)

    main_loop.run()
    print "Execution time: %f" % (time.time() - t1)
Beispiel #12
0
def construct_main_loop(name, task_name, batch_size, max_epochs,
                        patience_epochs, learning_rate, hyperparameters,
                        **kwargs):
    task = tasks.get_task(**hyperparameters)
    hyperparameters["n_channels"] = task.n_channels

    extensions = []

    print "constructing graphs..."
    graphs, outputs, updates = construct_graphs(task=task, **hyperparameters)

    print "setting up main loop..."

    from blocks.model import Model
    model = Model(outputs["train"]["cost"])

    from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, Adam
    algorithm = GradientDescent(cost=outputs["train"]["cost"],
                                parameters=graphs["train"].parameters,
                                step_rule=CompositeRule([
                                    StepClipping(1e1),
                                    Adam(learning_rate=learning_rate),
                                    StepClipping(1e2)
                                ]),
                                on_unused_sources="warn")
    algorithm.add_updates(updates["train"])

    extensions.extend(
        construct_monitors(algorithm=algorithm,
                           task=task,
                           model=model,
                           graphs=graphs,
                           outputs=outputs,
                           **hyperparameters))

    from blocks.extensions import FinishAfter, Printing, ProgressBar, Timing
    from blocks.extensions.stopping import FinishIfNoImprovementAfter
    from blocks.extensions.training import TrackTheBest
    from blocks.extensions.saveload import Checkpoint
    from dump import DumpBest, LightCheckpoint, PrintingTo
    extensions.extend([
        TrackTheBest("valid_error_rate", "best_valid_error_rate"),
        FinishIfNoImprovementAfter("best_valid_error_rate",
                                   epochs=patience_epochs),
        FinishAfter(after_n_epochs=max_epochs),
        DumpBest("best_valid_error_rate", name + "_best.zip"),
        Checkpoint(hyperparameters["checkpoint_save_path"],
                   on_interrupt=False,
                   every_n_epochs=5,
                   before_training=True,
                   use_cpickle=True),
        ProgressBar(),
        Timing(),
        Printing(),
        PrintingTo(name + "_log")
    ])

    from blocks.main_loop import MainLoop
    main_loop = MainLoop(data_stream=task.get_stream("train"),
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)

    # note blocks will crash and burn because it cannot deal with an
    # already-initialized Algorithm, so this should be enabled only for
    # debugging
    if False:
        with open("graph", "w") as graphfile:
            algorithm.initialize()
            theano.printing.debugprint(algorithm._function, file=graphfile)

    from tabulate import tabulate
    print "parameter sizes:"
    print tabulate(
        (key, "x".join(map(str,
                           value.get_value().shape)), value.get_value().size)
        for key, value in main_loop.model.get_parameter_dict().items())

    return main_loop
# param_nans = 0
# for i, p in enumerate(params):
# # cost += reg * abs(p).sum()
# cost += reg * (p ** 2).sum()
# param_nans += T.isnan(p).sum()
# name = params[i].name
#     params[i] = params[i].mean()
#     params[i].name = name + str(i)
cost.name = "final_cost"

# param_nans.name = 'params_nans'

## Training algorithm

# algorithm.initialize()
algorithm.add_updates(cg.updates)

extensions = [
    FinishAfter(after_n_epochs=5000),
    DataStreamMonitoring([cost, error_rate, mistake_rate],
                         data_stream=test,
                         updates=cg.updates,
                         prefix="test"),
    TrainingDataMonitoring(
        [
            cost,  # hidden_states, debug_val, param_nans,
            aggregation.mean(algorithm.total_gradient_norm)
        ],  #+ params,
        prefix="train",
        after_epoch=True),
    Timing(),
Beispiel #14
0
def train(cli_params):
    cli_params['save_dir'] = prepare_dir(cli_params['save_to'])
    logfile = os.path.join(cli_params['save_dir'], 'log.txt')

    # Log also DEBUG to a file
    fh = logging.FileHandler(filename=logfile)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    logger.info('Logging into %s' % logfile)

    p, loaded = load_and_log_params(cli_params)
    in_dim, data, whiten, cnorm = setup_data(p, test_set=False)
    if not loaded:
        # Set the zero layer to match input dimensions
        p.encoder_layers = (in_dim, ) + p.encoder_layers

    ladder = setup_model(p)

    # Training
    all_params = ComputationGraph([ladder.costs.total]).parameters
    logger.info('Found the following parameters: %s' % str(all_params))

    # Fetch all batch normalization updates. They are in the clean path.
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    assert 'counter' in [u.name for u in bn_updates.keys()], \
        'No batch norm params in graph - the graph has been cut?'

    training_algorithm = GradientDescent(
        cost=ladder.costs.total,
        parameters=all_params,
        step_rule=Adam(learning_rate=ladder.lr))
    # In addition to actual training, also do BN variable approximations
    training_algorithm.add_updates(bn_updates)

    short_prints = {
        "train": {
            'T_C_class': ladder.costs.class_corr,
            'T_C_de': ladder.costs.denois.values(),
        },
        "valid_approx":
        OrderedDict([
            ('V_C_class', ladder.costs.class_clean),
            ('V_E', ladder.error.clean),
            ('V_C_de', ladder.costs.denois.values()),
        ]),
        "valid_error":
        OrderedDict([
            ('VE_C_class', ladder.costs.class_clean),
            ('VE_E', ladder.error.clean),
            ('VE_C_de', ladder.costs.denois.values()),
        ]),
    }

    main_loop = MainLoop(
        training_algorithm,
        # Datastream used for training
        make_datastream(data.train,
                        data.train_ind,
                        p.batch_size,
                        n_labeled=p.labeled_samples,
                        n_unlabeled=p.unlabeled_samples,
                        whiten=whiten,
                        cnorm=cnorm),
        model=Model(ladder.costs.total),
        extensions=[
            #FinishAfter(after_n_epochs=p.num_epochs),

            # This will estimate the validation error using
            # running average estimates of the batch normalization
            # parameters, mean and variance
            #ApproxTestMonitoring(
            #    [ladder.costs.class_clean, ladder.error.clean]
            #    + ladder.costs.denois.values(),
            #    make_datastream(data.valid, data.valid_ind,
            #                    p.valid_batch_size, whiten=whiten, cnorm=cnorm,
            #                    scheme=ShuffledScheme),
            #    prefix="valid_approx"),

            # This Monitor is slower, but more accurate since it will first
            # estimate batch normalization parameters from training data and
            # then do another pass to calculate the validation error.
            FinalTestMonitoring(
                [ladder.costs.class_clean, ladder.error.clean] +
                ladder.costs.denois.values(),
                make_datastream(data.train,
                                data.train_ind,
                                p.batch_size,
                                n_labeled=p.labeled_samples,
                                whiten=whiten,
                                cnorm=cnorm,
                                scheme=ShuffledScheme),
                make_datastream(data.valid,
                                data.valid_ind,
                                p.valid_batch_size,
                                n_labeled=len(data.valid_ind),
                                whiten=whiten,
                                cnorm=cnorm,
                                scheme=ShuffledScheme),
                prefix="valid_error",
                every_n_epochs=1),
            TrainingDataMonitoring([
                ladder.costs.total, ladder.costs.class_corr,
                training_algorithm.total_gradient_norm
            ] + ladder.costs.denois.values(),
                                   prefix="train",
                                   after_epoch=True),

            #SaveParams(None, all_params, p.save_dir, after_epoch=True),
            #SaveExpParams(p, p.save_dir, before_training=True),
            #SaveLog(p.save_dir, after_training=True),
            #ShortPrinting(short_prints),
            #ProgressBar(),
            StopAfterNoImprovementValidation('valid_error_error_rate_clean',
                                             10, all_params, p, p.save_dir),
            #LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs,
            #        after_epoch=True),
        ])
    main_loop.run()

    # Get results
    df = DataFrame.from_dict(main_loop.log, orient='index')
    col = 'valid_error_error_rate_clean'
    logger.info('%s %g' % (col, df[col].iloc[-1]))

    if main_loop.log.status['epoch_interrupt_received']:
        return None
    return df
Beispiel #15
0
def run():

    # Load Model
    net_size = 256  #Hard-code instead of loading model (takes too long to set up network)
    #net = vaegan.VAEGAN()
    #network_saver = saver.NetworkSaver('vaegan/models/', net=net)
    #network_saver.load()

    # DATA
    train_stream = get_stream(hdf5_file, 'train', batch_size)  #TODO jonathan ?
    test_stream = get_stream(hdf5_file, 'test', batch_size)  #TODO jonathan ?

    # MODEL
    x = T.TensorType('floatX', [False] * 3)('features')
    y = T.tensor3('targets', dtype='floatX')
    train_flag = [theano.shared(0)]
    x = x.swapaxes(0, 1)
    y = y.swapaxes(0, 1)

    # More Config
    out_size = len(output_columns) - 1  # code_mode=RL-MDN
    latent_size = net_size
    in_size = latent_size + len(input_columns)

    # NN fprop
    y_hat, cost, cells = nn_fprop(x, y, in_size, out_size, hidden_size,
                                  num_recurrent_layers, train_flag)

    # COST
    cg = ComputationGraph(cost)
    extra_updates = []

    # RMS Prop training optimizer
    step_rules = [
        RMSProp(learning_rate=learning_rate, decay_rate=decay_rate),
        StepClipping(step_clipping)
    ]

    parameters_to_update = cg.parameters
    algorithm = GradientDescent(cost=cg.outputs[0],
                                parameters=parameters_to_update,
                                step_rule=CompositeRule(step_rules))
    algorithm.add_updates(
        extra_updates)  # TODO jonathan what is this, is this needed?

    # Extensions
    gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
    step_norm = aggregation.mean(algorithm.total_step_norm)
    monitored_vars = [
        cost, step_rules[0].learning_rate, gradient_norm, step_norm
    ]

    test_monitor = DataStreamMonitoring(variables=[cost],
                                        after_epoch=True,
                                        before_first_epoch=True,
                                        data_stream=test_stream,
                                        prefix="test")
    train_monitor = TrainingDataMonitoring(variables=monitored_vars,
                                           after_epoch=True,
                                           before_first_epoch=True,
                                           prefix='train')

    set_train_flag = SetTrainFlag(after_epoch=True,
                                  before_epoch=True,
                                  flag=train_flag)

    # plot = Plot('Plotting example', channels=[['cost']], after_batch=True, open_browser=True)
    extensions = [
        set_train_flag,
        test_monitor,
        train_monitor,
        Timing(),
        Printing(after_epoch=True),
        FinishAfter(after_n_epochs=nepochs),
        saveload.Load(load_path),
        saveload.Checkpoint(last_path, every_n_epochs=10000),
    ] + track_best('test_cost',
                   save_path)  #+ track_best('train_cost', last_path)

    if learning_rate_decay not in (0, 1):
        extensions.append(
            SharedVariableModifier(step_rules[0].learning_rate,
                                   lambda n, lr: np.cast[theano.config.floatX]
                                   (learning_rate_decay * lr),
                                   after_epoch=False,
                                   every_n_epochs=lr_decay_every_n_epochs,
                                   after_batch=False))

    print 'number of parameters in the model: ' + str(
        T.sum([p.size for p in cg.parameters]).eval())
    # Finally build the main loop and train the model
    mainLoop = MainLoop(data_stream=train_stream,
                        algorithm=algorithm,
                        model=Model(cost),
                        extensions=extensions)
    mainLoop.run()
Beispiel #16
0
def main(nvis, nhid, encoding_lstm_dim, decoding_lstm_dim, T=1):
    x = tensor.matrix('features')

    # Construct and initialize model
    encoding_mlp = MLP([Tanh()], [None, None])
    decoding_mlp = MLP([Tanh()], [None, None])
    encoding_lstm = LSTM(dim=encoding_lstm_dim)
    decoding_lstm = LSTM(dim=decoding_lstm_dim)
    draw = DRAW(nvis=nvis,
                nhid=nhid,
                T=T,
                encoding_mlp=encoding_mlp,
                decoding_mlp=decoding_mlp,
                encoding_lstm=encoding_lstm,
                decoding_lstm=decoding_lstm,
                biases_init=Constant(0),
                weights_init=Orthogonal())
    draw.push_initialization_config()
    encoding_lstm.weights_init = IsotropicGaussian(std=0.001)
    decoding_lstm.weights_init = IsotropicGaussian(std=0.001)
    draw.initialize()

    # Compute cost
    cost = -draw.log_likelihood_lower_bound(x).mean()
    cost.name = 'nll_upper_bound'
    model = Model(cost)

    # Datasets and data streams
    mnist_train = BinarizedMNIST('train')
    train_loop_stream = ForceFloatX(
        DataStream(dataset=mnist_train,
                   iteration_scheme=SequentialScheme(mnist_train.num_examples,
                                                     100)))
    train_monitor_stream = ForceFloatX(
        DataStream(dataset=mnist_train,
                   iteration_scheme=SequentialScheme(mnist_train.num_examples,
                                                     500)))
    mnist_valid = BinarizedMNIST('valid')
    valid_monitor_stream = ForceFloatX(
        DataStream(dataset=mnist_valid,
                   iteration_scheme=SequentialScheme(mnist_valid.num_examples,
                                                     500)))
    mnist_test = BinarizedMNIST('test')
    test_monitor_stream = ForceFloatX(
        DataStream(dataset=mnist_test,
                   iteration_scheme=SequentialScheme(mnist_test.num_examples,
                                                     500)))

    # Get parameters and monitoring channels
    computation_graph = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(computation_graph.variables)
    monitoring_channels = dict([
        ('avg_' + channel.tag.name, channel.mean())
        for channel in VariableFilter(
            name='.*term$')(computation_graph.auxiliary_variables)
    ])
    for name, channel in monitoring_channels.items():
        channel.name = name
    monitored_quantities = monitoring_channels.values() + [cost]

    # Training loop
    step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95)
    algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule)
    algorithm.add_updates(computation_graph.updates)
    main_loop = MainLoop(
        model=model,
        data_stream=train_loop_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            SerializeMainLoop('vae.pkl', save_separately=['model']),
            FinishAfter(after_n_epochs=200),
            DataStreamMonitoring(monitored_quantities,
                                 train_monitor_stream,
                                 prefix="train",
                                 updates=computation_graph.updates),
            DataStreamMonitoring(monitored_quantities,
                                 valid_monitor_stream,
                                 prefix="valid",
                                 updates=computation_graph.updates),
            DataStreamMonitoring(monitored_quantities,
                                 test_monitor_stream,
                                 prefix="test",
                                 updates=computation_graph.updates),
            ProgressBar(),
            Printing()
        ])
    main_loop.run()
Beispiel #17
0
def main(num_epochs=50, batch_normalized=True, alpha=0.1):
    """Run the example.

    Parameters
    ----------
    num_epochs : int, optional
        Number of epochs for which to train.

    batch_normalized : bool, optional
        Batch-normalize the training graph. Defaults to `True`.

    alpha : float, optional
        Weight to apply to a new sample when calculating running
        averages for population statistics (1 - alpha weight is
        given to the existing average).

    """
    if batch_normalized:
        # Add an extra keyword argument that only BatchNormalizedMLP takes,
        # in order to speed things up at the cost of a bit of extra memory.
        mlp_class = BatchNormalizedMLP
        extra_kwargs = {'conserve_memory': False}
    else:
        mlp_class = MLP
        extra_kwargs = {}
    mlp = mlp_class([Logistic(), Logistic(),
                     Logistic(), Softmax()], [2, 5, 5, 5, 3],
                    weights_init=IsotropicGaussian(0.2),
                    biases_init=Constant(0.),
                    **extra_kwargs)
    mlp.initialize()

    # Generate a dataset with 3 spiral arms, using 8000 examples for
    # training and 2000 for testing.
    dataset = Spiral(num_examples=10000,
                     classes=3,
                     sources=['features', 'label'],
                     noise=0.05)
    train_stream = DataStream(dataset,
                              iteration_scheme=ShuffledScheme(examples=8000,
                                                              batch_size=20))
    test_stream = DataStream(dataset,
                             iteration_scheme=SequentialScheme(
                                 examples=list(range(8000, 10000)),
                                 batch_size=2000))

    # Build a cost graph; this contains BatchNormalization bricks that will
    # by default run in inference mode.
    features = tensor.matrix('features')
    label = tensor.lvector('label')
    prediction = mlp.apply(features)
    cost = CategoricalCrossEntropy().apply(label, prediction)
    misclass = MisclassificationRate().apply(label, prediction)
    misclass.name = 'misclass'  # The default name for this is annoyingly long
    original_cg = ComputationGraph([cost, misclass])

    if batch_normalized:
        cg = apply_batch_normalization(original_cg)
        # Add updates for population parameters
        pop_updates = get_batch_normalization_updates(cg)
        extra_updates = [(p, m * alpha + p * (1 - alpha))
                         for p, m in pop_updates]
    else:
        cg = original_cg
        extra_updates = []

    algorithm = GradientDescent(step_rule=Adam(0.001),
                                cost=cg.outputs[0],
                                parameters=cg.parameters)
    algorithm.add_updates(extra_updates)

    main_loop = MainLoop(
        algorithm=algorithm,
        data_stream=train_stream,
        # Use the original cost and misclass variables so
        # that we monitor the (original) inference-mode graph.
        extensions=[
            DataStreamMonitoring([cost, misclass],
                                 train_stream,
                                 prefix='train'),
            DataStreamMonitoring([cost, misclass], test_stream, prefix='test'),
            Printing(),
            FinishAfter(after_n_epochs=num_epochs)
        ])
    main_loop.run()
    return main_loop
Beispiel #18
0
def construct_main_loop(name, task_name, patch_shape, batch_size,
                        n_spatial_dims, n_patches, max_epochs,
                        patience_epochs, learning_rate, gradient_limiter,
                        hyperparameters, **kwargs):
    task = tasks.get_task(**hyperparameters)
    hyperparameters["n_channels"] = task.n_channels

    extensions = []

    # let theta noise decay as training progresses
    for key in "location_std scale_std".split():
        hyperparameters[key] = theano.shared(hyperparameters[key], name=key)
        extensions.append(util.ExponentialDecay(
            hyperparameters[key],
            hyperparameters["%s_decay" % key],
            after_batch=True))

    print "constructing graphs..."
    graphs, outputs, updates = construct_graphs(task=task, **hyperparameters)

    print "setting up main loop..."

    from blocks.model import Model
    model = Model(outputs["train"]["cost"])

    from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, Adam, RMSProp
    from extensions import Compressor
    if gradient_limiter == "clip":
        limiter = StepClipping(1.)
    elif gradient_limiter == "compress":
        limiter = Compressor()
    else:
        raise ValueError()

    algorithm = GradientDescent(
        cost=outputs["train"]["cost"],
        parameters=graphs["train"].parameters,
        step_rule=CompositeRule([limiter, Adam(learning_rate=learning_rate)]))
    algorithm.add_updates(updates["train"])

    extensions.extend(construct_monitors(
        algorithm=algorithm, task=task, model=model, graphs=graphs,
        outputs=outputs, updates=updates, **hyperparameters))

    from blocks.extensions import FinishAfter, Printing, ProgressBar, Timing
    from blocks.extensions.stopping import FinishIfNoImprovementAfter
    from blocks.extensions.training import TrackTheBest
    from blocks.extensions.saveload import Checkpoint
    from dump import DumpBest, LightCheckpoint, PrintingTo, DumpGraph, DumpLog
    extensions.extend([
        TrackTheBest("valid_error_rate", "best_valid_error_rate"),
        FinishIfNoImprovementAfter("best_valid_error_rate", epochs=patience_epochs),
        FinishAfter(after_n_epochs=max_epochs),
        DumpBest("best_valid_error_rate", name+"_best.zip"),
        Checkpoint(hyperparameters["checkpoint_save_path"],
                   on_interrupt=False, every_n_epochs=10,
                   use_cpickle=True),
        DumpLog("log.pkl", after_epoch=True),
        ProgressBar(),
        Timing(),
        Printing(), PrintingTo(name+"_log"),
        DumpGraph(name+"_grad_graph")])

    from blocks.main_loop import MainLoop
    main_loop = MainLoop(data_stream=task.get_stream("train"),
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)

    from tabulate import tabulate
    print "parameter sizes:"
    print tabulate((key, "x".join(map(str, value.get_value().shape)), value.get_value().size)
                   for key, value in main_loop.model.get_parameter_dict().items())

    return main_loop
Beispiel #19
0
def train_model(cost,
                cross_entropy,
                updates,
                train_stream,
                valid_stream,
                args,
                gate_values=None):

    step_rule = learning_algorithm(args)
    cg = ComputationGraph(cost)

    # ADD REGULARIZATION
    # WEIGHT NOISE
    weight_noise = args.weight_noise
    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        cg_train = apply_noise(cg, weights, weight_noise)
        cost = cg_train.outputs[0]
    cost.name = "cost_with_weight_noise"
    cg = ComputationGraph(cost)

    logger.info(cg.parameters)

    algorithm = GradientDescent(cost=cost,
                                step_rule=step_rule,
                                params=cg.parameters)
    algorithm.add_updates(updates)

    # extensions to be added
    extensions = []
    if args.load_path is not None:
        extensions.append(Load(args.load_path))

    outputs = [
        variable for variable in cg.variables if variable.name == "presoft"
    ]

    if args.generate:
        extensions.append(
            TextGenerationExtension(
                outputs=outputs,
                generation_length=args.generated_text_lenght,
                initial_text_length=args.initial_text_length,
                every_n_batches=args.monitoring_freq,
                ploting_path=os.path.join(args.save_path, 'prob_plot.png'),
                softmax_sampling=args.softmax_sampling,
                dataset=args.dataset,
                updates=updates,
                interactive_mode=args.interactive_mode))
    extensions.extend([
        TrainingDataMonitoring([cost],
                               prefix='train',
                               every_n_batches=args.monitoring_freq,
                               after_epoch=True),
        DataStreamMonitoring([cost, cross_entropy],
                             valid_stream,
                             args.mini_batch_size_valid,
                             state_updates=updates,
                             prefix='valid',
                             before_first_epoch=not (args.visualize_gates),
                             every_n_batches=args.monitoring_freq),
        ResetStates([v for v, _ in updates], every_n_batches=100),
        ProgressBar()
    ])
    # Creating directory for saving model.
    if not args.interactive_mode:
        if not os.path.exists(args.save_path):
            os.makedirs(args.save_path)
        else:
            raise Exception('Directory already exists')
    early_stopping = EarlyStopping('valid_cross_entropy',
                                   args.patience,
                                   args.save_path,
                                   every_n_batches=args.monitoring_freq)

    # Visualizing extensions
    if args.interactive_mode:
        extensions.append(InteractiveMode())
    if args.visualize_gates and (gate_values is not None):
        if args.rnn_type == "lstm":
            extensions.append(
                VisualizeGateLSTM(gate_values,
                                  updates,
                                  args.dataset,
                                  ploting_path=None))
        elif args.rnn_type == "soft":
            extensions.append(
                VisualizeGateSoft(gate_values,
                                  updates,
                                  args.dataset,
                                  ploting_path=None))
        else:
            assert (False)

    extensions.append(early_stopping)
    extensions.append(Printing(every_n_batches=args.monitoring_freq))

    main_loop = MainLoop(model=Model(cost),
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    main_loop.run()
def main():
    nclasses = 27

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=1)
    parser.add_argument("--length", type=int, default=180)
    parser.add_argument("--num-epochs", type=int, default=100)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=1e-3)
    parser.add_argument("--epsilon", type=float, default=1e-5)
    parser.add_argument("--num-hidden", type=int, default=1000)
    parser.add_argument("--baseline", action="store_true")
    parser.add_argument("--initialization",
                        choices="identity glorot orthogonal uniform".split(),
                        default="identity")
    parser.add_argument("--initial-gamma", type=float, default=1e-1)
    parser.add_argument("--initial-beta", type=float, default=0)
    parser.add_argument("--cluster", action="store_true")
    parser.add_argument("--activation",
                        choices=list(activations.keys()),
                        default="tanh")
    parser.add_argument("--optimizer",
                        choices="sgdmomentum adam rmsprop",
                        default="rmsprop")
    parser.add_argument("--continue-from")
    parser.add_argument("--evaluate")
    parser.add_argument("--dump-hiddens")
    args = parser.parse_args()

    np.random.seed(args.seed)
    blocks.config.config.default_seed = args.seed

    if args.continue_from:
        from blocks.serialization import load
        main_loop = load(args.continue_from)
        main_loop.run()
        sys.exit(0)

    graphs, extensions, updates = construct_graphs(args, nclasses)

    ### optimization algorithm definition
    if args.optimizer == "adam":
        optimizer = Adam(learning_rate=args.learning_rate)
    elif args.optimizer == "rmsprop":
        optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9)
    elif args.optimizer == "sgdmomentum":
        optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99)
    step_rule = CompositeRule([
        StepClipping(1.),
        optimizer,
    ])
    algorithm = GradientDescent(cost=graphs["training"].outputs[0],
                                parameters=graphs["training"].parameters,
                                step_rule=step_rule)
    algorithm.add_updates(updates["training"])
    model = Model(graphs["training"].outputs[0])
    extensions = extensions["training"] + extensions["inference"]

    # step monitor
    step_channels = []
    step_channels.extend([
        algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
        for name, param in model.get_parameter_dict().items()
    ])
    step_channels.append(
        algorithm.total_step_norm.copy(name="total_step_norm"))
    step_channels.append(
        algorithm.total_gradient_norm.copy(name="total_gradient_norm"))
    step_channels.extend(graphs["training"].outputs)
    logger.warning("constructing training data monitor")
    extensions.append(
        TrainingDataMonitoring(step_channels,
                               prefix="iteration",
                               after_batch=True))

    # parameter monitor
    extensions.append(
        DataStreamMonitoring([
            param.norm(2).copy(name="parameter.norm:%s" % name)
            for name, param in model.get_parameter_dict().items()
        ],
                             data_stream=None,
                             after_epoch=True))

    validation_interval = 500
    # performance monitor
    for situation in "training inference".split():
        if situation == "inference" and not args.evaluate:
            # save time when we don't need the inference graph
            continue

        for which_set in "train valid test".split():
            logger.warning("constructing %s %s monitor" %
                           (which_set, situation))
            channels = list(graphs[situation].outputs)
            extensions.append(
                DataStreamMonitoring(channels,
                                     prefix="%s_%s" % (which_set, situation),
                                     every_n_batches=validation_interval,
                                     data_stream=get_stream(
                                         which_set=which_set,
                                         batch_size=args.batch_size,
                                         num_examples=10000,
                                         length=args.length)))

    extensions.extend([
        TrackTheBest("valid_training_error_rate",
                     "best_valid_training_error_rate"),
        DumpBest("best_valid_training_error_rate", "best.zip"),
        FinishAfter(after_n_epochs=args.num_epochs),
        #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50),
        Checkpoint("checkpoint.zip",
                   on_interrupt=False,
                   every_n_epochs=1,
                   use_cpickle=True),
        DumpLog("log.pkl", after_epoch=True)
    ])

    if not args.cluster:
        extensions.append(ProgressBar())

    extensions.extend([
        Timing(),
        Printing(every_n_batches=validation_interval),
        PrintingTo("log"),
    ])
    main_loop = MainLoop(data_stream=get_stream(which_set="train",
                                                batch_size=args.batch_size,
                                                length=args.length,
                                                augment=True),
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)

    if args.dump_hiddens:
        dump_hiddens(args, main_loop)
        return

    if args.evaluate:
        evaluate(args, main_loop)
        return

    main_loop.run()
Beispiel #21
0
def main(num_epochs=50, batch_normalized=True, alpha=0.1):
    """Run the example.

    Parameters
    ----------
    num_epochs : int, optional
        Number of epochs for which to train.

    batch_normalized : bool, optional
        Batch-normalize the training graph. Defaults to `True`.

    alpha : float, optional
        Weight to apply to a new sample when calculating running
        averages for population statistics (1 - alpha weight is
        given to the existing average).

    """
    if batch_normalized:
        # Add an extra keyword argument that only BatchNormalizedMLP takes,
        # in order to speed things up at the cost of a bit of extra memory.
        mlp_class = BatchNormalizedMLP
        extra_kwargs = {'conserve_memory': False}
    else:
        mlp_class = MLP
        extra_kwargs = {}
    mlp = mlp_class([Logistic(), Logistic(), Logistic(), Softmax()],
                    [2, 5, 5, 5, 3],
                    weights_init=IsotropicGaussian(0.2),
                    biases_init=Constant(0.), **extra_kwargs)
    mlp.initialize()

    # Generate a dataset with 3 spiral arms, using 8000 examples for
    # training and 2000 for testing.
    dataset = Spiral(num_examples=10000, classes=3,
                     sources=['features', 'label'],
                     noise=0.05)
    train_stream = DataStream(dataset,
                              iteration_scheme=ShuffledScheme(examples=8000,
                                                              batch_size=20))
    test_stream = DataStream(dataset,
                             iteration_scheme=SequentialScheme(
                                 examples=list(range(8000, 10000)),
                                 batch_size=2000))

    # Build a cost graph; this contains BatchNormalization bricks that will
    # by default run in inference mode.
    features = tensor.matrix('features')
    label = tensor.lvector('label')
    prediction = mlp.apply(features)
    cost = CategoricalCrossEntropy().apply(label, prediction)
    misclass = MisclassificationRate().apply(label, prediction)
    misclass.name = 'misclass'  # The default name for this is annoyingly long
    original_cg = ComputationGraph([cost, misclass])

    if batch_normalized:
        cg = apply_batch_normalization(original_cg)
        # Add updates for population parameters
        pop_updates = get_batch_normalization_updates(cg)
        extra_updates = [(p, m * alpha + p * (1 - alpha))
                         for p, m in pop_updates]
    else:
        cg = original_cg
        extra_updates = []

    algorithm = GradientDescent(step_rule=Adam(0.001),
                                cost=cg.outputs[0],
                                parameters=cg.parameters)
    algorithm.add_updates(extra_updates)

    main_loop = MainLoop(algorithm=algorithm,
                         data_stream=train_stream,
                         # Use the original cost and misclass variables so
                         # that we monitor the (original) inference-mode graph.
                         extensions=[DataStreamMonitoring([cost, misclass],
                                                          train_stream,
                                                          prefix='train'),
                                     DataStreamMonitoring([cost, misclass],
                                                          test_stream,
                                                          prefix='test'),
                                     Printing(),
                                     FinishAfter(after_n_epochs=num_epochs)])
    main_loop.run()
    return main_loop
Beispiel #22
0
def main(save_to, num_epochs,
         regularization=0.0001, subset=None, num_batches=None,
         batch_size=None, histogram=None, resume=False):
    output_size = 10
    convnet = create_res_net()

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    test_probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs)
            .copy(name='cost'))
    test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs)
                  .copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs)
                  .copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate])

    # Apply dropout to all layer outputs except final softmax
    # dropout_vars = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(test_cg.variables)
    # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(test_cg, [x], 0.2)
    # train_cg = drop_cg
    # train_cg = apply_batch_normalization(test_cg)

    # train_cost, train_error_rate, train_components = train_cg.outputs

    with batch_normalization(convnet):
        train_probs = convnet.apply(x)
    train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs)
                .copy(name='cost'))
    train_components = (ComponentwiseCrossEntropy().apply(y.flatten(),
                train_probs).copy(name='components'))
    train_error_rate = (MisclassificationRate().apply(y.flatten(),
                train_probs).copy(name='error_rate'))
    train_cg = ComputationGraph([train_cost,
                train_error_rate, train_components])
    population_updates = get_batch_normalization_updates(train_cg)
    bn_alpha = 0.9
    extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha))
                for p, m in population_updates]

    # Apply regularization to the cost
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    l2_norm = sum([(W ** 2).sum() for W in weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = regularization * l2_norm
    l2_regularization.name = 'l2_regularization'
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + regularization * l2_norm
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train",))
    cifar10_train_stream = RandomPadCropFlip(
        NormalizeBatchLevels(DataStream.default_stream(
            cifar10_train, iteration_scheme=ShuffledScheme(
                cifar10_train.num_examples, batch_size)),
        which_sources=('features',)),
        (32, 32), pad=4, which_sources=('features',))

    test_batch_size = 500
    cifar10_test = CIFAR10(("test",))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(
            cifar10_test.num_examples, test_batch_size)),
        which_sources=('features',))

    momentum = Momentum(0.01, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])

    # from theano.compile.nanguardmode import NanGuardMode

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=train_cost, parameters=train_cg.parameters,
        step_rule=momentum)
    algorithm.add_updates(extra_updates)

    #,
    #    theano_func_kwargs={
    #        'mode': NanGuardMode(
    #            nan_is_error=True, inf_is_error=True, big_is_error=True)})

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  EpochSchedule(momentum.learning_rate, [
                      (0, 0.01),   # Warm up with 0.01 learning rate
                      (1, 0.1),    # Then go back to 0.1
                      (100, 0.01),
                      (150, 0.001)
                      # (83, 0.01),  # Follow the schedule in the paper
                      # (125, 0.001)
                  ]),
                  DataStreamMonitoring(
                      [test_cost, test_error_rate, test_confusion],
                      cifar10_test_stream,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [train_cost, train_error_rate,
                       train_cost_without_regularization,
                       l2_regularization,
                       momentum.learning_rate,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      every_n_batches=17),
                      # after_epoch=True),
                  Plot('Training performance for ' + save_to,
                      channels=[
                          ['train_cost_with_regularization',
                           'train_cost_without_regularization',
                           'train_l2_regularization'],
                          ['train_error_rate'],
                          ['train_total_gradient_norm'],
                      ],
                      every_n_batches=17),
                  Plot('Test performance for ' + save_to,
                      channels=[[
                          'train_error_rate',
                          'test_error_rate',
                          ]],
                      after_epoch=True),
                  Checkpoint(save_to, use_cpickle=True),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=train_components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(save_to, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(
        algorithm,
        cifar10_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)
Beispiel #23
0
def train_ladder(cli_params, dataset=None, save_to='results/ova_all_full'):
    cli_params['save_dir'] = prepare_dir(save_to)
    logfile = os.path.join(cli_params['save_dir'], 'log.txt')

    # Log also DEBUG to a file
    fh = logging.FileHandler(filename=logfile)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    logger.info('Logging into %s' % logfile)

    p, loaded = load_and_log_params(cli_params)

    ladder = setup_model(p)

    # Training
    all_params = ComputationGraph([ladder.costs.total]).parameters
    logger.info('Found the following parameters: %s' % str(all_params))

    # Fetch all batch normalization updates. They are in the clean path.
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    assert 'counter' in [u.name for u in bn_updates.keys()], \
        'No batch norm params in graph - the graph has been cut?'

    training_algorithm = GradientDescent(
        cost=ladder.costs.total,
        params=all_params,
        step_rule=Adam(learning_rate=ladder.lr))
    # In addition to actual training, also do BN variable approximations
    training_algorithm.add_updates(bn_updates)

    short_prints = {
        "train": {
            'T_C_class': ladder.costs.class_corr,
            'T_C_de': ladder.costs.denois.values(),
        },
        "valid_approx":
        OrderedDict([
            ('V_C_class', ladder.costs.class_clean),
            ('V_E', ladder.error.clean),
            ('V_C_de', ladder.costs.denois.values()),
        ]),
        "valid_final":
        OrderedDict([
            ('VF_C_class', ladder.costs.class_clean),
            ('VF_E', ladder.error.clean),
            ('VF_C_de', ladder.costs.denois.values()),
        ]),
    }

    ovadataset = dataset['ovadataset']
    train_indexes = dataset['train_indexes']
    val_indexes = dataset['val_indexes']

    main_loop = MainLoop(
        training_algorithm,
        # Datastream used for training
        make_datastream(ovadataset,
                        train_indexes,
                        p.batch_size,
                        scheme=ShuffledScheme),
        model=Model(ladder.costs.total),
        extensions=[
            FinishAfter(after_n_epochs=p.num_epochs),

            # This will estimate the validation error using
            # running average estimates of the batch normalization
            # parameters, mean and variance
            ApproxTestMonitoring(
                [ladder.costs.class_clean, ladder.error.clean] +
                ladder.costs.denois.values(),
                make_datastream(ovadataset, val_indexes, p.batch_size),
                prefix="valid_approx"),

            # This Monitor is slower, but more accurate since it will first
            # estimate batch normalization parameters from training data and
            # then do another pass to calculate the validation error.
            FinalTestMonitoring(
                [ladder.costs.class_clean, ladder.error.clean_mc] +
                ladder.costs.denois.values(),
                make_datastream(ovadataset, train_indexes, p.batch_size),
                make_datastream(ovadataset, val_indexes, p.batch_size),
                prefix="valid_final",
                after_n_epochs=p.num_epochs),
            TrainingDataMonitoring([
                ladder.costs.total, ladder.costs.class_corr,
                training_algorithm.total_gradient_norm
            ] + ladder.costs.denois.values(),
                                   prefix="train",
                                   after_epoch=True),
            ShortPrinting(short_prints),
            LRDecay(ladder.lr,
                    p.num_epochs * p.lrate_decay,
                    p.num_epochs,
                    after_epoch=True),
        ])
    main_loop.run()

    # Get results
    df = main_loop.log.to_dataframe()
    col = 'valid_final_error_matrix_cost'
    logger.info('%s %g' % (col, df[col].iloc[-1]))

    ds = make_datastream(ovadataset, val_indexes, p.batch_size)
    outputs = ladder.act.clean.labeled.h[len(ladder.layers) - 1]
    outputreplacer = TestMonitoring()
    _, _, outputs = outputreplacer._get_bn_params(outputs)

    cg = ComputationGraph(outputs)
    f = cg.get_theano_function()

    it = ds.get_epoch_iterator(as_dict=True)
    res = []
    inputs = {
        'features_labeled': [],
        'targets_labeled': [],
        'features_unlabeled': []
    }
    # Loop over one epoch
    for d in it:
        # Store all inputs
        for k, v in d.iteritems():
            inputs[k] += [v]
        # Store outputs
        res += [f(*[d[str(inp)] for inp in cg.inputs])]

    # Concatenate all minibatches
    res = [numpy.vstack(minibatches) for minibatches in zip(*res)]
    inputs = {k: numpy.vstack(v) for k, v in inputs.iteritems()}

    if main_loop.log.status['epoch_interrupt_received']:
        return None
    return res[0], inputs
Beispiel #24
0
def construct_main_loop(name, task_name, patch_shape, batch_size,
                        n_spatial_dims, n_patches, max_epochs, patience_epochs,
                        learning_rate, gradient_limiter, hyperparameters,
                        **kwargs):
    task = tasks.get_task(**hyperparameters)
    hyperparameters["n_channels"] = task.n_channels

    extensions = []

    # let theta noise decay as training progresses
    for key in "location_std scale_std".split():
        hyperparameters[key] = theano.shared(hyperparameters[key], name=key)
        extensions.append(
            util.ExponentialDecay(hyperparameters[key],
                                  hyperparameters["%s_decay" % key],
                                  after_batch=True))

    print "constructing graphs..."
    graphs, outputs, updates = construct_graphs(task=task, **hyperparameters)

    print "setting up main loop..."

    from blocks.model import Model
    model = Model(outputs["train"]["cost"])

    from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, Adam, RMSProp
    from extensions import Compressor
    if gradient_limiter == "clip":
        limiter = StepClipping(1.)
    elif gradient_limiter == "compress":
        limiter = Compressor()
    else:
        raise ValueError()

    algorithm = GradientDescent(
        cost=outputs["train"]["cost"],
        parameters=graphs["train"].parameters,
        step_rule=CompositeRule([limiter,
                                 Adam(learning_rate=learning_rate)]))
    algorithm.add_updates(updates["train"])

    extensions.extend(
        construct_monitors(algorithm=algorithm,
                           task=task,
                           model=model,
                           graphs=graphs,
                           outputs=outputs,
                           updates=updates,
                           **hyperparameters))

    from blocks.extensions import FinishAfter, Printing, ProgressBar, Timing
    from blocks.extensions.stopping import FinishIfNoImprovementAfter
    from blocks.extensions.training import TrackTheBest
    from blocks.extensions.saveload import Checkpoint
    from dump import DumpBest, LightCheckpoint, PrintingTo, DumpGraph, DumpLog
    extensions.extend([
        TrackTheBest("valid_error_rate", "best_valid_error_rate"),
        FinishIfNoImprovementAfter("best_valid_error_rate",
                                   epochs=patience_epochs),
        FinishAfter(after_n_epochs=max_epochs),
        DumpBest("best_valid_error_rate", name + "_best.zip"),
        Checkpoint(hyperparameters["checkpoint_save_path"],
                   on_interrupt=False,
                   every_n_epochs=10,
                   use_cpickle=True),
        DumpLog("log.pkl", after_epoch=True),
        ProgressBar(),
        Timing(),
        Printing(),
        PrintingTo(name + "_log"),
        DumpGraph(name + "_grad_graph")
    ])

    from blocks.main_loop import MainLoop
    main_loop = MainLoop(data_stream=task.get_stream("train"),
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)

    from tabulate import tabulate
    print "parameter sizes:"
    print tabulate(
        (key, "x".join(map(str,
                           value.get_value().shape)), value.get_value().size)
        for key, value in main_loop.model.get_parameter_dict().items())

    return main_loop
def run(batch_size, classifier, oldmodel, monitor_every, checkpoint_every, final_epoch,
        dataset, color_convert, image_size, net_depth, allowed, stretch):

    streams = create_custom_streams(filename=dataset,
                                    training_batch_size=batch_size,
                                    monitoring_batch_size=batch_size,
                                    include_targets=True,
                                    color_convert=color_convert,
                                    allowed=allowed,
                                    stretch=stretch)

    main_loop_stream = streams[0]
    train_monitor_stream = streams[1]
    valid_monitor_stream = streams[2]

    cg, bn_dropout_cg = create_training_computation_graphs(image_size, net_depth)

    model = Model(bn_dropout_cg.outputs[0])

    # Compute parameter updates for the batch normalization population
    # statistics. They are updated following an exponential moving average.
    pop_updates = get_batch_normalization_updates(bn_dropout_cg)
    decay_rate = 0.05
    extra_updates = [(p, m * decay_rate + p * (1 - decay_rate))
                     for p, m in pop_updates]

    # Prepare algorithm
    step_rule = Adam()
    algorithm = GradientDescent(cost=bn_dropout_cg.outputs[0],
                                parameters=bn_dropout_cg.parameters,
                                step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    # Prepare monitoring
    cost = bn_dropout_cg.outputs[0]
    cost.name = 'cost'
    train_monitoring = DataStreamMonitoring(
        [cost], train_monitor_stream, prefix="train",
        before_first_epoch=False, after_epoch=False, after_training=True,
        updates=extra_updates)

    cost, accuracy = cg.outputs
    cost.name = 'cost'
    accuracy.name = 'accuracy'
    monitored_quantities = [cost, accuracy]
    valid_monitoring = DataStreamMonitoring(
        monitored_quantities, valid_monitor_stream, prefix="valid",
        before_first_epoch=True, after_epoch=False,
        every_n_epochs=monitor_every)

    # Prepare checkpoint
    checkpoint = Checkpoint(classifier, every_n_epochs=checkpoint_every,
                            use_cpickle=True)

    extensions = [Timing(), FinishAfter(after_n_epochs=final_epoch), train_monitoring,
                  valid_monitoring, checkpoint, Printing(), ProgressBar()]
    main_loop = MainLoop(model=model, data_stream=main_loop_stream,
                         algorithm=algorithm, extensions=extensions)

    if oldmodel is not None:
        print("Initializing parameters with old model {}".format(oldmodel))
        with open(oldmodel, 'rb') as src:
            saved_model = load(src)
            main_loop.model.set_parameter_values(
                saved_model.model.get_parameter_values())
            del saved_model

    main_loop.run()
Beispiel #26
0
def train(cli_params):
    cli_params['save_dir'] = prepare_dir(cli_params['save_to'])
    logfile = os.path.join(cli_params['save_dir'], 'log.txt')

    # Log also DEBUG to a file
    fh = logging.FileHandler(filename=logfile)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    logger.info('Logging into %s' % logfile)

    p, loaded = load_and_log_params(cli_params)
    in_dim, data, whiten, cnorm = setup_data(p, test_set=False)
    if not loaded:
        # Set the zero layer to match input dimensions
        p.encoder_layers = (in_dim,) + p.encoder_layers

    ladder = setup_model(p)

    # Training
    all_params = ComputationGraph([ladder.costs.total]).parameters
    logger.info('Found the following parameters: %s' % str(all_params))

    # Fetch all batch normalization updates. They are in the clean path.
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    assert 'counter' in [u.name for u in bn_updates.keys()], \
        'No batch norm params in graph - the graph has been cut?'

    training_algorithm = GradientDescent(
        cost=ladder.costs.total, parameters=all_params,
        step_rule=Adam(learning_rate=ladder.lr.get_value()))
    # In addition to actual training, also do BN variable approximations
    training_algorithm.add_updates(bn_updates)

    short_prints = {
        "train": {
            'T_C_class': ladder.costs.class_corr,
            'T_C_de': ladder.costs.denois.values(),
        },
        "valid_approx": OrderedDict([
            ('V_C_class', ladder.costs.class_clean),
            ('V_E', ladder.error.clean),
            ('V_C_de', ladder.costs.denois.values()),
        ]),
        "valid_final": OrderedDict([
            ('VF_C_class', ladder.costs.class_clean),
            ('VF_E', ladder.error.clean),
            ('VF_C_de', ladder.costs.denois.values()),
        ]),
    }

    main_loop = MainLoop(
        training_algorithm,
        # Datastream used for training
        make_datastream(data.train, data.train_ind,
                        p.batch_size,
                        n_labeled=p.labeled_samples,
                        n_unlabeled=p.unlabeled_samples,
                        whiten=whiten,
                        cnorm=cnorm),
        model=Model(ladder.costs.total),
        extensions=[
            FinishAfter(after_n_epochs=p.num_epochs),

            # This will estimate the validation error using
            # running average estimates of the batch normalization
            # parameters, mean and variance
            ApproxTestMonitoring(
                [ladder.costs.class_clean, ladder.error.clean]
                + ladder.costs.denois.values(),
                make_datastream(data.valid, data.valid_ind,
                                p.valid_batch_size, whiten=whiten, cnorm=cnorm,
                                scheme=ShuffledScheme),
                prefix="valid_approx"),

            # This Monitor is slower, but more accurate since it will first
            # estimate batch normalization parameters from training data and
            # then do another pass to calculate the validation error.
            FinalTestMonitoring(
                [ladder.costs.class_clean, ladder.error.clean]
                + ladder.costs.denois.values(),
                make_datastream(data.train, data.train_ind,
                                p.batch_size,
                                n_labeled=p.labeled_samples,
                                whiten=whiten, cnorm=cnorm,
                                scheme=ShuffledScheme),
                make_datastream(data.valid, data.valid_ind,
                                p.valid_batch_size,
                                n_labeled=len(data.valid_ind),
                                whiten=whiten, cnorm=cnorm,
                                scheme=ShuffledScheme),
                prefix="valid_final",
                after_n_epochs=p.num_epochs),

            TrainingDataMonitoring(
                [ladder.costs.total, ladder.costs.class_corr,
                 training_algorithm.total_gradient_norm]
                + ladder.costs.denois.values(),
                prefix="train", after_epoch=True),

            SaveParams(None, all_params, p.save_dir, after_epoch=True),
            SaveExpParams(p, p.save_dir, before_training=True),
            SaveLog(p.save_dir, after_training=True),
            ShortPrinting(short_prints),
            LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs,
                    after_epoch=True),
        ])
    main_loop.run()

    # Get results
    df = DataFrame.from_dict(main_loop.log, orient='index')
    col = 'valid_final_error_rate_clean'
    logger.info('%s %g' % (col, df[col].iloc[-1]))

    if main_loop.log.status['epoch_interrupt_received']:
        return None
    return df
Beispiel #27
0
def main(mode, save_to, num_epochs, load_params=None,
         feature_maps=None, mlp_hiddens=None,
         conv_sizes=None, pool_sizes=None, stride=None, repeat_times=None,
         batch_size=None, num_batches=None, algo=None,
         test_set=None, valid_examples=None,
         dropout=None, max_norm=None, weight_decay=None,
         batch_norm=None):
    if feature_maps is None:
        feature_maps = [20, 50, 50]
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    if conv_sizes is None:
        conv_sizes = [5, 5, 5]
    if pool_sizes is None:
        pool_sizes = [2, 2, 2]
    if repeat_times is None:
        repeat_times = [1, 1, 1]
    if batch_size is None:
        batch_size = 500
    if valid_examples is None:
        valid_examples = 2500
    if stride is None:
        stride = 1
    if test_set is None:
        test_set = 'test'
    if algo is None:
        algo = 'rmsprop'
    if batch_norm is None:
        batch_norm = False

    image_size = (128, 128)
    output_size = 2

    if (len(feature_maps) != len(conv_sizes) or
        len(feature_maps) != len(pool_sizes) or
        len(feature_maps) != len(repeat_times)):
        raise ValueError("OMG, inconsistent arguments")

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations, 3, image_size,
                    stride=stride,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    repeat_times=repeat_times,
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='full',
                    batch_norm=batch_norm,
                    weights_init=Glorot(),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.initialize()
    logging.info("Input dim: {} {} {}".format(
        *convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(
                i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))


    single_x = tensor.tensor3('image_features')
    x = tensor.tensor4('image_features')
    single_y = tensor.lvector('targets')
    y = tensor.lmatrix('targets')

    # Training
    with batch_normalization(convnet):
        probs = convnet.apply(x)
    cost = (CategoricalCrossEntropy().apply(y.flatten(), probs)
            .copy(name='cost'))
    error_rate = (MisclassificationRate().apply(y.flatten(), probs)
                  .copy(name='error_rate'))

    cg = ComputationGraph([cost, error_rate])
    extra_updates = []

    if batch_norm: # batch norm:
        logger.debug("Apply batch norm")
        pop_updates = get_batch_normalization_updates(cg)
        # p stands for population mean
        # m stands for minibatch
        alpha = 0.005
        extra_updates = [(p, m * alpha + p * (1 - alpha))
                         for p, m in pop_updates]
        population_statistics = [p for p, m in extra_updates]
    if dropout:
        relu_outputs = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg)
        cg = apply_dropout(cg, relu_outputs, dropout)
    cost, error_rate = cg.outputs
    if weight_decay:
        logger.debug("Apply weight decay {}".format(weight_decay))
        cost += weight_decay * l2_norm(cg.parameters)
        cost.name = 'cost'

    # Validation
    valid_probs = convnet.apply_5windows(single_x)
    valid_cost = (CategoricalCrossEntropy().apply(single_y, valid_probs)
            .copy(name='cost'))
    valid_error_rate = (MisclassificationRate().apply(
        single_y, valid_probs).copy(name='error_rate'))

    model = Model([cost, error_rate])
    if load_params:
        logger.info("Loaded params from {}".format(load_params))
        with open(load_params, 'r') as src:
            model.set_parameter_values(load_parameters(src))

    # Training stream with random cropping
    train = DogsVsCats(("train",), subset=slice(None, 25000 - valid_examples, None))
    train_str =  DataStream(
        train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size))
    train_str = add_transformers(train_str, random_crop=True)

    # Validation stream without cropping
    valid = DogsVsCats(("train",), subset=slice(25000 - valid_examples, None, None))
    valid_str = DataStream(
        valid, iteration_scheme=SequentialExampleScheme(valid.num_examples))
    valid_str = add_transformers(valid_str)

    if mode == 'train':
        directory, _ = os.path.split(sys.argv[0])
        env = dict(os.environ)
        env['THEANO_FLAGS'] = 'floatX=float32'
        port = numpy.random.randint(1025, 10000)
        server = subprocess.Popen(
            [directory + '/server.py',
             str(25000 - valid_examples), str(batch_size), str(port)],
            env=env, stderr=subprocess.STDOUT)
        train_str = ServerDataStream(
            ('image_features', 'targets'), produces_examples=False,
            port=port)

        save_to_base, save_to_extension = os.path.splitext(save_to)

        # Train with simple SGD
        if algo == 'rmsprop':
            step_rule = RMSProp(decay_rate=0.999, learning_rate=0.0003)
        elif algo == 'adam':
            step_rule = Adam()
        else:
            assert False
        if max_norm:
            conv_params = VariableFilter(bricks=[Convolutional], roles=[WEIGHT])(cg)
            linear_params = VariableFilter(bricks=[Linear], roles=[WEIGHT])(cg)
            step_rule = CompositeRule(
                [step_rule,
                 Restrict(VariableClipping(max_norm, axis=0), linear_params),
                 Restrict(VariableClipping(max_norm, axis=(1, 2, 3)), conv_params)])

        algorithm = GradientDescent(
            cost=cost, parameters=model.parameters,
            step_rule=step_rule)
        algorithm.add_updates(extra_updates)
        # `Timing` extension reports time for reading data, aggregating a batch
        # and monitoring;
        # `ProgressBar` displays a nice progress bar during training.
        extensions = [Timing(every_n_batches=100),
                    FinishAfter(after_n_epochs=num_epochs,
                                after_n_batches=num_batches),
                    DataStreamMonitoring(
                        [valid_cost, valid_error_rate],
                        valid_str,
                        prefix="valid"),
                    TrainingDataMonitoring(
                        [cost, error_rate,
                        aggregation.mean(algorithm.total_gradient_norm)],
                        prefix="train",
                        after_epoch=True),
                    TrackTheBest("valid_error_rate"),
                    Checkpoint(save_to, save_separately=['log'],
                               parameters=cg.parameters +
                               (population_statistics if batch_norm else []),
                               before_training=True, after_epoch=True)
                        .add_condition(
                            ['after_epoch'],
                            OnLogRecord("valid_error_rate_best_so_far"),
                            (save_to_base + '_best' + save_to_extension,)),
                    Printing(every_n_batches=100)]

        model = Model(cost)

        main_loop = MainLoop(
            algorithm,
            train_str,
            model=model,
            extensions=extensions)
        try:
            main_loop.run()
        finally:
            server.terminate()
    elif mode == 'test':
        classify = theano.function([single_x], valid_probs.argmax())
        test = DogsVsCats((test_set,))
        test_str = DataStream(
            test, iteration_scheme=SequentialExampleScheme(test.num_examples))
        test_str = add_transformers(test_str)
        correct = 0
        with open(save_to, 'w') as dst:
            print("id", "label", sep=',', file=dst)
            for index, example in enumerate(test_str.get_epoch_iterator()):
                image = example[0]
                prediction = classify(image)
                print(index + 1, classify(image), sep=',', file=dst)
                if len(example) > 1 and prediction == example[1]:
                    correct += 1
        print(correct / float(test.num_examples))
    else:
        assert False
Beispiel #28
0
def train(cli_params):
    fn = 'noname'
    if 'save_to' in nodefaultargs or not cli_params.get('load_from'):
        fn = cli_params['save_to']
    cli_params['save_dir'] = prepare_dir(fn)
    nodefaultargs.append('save_dir')

    logfile = os.path.join(cli_params['save_dir'], 'log.txt')

    # Log also DEBUG to a file
    fh = logging.FileHandler(filename=logfile)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    logger.info('Logging into %s' % logfile)

    p, loaded = load_and_log_params(cli_params)

    in_dim, data, whiten, cnorm = setup_data(p, test_set=False)
    if not loaded:
        # Set the zero layer to match input dimensions
        p.encoder_layers = (in_dim, ) + p.encoder_layers

    ladder = setup_model(p)

    # Training
    all_params = ComputationGraph([ladder.costs.total]).parameters
    logger.info('Found the following parameters: %s' % str(all_params))

    # Fetch all batch normalization updates. They are in the clean path.
    # you can turn off BN by setting is_normalizing = False in ladder.py
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    assert not bn_updates or 'counter' in [u.name for u in bn_updates.keys()], \
        'No batch norm params in graph - the graph has been cut?'

    training_algorithm = GradientDescent(
        cost=ladder.costs.total,
        parameters=all_params,
        step_rule=Adam(learning_rate=ladder.lr))
    # In addition to actual training, also do BN variable approximations
    if bn_updates:
        training_algorithm.add_updates(bn_updates)

    short_prints = {
        "train":
        OrderedDict([
            ('T_E', ladder.error.clean),
            ('T_O', ladder.oos.clean),
            ('T_C_class', ladder.costs.class_corr),
            ('T_C_de', ladder.costs.denois.values()),
            ('T_T', ladder.costs.total),
        ]),
        "valid_approx":
        OrderedDict([
            ('V_C_class', ladder.costs.class_clean),
            ('V_E', ladder.error.clean),
            ('V_O', ladder.oos.clean),
            ('V_C_de', ladder.costs.denois.values()),
            ('V_T', ladder.costs.total),
        ]),
        "valid_final":
        OrderedDict([
            ('VF_C_class', ladder.costs.class_clean),
            ('VF_E', ladder.error.clean),
            ('VF_O', ladder.oos.clean),
            ('VF_C_de', ladder.costs.denois.values()),
            ('V_T', ladder.costs.total),
        ]),
    }

    if len(data.valid_ind):
        main_loop = MainLoop(
            training_algorithm,
            # Datastream used for training
            make_datastream(data.train,
                            data.train_ind,
                            p.batch_size,
                            n_labeled=p.labeled_samples,
                            n_unlabeled=p.unlabeled_samples,
                            whiten=whiten,
                            cnorm=cnorm,
                            balanced_classes=p.balanced_classes,
                            dseed=p.dseed),
            model=Model(ladder.costs.total),
            extensions=[
                FinishAfter(after_n_epochs=p.num_epochs),

                # This will estimate the validation error using
                # running average estimates of the batch normalization
                # parameters, mean and variance
                ApproxTestMonitoring([
                    ladder.costs.class_clean, ladder.error.clean,
                    ladder.oos.clean, ladder.costs.total
                ] + ladder.costs.denois.values(),
                                     make_datastream(
                                         data.valid,
                                         data.valid_ind,
                                         p.valid_batch_size,
                                         whiten=whiten,
                                         cnorm=cnorm,
                                         balanced_classes=p.balanced_classes,
                                         scheme=ShuffledScheme),
                                     prefix="valid_approx"),

                # This Monitor is slower, but more accurate since it will first
                # estimate batch normalization parameters from training data and
                # then do another pass to calculate the validation error.
                FinalTestMonitoring(
                    [
                        ladder.costs.class_clean, ladder.error.clean,
                        ladder.oos.clean, ladder.costs.total
                    ] + ladder.costs.denois.values(),
                    make_datastream(data.train,
                                    data.train_ind,
                                    p.batch_size,
                                    n_labeled=p.labeled_samples,
                                    whiten=whiten,
                                    cnorm=cnorm,
                                    balanced_classes=p.balanced_classes,
                                    scheme=ShuffledScheme),
                    make_datastream(data.valid,
                                    data.valid_ind,
                                    p.valid_batch_size,
                                    n_labeled=len(data.valid_ind),
                                    whiten=whiten,
                                    cnorm=cnorm,
                                    balanced_classes=p.balanced_classes,
                                    scheme=ShuffledScheme),
                    prefix="valid_final",
                    after_n_epochs=p.num_epochs,
                    after_training=True),
                TrainingDataMonitoring([
                    ladder.error.clean, ladder.oos.clean, ladder.costs.total,
                    ladder.costs.class_corr,
                    training_algorithm.total_gradient_norm
                ] + ladder.costs.denois.values(),
                                       prefix="train",
                                       after_epoch=True),
                # ladder.costs.class_clean - save model whenever we have best validation result another option `('train',ladder.costs.total)`
                SaveParams(('valid_approx', ladder.error.clean),
                           all_params,
                           p.save_dir,
                           after_epoch=True),
                SaveExpParams(p, p.save_dir, before_training=True),
                SaveLog(p.save_dir, after_training=True),
                ShortPrinting(short_prints),
                LRDecay(ladder.lr,
                        p.num_epochs * p.lrate_decay,
                        p.num_epochs,
                        lrmin=p.lrmin,
                        after_epoch=True),
            ])
    else:
        main_loop = MainLoop(
            training_algorithm,
            # Datastream used for training
            make_datastream(data.train,
                            data.train_ind,
                            p.batch_size,
                            n_labeled=p.labeled_samples,
                            n_unlabeled=p.unlabeled_samples,
                            whiten=whiten,
                            cnorm=cnorm,
                            balanced_classes=p.balanced_classes,
                            dseed=p.dseed),
            model=Model(ladder.costs.total),
            extensions=[
                FinishAfter(after_n_epochs=p.num_epochs),
                TrainingDataMonitoring([
                    ladder.error.clean, ladder.oos.clean, ladder.costs.total,
                    ladder.costs.class_corr,
                    training_algorithm.total_gradient_norm
                ] + ladder.costs.denois.values(),
                                       prefix="train",
                                       after_epoch=True),
                # ladder.costs.class_clean - save model whenever we have best validation result another option `('train',ladder.costs.total)`
                SaveParams(('train', ladder.error.clean),
                           all_params,
                           p.save_dir,
                           after_epoch=True),
                SaveExpParams(p, p.save_dir, before_training=True),
                SaveLog(p.save_dir, after_training=True),
                ShortPrinting(short_prints),
                LRDecay(ladder.lr,
                        p.num_epochs * p.lrate_decay,
                        p.num_epochs,
                        lrmin=p.lrmin,
                        after_epoch=True),
            ])
    main_loop.run()

    # Get results
    if len(data.valid_ind) == 0:
        return None

    df = DataFrame.from_dict(main_loop.log, orient='index')
    col = 'valid_final_error_rate_clean'
    logger.info('%s %g' % (col, df[col].iloc[-1]))

    if main_loop.log.status['epoch_interrupt_received']:
        return None
    return df
Beispiel #29
0
def run(model_name, port_train, port_valid):

    running_on_laptop = socket.gethostname() == 'yop'

    X = tensor.tensor4('image_features', dtype='float32')
    T = tensor.matrix('targets', dtype='float32')

    image_border_size = (100, 100)

    if running_on_laptop:
        host_plot = 'http://*****:*****@ %s' %
             (model_name, datetime.datetime.now(), socket.gethostname()),
             channels=[['loss'], ['error', 'valid_error']],
             after_epoch=True,
             server_url=host_plot),
        Printing(),
        Checkpoint('/tmp/train_bn2')
    ]

    main_loop = MainLoop(data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)
    main_loop.run()
def main(name, epochs, batch_size, learning_rate, window_size, conv_sizes, num_filters, fc_dim, enc_dim, dec_dim, step, num_digits, num_classes,
         oldmodel, live_plotting):
    channels, img_height, img_width = 1, 100, 100

    rnninits = {
        'weights_init': Uniform(width=0.02),
        'biases_init': Constant(0.),
    }

    inits = {
        'weights_init': IsotropicGaussian(0.001),
        'biases_init': Constant(0.),
    }

    rec_inits = {
        'weights_init': IsotropicGaussian(0.001),
        'biases_init': Constant(0.),
    }

    convinits = {
        'weights_init': Uniform(width=.2),
        'biases_init': Constant(0.),
    }

    n_iter = step * num_digits
    filter_size1, filter_size2 = zip(conv_sizes, conv_sizes)[:]
    w_height, w_width = window_size.split(',')
    w_height = int(w_height)
    w_width = int(w_width)

    subdir = time.strftime("%Y-%m-%d") + "-" + name
    if not os.path.exists(subdir):
        os.makedirs(subdir)

    lines = ["\n                Running experiment",
             "          subdirectory: %s" % subdir,
             "         learning rate: %g" % learning_rate,
             "        attention size: %s" % window_size,
             "          n_iterations: %d" % n_iter,
             "     encoder dimension: %d" % enc_dim,
             "     decoder dimension: %d" % dec_dim,
             "            batch size: %d" % batch_size,
             "                epochs: %d" % epochs,
             ]
    for line in lines:
        print(line)
    print()

    rectifier = Rectifier()
    conv1 = Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 2), num_channels=channels, image_size=(w_height, w_width), border_mode='half',
                          name='conv1', **convinits)
    conv1_bn = SpatialBatchNormalization(input_dim=(64, 26, 26), conserve_memory=False, n_iter=n_iter, name='conv1_bn')
    conv2 = Convolutional(filter_size=filter_size2, num_channels=int(num_filters / 2), num_filters=int(num_filters / 2), image_size=(26, 26), name='conv2',
                          **convinits)
    conv2_bn = SpatialBatchNormalization(input_dim=(64, 24, 24), conserve_memory=False, n_iter=n_iter, name='conv2_bn')
    max_pooling = MaxPooling(pooling_size=(2, 2), step=(2, 2))
    conv3 = Convolutional(filter_size=filter_size2, num_filters=num_filters, num_channels=int(num_filters / 2), image_size=(12, 12), border_mode='half',
                          name='conv3', **convinits)
    conv3_bn = SpatialBatchNormalization(input_dim=(128, 12, 12), conserve_memory=False, n_iter=n_iter, name='conv3_bn')
    conv4 = Convolutional(filter_size=filter_size2, num_filters=num_filters, num_channels=num_filters, image_size=(12, 12), border_mode='half', name='conv4',
                          **convinits)
    conv4_bn = SpatialBatchNormalization(input_dim=(128, 12, 12), conserve_memory=False, n_iter=n_iter, name='conv4_bn')
    # Max Pooling
    conv5 = Convolutional(filter_size=filter_size2, num_filters=160, num_channels=num_filters, image_size=(6, 6), border_mode='half', name='conv5',
                          **convinits)
    conv5_bn = SpatialBatchNormalization(input_dim=(160, 6, 6), conserve_memory=False, n_iter=n_iter, name='conv5_bn')
    conv6 = Convolutional(filter_size=filter_size2, num_filters=192, num_channels=160, image_size=(6, 6), name='conv6', **convinits)
    conv6_bn = SpatialBatchNormalization(input_dim=(192, 4, 4), conserve_memory=False, n_iter=n_iter, name='conv6_bn')

    conv_mlp = MLP(activations=[Identity()], dims=[3072, fc_dim], name="MLP_conv", **inits)
    conv_mlp_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='conv_mlp_bn')
    loc_mlp = MLP(activations=[Identity()], dims=[6, fc_dim], name="MLP_loc", **inits)
    loc_mlp_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='loc_mlp_bn')
    encoder_mlp = MLP([Identity()], [fc_dim, 4 * enc_dim], name="MLP_enc", **rec_inits)
    decoder_mlp = MLP([Identity()], [enc_dim, 4 * dec_dim], name="MLP_dec", **rec_inits)
    encoder_rnn = LSTM(activation=Tanh(), dim=enc_dim, name="RNN_enc", **rnninits)

    conv_init = ConvolutionalSequence(
        [Convolutional(filter_size=filter_size1, num_filters=int(num_filters / 8), name='conv1_init'),
         SpatialBatchNormalization(conserve_memory=False, name='conv1_bn_init'),
         Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 8), name='conv2_init'),
         SpatialBatchNormalization(conserve_memory=False, name='conv2_bn_init'),
         Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 4), name='conv3_init'),
         SpatialBatchNormalization(conserve_memory=False, name='conv3_bn_init'),
         ], image_size=(12, 12), num_channels=channels, name='conv_seq_init', **convinits)

    decoder_rnn = LSTM(activation=Tanh(), dim=dec_dim, name="RNN_dec", **rnninits)
    emit_mlp = MLP(activations=[Tanh()], dims=[dec_dim, 6], name='emit_mlp', weights_init=Constant(0.),
                   biases_init=Constant((1., 0., 0., 0., 1., 0.)))

    classification_mlp1 = MLP(activations=[Identity()], dims=[enc_dim, fc_dim], name='MPL_class1', **inits)
    classification_mlp1_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='classification_mlp1_bn')
    classification_mlp2 = MLP(activations=[Identity()], dims=[fc_dim, fc_dim], name='MPL_class2', **inits)
    classification_mlp2_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='classification_mlp2_bn')
    classification_mlp3 = MLP(activations=[Softmax()], dims=[fc_dim, num_classes], name='MPL_class3', **inits)

    edram = EDRAM(channels=channels, out_height=w_height, out_width=w_width, n_iter=n_iter, num_classes=num_classes, rectifier=rectifier, conv1=conv1,
                  conv1_bn=conv1_bn, conv2=conv2, conv2_bn=conv2_bn, max_pooling=max_pooling, conv3=conv3, conv3_bn=conv3_bn, conv4=conv4, conv4_bn=conv4_bn,
                  conv5=conv5, conv5_bn=conv5_bn, conv6=conv6, conv6_bn=conv6_bn, conv_mlp=conv_mlp, conv_mlp_bn=conv_mlp_bn,
                  loc_mlp=loc_mlp, loc_mlp_bn=loc_mlp_bn, conv_init=conv_init, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, decoder_mlp=decoder_mlp,
                  decoder_rnn=decoder_rnn, classification_mlp1=classification_mlp1, classification_mlp1_bn=classification_mlp1_bn,
                  classification_mlp2=classification_mlp2, classification_mlp2_bn=classification_mlp2_bn, classification_mlp3=classification_mlp3,
                  emit_mlp=emit_mlp)
    edram.initialize()

    # ------------------------------------------------------------------------
    x = T.ftensor4('features')
    x_coarse = T.ftensor4('features_coarse')
    y = T.ivector('labels')
    wr = T.fmatrix('locations')

    with batch_normalization(edram):
        bn_p, bn_l, m_c1_bn, s_c1_bn, m_c2_bn, s_c2_bn, m_c3_bn, s_c3_bn, m_c4_bn, s_c4_bn, m_c5_bn, s_c5_bn, m_c6_bn, s_c6_bn, \
        m_c_bn, s_c_bn, m_l_bn, s_l_bn, m_cl1_bn, s_cl1_bn, m_cl2_bn, s_cl2_bn = edram.calculate_train(x, x_coarse)

    def compute_cost(p, wr, y, l):
        cost_where = T.dot(T.sqr(wr - l), [1, 0.5, 1, 0.5, 1, 1])
        cost_y = T.stack([T.nnet.categorical_crossentropy(T.maximum(p[i, :], 1e-7), y) for i in range(0, n_iter)])

        return cost_where, cost_y

    cost_where, cost_y = compute_cost(bn_p, wr, y, bn_l)
    bn_cost = cost_y + cost_where
    bn_cost = bn_cost.sum(axis=0)
    bn_cost = bn_cost.mean()
    bn_cost.name = 'cost'

    bn_error_rate = MisclassificationRate().apply(y, bn_p[-1])
    bn_error_rate.name = 'error_rate'

    # ------------------------------------------------------------
    bn_cg = ComputationGraph([bn_cost, bn_error_rate])

    # Prepare algorithm
    algorithm = GradientDescent(
        cost=bn_cg.outputs[0],
        on_unused_sources='ignore',
        parameters=bn_cg.parameters,
        step_rule=CompositeRule([
            RemoveNotFinite(),
            StepClipping(10.),
            Adam(learning_rate)
        ])
    )

    pop_updates = get_batch_normalization_updates(bn_cg)
    update_params = [conv1_bn.population_mean, conv1_bn.population_stdev, conv2_bn.population_mean, conv2_bn.population_stdev, conv3_bn.population_mean,
                     conv3_bn.population_stdev, conv4_bn.population_mean, conv4_bn.population_stdev, conv5_bn.population_mean, conv5_bn.population_stdev,
                     conv6_bn.population_mean, conv6_bn.population_stdev, conv_mlp_bn.population_mean, conv_mlp_bn.population_stdev,
                     loc_mlp_bn.population_mean, loc_mlp_bn.population_stdev, classification_mlp1_bn.population_mean, classification_mlp1_bn.population_stdev,
                     classification_mlp2_bn.population_mean, classification_mlp2_bn.population_stdev]
    update_values = [m_c1_bn, s_c1_bn, m_c2_bn, s_c2_bn, m_c3_bn, s_c3_bn, m_c4_bn, s_c4_bn, m_c5_bn, s_c5_bn, m_c6_bn, s_c6_bn, m_c_bn, s_c_bn, m_l_bn, s_l_bn,
                     m_cl1_bn, s_cl1_bn, m_cl2_bn, s_cl2_bn]

    pop_updates.extend([(p, m) for p, m in zip(update_params, update_values)])

    decay_rate = 0.05
    extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates]
    algorithm.add_updates(extra_updates)

    # ------------------------------------------------------------------------
    # Setup monitors

    p, l = edram.calculate_test(x, x_coarse)
    cost_where, cost_y = compute_cost(p, wr, y, l)
    cost = cost_y + cost_where
    cost = cost.sum(axis=0)
    cost = cost.mean()
    cost.name = 'cost'

    error_rate = MisclassificationRate().apply(y, p[-1])
    error_rate.name = 'error_rate'
    monitors = [cost, error_rate]

    plotting_extensions = []
    # Live plotting...
    if live_plotting:
        plot_channels = [
            ['train_cost', 'test_cost'],
            ['train_error_rate', 'test_error_rate'],
        ]
        plotting_extensions = [
            Plot(subdir, channels=plot_channels, server_url='http://155.69.150.60:80/')
        ]

    # ------------------------------------------------------------

    mnist_cluttered_train = MNISTCluttered(which_sets=['train'], sources=('features', 'locations', 'labels'))
    mnist_cluttered_test = MNISTCluttered(which_sets=['test'], sources=('features', 'locations', 'labels'))

    main_loop = MainLoop(
        model=Model([bn_cost]),
        data_stream=DataStream.default_stream(mnist_cluttered_train, iteration_scheme=ShuffledScheme(mnist_cluttered_train.num_examples, batch_size)),
        algorithm=algorithm,
        extensions=[Timing(),
                    FinishAfter(after_n_epochs=epochs),
                    DataStreamMonitoring(monitors,
                                         DataStream.default_stream(mnist_cluttered_train, iteration_scheme=SequentialScheme(mnist_cluttered_train.num_examples,
                                                                                                                            batch_size)), prefix='train'),
                    DataStreamMonitoring(monitors,
                                         DataStream.default_stream(mnist_cluttered_test,
                                                                   iteration_scheme=SequentialScheme(mnist_cluttered_test.num_examples, batch_size)),
                                         prefix="test"),
                    PartsOnlyCheckpoint("{}/{}".format(subdir, name), before_training=False, after_epoch=True, save_separately=['log', ]),
                    TrackTheBest('test_error_rate', 'best_test_error_rate'),
                    BestCheckpount("{}/{}".format(subdir, name), 'best_test_error_rate', save_separately=['model', ]),
                    Printing(),
                    ProgressBar(),
                    PrintingTo("\n".join(lines), "{}/{}_log.txt".format(subdir, name)), ] + plotting_extensions)
    if oldmodel is not None:
        print("Initializing parameters with old model %s" % oldmodel)
        with open(oldmodel, "rb") as f:
            oldmodel = pickle.load(f)
            main_loop.model.set_parameter_values(oldmodel.get_parameter_values())

            main_loop.model.get_top_bricks()[0].conv1_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv1_bn.population_mean.get_value())
            main_loop.model.get_top_bricks()[0].conv1_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv1_bn.population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].conv2_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv2_bn.population_mean.get_value())
            main_loop.model.get_top_bricks()[0].conv2_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv2_bn.population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].conv3_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv3_bn.population_mean.get_value())
            main_loop.model.get_top_bricks()[0].conv3_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv3_bn.population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].conv4_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv4_bn.population_mean.get_value())
            main_loop.model.get_top_bricks()[0].conv4_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv4_bn.population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].conv5_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv5_bn.population_mean.get_value())
            main_loop.model.get_top_bricks()[0].conv5_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv5_bn.population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].conv6_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv6_bn.population_mean.get_value())
            main_loop.model.get_top_bricks()[0].conv6_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv6_bn.population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].loc_mlp_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].loc_mlp_bn.population_mean.get_value())
            main_loop.model.get_top_bricks()[0].loc_mlp_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].loc_mlp_bn.population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].conv_mlp_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv_mlp_bn.population_mean.get_value())
            main_loop.model.get_top_bricks()[0].conv_mlp_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv_mlp_bn.population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].classification_mlp1_bn.population_mean.set_value(
                oldmodel.get_top_bricks()[0].classification_mlp1_bn.population_mean.get_value())
            main_loop.model.get_top_bricks()[0].classification_mlp1_bn.population_stdev.set_value(
                oldmodel.get_top_bricks()[0].classification_mlp1_bn.population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].classification_mlp2_bn.population_mean.set_value(
                oldmodel.get_top_bricks()[0].classification_mlp2_bn.population_mean.get_value())
            main_loop.model.get_top_bricks()[0].classification_mlp2_bn.population_stdev.set_value(
                oldmodel.get_top_bricks()[0].classification_mlp2_bn.population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].conv_init.layers[1].population_mean.set_value(
                oldmodel.get_top_bricks()[0].conv_init.layers[1].population_mean.get_value())
            main_loop.model.get_top_bricks()[0].conv_init.layers[1].population_stdev.set_value(
                oldmodel.get_top_bricks()[0].conv_init.layers[1].population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].conv_init.layers[3].population_mean.set_value(
                oldmodel.get_top_bricks()[0].conv_init.layers[3].population_mean.get_value())
            main_loop.model.get_top_bricks()[0].conv_init.layers[3].population_stdev.set_value(
                oldmodel.get_top_bricks()[0].conv_init.layers[3].population_stdev.get_value())

            main_loop.model.get_top_bricks()[0].conv_init.layers[5].population_mean.set_value(
                oldmodel.get_top_bricks()[0].conv_init.layers[5].population_mean.get_value())
            main_loop.model.get_top_bricks()[0].conv_init.layers[5].population_stdev.set_value(
                oldmodel.get_top_bricks()[0].conv_init.layers[5].population_stdev.get_value())
        del oldmodel
    main_loop.run()
Beispiel #31
0
def train(cli_params):
    cli_params['save_dir'] = prepare_dir(cli_params['save_to'])
    logfile = os.path.join(cli_params['save_dir'], 'log.txt')

    # Log also DEBUG to a file
    fh = logging.FileHandler(filename=logfile)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    logger.info('Logging into %s' % logfile)

    p, loaded = load_and_log_params(cli_params)
    in_dim, data, whiten, cnorm = setup_data(p, test_set=False)

    if not loaded:
        # Set the zero layer to match input dimensions
        p.encoder_layers = (in_dim, ) + p.encoder_layers

    ladder = setup_model(p)

    # Training
    all_params = ComputationGraph([ladder.costs.total]).parameters
    logger.info('Found the following parameters: %s' % str(all_params))

    # Fetch all batch normalization updates. They are in the clean path.
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    assert 'counter' in [u.name for u in bn_updates.keys()], \
        'No batch norm params in graph - the graph has been cut?'

    training_algorithm = GradientDescent(
        cost=ladder.costs.total,
        params=all_params,
        step_rule=Adam(learning_rate=ladder.lr))
    # In addition to actual training, also do BN variable approximations
    training_algorithm.add_updates(bn_updates)

    model = Model(ladder.costs.total)

    monitored_variables = [
        ladder.costs.class_corr,
        ladder.costs.class_clean,
        ladder.error,
        #         training_algorithm.total_gradient_norm,
        ladder.costs.total] \
#         + ladder.costs.denois.values()

    # Make a global history recorder so that we can get summary at end of
    # training when we write to Sentinel
    # global_history records all relevant monitoring vars
    # updated by SaveLog every time
    global_history = {}

    main_loop = MainLoop(
        training_algorithm,
        # Datastream used for training
        make_datastream(data.train,
                        data.train_ind,
                        p.batch_size,
                        n_labeled=p.labeled_samples,
                        n_unlabeled=p.unlabeled_samples,
                        whiten=whiten,
                        cnorm=cnorm),
        model=model,
        extensions=[
            FinishAfter(after_n_epochs=p.num_epochs),

            # write out to sentinel file for experiment automator to work
            SentinelWhenFinish(save_dir=p.save_dir,
                               global_history=global_history),

            # This will estimate the validation error using
            # running average estimates of the batch normalization
            # parameters, mean and variance
            ApproxTestMonitoring(monitored_variables,
                                 make_datastream(data.valid,
                                                 data.valid_ind,
                                                 p.valid_batch_size,
                                                 whiten=whiten,
                                                 cnorm=cnorm,
                                                 scheme=ShuffledScheme),
                                 prefix="valid_approx"),

            # This Monitor is slower, but more accurate since it will first
            # estimate batch normalization parameters from training data and
            # then do another pass to calculate the validation error.
            FinalTestMonitoring(monitored_variables,
                                make_datastream(data.train,
                                                data.train_ind,
                                                p.batch_size,
                                                n_labeled=p.labeled_samples,
                                                whiten=whiten,
                                                cnorm=cnorm,
                                                scheme=ShuffledScheme),
                                make_datastream(data.valid,
                                                data.valid_ind,
                                                p.valid_batch_size,
                                                n_labeled=len(data.valid_ind),
                                                whiten=whiten,
                                                cnorm=cnorm,
                                                scheme=ShuffledScheme),
                                prefix="valid_final",
                                after_n_epochs=p.num_epochs),
            TrainingDataMonitoring(variables=monitored_variables,
                                   prefix="train",
                                   after_epoch=True),
            SaveParams('valid_approx_cost_class_corr', model, p.save_dir),
            #             SaveParams(None, all_params, p.save_dir, after_epoch=True),
            SaveExpParams(p, p.save_dir, before_training=True),
            SaveLog(save_dir=p.save_dir,
                    after_epoch=True,
                    global_history=global_history),
            Printing(),
            #             ShortPrinting(short_prints),
            LRDecay(ladder.lr,
                    p.num_epochs * p.lrate_decay,
                    p.num_epochs,
                    after_epoch=True),
        ])
    main_loop.run()

    # Get results
    df = main_loop.log.to_dataframe()
    col = 'valid_final_error_rate'
    logger.info('%s %g' % (col, df[col].iloc[-1]))

    if main_loop.log.status['epoch_interrupt_received']:
        return None
    return df
Beispiel #32
0
def run(batch_size, save_path, z_dim, oldmodel, discriminative_regularization,
        classifier, vintage, monitor_every, monitor_before, checkpoint_every, dataset, color_convert,
        image_size, net_depth, subdir,
        reconstruction_factor, kl_factor, discriminative_factor, disc_weights,
        num_epochs):

    if dataset:
        streams = create_custom_streams(filename=dataset,
                                        training_batch_size=batch_size,
                                        monitoring_batch_size=batch_size,
                                        include_targets=False,
                                        color_convert=color_convert)
    else:
        streams = create_celeba_streams(training_batch_size=batch_size,
                                        monitoring_batch_size=batch_size,
                                        include_targets=False)

    main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3]

    # Compute parameter updates for the batch normalization population
    # statistics. They are updated following an exponential moving average.
    rval = create_training_computation_graphs(
                z_dim, image_size, net_depth, discriminative_regularization, classifier,
                vintage, reconstruction_factor, kl_factor, discriminative_factor, disc_weights)
    cg, bn_cg, variance_parameters = rval

    pop_updates = list(
        set(get_batch_normalization_updates(bn_cg, allow_duplicates=True)))
    decay_rate = 0.05
    extra_updates = [(p, m * decay_rate + p * (1 - decay_rate))
                     for p, m in pop_updates]

    model = Model(bn_cg.outputs[0])

    selector = Selector(
        find_bricks(
            model.top_bricks,
            lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp',
                                         'decoder_convnet', 'decoder_mlp')))
    parameters = list(selector.get_parameters().values()) + variance_parameters

    # Prepare algorithm
    step_rule = Adam()
    algorithm = GradientDescent(cost=bn_cg.outputs[0],
                                parameters=parameters,
                                step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    # Prepare monitoring
    sys.setrecursionlimit(1000000)

    monitored_quantities_list = []
    for graph in [bn_cg, cg]:
        # cost, kl_term, reconstruction_term, discriminative_term = graph.outputs
        cost, kl_term, reconstruction_term, discriminative_term = graph.outputs[:4]
        discriminative_layer_terms = graph.outputs[4:]

        cost.name = 'nll_upper_bound'
        avg_kl_term = kl_term.mean(axis=0)
        avg_kl_term.name = 'avg_kl_term'
        avg_reconstruction_term = -reconstruction_term.mean(axis=0)
        avg_reconstruction_term.name = 'avg_reconstruction_term'
        avg_discriminative_term = discriminative_term.mean(axis=0)
        avg_discriminative_term.name = 'avg_discriminative_term'

        num_layer_terms = len(discriminative_layer_terms)
        avg_discriminative_layer_terms = [None] * num_layer_terms
        for i, term in enumerate(discriminative_layer_terms):
            avg_discriminative_layer_terms[i] = discriminative_layer_terms[i].mean(axis=0)
            avg_discriminative_layer_terms[i].name = "avg_discriminative_term_layer_{:02d}".format(i)

        monitored_quantities_list.append(
            [cost, avg_kl_term, avg_reconstruction_term,
             avg_discriminative_term] + avg_discriminative_layer_terms)

    train_monitoring = DataStreamMonitoring(
        monitored_quantities_list[0], train_monitor_stream, prefix="train",
        updates=extra_updates, after_epoch=False, before_first_epoch=monitor_before,
        every_n_epochs=monitor_every)
    valid_monitoring = DataStreamMonitoring(
        monitored_quantities_list[1], valid_monitor_stream, prefix="valid",
        after_epoch=False, before_first_epoch=monitor_before,
        every_n_epochs=monitor_every)

    # Prepare checkpoint
    checkpoint = Checkpoint(save_path, every_n_epochs=checkpoint_every,
                            before_training=True, use_cpickle=True)

    sample_checkpoint = SampleCheckpoint(interface=DiscGenModel, z_dim=z_dim/2,
                            image_size=(image_size, image_size), channels=3,
                            dataset=dataset, split="valid", save_subdir=subdir,
                            before_training=True, after_epoch=True)
    # TODO: why does z_dim=foo become foo/2?
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs),
                  checkpoint,
                  sample_checkpoint,
                  train_monitoring, valid_monitoring, 
                  Printing(),
                  ProgressBar()]
    main_loop = MainLoop(model=model, data_stream=main_loop_stream,
                         algorithm=algorithm, extensions=extensions)

    if oldmodel is not None:
        print("Initializing parameters with old model {}".format(oldmodel))
        try:
            saved_model = load(oldmodel)
        except AttributeError:
            # newer version of blocks
            with open(oldmodel, 'rb') as src:
                saved_model = load(src)
        main_loop.model.set_parameter_values(
            saved_model.model.get_parameter_values())
        del saved_model

    main_loop.run()
Beispiel #33
0
def train(ladder,
          batch_size=100,
          num_train_examples=60000,
          num_epochs=150,
          lrate_decay=0.67):
    # Setting Logger
    timestr = time.strftime("%Y_%m_%d_at_%H_%M")
    save_path = 'results/mnist_100_standard_' + timestr
    log_path = os.path.join(save_path, 'log.txt')
    os.makedirs(save_path)
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    # Training
    model = Model(ladder.costs.total)
    all_params = model.parameters
    print len(all_params)
    print all_params

    training_algorithm = GradientDescent(
        cost=ladder.costs.total,
        parameters=all_params,
        step_rule=Adam(learning_rate=ladder.lr))

    # Fetch all batch normalization updates. They are in the clean path.
    # In addition to actual training, also do BN variable approximations
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    training_algorithm.add_updates(bn_updates)

    monitored_variables = [
        ladder.costs.class_corr, ladder.costs.class_clean, ladder.error,
        training_algorithm.total_gradient_norm, ladder.costs.total
    ] + ladder.costs.denois.values()

    train_data_stream, test_data_stream = get_mixed_streams(batch_size)

    train_monitoring = TrainingDataMonitoring(variables=monitored_variables,
                                              prefix="train",
                                              after_epoch=True)

    valid_monitoring = DataStreamMonitoring(variables=monitored_variables,
                                            data_stream=test_data_stream,
                                            prefix="test",
                                            after_epoch=True)

    main_loop = MainLoop(algorithm=training_algorithm,
                         data_stream=train_data_stream,
                         model=model,
                         extensions=[
                             train_monitoring, valid_monitoring,
                             FinishAfter(after_n_epochs=num_epochs),
                             SaveParams('test_CE_corr', model, save_path),
                             SaveLog(save_path, after_epoch=True),
                             LRDecay(lr=ladder.lr,
                                     decay_first=num_epochs * lrate_decay,
                                     decay_last=num_epochs,
                                     after_epoch=True),
                             Printing()
                         ])
    main_loop.run()
Beispiel #34
0
def train_model(cost, cross_entropy, updates,
                train_stream, valid_stream, args, gate_values=None):

    step_rule = learning_algorithm(args)
    cg = ComputationGraph(cost)

    # ADD REGULARIZATION
    # WEIGHT NOISE
    weight_noise = args.weight_noise
    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        cg_train = apply_noise(cg, weights, weight_noise)
        cost = cg_train.outputs[0]
    cost.name = "cost_with_weight_noise"
    cg = ComputationGraph(cost)

    logger.info(cg.parameters)

    algorithm = GradientDescent(cost=cost, step_rule=step_rule,
                                params=cg.parameters)
    algorithm.add_updates(updates)

    # extensions to be added
    extensions = []
    if args.load_path is not None:
        extensions.append(Load(args.load_path))

    outputs = [
        variable for variable in cg.variables if variable.name == "presoft"]

    if args.generate:
        extensions.append(TextGenerationExtension(
            outputs=outputs,
            generation_length=args.generated_text_lenght,
            initial_text_length=args.initial_text_length,
            every_n_batches=args.monitoring_freq,
            ploting_path=os.path.join(args.save_path, 'prob_plot.png'),
            softmax_sampling=args.softmax_sampling,
            dataset=args.dataset,
            updates=updates,
            interactive_mode=args.interactive_mode))
    extensions.extend([
        TrainingDataMonitoring([cost], prefix='train',
                               every_n_batches=args.monitoring_freq,
                               after_epoch=True),
        DataStreamMonitoring([cost, cross_entropy],
                             valid_stream, args.mini_batch_size_valid,
                             state_updates=updates,
                             prefix='valid',
                             before_first_epoch=not(args.visualize_gates),
                             every_n_batches=args.monitoring_freq),
        ResetStates([v for v, _ in updates], every_n_batches=100),
        ProgressBar()])
    # Creating directory for saving model.
    if not args.interactive_mode:
        if not os.path.exists(args.save_path):
            os.makedirs(args.save_path)
        else:
            raise Exception('Directory already exists')
    early_stopping = EarlyStopping('valid_cross_entropy',
                                   args.patience, args.save_path,
                                   every_n_batches=args.monitoring_freq)

    # Visualizing extensions
    if args.interactive_mode:
        extensions.append(InteractiveMode())
    if args.visualize_gates and (gate_values is not None):
        if args.rnn_type == "lstm":
            extensions.append(VisualizeGateLSTM(gate_values, updates,
                                                args.dataset,
                                                ploting_path=None))
        elif args.rnn_type == "soft":
            extensions.append(VisualizeGateSoft(gate_values, updates,
                                                args.dataset,
                                                ploting_path=None))
        else:
            assert(False)

    extensions.append(early_stopping)
    extensions.append(Printing(every_n_batches=args.monitoring_freq))

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=extensions
    )
    main_loop.run()
Beispiel #35
0
def train(ladder, batch_size=100, labeled_samples=100,
          unlabeled_samples=50000, valid_set_size=10000,
          num_epochs=150, valid_batch_size=100, lrate_decay=0.67,
          save_path='results/mnist_100_full0'):
    # Setting Logger
    log_path = os.path.join(save_path, 'log.txt')
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)
    logger.info('Logging into %s' % log_path)

    # Training
    all_params = ComputationGraph([ladder.costs.total]).parameters
    logger.info('Found the following parameters: %s' % str(all_params))

    training_algorithm = GradientDescent(
        cost=ladder.costs.total, params=all_params,
        step_rule=Adam(learning_rate=ladder.lr))

    # Fetch all batch normalization updates. They are in the clean path.
    # In addition to actual training, also do BN variable approximations
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    training_algorithm.add_updates(bn_updates)

    monitored_variables = [
        ladder.costs.class_corr, ladder.costs.class_clean,
        ladder.error, training_algorithm.total_gradient_norm,
        ladder.costs.total] + ladder.costs.denois.values()

    data = get_mnist_data_dict(unlabeled_samples=unlabeled_samples,
                               valid_set_size=valid_set_size)

    train_data_stream = make_datastream(
        data.train, data.train_ind, batch_size,
        n_labeled=labeled_samples,
        n_unlabeled=unlabeled_samples)

    valid_data_stream = make_datastream(
        data.valid, data.valid_ind, valid_batch_size,
        n_labeled=len(data.valid_ind),
        n_unlabeled=len(data.valid_ind))

    train_monitoring = TrainingDataMonitoring(
        variables=monitored_variables,
        prefix="train",
        after_epoch=True)

    valid_monitoring = DataStreamMonitoring(
        variables=monitored_variables,
        data_stream=valid_data_stream,
        prefix="valid",
        after_epoch=True)

    main_loop = MainLoop(
        algorithm=training_algorithm,
        data_stream=train_data_stream,
        model=Model(ladder.costs.total),
        extensions=[
            train_monitoring,
            valid_monitoring,
            FinishAfter(after_n_epochs=num_epochs),
            SaveParams(None, all_params, save_path, after_epoch=True),
            SaveLog(save_path, after_training=True),
            LRDecay(lr=ladder.lr,
                    decay_first=num_epochs * lrate_decay,
                    decay_last=num_epochs,
                    after_epoch=True),
            Printing()])
    main_loop.run()
Beispiel #36
0
                                              batch_size=m)))

# uwaga: dyskryminator bierze 2m probek, pierwsze m to generowane, kolejne m to z danych

# observables.append(cost_discriminator)

generator_cost = theano.shared(value=np.array(0., dtype=np.float32), name='g_cost')
discriminator_cost = theano.shared(value=np.array(0., dtype=np.float32), name='d_cost')

generator_step_norm = theano.shared(value=np.array(0., dtype=np.float32), name='g_step_norm')
generator_grad_norm = theano.shared(value=np.array(0., dtype=np.float32), name='g_grad_norm')
discriminator_step_norm = theano.shared(value=np.array(0., dtype=np.float32), name='d_step_norm')
discriminator_grad_norm = theano.shared(value=np.array(0., dtype=np.float32), name='d_grad_norm')

discriminator_descent.add_updates([
    (discriminator_cost, ComputationGraph(cost_discriminator).outputs[0]),
    (discriminator_step_norm, discriminator_descent.total_step_norm),
    (discriminator_grad_norm, discriminator_descent.total_gradient_norm)])

generator_descent.add_updates([
    (generator_cost, ComputationGraph(cost_generator).outputs[0]),
    (generator_step_norm, generator_descent.total_step_norm),
    (generator_grad_norm, generator_descent.total_gradient_norm)])

observables = []
observables.append(generator_cost)
observables.append(discriminator_cost)
observables.append(generator_step_norm)
observables.append(generator_grad_norm)
observables.append(discriminator_step_norm)
observables.append(discriminator_grad_norm)
Beispiel #37
0
    # Build datastream
    train_stream = datastream.setup_datastream(config.dataset,
                                               config.num_seqs,
                                               config.seq_len,
                                               config.seq_div_size)

    # Build model
    m = config.Model(config)

    # Train the model
    cg = Model(m.sgd_cost)
    algorithm = GradientDescent(cost=m.sgd_cost,
                                step_rule=config.step_rule,
                                parameters=cg.parameters)

    algorithm.add_updates(m.states)

    monitor_vars = list(set(v for p in m.monitor_vars for v in p))
    extensions = [
            ProgressBar(),
            TrainingDataMonitoring(
                monitor_vars,
                prefix='train', every_n_batches=config.monitor_freq),
            Printing(every_n_batches=config.monitor_freq, after_epoch=False),

            ResetStates([v for v, _ in m.states], after_epoch=True)
    ]
    if plot_avail:
        plot_channels = [['train_' + v.name for v in p] for p in m.monitor_vars]
        extensions.append(
            Plot(document='text_'+model_name,
Beispiel #38
0
def train_model(cost,
                unregularized_cost,
                updates,
                train_stream,
                valid_stream,
                args,
                gate_values=None):

    step_rule = learning_algorithm(args)
    cg = ComputationGraph(cost)

    # ADD REGULARIZATION
    # WEIGHT NOISE
    weight_noise = args.weight_noise
    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        cg_train = apply_noise(cg, weights, weight_noise)
        cost = cg_train.outputs[0]
    cost.name = "cost_with_weight_noise"
    cg = ComputationGraph(cost)

    logger.info(cg.parameters)

    # Define algorithm
    algorithm = GradientDescent(cost=cost,
                                step_rule=step_rule,
                                parameters=cg.parameters)
    # Add the updates to carry the hidden state
    algorithm.add_updates(updates)

    # Extensions to be added
    extensions = []

    # Load from a dumped model
    if args.load_path is not None:
        if args.fine_tuning:
            cost = fine_tuning(cost, args)
        else:
            extensions.append(Load(args.load_path))

    # Generation extension
    if args.generate:
        extensions.append(
            TextGenerationExtension(
                cost=cost,
                generation_length=args.generated_text_lenght,
                initial_text_length=args.initial_text_length,
                every_n_batches=1,
                ploting_path=os.path.join(args.save_path, 'prob_plot.png'),
                softmax_sampling=args.softmax_sampling,
                dataset=args.dataset,
                updates=updates,
                interactive_mode=args.interactive_mode))

    # Training and Validation score monitoring
    extensions.extend([
        TrainingDataMonitoring([cost],
                               prefix='train',
                               every_n_batches=args.monitoring_freq),
        DataStreamMonitoring([cost, unregularized_cost],
                             valid_stream,
                             args.mini_batch_size_valid,
                             args.dataset,
                             state_updates=updates,
                             prefix='valid',
                             before_first_epoch=(args.visualize is None),
                             every_n_batches=args.monitoring_freq)
    ])

    # Creating directory for saving model.
    if not args.interactive_mode:
        if not os.path.exists(args.save_path):
            os.makedirs(args.save_path)
        elif 'test' in args.save_path:
            print("Rewriting in " + args.save_path)
        else:
            raise Exception('Directory already exists')

    # Early stopping
    extensions.append(
        EarlyStopping('valid_' + unregularized_cost.name,
                      args.patience,
                      args.save_path,
                      every_n_batches=args.monitoring_freq))

    # Printing
    extensions.append(ProgressBar())
    extensions.append(Printing(every_n_batches=args.monitoring_freq))

    # Reset the initial states
    if args.dataset == "sine":
        reset_frequency = 1
    else:
        reset_frequency = 100
    extensions.append(
        ResetStates([v for v, _ in updates], every_n_batches=reset_frequency))

    # Visualizing extensions
    if args.interactive_mode:
        extensions.append(InteractiveMode())

    main_loop = MainLoop(model=Model(cost),
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    # This is where the magic happens!
    main_loop.run()
Beispiel #39
0
def train_rnnrbm(train,
                 rnnrbm,
                 epochs=1000,
                 test=None,
                 bokeh=True,
                 load_path=None):
    cdk = theano.shared(10)
    lr = theano.shared(float32(0.004))

    cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk)

    error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask)
    error_rate.name = "error on note as a whole"
    mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask)
    mistake_rate.name = "single error within note"
    cost.name = 'rbm_cost'

    model = Model(cost)
    cg = ComputationGraph([cost])
    step_rule = CompositeRule([
        RemoveNotFinite(),
        StepClipping(30.0),
        Adam(learning_rate=lr),
        StepClipping(6.0),
        RemoveNotFinite()
    ])  # Scale(0.01)
    gradients = dict(
        equizip(cg.parameters,
                T.grad(cost, cg.parameters, consider_constant=[v_sample])))
    algorithm = GradientDescent(step_rule=step_rule,
                                gradients=gradients,
                                cost=cost,
                                params=cg.parameters)
    algorithm.add_updates(cg.updates)
    extensions = [
        SharedVariableModifier(parameter=cdk,
                               function=lambda n, v: rnnrbm_cdk[n]
                               if rnnrbm_cdk.get(n) else v),
        SharedVariableModifier(parameter=lr,
                               function=lambda n, v: float32(0.78 * v)
                               if n % (200 * 5) == 0 else v),
        FinishAfter(after_n_epochs=epochs),
        TrainingDataMonitoring(
            [
                cost,
                error_rate,
                mistake_rate,
            ],  # hidden_states, debug_val, param_nans,
            # aggregation.mean(algorithm.total_gradient_norm)],  #+ params,
            prefix="train",
            after_epoch=False,
            every_n_batches=40),
        Timing(),
        Printing(),
        ProgressBar()
    ]
    if test is not None:
        extensions.append(
            DataStreamMonitoring([cost, error_rate, mistake_rate],
                                 data_stream=test,
                                 updates=cg.updates,
                                 prefix="test",
                                 after_epoch=False,
                                 every_n_batches=40))
    if bokeh:
        extensions.append(
            Plot(
                'Training RNN-RBM',
                channels=[
                    [
                        'train_error on note as a whole',
                        'train_single error within note',
                        'test_error on note as a whole',
                        'test_single error within note'
                    ],
                    ['train_final_cost'],
                    # ['train_total_gradient_norm'],
                ]))

    main_loop = MainLoop(algorithm=algorithm,
                         data_stream=train,
                         model=model,
                         extensions=extensions)
    return main_loop
Beispiel #40
0
def train(cli_params):
    cli_params['save_dir'] = prepare_dir(cli_params['save_to'])
    logfile = os.path.join(cli_params['save_dir'], 'log.txt')

    # Log also DEBUG to a file
    fh = logging.FileHandler(filename=logfile)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    logger.info('Logging into %s' % logfile)

    p, loaded = load_and_log_params(cli_params)
    in_dim, data = setup_data(p, test_set=False)
    if not loaded:
        # Set the zero layer to match input dimensions
        p.encoder_layers = (in_dim, ) + p.encoder_layers

    ladder = setup_model(p)

    # Training
    all_params = ComputationGraph([ladder.costs.total]).parameters
    logger.info('Found the following parameters: %s' % str(all_params))

    # Fetch all batch normalization updates. They are in the clean path.
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    assert 'counter' in [u.name for u in list(bn_updates.keys())], \
        'No batch norm params in graph - the graph has been cut?'

    training_algorithm = GradientDescent(
        cost=ladder.costs.total,
        params=all_params,
        step_rule=Adam(learning_rate=ladder.lr))
    # In addition to actual training, also do BN variable approximations
    training_algorithm.add_updates(bn_updates)

    short_prints = {
        "train": {
            'T_C_class': ladder.costs.class_corr,
            'T_C_de': list(ladder.costs.denois.values()),
        },
        "valid_approx":
        OrderedDict([
            ('V_C_class', ladder.costs.class_clean),
            ('V_E', ladder.error.clean),
            ('V_C_de', list(ladder.costs.denois.values())),
        ]),
        "valid_final":
        OrderedDict([
            ('VF_C_class', ladder.costs.class_clean),
            ('VF_E', ladder.error.clean),
            ('VF_C_de', list(ladder.costs.denois.values())),
        ]),
    }

    main_loop = MainLoop(
        training_algorithm,
        # Datastream used for training
        make_datastream(data.train,
                        data.train_ind,
                        p.batch_size,
                        n_labeled=p.labeled_samples,
                        n_unlabeled=p.unlabeled_samples),
        model=Model(theano.tensor.cast(ladder.costs.total, "float32")),
        extensions=[
            FinishAfter(after_n_epochs=p.num_epochs),

            # This will estimate the validation error using
            # running average estimates of the batch normalization
            # parameters, mean and variance
            ApproxTestMonitoring(
                [ladder.costs.class_clean, ladder.error.clean] +
                list(ladder.costs.denois.values()),
                make_datastream(data.valid,
                                data.valid_ind,
                                p.valid_batch_size,
                                scheme=ShuffledScheme),
                prefix="valid_approx"),
            TrainingDataMonitoring([
                ladder.costs.total, ladder.costs.class_corr,
                training_algorithm.total_gradient_norm
            ] + list(ladder.costs.denois.values()),
                                   prefix="train",
                                   after_epoch=True),
            SaveParams(None, all_params, p.save_dir, after_epoch=True),
            SaveExpParams(p, p.save_dir, before_training=True),
            ShortPrinting(short_prints),
            LRDecay(ladder.lr,
                    p.num_epochs * p.lrate_decay,
                    p.num_epochs,
                    after_epoch=True),
        ])
    main_loop.run()

    # Get results
    df = main_loop.log.to_dataframe()
    col = 'valid_final_error_rate_clean'
    logger.info('%s %g' % (col, df[col].iloc[-1]))

    if main_loop.log.status['epoch_interrupt_received']:
        return None
    return df
Beispiel #41
0
def train_model(cost, unregularized_cost, updates,
                train_stream, valid_stream, args, gate_values=None):

    step_rule = learning_algorithm(args)
    cg = ComputationGraph(cost)

    # ADD REGULARIZATION
    # WEIGHT NOISE
    weight_noise = args.weight_noise
    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        cg_train = apply_noise(cg, weights, weight_noise)
        cost = cg_train.outputs[0]
    cost.name = "cost_with_weight_noise"
    cg = ComputationGraph(cost)

    logger.info(cg.parameters)

    # Define algorithm
    algorithm = GradientDescent(cost=cost, step_rule=step_rule,
                                parameters=cg.parameters)
    # Add the updates to carry the hidden state
    algorithm.add_updates(updates)

    # Extensions to be added
    extensions = []

    # Load from a dumped model
    if args.load_path is not None:
        extensions.append(Load(args.load_path))

    # Generation extension
    if args.generate:
        extensions.append(TextGenerationExtension(
            cost=cost,
            generation_length=args.generated_text_lenght,
            initial_text_length=args.initial_text_length,
            every_n_batches=1,
            ploting_path=os.path.join(args.save_path, 'prob_plot.png'),
            softmax_sampling=args.softmax_sampling,
            dataset=args.dataset,
            updates=updates,
            interactive_mode=args.interactive_mode))

    # Training and Validation score monitoring
    extensions.extend([
        TrainingDataMonitoring([cost], prefix='train',
                               every_n_batches=args.monitoring_freq),
        DataStreamMonitoring([cost, unregularized_cost],
                             valid_stream, args.mini_batch_size_valid,
                             args.dataset,
                             state_updates=updates,
                             prefix='valid',
                             before_first_epoch=(args.visualize == "nothing"),
                             every_n_batches=args.monitoring_freq)])

    # Creating directory for saving model.
    if not args.interactive_mode:
        if not os.path.exists(args.save_path):
            os.makedirs(args.save_path)
        elif 'test' in args.save_path:
            print "Rewriting in " + args.save_path
        else:
            raise Exception('Directory already exists')

    # Early stopping
    extensions.append(EarlyStopping('valid_' + unregularized_cost.name,
                                    args.patience, args.save_path,
                                    every_n_batches=args.monitoring_freq))

    # Printing
    extensions.append(ProgressBar())
    extensions.append(Printing(every_n_batches=args.monitoring_freq))

    # Reset the initial states
    if args.dataset == "sine":
        reset_frequency = 1
    else:
        reset_frequency = 100
    extensions.append(ResetStates([v for v, _ in updates],
                                  every_n_batches=reset_frequency))

    # Visualizing extensions
    if args.interactive_mode:
        extensions.append(InteractiveMode())

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=extensions
    )
    main_loop.run()
Beispiel #42
0
def main(nvis, nhid, encoding_lstm_dim, decoding_lstm_dim, T=1):
    x = tensor.matrix('features')

    # Construct and initialize model
    encoding_mlp = MLP([Tanh()], [None, None])
    decoding_mlp = MLP([Tanh()], [None, None])
    encoding_lstm = LSTM(dim=encoding_lstm_dim)
    decoding_lstm = LSTM(dim=decoding_lstm_dim)
    draw = DRAW(nvis=nvis, nhid=nhid, T=T, encoding_mlp=encoding_mlp,
                decoding_mlp=decoding_mlp, encoding_lstm=encoding_lstm,
                decoding_lstm=decoding_lstm, biases_init=Constant(0),
                weights_init=Orthogonal())
    draw.push_initialization_config()
    encoding_lstm.weights_init = IsotropicGaussian(std=0.001)
    decoding_lstm.weights_init = IsotropicGaussian(std=0.001)
    draw.initialize()

    # Compute cost
    cost = -draw.log_likelihood_lower_bound(x).mean()
    cost.name = 'nll_upper_bound'
    model = Model(cost)

    # Datasets and data streams
    mnist_train = BinarizedMNIST('train')
    train_loop_stream = ForceFloatX(DataStream(
        dataset=mnist_train,
        iteration_scheme=SequentialScheme(mnist_train.num_examples, 100)))
    train_monitor_stream = ForceFloatX(DataStream(
        dataset=mnist_train,
        iteration_scheme=SequentialScheme(mnist_train.num_examples, 500)))
    mnist_valid = BinarizedMNIST('valid')
    valid_monitor_stream = ForceFloatX(DataStream(
        dataset=mnist_valid,
        iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500)))
    mnist_test = BinarizedMNIST('test')
    test_monitor_stream = ForceFloatX(DataStream(
        dataset=mnist_test,
        iteration_scheme=SequentialScheme(mnist_test.num_examples, 500)))

    # Get parameters and monitoring channels
    computation_graph = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(computation_graph.variables)
    monitoring_channels = dict([
        ('avg_' + channel.tag.name, channel.mean()) for channel in
        VariableFilter(name='.*term$')(computation_graph.auxiliary_variables)])
    for name, channel in monitoring_channels.items():
        channel.name = name
    monitored_quantities = monitoring_channels.values() + [cost]

    # Training loop
    step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95)
    algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule)
    algorithm.add_updates(computation_graph.updates)
    main_loop = MainLoop(
        model=model, data_stream=train_loop_stream, algorithm=algorithm,
        extensions=[
            Timing(),
            SerializeMainLoop('vae.pkl', save_separately=['model']),
            FinishAfter(after_n_epochs=200),
            DataStreamMonitoring(
                monitored_quantities, train_monitor_stream, prefix="train",
                updates=computation_graph.updates),
            DataStreamMonitoring(
                monitored_quantities, valid_monitor_stream, prefix="valid",
                updates=computation_graph.updates),
            DataStreamMonitoring(
                monitored_quantities, test_monitor_stream, prefix="test",
                updates=computation_graph.updates),
            ProgressBar(),
            Printing()])
    main_loop.run()
Beispiel #43
0
def train(ladder, batch_size=100, num_train_examples=60000,
          num_epochs=150, lrate_decay=0.67):
    # Setting Logger
    timestr = time.strftime("%Y_%m_%d_at_%H_%M")
    save_path = 'results/mnist_100_standard_' + timestr
    log_path = os.path.join(save_path, 'log.txt')
    os.makedirs(save_path)
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    # Training
    model = Model(ladder.costs.total)
    all_params = model.parameters
    print len(all_params)
    print all_params

    training_algorithm = GradientDescent(
        cost=ladder.costs.total, parameters=all_params,
        step_rule=Adam(learning_rate=ladder.lr))

    # Fetch all batch normalization updates. They are in the clean path.
    # In addition to actual training, also do BN variable approximations
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    training_algorithm.add_updates(bn_updates)

    monitored_variables = [
        ladder.costs.class_corr, ladder.costs.class_clean,
        ladder.error, training_algorithm.total_gradient_norm,
        ladder.costs.total] + ladder.costs.denois.values()

    train_data_stream, test_data_stream = get_mixed_streams(
        batch_size)

    train_monitoring = TrainingDataMonitoring(
        variables=monitored_variables,
        prefix="train",
        after_epoch=True)

    valid_monitoring = DataStreamMonitoring(
        variables=monitored_variables,
        data_stream=test_data_stream,
        prefix="test",
        after_epoch=True)

    main_loop = MainLoop(
        algorithm=training_algorithm,
        data_stream=train_data_stream,
        model=model,
        extensions=[
            train_monitoring,
            valid_monitoring,
            FinishAfter(after_n_epochs=num_epochs),
            SaveParams('test_CE_corr', model, save_path),
            SaveLog(save_path, after_epoch=True),
            LRDecay(lr=ladder.lr,
                    decay_first=num_epochs * lrate_decay,
                    decay_last=num_epochs,
                    after_epoch=True),
            Printing()])
    main_loop.run()
    graphs, extensions, updates = construct_graphs(args, nclasses,
                                                   sequence_length)
    #graph, extension, update = construct_graphs(args, nclasses, sequence_length)

    ### optimization algorithm definition
    step_rule = CompositeRule([
        StepClipping(1.),
        #Momentum(learning_rate=args.learning_rate, momentum=0.9),
        RMSProp(learning_rate=args.learning_rate, decay_rate=0.5),
    ])

    algorithm = GradientDescent(cost=graphs["training"].outputs[0],
                                parameters=graphs["training"].parameters,
                                step_rule=step_rule)
    algorithm.add_updates(updates["training"])
    model = Model(graphs["training"].outputs[0])
    extensions = extensions["training"] + extensions["inference"]

    # step monitor (after epoch to limit the log size)
    step_channels = []
    step_channels.extend([
        algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
        for name, param in model.get_parameter_dict().items()
    ])
    step_channels.append(
        algorithm.total_step_norm.copy(name="total_step_norm"))
    step_channels.append(
        algorithm.total_gradient_norm.copy(name="total_gradient_norm"))
    step_channels.extend(graphs["training"].outputs)
    logger.warning("constructing training data monitor")
Beispiel #45
0
def train(cli_params):
    cli_params['save_dir'] = prepare_dir(cli_params['save_to'])
    logfile = os.path.join(cli_params['save_dir'], 'log.txt')

    # Log also DEBUG to a file
    fh = logging.FileHandler(filename=logfile)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    logger.info('Logging into %s' % logfile)

    p, loaded = load_and_log_params(cli_params)
    in_dim, data, whiten, cnorm = setup_data(p, test_set=True)
    
    if not loaded:
        # Set the zero layer to match input dimensions
        p.encoder_layers = (in_dim,) + p.encoder_layers

    ladder = setup_model(p)

    # Training
    all_params = ComputationGraph([ladder.costs.total]).parameters
    logger.info('Found the following parameters: %s' % str(all_params))

    # Fetch all batch normalization updates. They are in the clean path.
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    assert 'counter' in [u.name for u in bn_updates.keys()], \
        'No batch norm params in graph - the graph has been cut?'

    training_algorithm = GradientDescent(
        cost=ladder.costs.total, params=all_params,
        step_rule=Adam(learning_rate=ladder.lr))
    # In addition to actual training, also do BN variable approximations
    training_algorithm.add_updates(bn_updates)

    model=Model(ladder.costs.total)

    monitored_variables = [
        ladder.costs.class_corr, 
        ladder.costs.class_clean,
        ladder.error, 
#         training_algorithm.total_gradient_norm,
        ladder.costs.total] \
#         + ladder.costs.denois.values()

    # Make a global history recorder so that we can get summary at end of 
    # training when we write to Sentinel
    # global_history records all relevant monitoring vars
    # updated by SaveLog every time
    global_history = {}

    main_loop = MainLoop(
        training_algorithm,
        # Datastream used for training
        make_datastream(data.train, data.train_ind,
                        p.batch_size,
                        n_labeled=p.labeled_samples,
                        n_unlabeled=p.unlabeled_samples,
                        whiten=whiten,
                        cnorm=cnorm),
        model=model,
        extensions=[
            FinishAfter(after_n_epochs=p.num_epochs),
            
            # This will estimate the validation error using
            # running average estimates of the batch normalization
            # parameters, mean and variance
            ApproxTestMonitoring(
                monitored_variables,
                make_datastream(data.valid, data.valid_ind,
                                p.valid_batch_size, whiten=whiten, cnorm=cnorm,
                                scheme=ShuffledScheme),
                prefix="valid_approx"),

            # This Monitor is slower, but more accurate since it will first
            # estimate batch normalization parameters from training data and
            # then do another pass to calculate the validation error.
            FinalTestMonitoring(
                monitored_variables,
                make_datastream(data.train, data.train_ind,
                                p.batch_size,
                                n_labeled=p.labeled_samples,
                                whiten=whiten, cnorm=cnorm,
                                scheme=ShuffledScheme),
                # DEPREC: we directly test now
#                 make_datastream(data.valid, data.valid_ind,
#                                 p.valid_batch_size,
#                                 n_labeled=len(data.valid_ind),
#                                 whiten=whiten, cnorm=cnorm,
#                                 scheme=ShuffledScheme),
#                 prefix="valid_final",
                make_datastream(data.test, data.test_ind,
                                p.batch_size,
                                n_labeled=len(data.test_ind),
                                whiten=whiten, cnorm=cnorm,
                                scheme=ShuffledScheme),
                prefix="final_test",
                after_n_epochs=p.num_epochs),

            TrainingDataMonitoring(
                variables=monitored_variables,
                prefix="train", after_epoch=True),

            # write out to sentinel file for experiment automator to work
            # REMOVE THIS if you're running test mode with early stopping immediately after
            SentinelWhenFinish(save_dir=p.save_dir,
                               global_history=global_history),

            # originally use 'valid_approx_cost_class_clean'
            # turns out should use ER as early stopping
            # use CE as a fallback (secondary early stopvar) if ER is the same
#             SaveParams(('valid_approx_error_rate', 'valid_approx_cost_class_clean'),
#                         model, p.save_dir),
            # doesn't do early stopping now
            SaveParams(None, model, p.save_dir, after_epoch=True),
            SaveExpParams(p, p.save_dir, before_training=True),
            SaveLog(save_dir=p.save_dir, 
                    after_epoch=True,
                    global_history=global_history),
            Printing(),
#             ShortPrinting(short_prints),
            LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs,
                    after_epoch=True),
        ])
    main_loop.run()
    
    # ================= Add testing at end of training =================
    # DEPREC don't do early stopping anymore
    if False:
        p.load_from = p.save_dir
        ladder = setup_model(p)
        
        logger.info('Start testing on trained_params_best')
        main_loop = DummyLoop(
            extensions=[
                # write to global history
                SaveLog(save_dir=p.save_dir, 
                        after_training=True,
                        global_history=global_history),

                # write out to sentinel file for experiment automator to work
                SentinelWhenFinish(save_dir=p.save_dir,
                                   global_history=global_history),

                FinalTestMonitoring(
                    [ladder.costs.class_clean, ladder.error]
                    + ladder.costs.denois.values(),
                    make_datastream(data.train, data.train_ind,
                                    # These need to match with the training
                                    p.batch_size,
                                    n_labeled=p.labeled_samples,
                                    n_unlabeled=len(data.train_ind),
                                    cnorm=cnorm,
                                    whiten=whiten, scheme=ShuffledScheme),
                    make_datastream(data.test, data.test_ind,
                                    p.batch_size,
                                    n_labeled=len(data.test_ind),
                                    n_unlabeled=len(data.test_ind),
                                    cnorm=cnorm,
                                    whiten=whiten, scheme=ShuffledScheme),
                    prefix="test", 
                    before_training=True)
            ])
        main_loop.run()

    # Get results
    df = main_loop.log.to_dataframe()
#     col = 'valid_final_error_rate'
#     logger.info('%s %g' % (col, df[col].iloc[-1]))

    if main_loop.log.status['epoch_interrupt_received']:
        return None
    return df
Beispiel #46
0
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim,
         dec_dim, z_dim):

    if name is None:
        tag = "watt" if attention else "woatt"
        name = "%s-t%d-enc%d-dec%d-z%d" % (tag, n_iter, enc_dim, dec_dim,
                                           z_dim)

    print("\nRunning experiment %s" % name)
    print("         learning rate: %5.3f" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print()

    #------------------------------------------------------------------------

    x_dim = 28 * 28
    img_height, img_width = (28, 28)

    rnninits = {
        'weights_init': Orthogonal(),
        #'weights_init': IsotropicGaussian(0.001),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': Orthogonal(),
        #'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    prior_mu = T.zeros([z_dim])
    prior_log_sigma = T.zeros([z_dim])

    if attention:
        read_N = 4
        write_N = 6
        read_dim = 2 * read_N**2

        reader = AttentionReader(x_dim=x_dim,
                                 dec_dim=dec_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
        writer = AttentionWriter(input_dim=dec_dim,
                                 output_dim=x_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
    else:
        read_dim = 2 * x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

    encoder = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Tanh()], [(read_dim + dec_dim), 4 * enc_dim],
                      name="MLP_enc",
                      **inits)
    decoder_mlp = MLP([Tanh()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    for brick in [
            reader, writer, encoder, decoder, encoder_mlp, decoder_mlp,
            q_sampler
    ]:
        brick.allocate()
        brick.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    # This is one iteration
    def one_iteration(c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec,
                      x):
        x_hat = x - T.nnet.sigmoid(c)
        r = reader.apply(x, x_hat, h_dec)
        i_enc = encoder_mlp.apply(T.concatenate([r, h_dec], axis=1))
        h_enc, c_enc = encoder.apply(states=h_enc,
                                     cells=c_enc,
                                     inputs=i_enc,
                                     iterate=False)
        z_mean, z_log_sigma, z = q_sampler.apply(h_enc)
        i_dec = decoder_mlp.apply(z)
        h_dec, c_dec = decoder.apply(states=h_dec,
                                     cells=c_dec,
                                     inputs=i_dec,
                                     iterate=False)
        c = c + writer.apply(h_dec)
        return c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec

    outputs_info = [
        T.zeros([batch_size, x_dim]),  # c
        T.zeros([batch_size, enc_dim]),  # h_enc
        T.zeros([batch_size, enc_dim]),  # c_enc
        T.zeros([batch_size, z_dim]),  # z_mean
        T.zeros([batch_size, z_dim]),  # z_log_sigma
        T.zeros([batch_size, z_dim]),  # z
        T.zeros([batch_size, dec_dim]),  # h_dec
        T.zeros([batch_size, dec_dim]),  # c_dec
    ]

    outputs, scan_updates = theano.scan(fn=one_iteration,
                                        sequences=[],
                                        outputs_info=outputs_info,
                                        non_sequences=[x],
                                        n_steps=n_iter)

    c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec = outputs

    kl_terms = (prior_log_sigma - z_log_sigma + 0.5 *
                (tensor.exp(2 * z_log_sigma) +
                 (z_mean - prior_mu)**2) / tensor.exp(2 * prior_log_sigma) -
                0.5).sum(axis=-1)

    x_recons = T.nnet.sigmoid(c[-1, :, :])
    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule([
            #StepClipping(3.),
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )
    algorithm.add_updates(scan_updates)

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t, :].mean()
        kl_term_t.name = "kl_term_%d" % t

        x_recons_t = T.nnet.sigmoid(c[t, :, :])
        recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        recons_term_t = recons_term_t.mean()
        recons_term_t.name = "recons_term_%d" % t

        monitors += [kl_term_t, recons_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]

    # Live plotting...
    plot_channels = [["train_nll_bound", "test_nll_bound"],
                     ["train_kl_term_%d" % t for t in range(n_iter)],
                     ["train_recons_term_%d" % t for t in range(n_iter)],
                     ["train_total_gradient_norm", "train_total_step_norm"]]

    #------------------------------------------------------------

    mnist_train = BinarizedMNIST("train", sources=['features'])
    mnist_test = BinarizedMNIST("test", sources=['features'])
    #mnist_train = MNIST("train", binary=True, sources=['features'])
    #mnist_test = MNIST("test", binary=True, sources=['features'])

    main_loop = MainLoop(
        model=None,
        data_stream=ForceFloatX(
            DataStream(mnist_train,
                       iteration_scheme=SequentialScheme(
                           mnist_train.num_examples, batch_size))),
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            DataStreamMonitoring(
                monitors,
                ForceFloatX(
                    DataStream(mnist_test,
                               iteration_scheme=SequentialScheme(
                                   mnist_test.num_examples, batch_size))),
                updates=scan_updates,
                prefix="test"),
            TrainingDataMonitoring(train_monitors,
                                   prefix="train",
                                   after_every_epoch=True),
            SerializeMainLoop(name + ".pkl"),
            Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()
        ])
    main_loop.run()
Beispiel #47
0
def train_snli_model(new_training_job,
                     config,
                     save_path,
                     params,
                     fast_start,
                     fuel_server,
                     seed,
                     model='simple'):
    if config['exclude_top_k'] > config['num_input_words'] and config[
            'num_input_words'] > 0:
        raise Exception("Some words have neither word nor def embedding")
    c = config
    logger = configure_logger(name="snli_baseline_training",
                              log_file=os.path.join(save_path, "log.txt"))
    if not os.path.exists(save_path):
        logger.info("Start a new job")
        os.mkdir(save_path)
    else:
        logger.info("Continue an existing job")
    with open(os.path.join(save_path, "cmd.txt"), "w") as f:
        f.write(" ".join(sys.argv))

    # Make data paths nice
    for path in [
            'dict_path', 'embedding_def_path', 'embedding_path', 'vocab',
            'vocab_def', 'vocab_text'
    ]:
        if c.get(path, ''):
            if not os.path.isabs(c[path]):
                c[path] = os.path.join(fuel.config.data_path[0], c[path])

    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')

    # Save config to save_path
    json.dump(config, open(os.path.join(save_path, "config.json"), "w"))

    if model == 'simple':
        nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data(
            c)
    elif model == 'esim':
        nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data(
            c)
    else:
        raise NotImplementedError()

    # Compute cost
    s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2')

    if c['dict_path']:
        assert os.path.exists(c['dict_path'])
        s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix(
            'sentence2_def_map')
        def_mask = T.fmatrix("def_mask")
        defs = T.lmatrix("defs")
    else:
        s1_def_map, s2_def_map = None, None
        def_mask = None
        defs = None

    s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask')
    y = T.ivector('label')

    cg = {}
    for train_phase in [True, False]:
        # NOTE: Please don't change outputs of cg
        if train_phase:
            with batch_normalization(nli_model):
                pred = nli_model.apply(s1,
                                       s1_mask,
                                       s2,
                                       s2_mask,
                                       def_mask=def_mask,
                                       defs=defs,
                                       s1_def_map=s1_def_map,
                                       s2_def_map=s2_def_map,
                                       train_phase=train_phase)
        else:
            pred = nli_model.apply(s1,
                                   s1_mask,
                                   s2,
                                   s2_mask,
                                   def_mask=def_mask,
                                   defs=defs,
                                   s1_def_map=s1_def_map,
                                   s2_def_map=s2_def_map,
                                   train_phase=train_phase)

        cost = CategoricalCrossEntropy().apply(y.flatten(), pred)
        error_rate = MisclassificationRate().apply(y.flatten(), pred)
        cg[train_phase] = ComputationGraph([cost, error_rate])

    # Weight decay (TODO: Make it less bug prone)
    if model == 'simple':
        weights_to_decay = VariableFilter(
            bricks=[dense for dense, relu, bn in nli_model._mlp],
            roles=[WEIGHT])(cg[True].variables)
        weight_decay = np.float32(c['l2']) * sum(
            (w**2).sum() for w in weights_to_decay)
    elif model == 'esim':
        weight_decay = 0.0
    else:
        raise NotImplementedError()

    final_cost = cg[True].outputs[0] + weight_decay
    final_cost.name = 'final_cost'

    # Add updates for population parameters

    if c.get("bn", True):
        pop_updates = get_batch_normalization_updates(cg[True])
        extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates]
    else:
        pop_updates = []
        extra_updates = []

    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            loaded_params = load_parameters(src)
            cg[True].set_parameter_values(loaded_params)
            for param, m in pop_updates:
                param.set_value(loaded_params[get_brick(
                    param).get_hierarchical_name(param)])

    if os.path.exists(os.path.join(save_path, "main_loop.tar")):
        logger.warning("Manually loading BN stats :(")
        with open(os.path.join(save_path, "main_loop.tar")) as src:
            loaded_params = load_parameters(src)

        for param, m in pop_updates:
            param.set_value(
                loaded_params[get_brick(param).get_hierarchical_name(param)])

    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', batch_size=4).get_epoch_iterator())
        s1.tag.test_value = test_value_data[0]
        s1_mask.tag.test_value = test_value_data[1]
        s2.tag.test_value = test_value_data[2]
        s2_mask.tag.test_value = test_value_data[3]
        y.tag.test_value = test_value_data[4]

    # Freeze embeddings
    if not c['train_emb']:
        frozen_params = [
            p for E in nli_model.get_embeddings_lookups() for p in E.parameters
        ]
        train_params = [p for p in cg[True].parameters]
        assert len(set(frozen_params) & set(train_params)) > 0
    else:
        frozen_params = []
    if not c.get('train_def_emb', 1):
        frozen_params_def = [
            p for E in nli_model.get_def_embeddings_lookups()
            for p in E.parameters
        ]
        train_params = [p for p in cg[True].parameters]
        assert len(set(frozen_params_def) & set(train_params)) > 0
        frozen_params += frozen_params_def
    train_params = [p for p in cg[True].parameters if p not in frozen_params]
    train_params_keys = [
        get_brick(p).get_hierarchical_name(p) for p in train_params
    ]

    # Optimizer
    algorithm = GradientDescent(cost=final_cost,
                                on_unused_sources='ignore',
                                parameters=train_params,
                                step_rule=Adam(learning_rate=c['lr']))
    algorithm.add_updates(extra_updates)
    m = Model(final_cost)

    parameters = m.get_parameter_dict()  # Blocks version mismatch
    logger.info("Trainable parameters" + "\n" +
                pprint.pformat([(key, parameters[key].get_value().shape)
                                for key in sorted(train_params_keys)],
                               width=120))
    logger.info("# of parameters {}".format(
        sum([
            np.prod(parameters[key].get_value().shape)
            for key in sorted(train_params_keys)
        ])))

    ### Monitored args ###
    train_monitored_vars = [final_cost] + cg[True].outputs
    monitored_vars = cg[False].outputs
    val_acc = monitored_vars[1]
    to_monitor_names = [
        'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2',
        's1_gate_rootmean2', 's1_compose_gate_rootmean2'
    ]
    for k in to_monitor_names:
        train_v, valid_v = VariableFilter(name=k)(
            cg[True]), VariableFilter(name=k)(cg[False])
        if len(train_v):
            logger.info("Adding {} tracking".format(k))
            train_monitored_vars.append(train_v[0])
            monitored_vars.append(valid_v[0])
        else:
            logger.warning("Didnt find {} in cg".format(k))

    if c['monitor_parameters']:
        for name in train_params_keys:
            param = parameters[name]
            num_elements = numpy.product(param.get_value().shape)
            norm = param.norm(2) / num_elements
            grad_norm = algorithm.gradients[param].norm(2) / num_elements
            step_norm = algorithm.steps[param].norm(2) / num_elements
            stats = tensor.stack(norm, grad_norm, step_norm,
                                 step_norm / grad_norm)
            stats.name = name + '_stats'
            train_monitored_vars.append(stats)

    regular_training_stream = data.get_stream('train',
                                              batch_size=c['batch_size'],
                                              seed=seed)

    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=regular_training_stream.sources,
            hwm=100,
            produces_examples=regular_training_stream.produces_examples)
    else:
        training_stream = regular_training_stream

    ### Build extensions ###

    extensions = [
        # Load(main_loop_path, load_iteration_state=True, load_log=True)
        #     .set_conditions(before_training=not new_training_job),
        StartFuelServer(regular_training_stream,
                        stream_path,
                        hwm=100,
                        script_path=os.path.join(
                            os.path.dirname(__file__),
                            "../bin/start_fuel_server.py"),
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq']),
        ProgressBar(),
        RetrievalPrintStats(retrieval=used_retrieval,
                            every_n_batches=c['mon_freq_valid'],
                            before_training=not fast_start),
        Timestamp(),
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq']),
    ]

    if c['layout'] == 'snli':
        validation = DataStreamMonitoring(monitored_vars,
                                          data.get_stream('valid',
                                                          batch_size=14,
                                                          seed=seed),
                                          before_training=not fast_start,
                                          on_resumption=True,
                                          after_training=True,
                                          every_n_batches=c['mon_freq_valid'],
                                          prefix='valid')
        extensions.append(validation)
    elif c['layout'] == 'mnli':
        validation = DataStreamMonitoring(monitored_vars,
                                          data.get_stream('valid_matched',
                                                          batch_size=14,
                                                          seed=seed),
                                          every_n_batches=c['mon_freq_valid'],
                                          on_resumption=True,
                                          after_training=True,
                                          prefix='valid_matched')
        validation_mismatched = DataStreamMonitoring(
            monitored_vars,
            data.get_stream('valid_mismatched', batch_size=14, seed=seed),
            every_n_batches=c['mon_freq_valid'],
            before_training=not fast_start,
            on_resumption=True,
            after_training=True,
            prefix='valid_mismatched')
        extensions.extend([validation, validation_mismatched])
    else:
        raise NotImplementedError()

    # Similarity trackers for embeddings
    if len(c.get('vocab_def', '')):
        retrieval_vocab = Vocabulary(c['vocab_def'])
    else:
        retrieval_vocab = data.vocab

    retrieval_all = Retrieval(vocab_text=retrieval_vocab,
                              dictionary=used_dict,
                              max_def_length=c['max_def_length'],
                              exclude_top_k=0,
                              max_def_per_word=c['max_def_per_word'])

    for name in [
            's1_word_embeddings', 's1_dict_word_embeddings',
            's1_translated_word_embeddings'
    ]:
        variables = VariableFilter(name=name)(cg[False])
        if len(variables):
            s1_emb = variables[0]
            logger.info("Adding similarity tracking for " + name)
            # A bit sloppy about downcast

            if "dict" in name:
                embedder = construct_dict_embedder(theano.function(
                    [s1, defs, def_mask, s1_def_map],
                    s1_emb,
                    allow_input_downcast=True),
                                                   vocab=data.vocab,
                                                   retrieval=retrieval_all)
                extensions.append(
                    SimilarityWordEmbeddingEval(
                        embedder=embedder,
                        prefix=name,
                        every_n_batches=c['mon_freq_valid'],
                        before_training=not fast_start))
            else:
                embedder = construct_embedder(theano.function(
                    [s1], s1_emb, allow_input_downcast=True),
                                              vocab=data.vocab)
                extensions.append(
                    SimilarityWordEmbeddingEval(
                        embedder=embedder,
                        prefix=name,
                        every_n_batches=c['mon_freq_valid'],
                        before_training=not fast_start))

    track_the_best = TrackTheBest(validation.record_name(val_acc),
                                  before_training=not fast_start,
                                  every_n_epochs=c['save_freq_epochs'],
                                  after_training=not fast_start,
                                  every_n_batches=c['mon_freq_valid'],
                                  choose_best=min)
    extensions.append(track_the_best)

    # Special care for serializing embeddings
    if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path',
                                                     '')):
        extensions.insert(
            0,
            LoadNoUnpickling(main_loop_path,
                             load_iteration_state=True,
                             load_log=True).set_conditions(
                                 before_training=not new_training_job))
        extensions.append(
            Checkpoint(main_loop_path,
                       parameters=train_params + [p for p, m in pop_updates],
                       save_main_loop=False,
                       save_separately=['log', 'iteration_state'],
                       before_training=not fast_start,
                       every_n_epochs=c['save_freq_epochs'],
                       after_training=not fast_start).add_condition(
                           ['after_batch', 'after_epoch'],
                           OnLogRecord(track_the_best.notification_name),
                           (main_loop_best_val_path, )))
    else:
        extensions.insert(
            0,
            Load(main_loop_path, load_iteration_state=True,
                 load_log=True).set_conditions(
                     before_training=not new_training_job))
        extensions.append(
            Checkpoint(main_loop_path,
                       parameters=cg[True].parameters +
                       [p for p, m in pop_updates],
                       before_training=not fast_start,
                       every_n_epochs=c['save_freq_epochs'],
                       after_training=not fast_start).add_condition(
                           ['after_batch', 'after_epoch'],
                           OnLogRecord(track_the_best.notification_name),
                           (main_loop_best_val_path, )))

    extensions.extend([
        DumpCSVSummaries(save_path,
                         every_n_batches=c['mon_freq_valid'],
                         after_training=True),
        DumpTensorflowSummaries(save_path,
                                after_epoch=True,
                                every_n_batches=c['mon_freq_valid'],
                                after_training=True),
        Printing(every_n_batches=c['mon_freq_valid']),
        PrintMessage(msg="save_path={}".format(save_path),
                     every_n_batches=c['mon_freq']),
        FinishAfter(after_n_batches=c['n_batches']).add_condition(
            ['after_batch'],
            OnLogStatusExceed('iterations_done', c['n_batches']))
    ])

    logger.info(extensions)

    ### Run training ###

    if "VISDOM_SERVER" in os.environ:
        print("Running visdom server")
        ret = subprocess.Popen([
            os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"),
            "--visdom-server={}".format(os.environ['VISDOM_SERVER']),
            "--folder={}".format(save_path)
        ])
        time.sleep(0.1)
        if ret.returncode is not None:
            raise Exception()
        atexit.register(lambda: os.kill(ret.pid, signal.SIGINT))

    model = Model(cost)
    for p, m in pop_updates:
        model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=model,
                         extensions=extensions)

    assert os.path.exists(save_path)
    main_loop.run()
Beispiel #48
0
def main(save_to, num_epochs,
         weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None,
         batch_size=None, histogram=None, resume=False):
    output_size = 10

    prior_noise_level = -10
    noise_step_rule = Scale(1e-6)
    noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX))
    convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True,
            noise_rate=noise_rate,
            prior_noise_level=prior_noise_level)

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    test_probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs)
            .copy(name='cost'))
    test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs)
                  .copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs)
                  .copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate])

    # Apply dropout to all layer outputs except final softmax
    # dropout_vars = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(test_cg.variables)
    # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(test_cg, [x], 0.2)
    # train_cg = drop_cg
    # train_cg = apply_batch_normalization(test_cg)

    # train_cost, train_error_rate, train_components = train_cg.outputs

    with batch_normalization(convnet):
        with training_noise(convnet):
            train_probs = convnet.apply(x)
    train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs)
                .copy(name='cost'))
    train_components = (ComponentwiseCrossEntropy().apply(y.flatten(),
                train_probs).copy(name='components'))
    train_error_rate = (MisclassificationRate().apply(y.flatten(),
                train_probs).copy(name='error_rate'))
    train_cg = ComputationGraph([train_cost,
                train_error_rate, train_components])
    population_updates = get_batch_normalization_updates(train_cg)
    bn_alpha = 0.9
    extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha))
                for p, m in population_updates]

    # for annealing
    nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX))
    nit_penalty.name = 'nit_penalty'

    # Compute noise rates for training graph
    train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables)
    train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean()
    train_mean_log_sigma.name = 'mean_log_sigma'
    train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables)
    train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean()
    train_nit_rate.name = 'nit_rate'
    train_nit_regularization = nit_penalty * train_nit_rate
    train_nit_regularization.name = 'nit_regularization'

    # Apply regularization to the cost
    trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])(
            train_cg.parameters)
    mask_parameters = [p for p in trainable_parameters
            if get_brick(p).name == 'mask']
    noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters)
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    nonmask_weights = [p for p in weights if get_brick(p).name != 'mask']
    l2_norm = sum([(W ** 2).sum() for W in nonmask_weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = weight_decay * l2_norm
    l2_regularization.name = 'l2_regularization'

    # testversion
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + l2_regularization + train_nit_regularization
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train",))
    cifar10_train_stream = RandomPadCropFlip(
        NormalizeBatchLevels(DataStream.default_stream(
            cifar10_train, iteration_scheme=ShuffledScheme(
                cifar10_train.num_examples, batch_size)),
        which_sources=('features',)),
        (32, 32), pad=4, which_sources=('features',))

    test_batch_size = 128
    cifar10_test = CIFAR10(("test",))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(
            cifar10_test.num_examples, test_batch_size)),
        which_sources=('features',))

    momentum = Momentum(0.01, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])

    # Create a step rule that reduces the learning rate of noise
    scale_mask = Restrict(noise_step_rule, mask_parameters)
    step_rule = CompositeRule([scale_mask, momentum])

    # from theano.compile.nanguardmode import NanGuardMode

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=train_cost, parameters=trainable_parameters,
        step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    #,
    #    theano_func_kwargs={
    #        'mode': NanGuardMode(
    #            nan_is_error=True, inf_is_error=True, big_is_error=True)})

    exp_name = save_to.replace('.%d', '')

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  EpochSchedule(momentum.learning_rate, [
                      (0, 0.01),     # Warm up with 0.01 learning rate
                      (50, 0.1),     # Then go back to 0.1
                      (100, 0.01),
                      (150, 0.001)
                      # (83, 0.01),  # Follow the schedule in the paper
                      # (125, 0.001)
                  ]),
                  EpochSchedule(noise_step_rule.learning_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4)
                  ]),
                  EpochSchedule(noise_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4),
                      # (6, 3e-4),
                      # (8, 1e-3), # Causes nit rate to jump
                      # (10, 3e-3),
                      # (12, 1e-2),
                      # (15, 3e-2),
                      # (19, 1e-1),
                      # (24, 3e-1),
                      # (30, 1)
                  ]),
                  NoiseExtension(
                      noise_parameters=noise_parameters),
                  NoisyDataStreamMonitoring(
                      [test_cost, test_error_rate, test_confusion],
                      cifar10_test_stream,
                      noise_parameters=noise_parameters,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [train_cost, train_error_rate, train_nit_rate,
                       train_cost_without_regularization,
                       l2_regularization,
                       train_nit_regularization,
                       momentum.learning_rate,
                       train_mean_log_sigma,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      every_n_batches=17),
                      # after_epoch=True),
                  Plot('Training performance for ' + exp_name,
                      channels=[
                          ['train_cost_with_regularization',
                           'train_cost_without_regularization',
                           'train_nit_regularization',
                           'train_l2_regularization'],
                          ['train_error_rate'],
                          ['train_total_gradient_norm'],
                          ['train_mean_log_sigma'],
                      ],
                      every_n_batches=17),
                  Plot('Test performance for ' + exp_name,
                      channels=[[
                          'train_error_rate',
                          'test_error_rate',
                          ]],
                      after_epoch=True),
                  EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=train_components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(exp_name, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(
        algorithm,
        cifar10_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)
Beispiel #49
0
def run(model_name, port_train, port_valid):

	running_on_laptop = socket.gethostname() == 'yop'

	X = tensor.tensor4('image_features', dtype='float32')
	T = tensor.matrix('targets', dtype='float32')

	image_border_size = (100, 100)

	if running_on_laptop:
		host_plot = 'http://*****:*****@ %s' % (model_name, datetime.datetime.now(), socket.gethostname()), channels=[['loss'], ['error', 'valid_error']], after_epoch=True, server_url=host_plot),
		Printing(),
		Checkpoint('/tmp/train_bn2')
	]

	main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm,
	                     extensions=extensions, model=model)
	main_loop.run()
Beispiel #50
0
    mean_sigma, min_sigma, min_mu, max_mu, mean_mu, max_sigma, mean_coeff,
    min_coeff, max_coeff
]

#################
# Algorithm
#################

n_batches = 200

algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=CompositeRule(
                                [StepClipping(10.0),
                                 Adam(lr)]))
algorithm.add_updates(extra_updates)

train_monitor = TrainingDataMonitoring(variables=monitoring_variables +
                                       data_monitoring,
                                       every_n_batches=n_batches,
                                       prefix="train")

valid_monitor = DataStreamMonitoring(monitoring_variables,
                                     valid_stream,
                                     every_n_batches=n_batches,
                                     prefix="valid")

extensions = [
    ProgressBar(),
    Timing(every_n_batches=n_batches), train_monitor, valid_monitor,
    TrackTheBest('valid_nll', every_n_batches=n_batches),
Beispiel #51
0
def run(discriminative_regularization=True):
    streams = create_celeba_streams(training_batch_size=100,
                                    monitoring_batch_size=500,
                                    include_targets=False)
    main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3]

    # Compute parameter updates for the batch normalization population
    # statistics. They are updated following an exponential moving average.
    rval = create_training_computation_graphs(discriminative_regularization)
    cg, bn_cg, variance_parameters = rval
    pop_updates = list(
        set(get_batch_normalization_updates(bn_cg, allow_duplicates=True)))
    decay_rate = 0.05
    extra_updates = [(p, m * decay_rate + p * (1 - decay_rate))
                     for p, m in pop_updates]

    model = Model(bn_cg.outputs[0])
    selector = Selector(
        find_bricks(
            model.top_bricks, lambda brick: brick.name in
            ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp'
             )))
    parameters = list(selector.get_parameters().values()) + variance_parameters

    # Prepare algorithm
    step_rule = Adam()
    algorithm = GradientDescent(cost=bn_cg.outputs[0],
                                parameters=parameters,
                                step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    # Prepare monitoring
    monitored_quantities_list = []
    for graph in [bn_cg, cg]:
        cost, kl_term, reconstruction_term = graph.outputs
        cost.name = 'nll_upper_bound'
        avg_kl_term = kl_term.mean(axis=0)
        avg_kl_term.name = 'avg_kl_term'
        avg_reconstruction_term = -reconstruction_term.mean(axis=0)
        avg_reconstruction_term.name = 'avg_reconstruction_term'
        monitored_quantities_list.append(
            [cost, avg_kl_term, avg_reconstruction_term])
    train_monitoring = DataStreamMonitoring(monitored_quantities_list[0],
                                            train_monitor_stream,
                                            prefix="train",
                                            updates=extra_updates,
                                            after_epoch=False,
                                            before_first_epoch=False,
                                            every_n_epochs=5)
    valid_monitoring = DataStreamMonitoring(monitored_quantities_list[1],
                                            valid_monitor_stream,
                                            prefix="valid",
                                            after_epoch=False,
                                            before_first_epoch=False,
                                            every_n_epochs=5)

    # Prepare checkpoint
    save_path = 'celeba_vae_{}regularization.zip'.format(
        '' if discriminative_regularization else 'no_')
    checkpoint = Checkpoint(save_path, every_n_epochs=5, use_cpickle=True)

    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=75), train_monitoring, valid_monitoring,
        checkpoint,
        Printing(),
        ProgressBar()
    ]
    main_loop = MainLoop(data_stream=main_loop_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    main_loop.run()
Beispiel #52
0
min_binary = binary.min().copy(name="binary_min")
mean_binary = binary.mean().copy(name="binary_mean")
max_binary = binary.max().copy(name="binary_max")

data_monitoring += [mean_sigma, min_sigma, min_mu, max_mu, mean_mu, max_sigma, mean_binary, min_binary, max_binary]

#################
# Algorithm
#################

n_batches = 200

algorithm = GradientDescent(
    cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Adam(lr)])
)
algorithm.add_updates(extra_updates)

train_monitor = TrainingDataMonitoring(
    variables=monitoring_variables + data_monitoring, every_n_batches=n_batches, prefix="train"
)

valid_monitor = DataStreamMonitoring(
    monitoring_variables + data_monitoring, valid_stream, every_n_batches=n_batches, prefix="valid"
)

extensions = [
    ProgressBar(),
    Timing(every_n_batches=n_batches),
    train_monitor,
    valid_monitor,
    TrackTheBest("valid_nll", every_n_batches=n_batches),
# # cost += reg * abs(p).sum()
# cost += reg * (p ** 2).sum()
# param_nans += T.isnan(p).sum()
# name = params[i].name
#     params[i] = params[i].mean()
#     params[i].name = name + str(i)
cost.name = "final_cost"

# param_nans.name = 'params_nans'

## Training algorithm



# algorithm.initialize()
algorithm.add_updates(cg.updates)

extensions = [FinishAfter(after_n_epochs=5000),
              DataStreamMonitoring(
                  [cost, error_rate, mistake_rate],
                  data_stream=test,
                  updates=cg.updates,
                  prefix="test"),
              TrainingDataMonitoring(
                  [cost,  # hidden_states, debug_val, param_nans,
                   aggregation.mean(algorithm.total_gradient_norm)],  #+ params,
                  prefix="train",
                  after_epoch=True),
              Timing(),
              Printing(),
              ProgressBar()]
        main_loop.run()
        sys.exit(0)

    graphs, extensions, updates = construct_graphs(args, nclasses, sequence_length)

    ### optimization algorithm definition
    step_rule = CompositeRule([
        StepClipping(1.),
        #Momentum(learning_rate=args.learning_rate, momentum=0.9),
        RMSProp(learning_rate=args.learning_rate, decay_rate=0.5),
    ])

    algorithm = GradientDescent(cost=graphs["training"].outputs[0],
                                parameters=graphs["training"].parameters,
                                step_rule=step_rule)
    algorithm.add_updates(updates["training"])
    model = Model(graphs["training"].outputs[0])
    extensions = extensions["training"] + extensions["inference"]


    # step monitor (after epoch to limit the log size)
    step_channels = []
    step_channels.extend([
        algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
        for name, param in model.get_parameter_dict().items()])
    step_channels.append(algorithm.total_step_norm.copy(name="total_step_norm"))
    step_channels.append(algorithm.total_gradient_norm.copy(name="total_gradient_norm"))
    step_channels.extend(graphs["training"].outputs)
    logger.warning("constructing training data monitor")
    extensions.append(TrainingDataMonitoring(
        step_channels, prefix="iteration", after_batch=False))
Beispiel #55
0
def train(cli_params):
    cli_params["save_dir"] = prepare_dir(cli_params["save_to"])
    logfile = os.path.join(cli_params["save_dir"], "log.txt")

    # Log also DEBUG to a file
    fh = logging.FileHandler(filename=logfile)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    logger.info("Logging into %s" % logfile)

    p, loaded = load_and_log_params(cli_params)
    in_dim, data, whiten, cnorm = setup_data(p, test_set=False)

    if not loaded:
        # Set the zero layer to match input dimensions
        p.encoder_layers = (in_dim,) + p.encoder_layers

    ladder = setup_model(p)

    # Training
    all_params = ComputationGraph([ladder.costs.total]).parameters
    logger.info("Found the following parameters: %s" % str(all_params))

    # Fetch all batch normalization updates. They are in the clean path.
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    assert "counter" in [u.name for u in bn_updates.keys()], "No batch norm params in graph - the graph has been cut?"

    training_algorithm = GradientDescent(
        cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr)
    )
    # In addition to actual training, also do BN variable approximations
    training_algorithm.add_updates(bn_updates)

    model = Model(ladder.costs.total)

    monitored_variables = [
        ladder.costs.class_corr,
        ladder.costs.class_clean,
        ladder.error,
        #         training_algorithm.total_gradient_norm,
        ladder.costs.total,
    ]
    #         + ladder.costs.denois.values()

    # Make a global history recorder so that we can get summary at end of
    # training when we write to Sentinel
    # global_history records all relevant monitoring vars
    # updated by SaveLog every time
    global_history = {}

    main_loop = MainLoop(
        training_algorithm,
        # Datastream used for training
        make_datastream(
            data.train,
            data.train_ind,
            p.batch_size,
            n_labeled=p.labeled_samples,
            n_unlabeled=p.unlabeled_samples,
            whiten=whiten,
            cnorm=cnorm,
        ),
        model=model,
        extensions=[
            FinishAfter(after_n_epochs=p.num_epochs),
            # write out to sentinel file for experiment automator to work
            SentinelWhenFinish(save_dir=p.save_dir, global_history=global_history),
            # This will estimate the validation error using
            # running average estimates of the batch normalization
            # parameters, mean and variance
            ApproxTestMonitoring(
                monitored_variables,
                make_datastream(
                    data.valid, data.valid_ind, p.valid_batch_size, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme
                ),
                prefix="valid_approx",
            ),
            # This Monitor is slower, but more accurate since it will first
            # estimate batch normalization parameters from training data and
            # then do another pass to calculate the validation error.
            FinalTestMonitoring(
                monitored_variables,
                make_datastream(
                    data.train,
                    data.train_ind,
                    p.batch_size,
                    n_labeled=p.labeled_samples,
                    whiten=whiten,
                    cnorm=cnorm,
                    scheme=ShuffledScheme,
                ),
                make_datastream(
                    data.valid,
                    data.valid_ind,
                    p.valid_batch_size,
                    n_labeled=len(data.valid_ind),
                    whiten=whiten,
                    cnorm=cnorm,
                    scheme=ShuffledScheme,
                ),
                prefix="valid_final",
                after_n_epochs=p.num_epochs,
            ),
            TrainingDataMonitoring(variables=monitored_variables, prefix="train", after_epoch=True),
            SaveParams("valid_approx_cost_class_corr", model, p.save_dir),
            #             SaveParams(None, all_params, p.save_dir, after_epoch=True),
            SaveExpParams(p, p.save_dir, before_training=True),
            SaveLog(save_dir=p.save_dir, after_epoch=True, global_history=global_history),
            Printing(),
            #             ShortPrinting(short_prints),
            LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True),
        ],
    )
    main_loop.run()

    # Get results
    df = main_loop.log.to_dataframe()
    col = "valid_final_error_rate"
    logger.info("%s %g" % (col, df[col].iloc[-1]))

    if main_loop.log.status["epoch_interrupt_received"]:
        return None
    return df