Example #1
0
def test_shared_variable_modifier():
    weights = numpy.array([-1, 1], dtype=theano.config.floatX)
    features = [numpy.array(f, dtype=theano.config.floatX)
                for f in [[1, 2], [3, 4], [5, 6]]]
    targets = [(weights * f).sum() for f in features]
    n_batches = 3
    dataset = IterableDataset(dict(features=features, targets=targets))

    x = tensor.vector('features')
    y = tensor.scalar('targets')
    W = shared_floatx([0, 0], name='W')
    cost = ((x * W).sum() - y) ** 2
    cost.name = 'cost'

    step_rule = Scale(0.001)
    sgd = GradientDescent(cost=cost, parameters=[W],
                          step_rule=step_rule)
    main_loop = MainLoop(
        model=None, data_stream=dataset.get_example_stream(),
        algorithm=sgd,
        extensions=[
            FinishAfter(after_n_epochs=1),
            SharedVariableModifier(
                step_rule.learning_rate,
                lambda n: numpy.cast[theano.config.floatX](10. / n)
            )])

    main_loop.run()

    assert_allclose(step_rule.learning_rate.get_value(),
                    numpy.cast[theano.config.floatX](10. / n_batches))
Example #2
0
def test_shared_variable_modifier_two_params():
    weights = numpy.array([-1, 1], dtype=floatX)
    features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]]
    targets = [(weights * f).sum() for f in features]
    n_batches = 3
    dataset = ContainerDataset(dict(features=features, targets=targets))

    x = tensor.vector('features')
    y = tensor.scalar('targets')
    W = shared_floatx([0, 0], name='W')
    cost = ((x * W).sum() - y)**2
    cost.name = 'cost'

    step_rule = Scale(0.001)
    sgd = GradientDescent(cost=cost, params=[W], step_rule=step_rule)
    modifier = SharedVariableModifier(
        step_rule.learning_rate, lambda _, val: numpy.cast[floatX](val * 0.2))
    main_loop = MainLoop(model=None,
                         data_stream=dataset.get_default_stream(),
                         algorithm=sgd,
                         extensions=[FinishAfter(after_n_epochs=1), modifier])

    main_loop.run()

    new_value = step_rule.learning_rate.get_value()
    assert_allclose(new_value, 0.001 * 0.2**n_batches, atol=1e-5)
Example #3
0
    def decay_learning_rate(self, learning_rate_decay):
        """Decay learning rate after each epoch

        :learning_rate_decay: decay coeff.
        """
        if learning_rate_decay not in (0, 1):
            learning_rate = self.step_rules[0].learning_rate
            self.extensions.append(
                SharedVariableModifier(
                    learning_rate,
                    lambda n, lr: numpy.cast[theano.config.floatX]
                    (learning_rate_decay * lr),
                    after_epoch=True,
                    after_batch=False))
Example #4
0
step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate),
              StepClipping(step_clipping)]
algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
                            step_rule=CompositeRule(step_rules))

# Extensions
gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
step_norm = aggregation.mean(algorithm.total_step_norm)
monitored_vars = [cost, gradient_norm, step_norm]

dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True,
                                   before_first_epoch=True, data_stream=dev_stream, prefix="dev")
train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True,
                                       before_first_epoch=True, prefix='tra')

extensions = [dev_monitor, train_monitor, Timing(), Printing(after_batch=True),
              FinishAfter(after_n_epochs=nepochs),
              saveload.Load(load_path),
              saveload.Checkpoint(last_path),
              ] + track_best('dev_cost', save_path)

if learning_rate_decay not in (0, 1):
    extensions.append(SharedVariableModifier(step_rules[0].learning_rate,
                                             lambda n, lr: numpy.cast[theano.config.floatX](learning_rate_decay * lr), after_epoch=True, after_batch=False))

print('number of parameters in the model: ' + str(tensor.sum([p.size for p in cg.parameters]).eval()))
# Finally build the main loop and train the model
main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm,
                     model=Model(cost), extensions=extensions)
main_loop.run()
Example #5
0
def train(args, trial=11, no_valid=False):
    # Creating unique strings to save for experiments.
    data_valid = "data/"+args.data_name+"_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\
    "_transitions_"+str(args.transitions)
    data_test = data_valid.replace("_valid_size", "_test_size")
    # If we want validation set to match modData of test set
    if modDataValid == 1:
        data_valid = data_valid.replace("_trial_", "_" + modData + "_trial_")
        data_test = data_test.replace("_trial_", "_" + modData + "_trial_")

    # By default, it is m0
    data_train = "data/"+args.data_name+"_trial_"+str(trial)+"_train_size_"+str(args.train_size)+\
    "_transitions_"+str(args.transitions)

    subStr = "rnn_type_"+args.rnn_type + "_trial_"+str(trial) + "_hiddenSize_"+str(args.hidden_size)+\
    "_numLayers_"+str(args.num_layers)+ \
    "_dropout_"+str(args.dropout)+"_train_size_"+str(args.train_size) + "_transitions_"+str(args.transitions)+\
    "_novalid_"+str(args.no_valid)

    if modData == "m1":
        data_train = data_train.replace("_trial_", "_m1_trial_")
        subStr = subStr.replace("_trial_", "_m1_trial_")
    elif modData == "m3":
        data_train = data_train.replace("_trial_", "_m3_trial_")
        subStr = subStr.replace("_trial_", "_m3_trial_")

        data_valid = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\
        "_transitions_"+str(args.transitions)
        data_test = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_test_size_"+str(args.train_size)+\
        "_transitions_"+str(args.transitions)

    print("on test: " + subStr)
    # Perform folder prefixing
    prefix_path = models_folder + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\
    "_boost_"+bStr(args.boosting)

    load_path2 = prefix + load_path
    save_path2 = prefix + save_path
    last_path2 = prefix + last_path

    plots_output2 = plots_output + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\
    "_boost_"+bStr(args.boosting)

    # obtain vocabulary size
    ix_to_char, char_to_ix, vocab_size = get_metadata(
        data_test.replace("_test", ""))
    print("vocab_size: " + str(vocab_size))

    # Get train, valid, test streams
    sharedDataTrain, train_stream = get_stream_inGPU(data_train,
                                                     sharedName='sharedData')
    train_streamCopy = copy.deepcopy(train_stream)
    sharedDataValid, dev_stream = get_stream_inGPU(data_valid,
                                                   sharedName='sharedData')
    valid_streamCopy = copy.deepcopy(dev_stream)
    sharedDataTest, test_stream = get_stream_inGPU(data_test,
                                                   sharedName='sharedData')
    test_streamCopy = copy.deepcopy(test_stream)

    # Create dummy sums
    sharedMRRSUM = shared(np.array(0.0, dtype=theano.config.floatX))
    sharedTOTSUM = shared(np.array(0.0, dtype=theano.config.floatX))
    sharedSUMVARs = {
        'sharedMRRSUM': sharedMRRSUM,
        'sharedTOTSUM': sharedTOTSUM
    }

    # Initialize batches
    batch_index_From = T.scalar('int_stream_From', dtype='int32')
    batch_index_To = T.scalar('int_stream_To', dtype='int32')

    # Index theano variables
    x = sharedDataTrain['x'][:, batch_index_From:batch_index_To]
    x.name = 'x'

    x_mask = sharedDataTrain['x_mask'][:, batch_index_From:batch_index_To]
    x_mask.name = 'x_mask'

    x_mask_o = sharedDataTrain['x_mask_o'][:, batch_index_From:batch_index_To]
    x_mask_o.name = 'x_mask_o'

    x_mask_o_mask = sharedDataTrain[
        'x_mask_o_mask'][:, batch_index_From:batch_index_To]
    x_mask_o_mask.name = 'x_mask_o_mask'

    y = sharedDataTrain['y'][:, batch_index_From:batch_index_To]
    y.name = 'y'

    y_mask = sharedDataTrain['y_mask'][:, batch_index_From:batch_index_To]
    y_mask.name = 'y_mask'

    y_mask_o = sharedDataTrain['y_mask_o'][:, batch_index_From:batch_index_To]
    y_mask_o.name = 'y_mask_o'

    y_mask_o_mask = sharedDataTrain[
        'y_mask_o_mask'][:, batch_index_From:batch_index_To]
    y_mask_o_mask.name = 'y_mask_o_mask'

    lens = sharedDataTrain['lens'][:, batch_index_From:batch_index_To]
    lens.name = 'lens'

    # Generate temp shared vars
    tempSharedData = {}
    tempSharedData[theano.config.floatX] = [
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX))
    ]

    tempSharedData['uint8'] = [
        shared(np.array([[0], [0]], dtype='uint8')),
        shared(np.array([[0], [0]], dtype='uint8')),
        shared(np.array([[0], [0]], dtype='uint8'))
    ]

    # Final mask is due to the generated mask and the input mask
    x_mask_final = x_mask * x_mask_o * x_mask_o_mask
    y_mask_final = y_mask * y_mask_o * y_mask_o_mask

    # Build neural network
    linear_output, cost = nn_fprop(
        x,
        x_mask_final,
        y,
        y_mask_final,
        lens,
        vocab_size,
        hidden_size,
        num_layers,
        rnn_type,
        boosting=boosting,
        scan_kwargs={'truncate_gradient': truncate_gradient})

    # Keep a constant in gpu memory
    constant1 = shared(np.float32(1.0))
    cost_int, ymasksum = RR_cost(y, linear_output, y_mask_final, constant1)

    # Validation calculations
    fRR = function(inputs=[
        theano.In(batch_index_From, borrow=True),
        theano.In(batch_index_To, borrow=True)
    ],
                   updates=[(sharedMRRSUM, sharedMRRSUM + cost_int),
                            (sharedTOTSUM, sharedTOTSUM + ymasksum)])

    # COST
    cg = ComputationGraph(cost)

    if dropout > 0:
        # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015)
        inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(
            cg.variables)
        cg = apply_dropout(cg, inputs, dropout)
        cost = cg.outputs[0]

    # Learning algorithm
    step_rules = [
        RMSProp(learning_rate=rmsPropLearnRate, decay_rate=decay_rate),
        StepClipping(step_clipping)
    ]
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=CompositeRule(step_rules))

    # Extensions

    # This is for tracking our best result
    trackbest = track_best('valid_MRR', save_path2, last_path2, num_epochs,
                           nepochs, maxIterations, epsilon, tempSharedData)

    if onlyPlots:
        prefixes = ["train_cross", "valid_cross", "test_cross"]
        gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
        step_norm = aggregation.mean(algorithm.total_step_norm)
        monitored_vars = [cost, gradient_norm, step_norm]
        #this is faster
        train_monitor = myTrainingDataMonitoring(
            variables=monitored_vars,
            prefix=prefixes[0],
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        #train_monitor = DataStreamMonitoringPlot(variables=[cost],
        #                    data_stream=train_streamCopy, prefix=prefixes[0], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTrain, after_batch=True, saveEveryXIteration = saveEveryXIteration)
        valid_monitor = DataStreamMonitoringPlot(
            variables=[cost],
            data_stream=valid_streamCopy,
            prefix=prefixes[1],
            sharedDataTrain=sharedDataTrain,
            sharedDataActualTest=sharedDataValid,
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        test_monitor = DataStreamMonitoringPlot(
            variables=[cost],
            data_stream=test_streamCopy,
            prefix=prefixes[2],
            sharedDataTrain=sharedDataTrain,
            sharedDataActualTest=sharedDataTest,
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        trackbest = [trackbest[0], trackbest[2], trackbest[3], trackbest[4]]
        plot = Plot('Live Plotting',
                    saveFolder=plots_output2,
                    channels=[
                        'train_cross_cost', 'valid_cross_cost',
                        'test_cross_cost'
                    ],
                    numProcesses=numProcesses,
                    saveEveryXIteration=saveEveryXIteration,
                    after_batch=True)
        extensions = [
            train_monitor,
            valid_monitor,
            test_monitor,
            plot,
            Printing(),
            ProgressBar(),
        ] + trackbest
    else:
        dev_monitor = myDataStreamMonitoring(after_epoch=True,
                                             before_epoch=False,
                                             data_stream=dev_stream,
                                             prefix="valid",
                                             fRR=fRR,
                                             sharedVars=sharedSUMVARs,
                                             sharedDataTrain=sharedDataTrain,
                                             sharedDataValid=sharedDataValid)
        extensions = [
            dev_monitor,
            Printing(),
            ProgressBar(),
        ] + trackbest

    if learning_rate_decay not in (0, 1):
        extensions.append(
            SharedVariableModifier(step_rules[0].learning_rate,
                                   lambda n, lr: np.cast[theano.config.floatX]
                                   (learning_rate_decay * lr),
                                   after_epoch=True,
                                   after_batch=False))

    print 'number of parameters in the model: ' + str(
        T.sum([p.size for p in cg.parameters]).eval())
    # Finally build the main loop and train the model
    main_loop = MainLoop(data_stream=train_stream,
                         algorithm=algorithm,
                         model=Model(cost),
                         extensions=extensions)
    main_loop.run()
Example #6
0
# plot = Plot('Plotting example', channels=[['cost']], after_batch=True, open_browser=True)
extensions = [
    set_train_flag,
    test_monitor,
    train_monitor,
    Timing(),
    Printing(after_epoch=True),
    FinishAfter(after_n_epochs=nepochs),
    saveload.Load(load_path),
    saveload.Checkpoint(last_path, every_n_epochs=10000),
] + track_best('test_cost', save_path)  #+ track_best('train_cost', last_path)

if learning_rate_decay not in (0, 1):
    extensions.append(
        SharedVariableModifier(step_rules[0].learning_rate,
                               lambda n, lr: np.cast[theano.config.floatX]
                               (learning_rate_decay * lr),
                               after_epoch=False,
                               every_n_epochs=lr_decay_every_n_epochs,
                               after_batch=False))

print 'number of parameters in the model: ' + str(
    T.sum([p.size for p in cg.parameters]).eval())
# Finally build the main loop and train the model
main_loop = MainLoop(data_stream=train_stream,
                     algorithm=algorithm,
                     model=Model(cost),
                     extensions=extensions)
main_loop.run()
        # DEBUG this triggers an error on my machine
        # apply dropout to all the input variables
        inputs = VariableFilter(roles=[INPUT])(cg_nodropout.variables)
        # dropconnect
        # inputs = VariableFilter(roles=[PARAMETER])(cg_nodropout.variables)
        cg = apply_dropout(cg_nodropout, inputs, args.dropout_rate)
    else:
        cg = cg_nodropout
    step_compute = RMSProp(learning_rate=args.lr, max_scaling=1e10)
    algorithm = GradientDescent(step_rule=CompositeRule([RemoveNotFinite(),
        step_compute]),
        parameters=cg.parameters, cost=cost)
    extension_list = []
    extension_list.append(
        SharedVariableModifier(step_compute.learning_rate,
            extensions.decay_learning_rate,
            after_batch=False,
            every_n_batches=batches_per_epoch, ))
    extension_list.append(FinishAfter(after_n_epochs=100001))

    ## logging of test set performance
    extension_list.append(extensions.LogLikelihood(dpm, test_stream, scl,
        every_n_batches=args.ext_every_n*batches_per_epoch, before_training=False))

    ## set up logging
    extension_list.extend([Timing(), Printing()])
    model_dir = util.create_log_dir(args, dpm.name + '_' + args.dataset)
    model_save_name = os.path.join(model_dir, 'model.pkl')
    extension_list.append(
        Checkpoint(model_save_name, every_n_batches=args.ext_every_n*batches_per_epoch, save_separately=['log']))
    # generate plots
    extension_list.append(extensions.PlotMonitors(model_dir,
Example #8
0
def pretrain_rnn(train, rnnrbm, test=None, epochs=1000, bokeh=True):
    lr = theano.shared(float32(0.1))

    probs, _, _, _ = rnnrbm.rnn_pretrain_pred(x, x_mask)
    cost = NegativeLogLikelihood().apply(y, probs, y_mask)

    error_rate = MismulitclassificationRate().apply(y, probs, y_mask)
    error_rate.name = "error on note as a whole"
    mistake_rate = MismulitmistakeRate().apply(y, probs, y_mask)
    mistake_rate.name = "single error within note"
    cost.name = 'final_cost'

    model = Model(cost)
    cg = ComputationGraph([cost])
    step_rule = CompositeRule([
        RemoveNotFinite(),
        StepClipping(30.0),
        Adam(learning_rate=lr),
        StepClipping(6.0),
        RemoveNotFinite()
    ])
    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost,
                                params=cg.parameters)
    extensions = [
        SharedVariableModifier(parameter=lr,
                               function=lambda n, v: float32(0.7 * v)
                               if n % 700 == 0 else v),
        FinishAfter(after_n_epochs=epochs),
        TrainingDataMonitoring(
            [
                cost,
                error_rate,
                mistake_rate,
            ],  # hidden_states, debug_val, param_nans,
            # aggregation.mean(algorithm.total_gradient_norm)],  #+ params,
            prefix="train",
            after_epoch=False,
            every_n_batches=40),
        Timing(),
        Printing(),
        ProgressBar()
    ]
    if test is not None:
        extensions.append(
            DataStreamMonitoring([cost, error_rate, mistake_rate],
                                 data_stream=test,
                                 updates=cg.updates,
                                 prefix="test",
                                 after_epoch=False,
                                 every_n_batches=40))

    if bokeh:
        extensions.append(
            Plot(
                'Pretrain RNN',
                channels=[
                    [
                        'train_error on note as a whole',
                        'train_single error within note',
                        'test_error on note as a whole',
                        'test_single error within note'
                    ],
                    ['train_rbm_cost'],
                    # ['train_total_gradient_norm'],
                ]))

    main_loop = MainLoop(algorithm=algorithm,
                         data_stream=train,
                         model=model,
                         extensions=extensions)
    return main_loop
Example #9
0
def train_rnnrbm(train,
                 rnnrbm,
                 epochs=1000,
                 test=None,
                 bokeh=True,
                 load_path=None):
    cdk = theano.shared(10)
    lr = theano.shared(float32(0.004))

    cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk)

    error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask)
    error_rate.name = "error on note as a whole"
    mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask)
    mistake_rate.name = "single error within note"
    cost.name = 'rbm_cost'

    model = Model(cost)
    cg = ComputationGraph([cost])
    step_rule = CompositeRule([
        RemoveNotFinite(),
        StepClipping(30.0),
        Adam(learning_rate=lr),
        StepClipping(6.0),
        RemoveNotFinite()
    ])  # Scale(0.01)
    gradients = dict(
        equizip(cg.parameters,
                T.grad(cost, cg.parameters, consider_constant=[v_sample])))
    algorithm = GradientDescent(step_rule=step_rule,
                                gradients=gradients,
                                cost=cost,
                                params=cg.parameters)
    algorithm.add_updates(cg.updates)
    extensions = [
        SharedVariableModifier(parameter=cdk,
                               function=lambda n, v: rnnrbm_cdk[n]
                               if rnnrbm_cdk.get(n) else v),
        SharedVariableModifier(parameter=lr,
                               function=lambda n, v: float32(0.78 * v)
                               if n % (200 * 5) == 0 else v),
        FinishAfter(after_n_epochs=epochs),
        TrainingDataMonitoring(
            [
                cost,
                error_rate,
                mistake_rate,
            ],  # hidden_states, debug_val, param_nans,
            # aggregation.mean(algorithm.total_gradient_norm)],  #+ params,
            prefix="train",
            after_epoch=False,
            every_n_batches=40),
        Timing(),
        Printing(),
        ProgressBar()
    ]
    if test is not None:
        extensions.append(
            DataStreamMonitoring([cost, error_rate, mistake_rate],
                                 data_stream=test,
                                 updates=cg.updates,
                                 prefix="test",
                                 after_epoch=False,
                                 every_n_batches=40))
    if bokeh:
        extensions.append(
            Plot(
                'Training RNN-RBM',
                channels=[
                    [
                        'train_error on note as a whole',
                        'train_single error within note',
                        'test_error on note as a whole',
                        'test_single error within note'
                    ],
                    ['train_final_cost'],
                    # ['train_total_gradient_norm'],
                ]))

    main_loop = MainLoop(algorithm=algorithm,
                         data_stream=train,
                         model=model,
                         extensions=extensions)
    return main_loop
Example #10
0
def run():

    # Load Model
    net_size = 256  #Hard-code instead of loading model (takes too long to set up network)
    #net = vaegan.VAEGAN()
    #network_saver = saver.NetworkSaver('vaegan/models/', net=net)
    #network_saver.load()

    # DATA
    train_stream = get_stream(hdf5_file, 'train', batch_size)  #TODO jonathan ?
    test_stream = get_stream(hdf5_file, 'test', batch_size)  #TODO jonathan ?

    # MODEL
    x = T.TensorType('floatX', [False] * 3)('features')
    y = T.tensor3('targets', dtype='floatX')
    train_flag = [theano.shared(0)]
    x = x.swapaxes(0, 1)
    y = y.swapaxes(0, 1)

    # More Config
    out_size = len(output_columns) - 1  # code_mode=RL-MDN
    latent_size = net_size
    in_size = latent_size + len(input_columns)

    # NN fprop
    y_hat, cost, cells = nn_fprop(x, y, in_size, out_size, hidden_size,
                                  num_recurrent_layers, train_flag)

    # COST
    cg = ComputationGraph(cost)
    extra_updates = []

    # RMS Prop training optimizer
    step_rules = [
        RMSProp(learning_rate=learning_rate, decay_rate=decay_rate),
        StepClipping(step_clipping)
    ]

    parameters_to_update = cg.parameters
    algorithm = GradientDescent(cost=cg.outputs[0],
                                parameters=parameters_to_update,
                                step_rule=CompositeRule(step_rules))
    algorithm.add_updates(
        extra_updates)  # TODO jonathan what is this, is this needed?

    # Extensions
    gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
    step_norm = aggregation.mean(algorithm.total_step_norm)
    monitored_vars = [
        cost, step_rules[0].learning_rate, gradient_norm, step_norm
    ]

    test_monitor = DataStreamMonitoring(variables=[cost],
                                        after_epoch=True,
                                        before_first_epoch=True,
                                        data_stream=test_stream,
                                        prefix="test")
    train_monitor = TrainingDataMonitoring(variables=monitored_vars,
                                           after_epoch=True,
                                           before_first_epoch=True,
                                           prefix='train')

    set_train_flag = SetTrainFlag(after_epoch=True,
                                  before_epoch=True,
                                  flag=train_flag)

    # plot = Plot('Plotting example', channels=[['cost']], after_batch=True, open_browser=True)
    extensions = [
        set_train_flag,
        test_monitor,
        train_monitor,
        Timing(),
        Printing(after_epoch=True),
        FinishAfter(after_n_epochs=nepochs),
        saveload.Load(load_path),
        saveload.Checkpoint(last_path, every_n_epochs=10000),
    ] + track_best('test_cost',
                   save_path)  #+ track_best('train_cost', last_path)

    if learning_rate_decay not in (0, 1):
        extensions.append(
            SharedVariableModifier(step_rules[0].learning_rate,
                                   lambda n, lr: np.cast[theano.config.floatX]
                                   (learning_rate_decay * lr),
                                   after_epoch=False,
                                   every_n_epochs=lr_decay_every_n_epochs,
                                   after_batch=False))

    print 'number of parameters in the model: ' + str(
        T.sum([p.size for p in cg.parameters]).eval())
    # Finally build the main loop and train the model
    mainLoop = MainLoop(data_stream=train_stream,
                        algorithm=algorithm,
                        model=Model(cost),
                        extensions=extensions)
    mainLoop.run()