Example #1
0
    def check_dataset(name):
        try:
            img_shape, channels, data_train, data_valid, data_test = datasets.get_data(name)
        except IOError as e:
            raise SkipTest

        for ds in (data_train, data_valid, data_test):
            features, = ds.get_data(None, slice(0, 100))

            features = features.reshape([100, -1])
            assert (features >= 0).all()
            assert (features <= 1).all()
Example #2
0
    def check_dataset(name):
        try:
            img_shape, data_train, data_valid, data_test = datasets.get_data(
                name)
        except IOError as e:
            raise SkipTest

        for ds in (data_train, data_valid, data_test):
            features, = ds.get_data(None, slice(0, 100))

            features = features.reshape([100, -1])
            assert (features >= 0).all()
            assert (features <= 1).all()
Example #3
0
    def check_dataset(name):
        try:
            img_shape, channels, data_train, data_valid, data_test = datasets.get_data(name)
        except IOError as e:
            raise SkipTest

        for ds in (data_train, data_valid, data_test):
            features, = ds.get_data(None, slice(0, 100))
            
            img_height, img_width = img_shape
            x_dim = img_height * img_width

            features = features.reshape([100, -1])
            assert features.shape == (100, x_dim)
Example #4
0
    def check_dataset(name):
        try:
            img_shape, data_train, data_valid, data_test = datasets.get_data(
                name)
        except IOError as e:
            raise SkipTest

        for ds in (data_train, data_valid, data_test):
            features, = ds.get_data(None, slice(0, 100))

            img_height, img_width = img_shape
            x_dim = img_height * img_width

            features = features.reshape([100, -1])
            assert features.shape == (100, x_dim)
Example #5
0
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter,
         enc_dim, dec_dim, z_dim, oldmodel):

    image_size, data_train, data_valid, data_test = datasets.get_data(dataset)

    train_stream = Flatten(
        DataStream(data_train,
                   iteration_scheme=SequentialScheme(data_train.num_examples,
                                                     batch_size)))
    valid_stream = Flatten(
        DataStream(data_valid,
                   iteration_scheme=SequentialScheme(data_valid.num_examples,
                                                     batch_size)))
    test_stream = Flatten(
        DataStream(data_test,
                   iteration_scheme=SequentialScheme(data_test.num_examples,
                                                     batch_size)))

    if name is None:
        name = dataset

    img_height, img_width = image_size
    x_dim = img_height * img_width

    rnninits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    if attention != "":
        read_N, write_N = attention.split(',')

        read_N = int(read_N)
        write_N = int(write_N)
        read_dim = 2 * read_N**2

        reader = AttentionReader(x_dim=x_dim,
                                 dec_dim=dec_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
        writer = AttentionWriter(input_dim=dec_dim,
                                 output_dim=x_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=write_N,
                                 **inits)
        attention_tag = "r%d-w%d" % (read_N, write_N)
    else:
        read_dim = 2 * x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

        attention_tag = "full"

    #----------------------------------------------------------------------

    # Learning rate
    def lr_tag(value):
        """ Convert a float into a short tag-usable string representation. E.g.:
            0.1   -> 11
            0.01  -> 12
            0.001 -> 13
            0.005 -> 53
        """
        exp = np.floor(np.log10(value))
        leading = ("%e" % value)[0]
        return "%s%d" % (leading, -exp)

    lr_str = lr_tag(learning_rate)

    subdir = time.strftime("%Y%m%d-%H%M%S") + "-" + name
    longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (
        dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str)
    pickle_file = subdir + "/" + longname + ".pkl"

    print("\nRunning experiment %s" % longname)
    print("               dataset: %s" % dataset)
    print("          subdirectory: %s" % subdir)
    print("         learning rate: %g" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print("            batch size: %d" % batch_size)
    print("                epochs: %d" % epochs)
    print()

    #----------------------------------------------------------------------

    encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim],
                      name="MLP_enc",
                      **inits)
    decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim],
                      name="MLP_dec",
                      **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    draw = DrawModel(n_iter,
                     reader=reader,
                     encoder_mlp=encoder_mlp,
                     encoder_rnn=encoder_rnn,
                     sampler=q_sampler,
                     decoder_mlp=decoder_mlp,
                     decoder_rnn=decoder_rnn,
                     writer=writer)
    draw.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    #x_recons = 1. + x
    x_recons, kl_terms = draw.reconstruct(x)
    #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100)
    #x_recons = x_recons[-1,:,:]

    #samples = draw.sample(100)
    #x_recons = samples[-1, :, :]
    #x_recons = samples[-1, :, :]

    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule([
            StepClipping(10.),
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )
    #algorithm.add_updates(scan_updates)

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t, :].mean()
        kl_term_t.name = "kl_term_%d" % t

        #x_recons_t = T.nnet.sigmoid(c[t,:,:])
        #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        #recons_term_t = recons_term_t.mean()
        #recons_term_t.name = "recons_term_%d" % t

        monitors += [kl_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]
    # Live plotting...
    plot_channels = [
        ["train_nll_bound", "test_nll_bound"],
        ["train_kl_term_%d" % t for t in range(n_iter)],
        #["train_recons_term_%d" % t for t in range(n_iter)],
        ["train_total_gradient_norm", "train_total_step_norm"]
    ]

    #------------------------------------------------------------

    if not os.path.exists(subdir):
        os.makedirs(subdir)

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            TrainingDataMonitoring(train_monitors,
                                   prefix="train",
                                   after_epoch=True),
            #            DataStreamMonitoring(
            #                monitors,
            #                valid_stream,
            ##                updates=scan_updates,
            #                prefix="valid"),
            DataStreamMonitoring(
                monitors,
                test_stream,
                #                updates=scan_updates,
                prefix="test"),
            Checkpoint(name,
                       before_training=False,
                       after_epoch=True,
                       save_separately=['log', 'model']),
            #Checkpoint(image_size=image_size, save_subdir=subdir, path=pickle_file, before_training=False, after_epoch=True, save_separately=['log', 'model']),
            Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()
        ])

    if oldmodel is not None:
        print("Initializing parameters with old model %s" % oldmodel)
        with open(oldmodel, "rb") as f:
            oldmodel = pickle.load(f)
            main_loop.model.set_param_values(oldmodel.get_param_values())
        del oldmodel

    main_loop.run()
Example #6
0
def main(name, dataset, epochs, batch_size, learning_rate, attention, 
            n_iter, enc_dim, dec_dim, z_dim, oldmodel, live_plotting):

    image_size, channels, data_train, data_valid, data_test = datasets.get_data(dataset)

    train_stream = Flatten(DataStream.default_stream(data_train, iteration_scheme=SequentialScheme(data_train.num_examples, batch_size)))
    valid_stream = Flatten(DataStream.default_stream(data_valid, iteration_scheme=SequentialScheme(data_valid.num_examples, batch_size)))
    test_stream  = Flatten(DataStream.default_stream(data_test,  iteration_scheme=SequentialScheme(data_test.num_examples, batch_size)))

    if name is None:
        name = dataset

    img_height, img_width = image_size
    x_dim = channels * img_height * img_width

    rnninits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    # Configure attention mechanism
    if attention != "":
        read_N, write_N = attention.split(',')
    
        read_N = int(read_N)
        write_N = int(write_N)
        read_dim = 2 * channels * read_N ** 2

        reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim,
                                 channels=channels, width=img_width, height=img_height,
                                 N=read_N, **inits)
        writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim,
                                 channels=channels, width=img_width, height=img_height,
                                 N=write_N, **inits)
        attention_tag = "r%d-w%d" % (read_N, write_N)
    else:
        read_dim = 2*x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

        attention_tag = "full"

    #----------------------------------------------------------------------

    if name is None:
        name = dataset

    # Learning rate
    def lr_tag(value):
        """ Convert a float into a short tag-usable string representation. E.g.:
            0.1   -> 11
            0.01  -> 12
            0.001 -> 13
            0.005 -> 53
        """
        exp = np.floor(np.log10(value))
        leading = ("%e"%value)[0]
        return "%s%d" % (leading, -exp)

    lr_str = lr_tag(learning_rate)

    subdir = name + "-" + time.strftime("%Y%m%d-%H%M%S");
    longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str)
    pickle_file = subdir + "/" + longname + ".pkl"

    print("\nRunning experiment %s" % longname)
    print("               dataset: %s" % dataset)
    print("          subdirectory: %s" % subdir)
    print("         learning rate: %g" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print("            batch size: %d" % batch_size)
    print("                epochs: %d" % epochs)
    print()

    #----------------------------------------------------------------------

    encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Identity()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits)
    decoder_mlp = MLP([Identity()], [             z_dim, 4*dec_dim], name="MLP_dec", **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    draw = DrawModel(
                n_iter, 
                reader=reader,
                encoder_mlp=encoder_mlp,
                encoder_rnn=encoder_rnn,
                sampler=q_sampler,
                decoder_mlp=decoder_mlp,
                decoder_rnn=decoder_rnn,
                writer=writer)
    draw.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')
    
    x_recons, kl_terms = draw.reconstruct(x)

    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost, 
        parameters=params,
        step_rule=CompositeRule([
            StepClipping(10.), 
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t,:].mean()
        kl_term_t.name = "kl_term_%d" % t

        #x_recons_t = T.nnet.sigmoid(c[t,:,:])
        #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        #recons_term_t = recons_term_t.mean()
        #recons_term_t.name = "recons_term_%d" % t

        monitors +=[kl_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]
    # Live plotting...
    plot_channels = [
        ["train_nll_bound", "test_nll_bound"],
        ["train_kl_term_%d" % t for t in range(n_iter)],
        #["train_recons_term_%d" % t for t in range(n_iter)],
        ["train_total_gradient_norm", "train_total_step_norm"]
    ]

    #------------------------------------------------------------

    if not os.path.exists(subdir):
        os.makedirs(subdir)

    plotting_extensions = []
    if live_plotting:
        plotting_extensions = [
            Plot(name, channels=plot_channels)
        ]

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            TrainingDataMonitoring(
                train_monitors, 
                prefix="train",
                after_epoch=True),
#            DataStreamMonitoring(
#                monitors,
#                valid_stream,
##                updates=scan_updates,
#                prefix="valid"),
            DataStreamMonitoring(
                monitors,
                test_stream,
#                updates=scan_updates, 
                prefix="test"),
            #Checkpoint(name, before_training=False, after_epoch=True, save_separately=['log', 'model']),
            PartsOnlyCheckpoint("{}/{}".format(subdir,name), before_training=True, after_epoch=True, save_separately=['log', 'model']),
            SampleCheckpoint(image_size=image_size[0], channels=channels, save_subdir=subdir, before_training=True, after_epoch=True),
            ProgressBar(),
            Printing()] + plotting_extensions)

    if oldmodel is not None:
        print("Initializing parameters with old model %s"%oldmodel)
        with open(oldmodel, "rb") as f:
            oldmodel = pickle.load(f)
            main_loop.model.set_param_values(oldmodel.get_param_values())
        del oldmodel

    main_loop.run()
Example #7
0
from PIL import Image

import unittest

from draw.attention import *

import SVRT_analysis_helper_functions 

#Initialize vars and theano function
num_estimates = 20
num_training = 6 #per evaluation
num_testing = 1 #per evaluation
batch_size = num_estimates * num_training + num_estimates * num_testing #get a bunch just to be sure

image_size, channels, data_train, data_valid, data_test = datasets.get_data('sketch')
rows = 10
cols = 20
N_iter = 64;

#Load images
train_stream  = Flatten(DataStream.default_stream(data_train, iteration_scheme=SequentialScheme(data_train.num_examples, batch_size)))
train_batch = train_stream.get_epoch_iterator()
train_image = train_batch.data_stream.get_data()[0][:].reshape(batch_size,32,32)
train_labels = train_batch.data_stream.get_data()[1]
test_stream  = Flatten(DataStream.default_stream(data_test, iteration_scheme=SequentialScheme(data_test.num_examples, batch_size)))
test_batch = test_stream.get_epoch_iterator()
test_image = test_batch.data_stream.get_data()[0][:].reshape(batch_size,32,32)
test_labels = test_batch.data_stream.get_data()[1]

Example #8
0
def main(name, dataset, epochs, batch_size, learning_rate, 
         attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel,
         max_total_duration, results_root_dir, nosync,
         adam_args_json,
         rmsprop_args_json):

    image_size, channels, data_train, data_valid, data_test = datasets.get_data(dataset)

    # Sequential scheme as originally implemented.
    #train_stream = Flatten(DataStream.default_stream(data_train, iteration_scheme=SequentialScheme(data_train.num_examples, batch_size)))
    #valid_stream = Flatten(DataStream.default_stream(data_valid, iteration_scheme=SequentialScheme(data_valid.num_examples, batch_size)))
    #test_stream  = Flatten(DataStream.default_stream(data_test,  iteration_scheme=SequentialScheme(data_test.num_examples, batch_size)))

    # Shuffled version makes more sense for distributed training.
    train_stream = Flatten(DataStream.default_stream(data_train, iteration_scheme=ShuffledScheme(data_train.num_examples, batch_size)))
    valid_stream = Flatten(DataStream.default_stream(data_valid, iteration_scheme=ShuffledScheme(data_valid.num_examples, batch_size)))
    test_stream  = Flatten(DataStream.default_stream(data_test,  iteration_scheme=ShuffledScheme(data_test.num_examples, batch_size)))

    if name is None:
        name = dataset

    img_height, img_width = image_size
    x_dim = channels * img_height * img_width

    rnninits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    # Configure attention mechanism
    if attention != "":
        read_N, write_N = attention.split(',')
    
        read_N = int(read_N)
        write_N = int(write_N)
        read_dim = 2 * channels * read_N ** 2

        reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim,
                                 channels=channels, width=img_width, height=img_height,
                                 N=read_N, **inits)
        writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim,
                                 channels=channels, width=img_width, height=img_height,
                                 N=write_N, **inits)
        attention_tag = "r%d-w%d" % (read_N, write_N)
    else:
        read_dim = 2*x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

        attention_tag = "full"

    #----------------------------------------------------------------------

    if name is None:
        name = dataset

    if os.environ.has_key('MOAB_JOBARRAYINDEX'):
        name = name + ("-%0.3d" % int(os.environ['MOAB_JOBARRAYINDEX']))

    # Learning rate
    def lr_tag(value):
        """ Convert a float into a short tag-usable string representation. E.g.:
            0.1   -> 11
            0.01  -> 12
            0.001 -> 13
            0.005 -> 53
        """
        exp = np.floor(np.log10(value))
        leading = ("%e"%value)[0]
        return "%s%d" % (leading, -exp)

    lr_str = lr_tag(learning_rate)

    subdir = name + "-" + time.strftime("%Y%m%d-%H%M%S");

    if results_root_dir is not None:
        subdir = os.path.join(subdir, results_root_dir)

    longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str)
    pickle_file = subdir + "/" + longname + ".pkl"

    print("\nRunning experiment %s" % longname)
    print("               dataset: %s" % dataset)
    print("          subdirectory: %s" % subdir)
    print("         learning rate: %g" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print("            batch size: %d" % batch_size)
    print("                epochs: %d" % epochs)
    print("    max_total_duration: %d" % max_total_duration)
    print("                nosync: %s" % str(nosync))
    print("        adam_args_json: %s" % str(adam_args_json))
    print("     rmsprop_args_json: %s" % str(rmsprop_args_json))
    print()

    #----------------------------------------------------------------------

    encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Identity()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits)
    decoder_mlp = MLP([Identity()], [             z_dim, 4*dec_dim], name="MLP_dec", **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    draw = DrawModel(
                n_iter, 
                reader=reader,
                encoder_mlp=encoder_mlp,
                encoder_rnn=encoder_rnn,
                sampler=q_sampler,
                decoder_mlp=decoder_mlp,
                decoder_rnn=decoder_rnn,
                writer=writer)
    draw.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')
    
    x_recons, kl_terms = draw.reconstruct(x)

    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    if adam_args_json is not None:

        print("Setup for Adam with arguments passed by command-line.")
        import json
        adam_args = json.loads(adam_args_json)
        if learning_rate is not None:
            adam_args['learning_rate'] = learning_rate

        algorithm = GradientDescent(
            cost=cost, 
            parameters=params,
            step_rule=CompositeRule([
                StepClipping(10.), 
                Adam(**adam_args),
            ])
        )

    elif rmsprop_args_json is not None:

        print("Setup for RMSProp with arguments passed by command-line.")
        import json
        rmsprop_args = json.loads(rmsprop_args_json)
        if learning_rate is not None:
            rmsprop_args['learning_rate'] = learning_rate

        algorithm = GradientDescent(
            cost=cost, 
            parameters=params,
            step_rule=CompositeRule([
                StepClipping(10.), 
                RMSProp(**rmsprop_args),
            ])
        )

    else:

        print("Setup for Adam by default.")
        # default original behavior
        algorithm = GradientDescent(
            cost=cost, 
            parameters=params,
            step_rule=CompositeRule([
                StepClipping(10.), 
                Adam(learning_rate),
            ])
            #step_rule=RMSProp(learning_rate),
            #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
        )



    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t,:].mean()
        kl_term_t.name = "kl_term_%d" % t

        #x_recons_t = T.nnet.sigmoid(c[t,:,:])
        #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        #recons_term_t = recons_term_t.mean()
        #recons_term_t.name = "recons_term_%d" % t

        monitors +=[kl_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]
    # Live plotting...
    plot_channels = [
        ["train_nll_bound", "test_nll_bound"],
        ["train_kl_term_%d" % t for t in range(n_iter)],
        #["train_recons_term_%d" % t for t in range(n_iter)],
        ["train_total_gradient_norm", "train_total_step_norm"]
    ]

    #------------------------------------------------------------------------
    # Setup for legion

    cg = ComputationGraph(cost)
    params_to_sync = {}
    #cg.variables
    counter = 0
    print("---- cg.parameters ----")
    for p in cg.parameters:
        # `p` is of type theano.sandbox.cuda.var.CudaNdarraySharedVariable

        # Warning. This is not as deterministic as we would want.
        # For now, however, we don't have much of a choice.
        new_name = p.name
        while params_to_sync.has_key(new_name):
            counter += 1
            new_name = p.name + ("_%d" % counter)

        params_to_sync[new_name] = p
        print("Parameter %s now referred to as %s." % (p.name, new_name))
        #import pdb; pdb.set_trace()
    print("---- --.---------- ----")

    #------------------------------------------------------------

    if not os.path.exists(subdir):
        os.makedirs(subdir)

    extensions = [
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            TrainingDataMonitoring(
                train_monitors, 
                prefix="train",
                after_epoch=True),
#            DataStreamMonitoring(
#                monitors,
#                valid_stream,
##                updates=scan_updates,
#                prefix="valid"),
            DataStreamMonitoring(
                monitors,
                test_stream,
#                updates=scan_updates, 
                prefix="test"),
            PartsOnlyCheckpoint("{}/{}".format(subdir, name), before_training=True, after_epoch=True, save_separately=['log', 'model'])
            #SampleCheckpoint(image_size=image_size[0], channels=channels, save_subdir=subdir, before_training=True, after_epoch=True),
            ]

    if not nosync:
        # With parameter sync on legion,
        #extensions.append(SharedParamsRateLimited(params=params_to_sync, before_training=True, alpha=1.0, beta=0.0, maximum_rate=1.0))
        #extensions.append(SharedParamsRateLimited(params=params_to_sync, before_training=True, every_n_batches=4, alpha=0.5, beta=0.5, maximum_rate=0.2, want_sync_timing_log=True))
        extensions.append(SharedParamsRateLimited(params=params_to_sync, before_training=True, every_n_batches=1, alpha=0.99, beta=0.01, maximum_rate=0.2, want_sync_timing_log=True))


    extensions = extensions + [StopAfterTimeElapsed(every_n_batches=4, total_duration=max_total_duration),
                               # Timing information to facilitate plotting.
                               Timing(every_n_epochs=1),
                               Timestamp(every_n_batches=4),
                               # Plot(name, channels=plot_channels),
                               ProgressBar(),
                               Printing()]

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=extensions)


    if oldmodel is not None:
        print("Initializing parameters with old model %s"%oldmodel)
        with open(oldmodel, "rb") as f:
            oldmodel = pickle.load(f)
            main_loop.model.set_parameter_values(oldmodel.get_parameter_values())
        del oldmodel

    main_loop.run()