Example #1
0
                  source_names=source_names,
                  emitter=emitter,
                  feedback_brick=feedback,
                  name="readout")

attention = SimpleSequenceAttention(state_names=source_names,
                                    state_dims=[hidden_size_recurrent],
                                    attended_dim=context_size)

generator = SequenceGenerator(readout=readout,
                              transition=transition,
                              attention=attention,
                              name="generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.)
generator.initialize()

mlp_context.weights_init = IsotropicGaussian(0.01)
mlp_context.biases_init = Constant(0.)
mlp_context.initialize()

#ipdb.set_trace()
cost_matrix = generator.cost_matrix(x,
                                    x_mask,
                                    attended=mlp_context.apply(context))
cost = cost_matrix.sum() / x_mask.sum()
cost.name = "sequence_log_likelihood"

cg = ComputationGraph(cost)
model = Model(cost)
Example #2
0
attention = SimpleSequenceAttention(
    state_names=source_names, state_dims=[hidden_size_recurrent], attended_dim=context_size, name="attention"
)

readout = Readout(
    readout_dim=hidden_size_recurrent,
    source_names=source_names + ["feedback"] + ["glimpses"],
    emitter=emitter,
    feedback_brick=feedback,
    name="readout",
)

generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.0)
generator.push_initialization_config()

generator.transition.biases_init = IsotropicGaussian(0.01, 1)
generator.transition.push_initialization_config()

generator.initialize()

lookup.weights_init = IsotropicGaussian(0.001)
lookup.biases_init = Constant(0.0)
lookup.initialize()

# states = {}
states = [state for state in generator.transition.apply.outputs if state != "step"]

# ipdb.set_trace()
Example #3
0
def main(name, epochs, batch_size, learning_rate,
         dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout,
         depth, max_grad, step_method, epsilon, sample, skip, uniform, top):

    #----------------------------------------------------------------------
    datasource = name

    def shnum(x):
        """ Convert a positive float into a short tag-usable string
             E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2
        """
        return '0' if x <= 0 else '%s%d' % (("%e"%x)[0], -np.floor(np.log10(x)))

    jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (datasource, depth, dim, mix_dim,
                                           int(dropout*10),
                                           shnum(learning_rate), batch_size,
                                           shnum(epsilon))
    if max_length != 600:
        jobname += '-L%d'%max_length

    if GRU:
        jobname += 'g'
    if max_grad != 5.:
        jobname += 'G%g'%max_grad
    if step_method != 'adam':
        jobname += step_method
    if skip:
        jobname += 'D'
        assert depth > 1
    if top:
        jobname += 'T'
        assert depth > 1
    if uniform > 0.:
        jobname += 'u%d'%int(uniform*100)

    if debug:
        jobname += ".debug"

    if sample:
        print("Sampling")
    else:
        print("\nRunning experiment %s" % jobname)
    if old_model_name:
        print("starting from model %s"%old_model_name)

    #----------------------------------------------------------------------
    transitions = [GatedRecurrent(dim=dim) if GRU else LSTM(dim=dim)
                   for _ in range(depth)]
    if depth > 1:
        transition = RecurrentStack(transitions, name="transition",
                                    fast=True, skip_connections=skip or top)
        if skip:
            source_names=['states'] + ['states_%d'%d for d in range(1,depth)]
        else:
            source_names=['states_%d'%(depth-1)]
    else:
        transition = transitions[0]
        transition.name = "transition"
        source_names=['states']

    emitter = SketchEmitter(mix_dim=mix_dim,
                            epsilon=epsilon,
                            name="emitter")
    readout = Readout(
        readout_dim=emitter.get_dim('inputs'),
        source_names=source_names,
        emitter=emitter,
        name="readout")
    normal_inputs = [name for name in transition.apply.sequences
                     if 'mask' not in name]
    fork = Fork(normal_inputs, prototype=Linear(use_bias=True))
    generator = SequenceGenerator(readout=readout, transition=transition,
                                  fork=fork)

    # Initialization settings
    if uniform > 0.:
        generator.weights_init = Uniform(width=uniform*2.)
    else:
        generator.weights_init = OrthogonalGlorot()
    generator.biases_init = Constant(0)

    # Build the cost computation graph [steps, batch_size, 3]
    x = T.tensor3('features', dtype=floatX)
    if debug:
        x.tag.test_value = np.ones((max_length,batch_size,3)).astype(floatX)
    x = x[:max_length,:,:]  # has to be after setting test_value
    cost = generator.cost(x)
    cost.name = "sequence_log_likelihood"

    # Give an idea of what's going on
    model = Model(cost)
    params = model.get_params()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, value.get_value().shape) for key, value
                     in params.items()],
                    width=120))
    model_size = 0
    for v in params.itervalues():
        s = v.get_value().shape
        model_size += s[0] * (s[1] if len(s) > 1 else 1)
    logger.info("Total number of parameters %d"%model_size)

    #------------------------------------------------------------
    extensions = []
    if old_model_name == 'continue':
        extensions.append(LoadFromDump(jobname))
    elif old_model_name:
        # or you can just load the weights without state using:
        old_params = LoadFromDump(old_model_name).manager.load_parameters()
        model.set_param_values(old_params)
    else:
        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

    if sample:
        assert old_model_name and old_model_name != 'continue'
        Sample(generator, steps=max_length, path=old_model_name).do(None)
        exit(0)

    #------------------------------------------------------------
    # Define the training algorithm.
    cg = ComputationGraph(cost)
    if dropout > 0.:
        from blocks.roles import INPUT, OUTPUT
        dropout_target = VariableFilter(roles=[OUTPUT],
                                        bricks=transitions,
                                        name_regex='states')(cg.variables)
        print('# dropout %d' % len(dropout_target))
        cg = apply_dropout(cg, dropout_target, dropout)
        opt_cost = cg.outputs[0]
    else:
        opt_cost = cost

    if step_method == 'adam':
        step_rule = Adam(learning_rate)
    elif step_method == 'rmsprop':
        step_rule = RMSProp(learning_rate, decay_rate=0.95)
    elif step_method == 'adagrad':
        step_rule = AdaGrad(learning_rate)
    elif step_method == 'adadelta':
        step_rule = AdaDelta()
    elif step_method == 'scale':
        step_rule = Scale(learning_rate)
    else:
        raise Exception('Unknown sttep method %s'%step_method)

    step_rule = CompositeRule([StepClipping(max_grad), step_rule])

    algorithm = GradientDescent(
        cost=opt_cost, params=cg.parameters,
        step_rule=step_rule)

    #------------------------------------------------------------
    observables = [cost]

    # Fetch variables useful for debugging
    (energies,) = VariableFilter(
        applications=[generator.readout.readout],
        name_regex="output")(cg.variables)
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    observables += [min_energy, max_energy]

    # (activations,) = VariableFilter(
    #     applications=[generator.transition.apply],
    #     name=generator.transition.apply.states[0])(cg.variables)
    # mean_activation = named_copy(abs(activations).mean(),
    #                              "mean_activation")
    # observables.append(mean_activation)

    observables += [algorithm.total_step_norm, algorithm.total_gradient_norm]
    for name, param in params.items():
        observables.append(named_copy(
            param.norm(2), name + "_norm"))
        observables.append(named_copy(
            algorithm.gradients[param].norm(2), name + "_grad_norm"))

    #------------------------------------------------------------
    datasource_fname = os.path.join(fuel.config.data_path, datasource,
                                    datasource+'.hdf5')

    train_ds = H5PYDataset(datasource_fname, #max_length=max_length,
                             which_set='train', sources=('features',),
                             load_in_memory=True)
    train_stream = DataStream(train_ds,
                              iteration_scheme=ShuffledScheme(
                                  train_ds.num_examples, batch_size))

    test_ds = H5PYDataset(datasource_fname, #max_length=max_length,
                            which_set='test', sources=('features',),
                            load_in_memory=True)
    test_stream  = DataStream(test_ds,
                              iteration_scheme=SequentialScheme(
                                  test_ds.num_examples, batch_size))

    train_stream = Mapping(train_stream, _transpose)
    test_stream = Mapping(test_stream, _transpose)

    def stream_stats(ds, label):
        itr = ds.get_epoch_iterator(as_dict=True)
        batch_count = 0
        examples_count = 0
        for batch in itr:
            batch_count += 1
            examples_count += batch['features'].shape[1]
        print('%s #batch %d #examples %d' %
              (label, batch_count, examples_count))

    stream_stats(train_stream, 'train')
    stream_stats(test_stream, 'test')

    extensions += [Timing(every_n_batches=10),
                   TrainingDataMonitoring(
                       observables, prefix="train",
                       every_n_batches=10),
                   DataStreamMonitoring(
                       [cost],  # without dropout
                       test_stream,
                       prefix="test",
                       on_resumption=True,
                       after_epoch=False,  # by default this is True
                       every_n_batches=100),
                   # all monitored data is ready so print it...
                   # (next steps may take more time and we want to see the
                   # results as soon as possible so print as soon as you can)
                   Printing(every_n_batches=10),
                   # perform multiple dumps at different intervals
                   # so if one of them breaks (has nan) we can hopefully
                   # find a model from few batches ago in the other
                   Dump(jobname, every_n_batches=11),
                   Dump(jobname+'.test', every_n_batches=100),
                   Sample(generator, steps=max_length,
                          path=jobname+'.test',
                          every_n_batches=100),
                   ProgressBar(),
                   FinishAfter(after_n_epochs=epochs)
                    # This shows a way to handle NaN emerging during
                    # training: simply finish it.
                    .add_condition("after_batch", _is_nan),
                   ]

    if bokeh:
        from blocks.extensions.plot import Plot
        extensions.append(Plot(
            'sketch',
            channels=[
                ['cost'],]))

    # Construct the main loop and start training!
    main_loop = MainLoop(
        model=model,
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=extensions
        )

    main_loop.run()
Example #4
0
    def __init__(self, config, vocab_size):
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        context_bag = to_bag(context, vocab_size)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        embed.weights_init = IsotropicGaussian(0.01)
        #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt')
        #embed.weights_init = Constant(embeddings_initial_value)

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Build the encoder bricks
        transition = GatedRecurrent(activation=Tanh(),
                                    dim=config.generator_lstm_size,
                                    name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            attended_dim=cenc_dim,
            match_dim=config.generator_lstm_size,
            name="attention")
        readout = Readout(readout_dim=vocab_size,
                          source_names=[
                              transition.apply.states[0],
                              attention.take_glimpses.outputs[0]
                          ],
                          emitter=MaskedSoftmaxEmitter(context_bag=context_bag,
                                                       name='emitter'),
                          feedback_brick=LookupFeedback(
                              vocab_size, config.feedback_size),
                          name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      name="generator")

        cost = generator.cost(answer,
                              answer_mask.astype(theano.config.floatX),
                              attended=cenc,
                              attended_mask=context_mask.astype(
                                  theano.config.floatX),
                              name="cost")
        self.predictions = generator.generate(
            n_steps=7,
            batch_size=config.batch_size,
            attended=cenc,
            attended_mask=context_mask.astype(theano.config.floatX),
            iterate=True)[1]

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # initialize new stuff manually (change!)
        generator.weights_init = IsotropicGaussian(0.01)
        generator.biases_init = Constant(0)
        generator.push_allocation_config()
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Example #5
0
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name,
         max_length, bokeh, GRU, dropout, depth, max_grad, step_method,
         epsilon, sample):

    #----------------------------------------------------------------------
    datasource = name

    def shnum(x):
        """ Convert a positive float into a short tag-usable string
             E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2
        """
        return '0' if x <= 0 else '%s%d' % (
            ("%e" % x)[0], -np.floor(np.log10(x)))

    jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (
        datasource, depth, dim, mix_dim, int(
            dropout * 10), shnum(learning_rate), batch_size, shnum(epsilon))
    if max_length != 600:
        jobname += '-L%d' % max_length

    if GRU:
        jobname += 'g'
    if max_grad != 5.:
        jobname += 'G%g' % max_grad
    if step_method != 'adam':
        jobname += step_method

    if sample:
        print("Sampling")
    else:
        print("\nRunning experiment %s" % jobname)

    #----------------------------------------------------------------------
    if depth > 1:
        transition = LSTMstack(dim=dim,
                               depth=depth,
                               name="transition",
                               lstm_name="transition")
        assert not GRU
    elif GRU:
        transition = GatedRecurrent(dim=dim, name="transition")
    else:
        transition = LSTM(dim=dim, name="transition")

    emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter")
    readout = Readout(readout_dim=emitter.get_dim('inputs'),
                      source_names=['states'],
                      emitter=emitter,
                      name="readout")
    normal_inputs = [
        name for name in transition.apply.sequences if 'mask' not in name
    ]
    fork = Fork(normal_inputs, prototype=Linear(use_bias=True))
    generator = SequenceGenerator(readout=readout,
                                  transition=transition,
                                  fork=fork)

    # Initialization settings
    generator.weights_init = OrthogonalGlorot()
    generator.biases_init = Constant(0)

    # Build the cost computation graph [steps,batch_size, 3]
    x = T.tensor3('features', dtype=floatX)[:max_length, :, :]
    x.tag.test_value = np.ones((max_length, batch_size, 3)).astype(np.float32)
    cost = generator.cost(x)
    cost.name = "sequence_log_likelihood"

    # Give an idea of what's going on
    model = Model(cost)
    params = model.get_params()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, value.get_value().shape)
                                for key, value in params.items()],
                               width=120))
    model_size = 0
    for v in params.itervalues():
        s = v.get_value().shape
        model_size += s[0] * (s[1] if len(s) > 1 else 1)
    logger.info("Total number of parameters %d" % model_size)

    #------------------------------------------------------------
    extensions = []
    if old_model_name == 'continue':
        extensions.append(LoadFromDump(jobname))
    elif old_model_name:
        # or you can just load the weights without state using:
        old_params = LoadFromDump(old_model_name).manager.load_parameters()
        model.set_param_values(old_params)
    else:
        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

    if sample:
        assert old_model_name and old_model_name != 'continue'
        Sample(generator, steps=max_length, path='.').do(None)
        exit(0)

    #------------------------------------------------------------
    # Define the training algorithm.
    cg = ComputationGraph(cost)
    if dropout > 0.:
        from blocks.roles import INPUT, OUTPUT
        dropout_target = VariableFilter(roles=[OUTPUT],
                                        bricks=[transition],
                                        name_regex='states')(cg.variables)
        cg = apply_dropout(cg, dropout_target, dropout)
        cost = cg.outputs[0]

    if step_method == 'adam':
        step_rule = Adam(learning_rate)
    elif step_method == 'rmsprop':
        step_rule = RMSProp(learning_rate, decay_rate=0.95)
    elif step_method == 'adagrad':
        step_rule = AdaGrad(learning_rate)
    elif step_method == 'adadelta':
        step_rule = AdaDelta()
    elif step_method == 'scale':
        step_rule = Scale(learning_rate=0.1)
    else:
        raise Exception('Unknown sttep method %s' % step_method)

    step_rule = CompositeRule([StepClipping(max_grad), step_rule])

    algorithm = GradientDescent(cost=cost,
                                params=cg.parameters,
                                step_rule=step_rule)

    #------------------------------------------------------------
    observables = [cost]

    # Fetch variables useful for debugging
    (energies, ) = VariableFilter(applications=[generator.readout.readout],
                                  name_regex="output")(cg.variables)
    (activations, ) = VariableFilter(
        applications=[generator.transition.apply],
        name=generator.transition.apply.states[0])(cg.variables)
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_activation = named_copy(abs(activations).mean(), "mean_activation")
    observables += [min_energy, max_energy, mean_activation]

    observables += [algorithm.total_step_norm, algorithm.total_gradient_norm]
    for name, param in params.items():
        observables.append(named_copy(param.norm(2), name + "_norm"))
        observables.append(
            named_copy(algorithm.gradients[param].norm(2),
                       name + "_grad_norm"))

    #------------------------------------------------------------
    datasource_fname = os.path.join(fuel.config.data_path, datasource,
                                    datasource + '.hdf5')

    train_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='train',
        sources=('features', ),
        load_in_memory=True)
    train_stream = DataStream(train_ds,
                              iteration_scheme=ShuffledScheme(
                                  train_ds.num_examples, batch_size))

    test_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='test',
        sources=('features', ),
        load_in_memory=True)
    test_stream = DataStream(test_ds,
                             iteration_scheme=SequentialScheme(
                                 test_ds.num_examples, batch_size))

    train_stream = Mapping(train_stream, _transpose)
    test_stream = Mapping(test_stream, _transpose)

    def stream_stats(ds, label):
        itr = ds.get_epoch_iterator(as_dict=True)
        batch_count = 0
        examples_count = 0
        for batch in itr:
            batch_count += 1
            examples_count += batch['features'].shape[1]
        print('%s #batch %d #examples %d' %
              (label, batch_count, examples_count))

    stream_stats(train_stream, 'train')
    stream_stats(test_stream, 'test')

    extensions += [
        Timing(every_n_batches=10),
        TrainingDataMonitoring(observables, prefix="train",
                               every_n_batches=10),
        DataStreamMonitoring(
            [cost],
            test_stream,
            prefix="test",
            on_resumption=True,
            after_epoch=False,  # by default this is True
            every_n_batches=100),
        # all monitored data is ready so print it...
        # (next steps may take more time and we want to see the
        # results as soon as possible so print as soon as you can)
        Printing(every_n_batches=10),
        # perform multiple dumps at different intervals
        # so if one of them breaks (has nan) we can hopefully
        # find a model from few batches ago in the other
        Dump(jobname, every_n_batches=11),
        Dump(jobname + '.test', every_n_batches=100),
        Sample(generator,
               steps=max_length,
               path=jobname + '.test',
               every_n_batches=100),
        ProgressBar(),
        FinishAfter(after_n_epochs=epochs)
        # This shows a way to handle NaN emerging during
        # training: simply finish it.
        .add_condition("after_batch", _is_nan),
    ]

    if bokeh:
        extensions.append(Plot('sketch', channels=[
            ['cost'],
        ]))

    # Construct the main loop and start training!
    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
Example #6
0
def main_rnn(config):

    x = tensor.tensor3('features')
    y = tensor.matrix('targets')

#    if 'LSTM' in config['model'] :
#        from models import getLSTMstack
#        y_hat = getLSTMstack(input_dim=13, input_var=x, depth=int(config['model'][-1]))
#    else :
#        raise Exception("These are not the LSTM we are looking for")

#    y_hat = model.apply(x)
    

    emitter = TestEmitter()
#    emitter = TrivialEmitter(readout_dim=config['lstm_hidden_size'])

#    cost_func = SquaredError()

 #   @application
 #   def qwe(self, readouts, outputs=None):
 #       print(type(self), type(readouts))
 #       x = cost_func.apply(readouts,outputs)
 #       return x
    print(type(emitter.cost))
 #   emitter.cost = qwe
  #  print(type(qwe))

    steps = 2 
    n_samples= config['target_size']

    transition = [LSTM(config['lstm_hidden_size']) for _ in range(4)]
    transition = RecurrentStack(transition,
            name="transition", skip_connections=False)

    source_names = [name for name in transition.apply.states if 'states' in name]

    readout = Readout(emitter, readout_dim=config['lstm_hidden_size'], source_names=source_names,feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None)

    seqgen = SequenceGenerator(readout, transition, attention=None, add_contexts=False)
    seqgen.weights_init = IsotropicGaussian(0.01)
    seqgen.biases_init = Constant(0.)
    seqgen.push_initialization_config()

    seqgen.transition.biases_init = IsotropicGaussian(0.01,1)
    seqgen.transition.push_initialization_config()
    seqgen.initialize()

    states = seqgen.transition.apply.outputs
    print('states',states)
    states = {name: shared_floatx_zeros((n_samples, config['lstm_hidden_size']))
        for name in states}

    cost_matrix = seqgen.cost_matrix(x, **states)
    cost = cost_matrix.mean()
    cost.name = "nll"

    cg = ComputationGraph(cost)
    model = Model(cost)
    #Cost
#    cost = SquaredError().apply(y_hat ,y)
    #cost = CategoricalCrossEntropy().apply(T.flatten(),Y)
 #   

        #for sampling
    #cg = ComputationGraph(seqgen.generate(n_steps=steps,batch_size=n_samples, iterate=True))
  

    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=Scale(learning_rate=config['learning_rate']))



    #Getting the stream
    train_stream = MFCC.get_stream(config['batch_size'],config['source_size'],config['target_size'],config['num_examples'])


    #Monitoring stuff
    extensions = [Timing(),
                  FinishAfter(after_n_batches=config['num_batches']),
                  #DataStreamMonitoring([cost, error_rate],test_stream,prefix="test"),
                  TrainingDataMonitoring([cost], prefix="train", every_n_batches=1),
                  #Checkpoint(save_to),
                  ProgressBar(),
                  Printing(every_n_batches=1)]
   

    main_loop = MainLoop(
        algorithm,
        train_stream,
 #       model=model,
        extensions=extensions)

    main_loop.run()