Esempio n. 1
0
def test_step_clipping_no_threshold_regression():
    """Test regression for #1145, incorrect output when threshold=None."""
    rule1 = StepClipping()
    assert rule1.threshold is None
    gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)}
    clipped1, updates = rule1.compute_steps(gradients)
    assert len(updates) == 0
    assert clipped1 == gradients
Esempio n. 2
0
def test_step_clipping_no_threshold_regression():
    """Test regression for #1145, incorrect output when threshold=None."""
    rule1 = StepClipping()
    assert rule1.threshold is None
    gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)}
    clipped1, updates = rule1.compute_steps(gradients)
    assert len(updates) == 0
    assert clipped1 == gradients
Esempio n. 3
0
def test_step_clipping():
    rule1 = StepClipping(4)
    rule2 = StepClipping(5)

    gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)}
    clipped1, _ = rule1.compute_steps(gradients)
    assert_allclose(clipped1[0].eval(), 12 / 5.0)
    assert_allclose(clipped1[1].eval(), 16 / 5.0)
    clipped2, _ = rule2.compute_steps(gradients)
    assert_allclose(clipped2[0].eval(), 3.0)
    assert_allclose(clipped2[1].eval(), 4.0)
Esempio n. 4
0
def learning_algorithm(args):
    name = args.algorithm
    learning_rate = float(args.learning_rate)
    momentum = args.momentum
    clipping_threshold = args.clipping
    clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
    if name == 'adam':
        adam = Adam(learning_rate=learning_rate)
        step_rule = CompositeRule([adam, clipping])
        learning_rate = adam.learning_rate
    elif name == 'rms_prop':
        rms_prop = RMSProp(learning_rate=learning_rate)
        step_rule = CompositeRule([clipping, rms_prop])
        learning_rate = rms_prop.learning_rate
    elif name == 'momentum':
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        step_rule = CompositeRule([clipping, sgd_momentum])
        learning_rate = sgd_momentum.learning_rate
    elif name == 'sgd':
        sgd = Scale(learning_rate=learning_rate)
        step_rule = CompositeRule([clipping, sgd])
        learning_rate = sgd.learning_rate
    else:
        raise NotImplementedError
    return step_rule, learning_rate
def learning_algorithm(args):
    name = args.algorithm
    learning_rate = float(args.learning_rate)
    momentum = args.momentum
    clipping_threshold = args.clipping
    if name == 'adam':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        adam = Adam(learning_rate=learning_rate)
        step_rule = CompositeRule([adam, clipping])
    elif name == 'rms_prop':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        rms_prop = RMSProp(learning_rate=learning_rate)
        step_rule = CompositeRule([clipping, rms_prop])
    else:
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        step_rule = CompositeRule([clipping, sgd_momentum])
    return step_rule
Esempio n. 6
0
def learning_algorithm(args):
    name = args.algorithm
    learning_rate = float(args.learning_rate)
    momentum = args.momentum
    clipping_threshold = args.clipping
    if name == 'adam':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        adam = Adam(learning_rate=learning_rate)
        # [adam, clipping] means 'step clipping'
        # [clipping, adam] means 'gradient clipping'
        step_rule = CompositeRule([adam, clipping])
    elif name == 'rms_prop':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        rms_prop = RMSProp(learning_rate=learning_rate)
        rm_non_finite = RemoveNotFinite()
        step_rule = CompositeRule([clipping, rms_prop, rm_non_finite])
    else:
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        rm_non_finite = RemoveNotFinite()
        step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite])
    return step_rule
Esempio n. 7
0
def learning_algorithm(learning_rate,
                       momentum=0.0,
                       clipping_threshold=100,
                       algorithm='sgd'):
    if algorithm == 'adam':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        adam = Adam(learning_rate=learning_rate)
        # [adam, clipping] means 'step clipping'
        # [clipping, adam] means 'gradient clipping'
        step_rule = CompositeRule([adam, clipping])
    elif algorithm == 'rms_prop':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        rms_prop = RMSProp(learning_rate=learning_rate)
        rm_non_finite = RemoveNotFinite()
        step_rule = CompositeRule([clipping, rms_prop, rm_non_finite])
    elif algorithm == 'sgd':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        rm_non_finite = RemoveNotFinite()
        step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite])
    else:
        raise NotImplementedError
    return step_rule
def learning_algorithm(args):
    name = args.algorithm
    learning_rate = float(args.learning_rate)
    momentum = args.momentum
    clipping_threshold = args.clipping
    if name == 'adam':
        adam = Adam(learning_rate=learning_rate)
        step_rule = adam
    elif name == 'rms_prop':
        rms_prop = RMSProp(learning_rate=learning_rate, decay_rate=0.9)
        step_rule = CompositeRule([StepClipping(1.), rms_prop])
    else:
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        step_rule = sgd_momentum
    return step_rule
Esempio n. 9
0
def test_step_clipping():
    rule1 = StepClipping(4)
    rule2 = StepClipping(5)

    gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)}
    clipped1, _ = rule1.compute_steps(gradients)
    assert_allclose(clipped1[0].eval(), 12 / 5.0)
    assert_allclose(clipped1[1].eval(), 16 / 5.0)
    clipped2, _ = rule2.compute_steps(gradients)
    assert_allclose(clipped2[0].eval(), 3.0)
    assert_allclose(clipped2[1].eval(), 4.0)
def test_composite_rule():
    rule = CompositeRule([StepClipping(4), Scale(0.1)])
    gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)}
    result, _ = rule.compute_steps(gradients)
    assert_allclose(result[0].eval(), 12 / 50.0)
    assert_allclose(result[1].eval(), 16 / 50.0)

    class RuleWithUpdates(StepRule):
        def __init__(self, updates):
            self.updates = updates

        def compute_steps(self, previous_steps):
            return previous_steps, self.updates

    rule = CompositeRule([RuleWithUpdates([(1, 2)]),
                          RuleWithUpdates([(3, 4)])])
    assert rule.compute_steps(None)[1] == [(1, 2), (3, 4)]
Esempio n. 11
0
    def train(self, data_file, output_data_file, n_epochs=0):

        training_data = dataset.T_H5PYDataset(data_file, which_sets=('train',))
        test_data = dataset.T_H5PYDataset(data_file, which_sets=('test',))

        session = Session(root_url='http://localhost:5006')

        if self.MainLoop is None:
            step_rules = [RMSProp(learning_rate=0.2, decay_rate=0.95), StepClipping(1)]

            algorithm = GradientDescent(cost=self.Cost,
                                        parameters=self.ComputationGraph.parameters,
                                        step_rule=CompositeRule(step_rules),
                                        on_unused_sources='ignore')

            train_stream = DataStream.default_stream(
                training_data, iteration_scheme=SequentialScheme(
                    training_data.num_examples, batch_size=100))

            test_stream = DataStream.default_stream(
                test_data, iteration_scheme=SequentialScheme(
                    test_data.num_examples, batch_size=100))

            self.MainLoop = MainLoop(
                model=Model(self.Cost),
                data_stream=train_stream,
                algorithm=algorithm,
                extensions=[
                    FinishAfter(after_n_epochs=n_epochs),
                    Printing(),
                    Checkpoint(output_data_file, every_n_epochs=50),
                    TrainingDataMonitoring([self.Cost], after_batch=True, prefix='train'),
                    DataStreamMonitoring([self.Cost], after_batch=True, data_stream=test_stream, prefix='test'),
                    Plot(output_data_file, channels=[['train_cost', 'test_cost']])
                ])

        self.MainLoop.run()
Esempio n. 12
0
    def train(self, training_data):

        step_rules = [Adam(), StepClipping(1.0)]

        algorithm = GradientDescent(
            cost=self.Cost,
            parameters=self.ComputationGraph.parameters,
            step_rule=CompositeRule(step_rules))

        train_stream = DataStream.default_stream(
            training_data,
            iteration_scheme=SequentialScheme(training_data.num_examples,
                                              batch_size=20))

        main = MainLoop(model=Model(self.Cost),
                        data_stream=train_stream,
                        algorithm=algorithm,
                        extensions=[
                            FinishAfter(),
                            Printing(),
                            Checkpoint('trainingdata.tar', every_n_epochs=10)
                        ])

        main.run()
Esempio n. 13
0
def setup_algorithms(cost, cg, method, type="ff"):
    """Setup training algorithm.

    Parameters
    ----------
    cost : expression
        cost expression
    cg : ComputationGraph
        Computation graph
    method : string
        training method: SGD, momentum SGD, AdaGrad, RMSprop
    learning_rate : float
        learning rate for learning method

    Returns
    -------
    algorithm : GradientDescent
        Gradient Descent algorithm based on different optimization method
    """
    if method == "sgd":
        step_rule = Scale(learning_rate=0.01)
    elif method == "momentum":
        step_rule = Momentum(learning_rate=0.01, momentum=0.95)
    elif method == "adagrad":
        step_rule = AdaGrad()
    elif method == "rmsprop":
        step_rule = RMSProp()

    if type == "RNN":
        step_rule = CompositeRule([StepClipping(1.0), step_rule])

    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=step_rule)

    return algorithm
Esempio n. 14
0
emit = gmm_emitter.emit(h[-2])
emit.name = 'emitter'

cg = ComputationGraph(cost)
model = Model(cost)

#################
# Algorithm
#################

n_batches = 139*16

algorithm = GradientDescent(
    cost=cost, parameters=cg.parameters,
    step_rule=CompositeRule([StepClipping(10.0), Adam(lr)]))

train_monitor = TrainingDataMonitoring(
    variables=[cost],
    every_n_batches = n_batches,
    prefix="train")

valid_monitor = DataStreamMonitoring(
     [cost],
     valid_stream,
     after_epoch = True,
     #before_first_epoch = False,
     prefix="valid")

extensions = extensions=[
    Timing(every_n_batches = n_batches),
Esempio n. 15
0
                                   data_stream=data_stream_val,
                                   prefix='validation',
                                   after_epoch=True)

monitor_test = DataStreamMonitoring(variables=[error],
                                    data_stream=data_stream_test,
                                    prefix='test',
                                    after_epoch=True)

learning_rate = 0.00008
n_epochs = 100
algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            on_unused_sources='ignore',
                            step_rule=CompositeRule([
                                StepClipping(10.),
                                Adam(learning_rate),
                            ]))
main_loop = MainLoop(model=Model(cost),
                     data_stream=data_stream_train,
                     algorithm=algorithm,
                     extensions=[
                         Timing(),
                         FinishAfter(after_n_epochs=n_epochs), monitor,
                         monitor_val, monitor_test,
                         saveSnapshot(
                             '/home/xuehongyang/checkpoints_read/snapshot',
                             save_main_loop=False,
                             after_epoch=True,
                             save_separately=['log', 'model']),
                         ProgressBar(),
Esempio n. 16
0
def construct_main_loop(name, task_name, patch_shape, batch_size,
                        n_spatial_dims, n_patches, max_epochs, patience_epochs,
                        learning_rate, gradient_limiter, hyperparameters,
                        **kwargs):
    task = tasks.get_task(**hyperparameters)
    hyperparameters["n_channels"] = task.n_channels

    extensions = []

    # let theta noise decay as training progresses
    for key in "location_std scale_std".split():
        hyperparameters[key] = theano.shared(hyperparameters[key], name=key)
        extensions.append(
            util.ExponentialDecay(hyperparameters[key],
                                  hyperparameters["%s_decay" % key],
                                  after_batch=True))

    print "constructing graphs..."
    graphs, outputs, updates = construct_graphs(task=task, **hyperparameters)

    print "setting up main loop..."

    from blocks.model import Model
    model = Model(outputs["train"]["cost"])

    from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, Adam, RMSProp
    from extensions import Compressor
    if gradient_limiter == "clip":
        limiter = StepClipping(1.)
    elif gradient_limiter == "compress":
        limiter = Compressor()
    else:
        raise ValueError()

    algorithm = GradientDescent(
        cost=outputs["train"]["cost"],
        parameters=graphs["train"].parameters,
        step_rule=CompositeRule([limiter,
                                 Adam(learning_rate=learning_rate)]))
    algorithm.add_updates(updates["train"])

    extensions.extend(
        construct_monitors(algorithm=algorithm,
                           task=task,
                           model=model,
                           graphs=graphs,
                           outputs=outputs,
                           updates=updates,
                           **hyperparameters))

    from blocks.extensions import FinishAfter, Printing, ProgressBar, Timing
    from blocks.extensions.stopping import FinishIfNoImprovementAfter
    from blocks.extensions.training import TrackTheBest
    from blocks.extensions.saveload import Checkpoint
    from dump import DumpBest, LightCheckpoint, PrintingTo, DumpGraph, DumpLog
    extensions.extend([
        TrackTheBest("valid_error_rate", "best_valid_error_rate"),
        FinishIfNoImprovementAfter("best_valid_error_rate",
                                   epochs=patience_epochs),
        FinishAfter(after_n_epochs=max_epochs),
        DumpBest("best_valid_error_rate", name + "_best.zip"),
        Checkpoint(hyperparameters["checkpoint_save_path"],
                   on_interrupt=False,
                   every_n_epochs=10,
                   use_cpickle=True),
        DumpLog("log.pkl", after_epoch=True),
        ProgressBar(),
        Timing(),
        Printing(),
        PrintingTo(name + "_log"),
        DumpGraph(name + "_grad_graph")
    ])

    from blocks.main_loop import MainLoop
    main_loop = MainLoop(data_stream=task.get_stream("train"),
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)

    from tabulate import tabulate
    print "parameter sizes:"
    print tabulate(
        (key, "x".join(map(str,
                           value.get_value().shape)), value.get_value().size)
        for key, value in main_loop.model.get_parameter_dict().items())

    return main_loop
Esempio n. 17
0
x = tensor.matrix('features', dtype='uint8')
y = tensor.matrix('targets', dtype='uint8')
y_hat, cost, cells = nn_fprop(x, y, vocab_size, hidden_size, num_layers, model)

# COST
cg = ComputationGraph(cost)

if dropout > 0:
    # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015)
    inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(cg.variables)
    cg = apply_dropout(cg, inputs, dropout)
    cost = cg.outputs[0]

# Learning algorithm
step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate),
              StepClipping(step_clipping)]
algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
                            step_rule=CompositeRule(step_rules))

# Extensions
gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
step_norm = aggregation.mean(algorithm.total_step_norm)
monitored_vars = [cost, gradient_norm, step_norm]

dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True,
                                   before_first_epoch=True, data_stream=dev_stream, prefix="dev")
train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True,
                                       before_first_epoch=True, prefix='tra')

extensions = [dev_monitor, train_monitor, Timing(), Printing(after_batch=True),
              FinishAfter(after_n_epochs=nepochs),
Esempio n. 18
0
def main(config, tr_stream, dev_stream, use_bokeh=False, the_task=None, the_track=None):

    config['the_task'] = the_task
    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        # end_embed is dimension of word embedding matrix in encoder; enc_nhids number of hidden units in encoder GRU
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2, config['use_attention'], cost_type=config['error_fct'])
    cost = decoder.cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)
    testVar = decoder.getTestVar(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)
   
    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    my_rng = numpy.random.RandomState(config['rng_value']) 
    if config['identity_init']:
      encoder.weights_init = decoder.weights_init = Identity()
    else:
      encoder.weights_init = decoder.weights_init = IsotropicGaussian(
          config['weight_scale'])
      encoder.rng = decoder.rng = my_rng
    
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    encoder.bidir.prototype.rng = my_rng
    decoder.transition.weights_init = Orthogonal()
    decoder.transition.rng = my_rng
    encoder.initialize()
    decoder.initialize()

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info('Applying dropout')
        dropout_inputs = [x for x in cg.intermediary_variables
                          if x.name == 'maxout_apply_output']
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Apply weight noise for regularization
    if config['weight_noise_ff'] > 0.0:
        logger.info('Applying weight noise to ff layers')
        enc_params = Selector(encoder.lookup).get_params().values()
        enc_params += Selector(encoder.fwd_fork).get_params().values()
        enc_params += Selector(encoder.back_fork).get_params().values()
        dec_params = Selector(
            decoder.sequence_generator.readout).get_params().values()
        dec_params += Selector(
            decoder.sequence_generator.fork).get_params().values()
        dec_params += Selector(decoder.state_init).get_params().values()
        cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'], seed=my_rng)

    cost = cg.outputs[0]

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))
    


    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))


    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)    

    # Set extensions
    logger.info("Initializing extensions")
    # this is ugly code and done, because I am not sure if the order of the extensions is important
    if 'track2' in config['saveto']: # less epochs for track 2, because of more data
      if config['early_stopping']:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']/2),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
      else:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']/2),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
    else:
      if config['early_stopping']:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
      else:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    
    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model, data_stream=tr_stream,
                    hook_samples=config['hook_samples'],
                    #every_n_batches=1,
                    every_n_batches=config['sampling_freq'],
                    src_vocab_size=8))
                    #src_vocab_size=config['src_vocab_size']))
    
    # Add early stopping based on bleu
    if config['val_set'] is not None:
        logger.info("Building accuracy validator")
        extensions.append(
            AccuracyValidator(sampling_input, samples=samples, config=config,
                          model=search_model, data_stream=dev_stream,
                          after_training=True,
                          #after_epoch=True))
                          every_n_epochs=5))
    else:
        logger.info("No validation set given for this language")
    
    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))
       
    
    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                 eval(config['step_rule'])()])
    )

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=training_model,
        algorithm=algorithm,
        data_stream=tr_stream,
        extensions=extensions
    )
    
    # Train!
    main_loop.run()
Esempio n. 19
0
def mainPredict(config, data_to_predict_stream, use_ensemble, lang=None, et_version=False, use_bokeh=False, the_track=None):
    # Create Theano variables
    assert the_track != None
    
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2, cost_type=config['error_fct'])
    cost = decoder.cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))
    
    
    # Set extensions
    logger.info("Initializing (empty) extensions")
    extensions = [
    ]

    logger.info("Building sampling model")
    sampling_representation = encoder.apply(
        sampling_input, tensor.ones(sampling_input.shape))
    generated = decoder.generate(sampling_input, sampling_representation)
    search_model = Model(generated)

    _, samples = VariableFilter(
        bricks=[decoder.sequence_generator], name="outputs")(
            ComputationGraph(generated[1]))  # generated[1] is next_outputs
    
    # Reload the model (as this is prediction, it is 100% necessary):
    if config['reload']:
        #extensions.append(LoadOnlyBestModel(config['saveto'])) # without early stopping use LoadOnlyModel here!
        extensions.append(LoadOnlyModel(config['saveto'])) # without early stopping use LoadOnlyModel here!
    else:
        raise Exception('No model available for prediction! (Check config[\'reload\'] variable)')

    
    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                 eval(config['step_rule'])()])
    )

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=search_model,
        algorithm=algorithm,
        #algorithm=None,
        data_stream=data_to_predict_stream,
        extensions=extensions
    )

    predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, et_version, config, the_track=the_track)
Esempio n. 20
0
def main(config, tr_stream, dev_stream):
    # Create Theano variables
    logger.info('Creating theano variables')
    source_char_seq = tensor.lmatrix('source_char_seq')
    source_sample_matrix = tensor.btensor3('source_sample_matrix')
    source_char_aux = tensor.bmatrix('source_char_aux')
    source_word_mask = tensor.bmatrix('source_word_mask')
    target_char_seq = tensor.lmatrix('target_char_seq')
    target_char_aux = tensor.bmatrix('target_char_aux')
    target_char_mask = tensor.bmatrix('target_char_mask')
    target_sample_matrix = tensor.btensor3('target_sample_matrix')
    target_word_mask = tensor.bmatrix('target_word_mask')
    target_resample_matrix = tensor.btensor3('target_resample_matrix')
    target_prev_char_seq = tensor.lmatrix('target_prev_char_seq')
    target_prev_char_aux = tensor.bmatrix('target_prev_char_aux')
    target_bos_idx = tr_stream.trg_bos
    target_space_idx = tr_stream.space_idx['target']

    # Construct model
    logger.info('Building RNN encoder-decoder')

    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'],
                                   config['src_dgru_nhids'],
                                   config['enc_nhids'],
                                   config['src_dgru_depth'],
                                   config['bidir_encoder_depth'])

    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['trg_dgru_nhids'], config['trg_igru_nhids'],
                      config['dec_nhids'], config['enc_nhids'] * 2,
                      config['transition_depth'], config['trg_igru_depth'],
                      config['trg_dgru_depth'], target_space_idx,
                      target_bos_idx)

    representation = encoder.apply(source_char_seq, source_sample_matrix,
                                   source_char_aux, source_word_mask)
    cost = decoder.cost(representation, source_word_mask, target_char_seq,
                        target_sample_matrix, target_resample_matrix,
                        target_char_aux, target_char_mask, target_word_mask,
                        target_prev_char_seq, target_prev_char_aux)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    for layer_n in range(config['src_dgru_depth']):
        encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal()
    for layer_n in range(config['bidir_encoder_depth']):
        encoder.children[
            1 + layer_n].prototype.recurrent.weights_init = Orthogonal()
    if config['trg_igru_depth'] == 1:
        decoder.interpolator.igru.weights_init = Orthogonal()
    else:
        for layer_n in range(config['trg_igru_depth']):
            decoder.interpolator.igru.transitions[
                layer_n].weights_init = Orthogonal()
    for layer_n in range(config['trg_dgru_depth']):
        decoder.interpolator.feedback_brick.dgru.transitions[
            layer_n].weights_init = Orthogonal()
    for layer_n in range(config['transition_depth']):
        decoder.transition.transitions[layer_n].weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(str(shape), count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_parameters(),
        Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(str(value.get_value().shape), name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)
    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=CompositeRule([
                                    StepClipping(config['step_clipping']),
                                    eval(config['step_rule'])()
                                ]))

    # Set extensions
    logger.info("Initializing extensions")
    # Extensions
    gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
    step_norm = aggregation.mean(algorithm.total_step_norm)
    train_monitor = CostCurve([cost, gradient_norm, step_norm],
                              config=config,
                              after_batch=True,
                              before_first_epoch=True,
                              prefix='tra')
    extensions = [
        train_monitor,
        Timing(),
        Printing(every_n_batches=config['print_freq']),
        FinishAfter(after_n_batches=config['finish_after']),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        logger.info("Building sampling model")
        generated = decoder.generate(representation, source_word_mask)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[config['transition_depth']])
            )  # generated[transition_depth] is next_outputs

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model,
                    data_stream=tr_stream,
                    hook_samples=config['hook_samples'],
                    transition_depth=config['transition_depth'],
                    every_n_batches=config['sampling_freq'],
                    src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    if config['bleu_script'] is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(source_char_seq,
                          source_sample_matrix,
                          source_char_aux,
                          source_word_mask,
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Esempio n. 21
0
def train(args, trial=11, no_valid=False):
    # Creating unique strings to save for experiments.
    data_valid = "data/"+args.data_name+"_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\
    "_transitions_"+str(args.transitions)
    data_test = data_valid.replace("_valid_size", "_test_size")
    # If we want validation set to match modData of test set
    if modDataValid == 1:
        data_valid = data_valid.replace("_trial_", "_" + modData + "_trial_")
        data_test = data_test.replace("_trial_", "_" + modData + "_trial_")

    # By default, it is m0
    data_train = "data/"+args.data_name+"_trial_"+str(trial)+"_train_size_"+str(args.train_size)+\
    "_transitions_"+str(args.transitions)

    subStr = "rnn_type_"+args.rnn_type + "_trial_"+str(trial) + "_hiddenSize_"+str(args.hidden_size)+\
    "_numLayers_"+str(args.num_layers)+ \
    "_dropout_"+str(args.dropout)+"_train_size_"+str(args.train_size) + "_transitions_"+str(args.transitions)+\
    "_novalid_"+str(args.no_valid)

    if modData == "m1":
        data_train = data_train.replace("_trial_", "_m1_trial_")
        subStr = subStr.replace("_trial_", "_m1_trial_")
    elif modData == "m3":
        data_train = data_train.replace("_trial_", "_m3_trial_")
        subStr = subStr.replace("_trial_", "_m3_trial_")

        data_valid = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\
        "_transitions_"+str(args.transitions)
        data_test = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_test_size_"+str(args.train_size)+\
        "_transitions_"+str(args.transitions)

    print("on test: " + subStr)
    # Perform folder prefixing
    prefix_path = models_folder + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\
    "_boost_"+bStr(args.boosting)

    load_path2 = prefix + load_path
    save_path2 = prefix + save_path
    last_path2 = prefix + last_path

    plots_output2 = plots_output + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\
    "_boost_"+bStr(args.boosting)

    # obtain vocabulary size
    ix_to_char, char_to_ix, vocab_size = get_metadata(
        data_test.replace("_test", ""))
    print("vocab_size: " + str(vocab_size))

    # Get train, valid, test streams
    sharedDataTrain, train_stream = get_stream_inGPU(data_train,
                                                     sharedName='sharedData')
    train_streamCopy = copy.deepcopy(train_stream)
    sharedDataValid, dev_stream = get_stream_inGPU(data_valid,
                                                   sharedName='sharedData')
    valid_streamCopy = copy.deepcopy(dev_stream)
    sharedDataTest, test_stream = get_stream_inGPU(data_test,
                                                   sharedName='sharedData')
    test_streamCopy = copy.deepcopy(test_stream)

    # Create dummy sums
    sharedMRRSUM = shared(np.array(0.0, dtype=theano.config.floatX))
    sharedTOTSUM = shared(np.array(0.0, dtype=theano.config.floatX))
    sharedSUMVARs = {
        'sharedMRRSUM': sharedMRRSUM,
        'sharedTOTSUM': sharedTOTSUM
    }

    # Initialize batches
    batch_index_From = T.scalar('int_stream_From', dtype='int32')
    batch_index_To = T.scalar('int_stream_To', dtype='int32')

    # Index theano variables
    x = sharedDataTrain['x'][:, batch_index_From:batch_index_To]
    x.name = 'x'

    x_mask = sharedDataTrain['x_mask'][:, batch_index_From:batch_index_To]
    x_mask.name = 'x_mask'

    x_mask_o = sharedDataTrain['x_mask_o'][:, batch_index_From:batch_index_To]
    x_mask_o.name = 'x_mask_o'

    x_mask_o_mask = sharedDataTrain[
        'x_mask_o_mask'][:, batch_index_From:batch_index_To]
    x_mask_o_mask.name = 'x_mask_o_mask'

    y = sharedDataTrain['y'][:, batch_index_From:batch_index_To]
    y.name = 'y'

    y_mask = sharedDataTrain['y_mask'][:, batch_index_From:batch_index_To]
    y_mask.name = 'y_mask'

    y_mask_o = sharedDataTrain['y_mask_o'][:, batch_index_From:batch_index_To]
    y_mask_o.name = 'y_mask_o'

    y_mask_o_mask = sharedDataTrain[
        'y_mask_o_mask'][:, batch_index_From:batch_index_To]
    y_mask_o_mask.name = 'y_mask_o_mask'

    lens = sharedDataTrain['lens'][:, batch_index_From:batch_index_To]
    lens.name = 'lens'

    # Generate temp shared vars
    tempSharedData = {}
    tempSharedData[theano.config.floatX] = [
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX))
    ]

    tempSharedData['uint8'] = [
        shared(np.array([[0], [0]], dtype='uint8')),
        shared(np.array([[0], [0]], dtype='uint8')),
        shared(np.array([[0], [0]], dtype='uint8'))
    ]

    # Final mask is due to the generated mask and the input mask
    x_mask_final = x_mask * x_mask_o * x_mask_o_mask
    y_mask_final = y_mask * y_mask_o * y_mask_o_mask

    # Build neural network
    linear_output, cost = nn_fprop(
        x,
        x_mask_final,
        y,
        y_mask_final,
        lens,
        vocab_size,
        hidden_size,
        num_layers,
        rnn_type,
        boosting=boosting,
        scan_kwargs={'truncate_gradient': truncate_gradient})

    # Keep a constant in gpu memory
    constant1 = shared(np.float32(1.0))
    cost_int, ymasksum = RR_cost(y, linear_output, y_mask_final, constant1)

    # Validation calculations
    fRR = function(inputs=[
        theano.In(batch_index_From, borrow=True),
        theano.In(batch_index_To, borrow=True)
    ],
                   updates=[(sharedMRRSUM, sharedMRRSUM + cost_int),
                            (sharedTOTSUM, sharedTOTSUM + ymasksum)])

    # COST
    cg = ComputationGraph(cost)

    if dropout > 0:
        # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015)
        inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(
            cg.variables)
        cg = apply_dropout(cg, inputs, dropout)
        cost = cg.outputs[0]

    # Learning algorithm
    step_rules = [
        RMSProp(learning_rate=rmsPropLearnRate, decay_rate=decay_rate),
        StepClipping(step_clipping)
    ]
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=CompositeRule(step_rules))

    # Extensions

    # This is for tracking our best result
    trackbest = track_best('valid_MRR', save_path2, last_path2, num_epochs,
                           nepochs, maxIterations, epsilon, tempSharedData)

    if onlyPlots:
        prefixes = ["train_cross", "valid_cross", "test_cross"]
        gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
        step_norm = aggregation.mean(algorithm.total_step_norm)
        monitored_vars = [cost, gradient_norm, step_norm]
        #this is faster
        train_monitor = myTrainingDataMonitoring(
            variables=monitored_vars,
            prefix=prefixes[0],
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        #train_monitor = DataStreamMonitoringPlot(variables=[cost],
        #                    data_stream=train_streamCopy, prefix=prefixes[0], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTrain, after_batch=True, saveEveryXIteration = saveEveryXIteration)
        valid_monitor = DataStreamMonitoringPlot(
            variables=[cost],
            data_stream=valid_streamCopy,
            prefix=prefixes[1],
            sharedDataTrain=sharedDataTrain,
            sharedDataActualTest=sharedDataValid,
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        test_monitor = DataStreamMonitoringPlot(
            variables=[cost],
            data_stream=test_streamCopy,
            prefix=prefixes[2],
            sharedDataTrain=sharedDataTrain,
            sharedDataActualTest=sharedDataTest,
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        trackbest = [trackbest[0], trackbest[2], trackbest[3], trackbest[4]]
        plot = Plot('Live Plotting',
                    saveFolder=plots_output2,
                    channels=[
                        'train_cross_cost', 'valid_cross_cost',
                        'test_cross_cost'
                    ],
                    numProcesses=numProcesses,
                    saveEveryXIteration=saveEveryXIteration,
                    after_batch=True)
        extensions = [
            train_monitor,
            valid_monitor,
            test_monitor,
            plot,
            Printing(),
            ProgressBar(),
        ] + trackbest
    else:
        dev_monitor = myDataStreamMonitoring(after_epoch=True,
                                             before_epoch=False,
                                             data_stream=dev_stream,
                                             prefix="valid",
                                             fRR=fRR,
                                             sharedVars=sharedSUMVARs,
                                             sharedDataTrain=sharedDataTrain,
                                             sharedDataValid=sharedDataValid)
        extensions = [
            dev_monitor,
            Printing(),
            ProgressBar(),
        ] + trackbest

    if learning_rate_decay not in (0, 1):
        extensions.append(
            SharedVariableModifier(step_rules[0].learning_rate,
                                   lambda n, lr: np.cast[theano.config.floatX]
                                   (learning_rate_decay * lr),
                                   after_epoch=True,
                                   after_batch=False))

    print 'number of parameters in the model: ' + str(
        T.sum([p.size for p in cg.parameters]).eval())
    # Finally build the main loop and train the model
    main_loop = MainLoop(data_stream=train_stream,
                         algorithm=algorithm,
                         model=Model(cost),
                         extensions=extensions)
    main_loop.run()
Esempio n. 22
0
def main(config,
         tr_stream,
         dev_stream,
         source_vocab,
         target_vocab,
         use_bokeh=False):

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    initial_context = tensor.matrix('initial_context')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])

    # let user specify the target transition class name in config,
    # eval it and pass to decoder
    target_transition_name = config.get(
        'target_transition', 'GRUInitialStateWithInitialStateSumContext')
    target_transition = eval(target_transition_name)

    logger.info('Using target transition: {}'.format(target_transition_name))
    decoder = InitialContextDecoder(config['trg_vocab_size'],
                                    config['dec_embed'], config['dec_nhids'],
                                    config['enc_nhids'] * 2,
                                    config['context_dim'], target_transition)

    cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask),
                        source_sentence_mask, target_sentence,
                        target_sentence_mask, initial_context)

    cost.name = 'decoder_cost'

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING
    # TODO: validate performance with/without regularization
    if config.get('l2_regularization', False) is True:
        l2_reg_alpha = config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to name the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_parameters(),
        Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions

    # TODO: add checking for existing model and loading
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Create the theano variables that we need for the sampling graph
    sampling_input = tensor.lmatrix('input')
    sampling_context = tensor.matrix('context_input')

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config.get('bleu_script',
                                                 None) is not None:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))

        generated = decoder.generate(sampling_input, sampling_representation,
                                     sampling_context)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(
                model=search_model,
                data_stream=tr_stream,
                hook_samples=config['hook_samples'],
                every_n_batches=config['sampling_freq'],
                src_vocab=source_vocab,
                trg_vocab=target_vocab,
                src_vocab_size=config['src_vocab_size'],
            ))

    # Add early stopping based on bleu
    if config.get('bleu_script', None) is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(sampling_input,
                          sampling_context,
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          src_vocab=source_vocab,
                          trg_vocab=target_vocab,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Add early stopping based on Meteor
    if config.get('meteor_directory', None) is not None:
        logger.info("Building meteor validator")
        extensions.append(
            MeteorValidator(sampling_input,
                            sampling_context,
                            samples=samples,
                            config=config,
                            model=search_model,
                            data_stream=dev_stream,
                            src_vocab=source_vocab,
                            trg_vocab=target_vocab,
                            normalize=config['normalized_bleu'],
                            every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(config['model_save_directory'],
                 channels=[[
                     'decoder_cost', 'validation_set_bleu_score',
                     'validation_set_meteor_score'
                 ]],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    # if there is dropout or random noise, we need to use the output of the modified graph
    if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0:
        algorithm = GradientDescent(cost=cg.outputs[0],
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Esempio n. 23
0
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter,
         enc_dim, dec_dim, z_dim, oldmodel):

    image_size, data_train, data_valid, data_test = datasets.get_data(dataset)

    train_stream = Flatten(
        DataStream(data_train,
                   iteration_scheme=SequentialScheme(data_train.num_examples,
                                                     batch_size)))
    valid_stream = Flatten(
        DataStream(data_valid,
                   iteration_scheme=SequentialScheme(data_valid.num_examples,
                                                     batch_size)))
    test_stream = Flatten(
        DataStream(data_test,
                   iteration_scheme=SequentialScheme(data_test.num_examples,
                                                     batch_size)))

    if name is None:
        name = dataset

    img_height, img_width = image_size
    x_dim = img_height * img_width

    rnninits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    if attention != "":
        read_N, write_N = attention.split(',')

        read_N = int(read_N)
        write_N = int(write_N)
        read_dim = 2 * read_N**2

        reader = AttentionReader(x_dim=x_dim,
                                 dec_dim=dec_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
        writer = AttentionWriter(input_dim=dec_dim,
                                 output_dim=x_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=write_N,
                                 **inits)
        attention_tag = "r%d-w%d" % (read_N, write_N)
    else:
        read_dim = 2 * x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

        attention_tag = "full"

    #----------------------------------------------------------------------

    # Learning rate
    def lr_tag(value):
        """ Convert a float into a short tag-usable string representation. E.g.:
            0.1   -> 11
            0.01  -> 12
            0.001 -> 13
            0.005 -> 53
        """
        exp = np.floor(np.log10(value))
        leading = ("%e" % value)[0]
        return "%s%d" % (leading, -exp)

    lr_str = lr_tag(learning_rate)

    subdir = time.strftime("%Y%m%d-%H%M%S") + "-" + name
    longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (
        dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str)
    pickle_file = subdir + "/" + longname + ".pkl"

    print("\nRunning experiment %s" % longname)
    print("               dataset: %s" % dataset)
    print("          subdirectory: %s" % subdir)
    print("         learning rate: %g" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print("            batch size: %d" % batch_size)
    print("                epochs: %d" % epochs)
    print()

    #----------------------------------------------------------------------

    encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim],
                      name="MLP_enc",
                      **inits)
    decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim],
                      name="MLP_dec",
                      **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    draw = DrawModel(n_iter,
                     reader=reader,
                     encoder_mlp=encoder_mlp,
                     encoder_rnn=encoder_rnn,
                     sampler=q_sampler,
                     decoder_mlp=decoder_mlp,
                     decoder_rnn=decoder_rnn,
                     writer=writer)
    draw.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    #x_recons = 1. + x
    x_recons, kl_terms = draw.reconstruct(x)
    #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100)
    #x_recons = x_recons[-1,:,:]

    #samples = draw.sample(100)
    #x_recons = samples[-1, :, :]
    #x_recons = samples[-1, :, :]

    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule([
            StepClipping(10.),
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )
    #algorithm.add_updates(scan_updates)

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t, :].mean()
        kl_term_t.name = "kl_term_%d" % t

        #x_recons_t = T.nnet.sigmoid(c[t,:,:])
        #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        #recons_term_t = recons_term_t.mean()
        #recons_term_t.name = "recons_term_%d" % t

        monitors += [kl_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]
    # Live plotting...
    plot_channels = [
        ["train_nll_bound", "test_nll_bound"],
        ["train_kl_term_%d" % t for t in range(n_iter)],
        #["train_recons_term_%d" % t for t in range(n_iter)],
        ["train_total_gradient_norm", "train_total_step_norm"]
    ]

    #------------------------------------------------------------

    if not os.path.exists(subdir):
        os.makedirs(subdir)

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            TrainingDataMonitoring(train_monitors,
                                   prefix="train",
                                   after_epoch=True),
            #            DataStreamMonitoring(
            #                monitors,
            #                valid_stream,
            ##                updates=scan_updates,
            #                prefix="valid"),
            DataStreamMonitoring(
                monitors,
                test_stream,
                #                updates=scan_updates,
                prefix="test"),
            Checkpoint(name,
                       before_training=False,
                       after_epoch=True,
                       save_separately=['log', 'model']),
            #Checkpoint(image_size=image_size, save_subdir=subdir, path=pickle_file, before_training=False, after_epoch=True, save_separately=['log', 'model']),
            Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()
        ])

    if oldmodel is not None:
        print("Initializing parameters with old model %s" % oldmodel)
        with open(oldmodel, "rb") as f:
            oldmodel = pickle.load(f)
            main_loop.model.set_param_values(oldmodel.get_param_values())
        del oldmodel

    main_loop.run()
Esempio n. 24
0
def main(exp_config, source_vocab, target_vocab, dev_stream, use_bokeh=True):

    # def setup_model_and_stream(exp_config, source_vocab, target_vocab):
    # def setup_model_and_stream(exp_config, source_vocab, target_vocab):
    train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream = setup_model_and_stream(
        exp_config, source_vocab, target_vocab)
    cost = create_model(train_encoder, train_decoder,
                        exp_config.get('imt_smoothing_constant', 0.005))

    # Set up training model
    logger.info("Building model")
    train_model = Model(cost)

    # Set the parameters from a trained models (.npz file)
    logger.info("Loading parameters from model: {}".format(
        exp_config['saved_parameters']))
    # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API
    param_values = LoadNMT.load_parameter_values(
        exp_config['saved_parameters'],
        brick_delimiter=exp_config.get('brick_delimiter', None))
    LoadNMT.set_model_parameters(train_model, param_values)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING
    if exp_config.get('l2_regularization', False) is True:
        l2_reg_alpha = exp_config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to rename the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    # Note dropout variables are hard-coded here
    if exp_config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, exp_config['dropout'])

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(exp_config['saveto']):
        os.makedirs(exp_config['saveto'])
        # TODO: mv the actual config file once we switch to .yaml for min-risk
        shutil.copy(exp_config['config_file'], exp_config['saveto'])

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=exp_config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(exp_config['saveto'],
                      every_n_batches=exp_config['save_freq'])
    ]

    # Set up beam search and sampling computation graphs if necessary
    # TODO: change the if statement here
    if exp_config['hook_samples'] >= 1 or exp_config['bleu_script'] is not None:
        logger.info("Building sampling model")
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[train_decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling -- TODO: sampling is broken for min-risk
    #if config['hook_samples'] >= 1:
    #    logger.info("Building sampler")
    #    extensions.append(
    #        Sampler(model=search_model, data_stream=tr_stream,
    #                hook_samples=config['hook_samples'],
    #                every_n_batches=config['sampling_freq'],
    #                src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    # TODO: use multimodal meteor and BLEU validator
    # TODO: add 'validator' key to IMT config
    # Add early stopping based on bleu
    if exp_config.get('bleu_script', None) is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(theano_sampling_source_input,
                          theano_sampling_context_input,
                          samples=samples,
                          config=exp_config,
                          model=search_model,
                          data_stream=dev_stream,
                          src_vocab=source_vocab,
                          trg_vocab=target_vocab,
                          normalize=exp_config['normalized_bleu'],
                          every_n_batches=exp_config['bleu_val_freq']))

    if exp_config.get('imt_f1_validation', False) is not False:
        logger.info("Building imt F1 validator")
        extensions.append(
            IMT_F1_Validator(theano_sampling_source_input,
                             theano_sampling_context_input,
                             samples=samples,
                             config=exp_config,
                             model=search_model,
                             data_stream=dev_stream,
                             src_vocab=source_vocab,
                             trg_vocab=target_vocab,
                             normalize=exp_config['normalized_bleu'],
                             every_n_batches=exp_config['bleu_val_freq']))

    # Add early stopping based on Meteor
    # if exp_config.get('meteor_directory', None) is not None:
    #     logger.info("Building meteor validator")
    #     extensions.append(
    #         MeteorValidator(theano_sampling_source_input, theano_sampling_context_input,
    #                         samples=samples,
    #                         config=config,
    #                         model=search_model, data_stream=dev_stream,
    #                         src_vocab=src_vocab,
    #                         trg_vocab=trg_vocab,
    #                         normalize=config['normalized_bleu'],
    #                         every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if exp_config['reload']:
        extensions.append(LoadNMT(exp_config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(exp_config['model_save_directory'],
                 channels=[[
                     'decoder_cost_cost', 'validation_set_imt_f1_score',
                     'validation_set_bleu_score', 'validation_set_meteor_score'
                 ]],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph
    # WORKING: try to catch and fix nan
    if exp_config['dropout'] < 1.0:
        if exp_config.get('nan_guard', False):
            from theano.compile.nanguardmode import NanGuardMode
            algorithm = GradientDescent(cost=cg.outputs[0],
                                        parameters=cg.parameters,
                                        step_rule=CompositeRule([
                                            StepClipping(
                                                exp_config['step_clipping']),
                                            eval(exp_config['step_rule'])()
                                        ]),
                                        on_unused_sources='warn',
                                        theano_func_kwargs={
                                            'mode':
                                            NanGuardMode(nan_is_error=True,
                                                         inf_is_error=True)
                                        })
        else:
            algorithm = GradientDescent(cost=cg.outputs[0],
                                        parameters=cg.parameters,
                                        step_rule=CompositeRule([
                                            StepClipping(
                                                exp_config['step_clipping']),
                                            eval(exp_config['step_rule'])()
                                        ]),
                                        on_unused_sources='warn')
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(
                                            exp_config['step_clipping']),
                                        eval(exp_config['step_rule'])()
                                    ]),
                                    on_unused_sources='warn')

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=train_model,
                         algorithm=algorithm,
                         data_stream=masked_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Esempio n. 25
0
softmax = NDimensionalSoftmax(name='ndim_softmax')

activation_input = lookup_input.apply(x)
hidden = rnn.apply(linear_input.apply(activation_input))
activation_output = linear_output.apply(hidden)
y_est = softmax.apply(activation_output, extra_ndim=1)

cost = softmax.categorical_cross_entropy(y, activation_output,
                                         extra_ndim=1).mean()

from blocks.graph import ComputationGraph
from blocks.algorithms import GradientDescent, Adam

cg = ComputationGraph([cost])

step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)]

algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=CompositeRule(step_rules),
                            on_unused_sources='ignore')

from blocks.extensions import Timing, FinishAfter, Printing, ProgressBar
from blocks.extensions.monitoring import TrainingDataMonitoring
from fuel.streams import DataStream
from fuel.schemes import SequentialScheme
from blocks.main_loop import MainLoop
from blocks.extensions.saveload import Checkpoint

from blocks.model import Model
Esempio n. 26
0
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs,
          test_cost, experiment_path, initialization, init_width, weight_noise,
          z_prob, z_prob_states, z_prob_cells, drop_prob_igates,
          ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop,
          rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len,
          decrease_lr_after_epoch, lr_decay, **kwargs):

    print '.. PTB experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()

    ###########################################
    #
    # LOAD DATA
    #
    ###########################################

    def onehot(x, numclasses=None):
        """ Convert integer encoding for class-labels (starting with 0 !)
            to one-hot encoding.
            The output is an array whose shape is the shape of the input array
            plus an extra dimension, containing the 'one-hot'-encoded labels.
        """
        if x.shape == ():
            x = x[None]
        if numclasses is None:
            numclasses = x.max() + 1
        result = numpy.zeros(list(x.shape) + [numclasses], dtype="int")
        z = numpy.zeros(x.shape, dtype="int")
        for c in range(numclasses):
            z *= 0
            z[numpy.where(x == c)] = 1
            result[..., c] += z
        return result.astype(theano.config.floatX)

    alphabetsize = 10000
    data = np.load('penntree_char_and_word.npz')
    trainset = data['train_words']
    validset = data['valid_words']
    testset = data['test_words']

    if testing:
        trainset = trainset[:3000]
        validset = validset[:3000]

    if share_mask:
        if not z_prob:
            raise ValueError('z_prob must be provided when using share_mask')
        if z_prob_cells or z_prob_states:
            raise ValueError(
                'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)'
            )
        z_prob_cells = z_prob
        # we don't want to actually use these masks, so this is to debug
        z_prob_states = None
    else:
        if z_prob:
            raise ValueError('z_prob is only used with share_mask')
        z_prob_cells = z_prob_cells or '1'
        z_prob_states = z_prob_states or '1'


#    rng = np.random.RandomState(seed)

###########################################
#
# MAKE STREAMS
#
###########################################

    def prep_dataset(dataset):
        dataset = dataset[:(len(dataset) - (len(dataset) %
                                            (seq_len * batch_size)))]
        dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2))

        stream = DataStream(
            IndexableDataset(indexables=OrderedDict([('data', dataset)])),
            iteration_scheme=SequentialExampleScheme(dataset.shape[0]))
        stream = Transpose(stream, [(1, 0)])
        stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells,
                                   drop_prob_igates, layer_size, num_layers,
                                   False, stoch_depth, share_mask,
                                   gaussian_drop, alphabetsize)
        stream.sources = ('data', ) * 3 + stream.sources + (
            'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates')
        return (stream, )

    train_stream, = prep_dataset(trainset)
    valid_stream, = prep_dataset(validset)
    test_stream, = prep_dataset(testset)

    ####################

    data = train_stream.get_epoch_iterator(as_dict=True).next()

    ####################

    ###########################################
    #
    # BUILD MODEL
    #
    ###########################################
    print '.. building model'

    x = T.tensor3('data')
    y = x
    zoneouts_states = T.tensor3('zoneouts_states')
    zoneouts_cells = T.tensor3('zoneouts_cells')
    zoneouts_igates = T.tensor3('zoneouts_igates')

    x.tag.test_value = data['data']
    zoneouts_states.tag.test_value = data['zoneouts_states']
    zoneouts_cells.tag.test_value = data['zoneouts_cells']
    zoneouts_igates.tag.test_value = data['zoneouts_igates']

    if init_width and not initialization == 'uniform':
        raise ValueError('Width is only for uniform init, whassup?')

    if initialization == 'glorot':
        weights_init = NormalizedInitialization()
    elif initialization == 'uniform':
        weights_init = Uniform(width=init_width)
    elif initialization == 'ortho':
        weights_init = OrthogonalInitialization()
    else:
        raise ValueError('No such initialization')

    if rnn_type.lower() == 'lstm':
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size * 4,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropLSTM(dim=layer_size,
                     weights_init=weights_init,
                     activation=Tanh(),
                     model_type=6,
                     name='rnn%d' % l,
                     ogates_zoneout=ogates_zoneout) for l in range(num_layers)
        ]
    elif rnn_type.lower() == 'gru':
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size * 3,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropGRU(dim=layer_size,
                    weights_init=weights_init,
                    activation=Tanh(),
                    name='rnn%d' % l) for l in range(num_layers)
        ]
    elif rnn_type.lower() == 'srnn':  # FIXME!!! make ReLU
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropSimpleRecurrent(dim=layer_size,
                                weights_init=weights_init,
                                activation=Rectifier(),
                                name='rnn%d' % l) for l in range(num_layers)
        ]
    else:
        raise NotImplementedError

    hid_to_out = Linear(layer_size,
                        alphabetsize,
                        name='hid_to_out',
                        weights_init=weights_init,
                        biases_init=Constant(0.0))

    for layer in in_to_hids:
        layer.initialize()
    for layer in recurrent_layers:
        layer.initialize()
    hid_to_out.initialize()

    layer_input = x  #in_to_hid.apply(x)

    init_updates = OrderedDict()
    for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)):
        rnn_embedding = in_to_hid.apply(layer_input)
        if rnn_type.lower() == 'lstm':
            states_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            cells_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name, cells_init.name = "states_init", "cells_init"
            states, cells = layer.apply(
                rnn_embedding,
                zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size],
                zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size],
                zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size],
                states_init, cells_init)
            init_updates.update([(states_init, states[-1]),
                                 (cells_init, cells[-1])])
        elif rnn_type.lower() in ['gru', 'srnn']:
            # untested!
            states_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name = "states_init"
            states = layer.apply(rnn_embedding, zoneouts_states,
                                 zoneouts_igates, states_init)
            init_updates.update([(states_init, states[-1])])
        else:
            raise NotImplementedError
        layer_input = states

    y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1]))
    shape_ = y_hat_pre_softmax.shape
    y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize)))

    ####################

    ###########################################
    #
    # SET UP COSTS AND MONITORS
    #
    ###########################################

    cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)),
                                           y_hat).copy('cost')

    bpc = (cost / np.log(2.0)).copy(name='bpr')
    perp = T.exp(cost).copy(name='perp')

    cost_train = cost.copy(name='train_cost')
    cg_train = ComputationGraph([cost_train])

    ###########################################
    #
    # NORM STABILIZER
    #
    ###########################################
    norm_cost = 0.

    def _magnitude(x, axis=-1):
        return T.sqrt(
            T.maximum(T.sqr(x).sum(axis=axis),
                      numpy.finfo(x.dtype).tiny))

    if penalty == 'cells':
        assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables)
        for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables):
            norms = _magnitude(cell)
            norm_cost += T.mean(
                T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))
    elif penalty == 'hids':
        for l in range(num_layers):
            assert 'rnn%d_apply_states' % l in [
                o.name
                for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)
            ]
        for output in VariableFilter(roles=[OUTPUT])(cg_train.variables):
            for l in range(num_layers):
                if output.name == 'rnn%d_apply_states' % l:
                    norms = _magnitude(output)
                    norm_cost += T.mean(
                        T.sum((norms[1:] - norms[:-1])**2, axis=0) /
                        (seq_len - 1))

    norm_cost.name = 'norm_cost'
    #cost_valid = cost_train
    cost_train += norm_cost_coeff * norm_cost
    cost_train = cost_train.copy(
        'cost_train')  #should this be cost_train.outputs[0]? no.

    cg_train = ComputationGraph([cost_train])

    ###########################################
    #
    # WEIGHT NOISE
    #
    ###########################################

    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg_train.variables)
        cg_train = apply_noise(cg_train, weights, weight_noise)
        cost_train = cg_train.outputs[0].copy(name='cost_train')

    model = Model(cost_train)

    learning_rate = float(learning_rate)
    clipping = StepClipping(threshold=np.cast[floatX](clipping))
    if algorithm == 'adam':
        adam = Adam(learning_rate=learning_rate)
        learning_rate = adam.learning_rate
        step_rule = CompositeRule([adam, clipping])
    elif algorithm == 'rms_prop':
        rms_prop = RMSProp(learning_rate=learning_rate)
        learning_rate = rms_prop.learning_rate
        step_rule = CompositeRule([clipping, rms_prop])
    elif algorithm == 'momentum':
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        learning_rate = sgd_momentum.learning_rate
        step_rule = CompositeRule([clipping, sgd_momentum])
    elif algorithm == 'sgd':
        sgd = Scale(learning_rate=learning_rate)
        learning_rate = sgd.learning_rate
        step_rule = CompositeRule([clipping, sgd])
    else:
        raise NotImplementedError
    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost_train,
                                parameters=cg_train.parameters)
    # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)})

    algorithm.add_updates(init_updates)

    def cond_number(x):
        _, _, sing_vals = T.nlinalg.svd(x, True, True)
        sing_mags = abs(sing_vals)
        return T.max(sing_mags) / T.min(sing_mags)

    def rms(x):
        return (x * x).mean().sqrt()

    whysplode_cond = []
    whysplode_rms = []
    for i, p in enumerate(init_updates):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(
                cond_number(p).copy(
                    'ini%d:%s_cond(%s)' %
                    (i, p.name, "x".join(map(str,
                                             p.get_value().shape)))))
        whysplode_rms.append(
            rms(p).copy('ini%d:%s_rms(%s)' %
                        (i, p.name, "x".join(map(str,
                                                 p.get_value().shape)))))
    for i, p in enumerate(cg_train.parameters):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(
                cond_number(p).copy(
                    'ini%d:%s_cond(%s)' %
                    (i, p.name, "x".join(map(str,
                                             p.get_value().shape)))))
        whysplode_rms.append(
            rms(p).copy('ini%d:%s_rms(%s)' %
                        (i, p.name, "x".join(map(str,
                                                 p.get_value().shape)))))

    observed_vars = [
        cost_train, cost, bpc, perp, learning_rate,
        aggregation.mean(
            algorithm.total_gradient_norm).copy("gradient_norm_mean")
    ]  # + whysplode_rms

    parameters = model.get_parameter_dict()
    for name, param in parameters.iteritems():
        observed_vars.append(param.norm(2).copy(name=name + "_norm"))
        observed_vars.append(
            algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm"))

    train_monitor = TrainingDataMonitoring(variables=observed_vars,
                                           prefix="train",
                                           after_epoch=True)

    dev_inits = [p.clone() for p in init_updates]
    cg_dev = ComputationGraph([cost, bpc, perp] +
                              init_updates.values()).replace(
                                  zip(init_updates.keys(), dev_inits))
    dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3]
    dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:]))

    dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp],
                                       data_stream=valid_stream,
                                       prefix="dev",
                                       updates=dev_init_updates)

    # noone does this
    if 'load_path' in kwargs:
        with open(kwargs['load_path']) as f:
            loaded = np.load(f)
            model = Model(cost_train)
            params_dicts = model.get_parameter_dict()
            params_names = params_dicts.keys()
            for param_name in params_names:
                param = params_dicts[param_name]
                # '/f_6_.W' --> 'f_6_.W'
                slash_index = param_name.find('/')
                param_name = param_name[slash_index + 1:]
                if param.get_value().shape == loaded[param_name].shape:
                    print 'Found: ' + param_name
                    param.set_value(loaded[param_name])
                else:
                    print 'Not found: ' + param_name

    extensions = []
    extensions.extend(
        [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor])
    if test_cost:
        test_inits = [p.clone() for p in init_updates]
        cg_test = ComputationGraph([cost, bpc, perp] +
                                   init_updates.values()).replace(
                                       zip(init_updates.keys(), test_inits))
        test_cost, test_bpc, test_perp = cg_test.outputs[:3]
        test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:]))

        test_monitor = DataStreamMonitoring(
            variables=[test_cost, test_bpc, test_perp],
            data_stream=test_stream,
            prefix="test",
            updates=test_init_updates)
        extensions.extend([test_monitor])

    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
    log_path = os.path.join(experiment_path, 'log.txt')
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    extensions.append(
        SaveParams('dev_cost', model, experiment_path, every_n_epochs=1))
    extensions.append(SaveLog(every_n_epochs=1))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    class RollsExtension(TrainingExtension):
        """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """
        def __init__(self, shvars):
            self.shvars = shvars

        def before_epoch(self):
            for v in self.shvars:
                v.set_value(np.roll(v.get_value(), 1, 0))

    extensions.append(
        RollsExtension(init_updates.keys() + dev_init_updates.keys() +
                       (test_init_updates.keys() if test_cost else [])))

    class LearningRateSchedule(TrainingExtension):
        """ Lets you set a number to divide learning rate by each epoch + when to start doing that """
        def __init__(self):
            self.epoch_number = 0

        def after_epoch(self):
            self.epoch_number += 1
            if self.epoch_number > decrease_lr_after_epoch:
                learning_rate.set_value(learning_rate.get_value() / lr_decay)

    if bool(lr_decay) != bool(decrease_lr_after_epoch):
        raise ValueError(
            'Need to define both lr_decay and decrease_lr_after_epoch')
    if lr_decay and decrease_lr_after_epoch:
        extensions.append(LearningRateSchedule())

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    t1 = time.time()
    print "Building time: %f" % (t1 - t0)

    main_loop.run()
    print "Execution time: %f" % (time.time() - t1)
    np.random.seed(args.seed)
    blocks.config.config.default_seed = args.seed

    if args.continue_from:
        from blocks.serialization import load
        main_loop = load(args.continue_from)
        main_loop.run()
        sys.exit(0)

    graphs, extensions, updates = construct_graphs(args, nclasses,
                                                   sequence_length)
    #graph, extension, update = construct_graphs(args, nclasses, sequence_length)

    ### optimization algorithm definition
    step_rule = CompositeRule([
        StepClipping(1.),
        #Momentum(learning_rate=args.learning_rate, momentum=0.9),
        RMSProp(learning_rate=args.learning_rate, decay_rate=0.5),
    ])

    algorithm = GradientDescent(cost=graphs["training"].outputs[0],
                                parameters=graphs["training"].parameters,
                                step_rule=step_rule)
    algorithm.add_updates(updates["training"])
    model = Model(graphs["training"].outputs[0])
    extensions = extensions["training"] + extensions["inference"]

    # step monitor (after epoch to limit the log size)
    step_channels = []
    step_channels.extend([
        algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
Esempio n. 28
0
def main(config,
         tr_stream,
         dev_stream,
         use_bokeh=False,
         slim_iteration_state=False,
         switch_controller=None,
         reset_epoch=False):
    """This method largely corresponds to the ``main`` method in the
    original Blocks implementation in blocks-examples and most of the
    code is copied from there. Following modifications have been made:
    
    - Support fixing word embedding during training
    - Dropout fix https://github.com/mila-udem/blocks-examples/issues/46
    - If necessary, add the exp3s extension
    
    Args:
        config (dict): NMT config
        tr_stream (DataStream): Training data stream
        dev_stream (DataStream): Validation data stream
        use_bokeh (bool): Whether to use bokeh for plotting
        slim_iteration_state (bool): Whether to store the full iteration
                                     state or only the epoch iterator
                                     without data stream state
        switch_controller (SourceSwitchController): Controlling strategy
                                                    if monolingual data
                                                    is used as well
        reset_epoch (bool): Set epoch_started in main loop status to
                            false. Sometimes required if you change
                            training parameters such as 
                            mono_data_integration
    """

    nmt_model = NMTModel(config)
    nmt_model.set_up()

    # Set extensions
    logging.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([nmt_model.cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'],
                      slim_iteration_state,
                      every_n_batches=config['save_freq'])
    ]

    # Add early stopping based on bleu
    if config['bleu_script'] is not None:
        logging.info("Building bleu validator")
        extensions.append(
            BleuValidator(nmt_model.sampling_input,
                          samples=nmt_model.samples,
                          config=config,
                          model=nmt_model.search_model,
                          data_stream=dev_stream,
                          normalize=config['normalized_bleu'],
                          store_full_main_loop=config['store_full_main_loop'],
                          every_n_batches=config['bleu_val_freq']))

    if switch_controller:
        switch_controller.beam_search = BeamSearch(samples=nmt_model.samples)
        switch_controller.src_sentence = nmt_model.sampling_input
        extensions.append(switch_controller)

    # Reload model if necessary
    if config['reload']:
        extensions.append(
            LoadNMT(config['saveto'], slim_iteration_state, reset_epoch))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot('Decoding cost',
                 channels=[['decoder_cost_cost']],
                 after_batch=True))

    # Add an extension for correct handling of SIGTERM and SIGINT
    extensions.append(AlwaysEpochInterrupt(every_n_batches=1))

    # Set up training algorithm
    logging.info("Initializing training algorithm")
    # https://github.com/mila-udem/blocks-examples/issues/46
    train_params = nmt_model.cg.parameters
    # fs439: fix embeddings?
    if config['fix_embeddings']:
        train_params = []
        embedding_params = [
            'softmax1', 'softmax0', 'maxout_bias', 'embeddings', 'lookuptable',
            'transform_feedback'
        ]
        for p in nmt_model.cg.parameters:
            add_param = True
            for ann in p.tag.annotations:
                if ann.name in embedding_params:
                    logging.info("Do not train %s due to annotation %s" %
                                 (p, ann))
                    add_param = False
                    break
            if add_param:
                train_params.append(p)
    # Change cost=cost to cg.outputs[0] ?
    algorithm = GradientDescent(cost=nmt_model.cg.outputs[0]
                                if config['dropout'] < 1.0 else nmt_model.cost,
                                parameters=train_params,
                                step_rule=CompositeRule([
                                    StepClipping(config['step_clipping']),
                                    eval(config['step_rule'])()
                                ]))

    # Initialize main loop
    logging.info("Initializing main loop")
    main_loop = MainLoop(model=nmt_model.training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Reset epoch
    if reset_epoch:
        main_loop.status['epoch_started'] = False

    # Train!
    main_loop.run()
Esempio n. 29
0
def test_step_clipping_broadcastable():
    verify_broadcastable_handling(StepClipping(0.4))
def train_language_model(new_training_job, config, save_path, params,
                         fast_start, fuel_server, seed):
    c = config
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    data, lm, retrieval = initialize_data_and_model(config)

    # full main loop can be saved...
    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    # or only state (log + params) which can be useful not to pickle embeddings
    state_path = os.path.join(save_path, 'training_state.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')
    best_tar_path = os.path.join(save_path, "best_model.tar")

    words = tensor.ltensor3('words')
    words_mask = tensor.matrix('words_mask')
    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', batch_size=4,
                            max_length=5).get_epoch_iterator())
        words.tag.test_value = test_value_data[0]
        words_mask.tag.test_value = test_value_data[1]

    costs, updates = lm.apply(words, words_mask)
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(words.shape[1], 'length')
    perplexity, = VariableFilter(name='perplexity')(cg)
    perplexities = VariableFilter(name_regex='perplexity.*')(cg)
    monitored_vars = [length, cost] + perplexities
    if c['dict_path']:
        num_definitions, = VariableFilter(name='num_definitions')(cg)
        monitored_vars.extend([num_definitions])

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    saved_parameters = parameters.values()
    if c['embedding_path']:
        logger.debug("Exclude word embeddings from the trained parameters")
        trained_parameters = [
            p for p in trained_parameters
            if not p == lm.get_def_embeddings_params()
        ]
        saved_parameters = [
            p for p in saved_parameters
            if not p == lm.get_def_embeddings_params()
        ]

    if c['cache_size'] != 0:
        logger.debug("Enable fake recursivity for looking up embeddings")
        trained_parameters = [
            p for p in trained_parameters if not p == lm.get_cache_params()
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    if c['cache_size'] != 0:
        algorithm.add_updates(updates)

    train_monitored_vars = list(monitored_vars)
    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)

    word_emb_RMS, = VariableFilter(name='word_emb_RMS')(cg)
    main_rnn_in_RMS, = VariableFilter(name='main_rnn_in_RMS')(cg)
    train_monitored_vars.extend([word_emb_RMS, main_rnn_in_RMS])

    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    # We use a completely random seed on purpose. With Fuel server
    # it's currently not possible to restore the state of the training
    # stream. That's why it's probably better to just have it stateless.
    stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None
    training_stream = data.get_stream('train',
                                      batch_size=c['batch_size'],
                                      max_length=c['max_length'],
                                      seed=stream_seed)
    valid_stream = data.get_stream('valid',
                                   batch_size=c['batch_size_valid'],
                                   max_length=c['max_length'],
                                   seed=stream_seed)
    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    validation = DataStreamMonitoring(monitored_vars,
                                      valid_stream,
                                      prefix="valid").set_conditions(
                                          before_first_epoch=not fast_start,
                                          on_resumption=True,
                                          every_n_batches=c['mon_freq_valid'])
    track_the_best = TrackTheBest(validation.record_name(perplexity),
                                  choose_best=min).set_conditions(
                                      on_resumption=True,
                                      after_epoch=True,
                                      every_n_batches=c['mon_freq_valid'])

    # don't save them the entire main loop to avoid pickling everything
    if c['fast_checkpoint']:
        load = (LoadNoUnpickling(state_path,
                                 load_iteration_state=True,
                                 load_log=True).set_conditions(
                                     before_training=not new_training_job))
        cp_args = {
            'save_main_loop': False,
            'save_separately': ['log', 'iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(state_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                state_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)
    else:
        load = (Load(main_loop_path, load_iteration_state=True,
                     load_log=True).set_conditions(
                         before_training=not new_training_job))
        cp_args = {
            'save_separately': ['iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(main_loop_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                main_loop_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)

    checkpoint = checkpoint.add_condition(
        ['after_batch', 'after_epoch'],
        OnLogRecord(track_the_best.notification_name), (best_tar_path, ))

    extensions = [
        load,
        StartFuelServer(original_training_stream,
                        stream_path,
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train'])
    ]

    if retrieval:
        extensions.append(
            RetrievalPrintStats(retrieval=retrieval,
                                every_n_batches=c['mon_freq_train'],
                                before_training=not fast_start))

    extensions.extend([
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
        validation, track_the_best, checkpoint
    ])
    if c['checkpoint_every_n_batches']:
        extensions.append(intermediate_cp)
    extensions.extend([
        DumpTensorflowSummaries(save_path,
                                every_n_batches=c['mon_freq_train'],
                                after_training=True),
        Printing(on_resumption=True, every_n_batches=c['mon_freq_train']),
        FinishIfNoImprovementAfter(track_the_best.notification_name,
                                   iterations=50 * c['mon_freq_valid'],
                                   every_n_batches=c['mon_freq_valid']),
        FinishAfter(after_n_batches=c['n_batches'])
    ])

    logger.info("monitored variables during training:" + "\n" +
                pprint.pformat(train_monitored_vars, width=120))
    logger.info("monitored variables during valid:" + "\n" +
                pprint.pformat(monitored_vars, width=120))

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
Esempio n. 31
0
                                    x_mask,
                                    attended=mlp_context.apply(context))
cost = cost_matrix.sum() / x_mask.sum()
cost.name = "sequence_log_likelihood"

cg = ComputationGraph(cost)
model = Model(cost)

#################
# Algorithm
#################

algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=CompositeRule(
                                [StepClipping(10.0),
                                 Adam(lr)]))

train_monitor = TrainingDataMonitoring(variables=[cost],
                                       after_epoch=True,
                                       prefix="train")

extensions = extensions = [
    train_monitor,
    TrackTheBest('train_sequence_log_likelihood'),
    Printing(after_epoch=True)
]

main_loop = MainLoop(model=model,
                     data_stream=data_stream,
                     algorithm=algorithm,
def main():
    nclasses = 27

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=1)
    parser.add_argument("--length", type=int, default=180)
    parser.add_argument("--num-epochs", type=int, default=100)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=1e-3)
    parser.add_argument("--epsilon", type=float, default=1e-5)
    parser.add_argument("--num-hidden", type=int, default=1000)
    parser.add_argument("--baseline", action="store_true")
    parser.add_argument("--initialization",
                        choices="identity glorot orthogonal uniform".split(),
                        default="identity")
    parser.add_argument("--initial-gamma", type=float, default=1e-1)
    parser.add_argument("--initial-beta", type=float, default=0)
    parser.add_argument("--cluster", action="store_true")
    parser.add_argument("--activation",
                        choices=list(activations.keys()),
                        default="tanh")
    parser.add_argument("--optimizer",
                        choices="sgdmomentum adam rmsprop",
                        default="rmsprop")
    parser.add_argument("--continue-from")
    parser.add_argument("--evaluate")
    parser.add_argument("--dump-hiddens")
    args = parser.parse_args()

    np.random.seed(args.seed)
    blocks.config.config.default_seed = args.seed

    if args.continue_from:
        from blocks.serialization import load
        main_loop = load(args.continue_from)
        main_loop.run()
        sys.exit(0)

    graphs, extensions, updates = construct_graphs(args, nclasses)

    ### optimization algorithm definition
    if args.optimizer == "adam":
        optimizer = Adam(learning_rate=args.learning_rate)
    elif args.optimizer == "rmsprop":
        optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9)
    elif args.optimizer == "sgdmomentum":
        optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99)
    step_rule = CompositeRule([
        StepClipping(1.),
        optimizer,
    ])
    algorithm = GradientDescent(cost=graphs["training"].outputs[0],
                                parameters=graphs["training"].parameters,
                                step_rule=step_rule)
    algorithm.add_updates(updates["training"])
    model = Model(graphs["training"].outputs[0])
    extensions = extensions["training"] + extensions["inference"]

    # step monitor
    step_channels = []
    step_channels.extend([
        algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
        for name, param in model.get_parameter_dict().items()
    ])
    step_channels.append(
        algorithm.total_step_norm.copy(name="total_step_norm"))
    step_channels.append(
        algorithm.total_gradient_norm.copy(name="total_gradient_norm"))
    step_channels.extend(graphs["training"].outputs)
    logger.warning("constructing training data monitor")
    extensions.append(
        TrainingDataMonitoring(step_channels,
                               prefix="iteration",
                               after_batch=True))

    # parameter monitor
    extensions.append(
        DataStreamMonitoring([
            param.norm(2).copy(name="parameter.norm:%s" % name)
            for name, param in model.get_parameter_dict().items()
        ],
                             data_stream=None,
                             after_epoch=True))

    validation_interval = 500
    # performance monitor
    for situation in "training inference".split():
        if situation == "inference" and not args.evaluate:
            # save time when we don't need the inference graph
            continue

        for which_set in "train valid test".split():
            logger.warning("constructing %s %s monitor" %
                           (which_set, situation))
            channels = list(graphs[situation].outputs)
            extensions.append(
                DataStreamMonitoring(channels,
                                     prefix="%s_%s" % (which_set, situation),
                                     every_n_batches=validation_interval,
                                     data_stream=get_stream(
                                         which_set=which_set,
                                         batch_size=args.batch_size,
                                         num_examples=10000,
                                         length=args.length)))

    extensions.extend([
        TrackTheBest("valid_training_error_rate",
                     "best_valid_training_error_rate"),
        DumpBest("best_valid_training_error_rate", "best.zip"),
        FinishAfter(after_n_epochs=args.num_epochs),
        #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50),
        Checkpoint("checkpoint.zip",
                   on_interrupt=False,
                   every_n_epochs=1,
                   use_cpickle=True),
        DumpLog("log.pkl", after_epoch=True)
    ])

    if not args.cluster:
        extensions.append(ProgressBar())

    extensions.extend([
        Timing(),
        Printing(every_n_batches=validation_interval),
        PrintingTo("log"),
    ])
    main_loop = MainLoop(data_stream=get_stream(which_set="train",
                                                batch_size=args.batch_size,
                                                length=args.length,
                                                augment=True),
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)

    if args.dump_hiddens:
        dump_hiddens(args, main_loop)
        return

    if args.evaluate:
        evaluate(args, main_loop)
        return

    main_loop.run()