Beispiel #1
0
def main(save_to, num_epochs):
    mlp = MLP([Tanh(), Softmax()], [784, 100, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    probs = mlp.apply(x)
    cost = CategoricalCrossEntropy().apply(y.flatten(), probs)
    error_rate = MisclassificationRate().apply(y.flatten(), probs)

    cg = ComputationGraph([cost])
    W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum()
    cost.name = 'final_cost'

    mnist_train = MNIST(("train", ))
    mnist_test = MNIST(("test", ))

    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs),
        DataStreamMonitoring([cost, error_rate],
                             Flatten(DataStream.default_stream(
                                 mnist_test,
                                 iteration_scheme=SequentialScheme(
                                     mnist_test.num_examples, 500)),
                                     which_sources=('features', )),
                             prefix="test"),
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               after_epoch=True),
        Checkpoint(save_to),
        Printing()
    ]

    if BLOCKS_EXTRAS_AVAILABLE:
        extensions.append(
            Plot('MNIST example',
                 channels=[[
                     'test_final_cost',
                     'test_misclassificationrate_apply_error_rate'
                 ], ['train_total_gradient_norm']]))

    main_loop = MainLoop(algorithm,
                         Flatten(DataStream.default_stream(
                             mnist_train,
                             iteration_scheme=SequentialScheme(
                                 mnist_train.num_examples, 50)),
                                 which_sources=('features', )),
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
Beispiel #2
0
 def train_base_model(self, train_data, test_data, input_dim):
     x = T.matrix('features')
     y = T.matrix('targets')
     mlp, cost, mis_cost = self.create_base_model(x, y, input_dim)
     cg = ComputationGraph([cost])
     inputs = VariableFilter(roles=[INPUT])(cg.variables)
     cg = apply_dropout(cg, inputs, 0.2)
     algorithm = GradientDescent(cost=cost,
                                 parameters=cg.parameters,
                                 step_rule=Adam(learning_rate=0.001))
     data_stream = train_data
     data_stream_test = test_data
     monitor = DataStreamMonitoring(variables=[mis_cost],
                                    data_stream=data_stream_test,
                                    prefix="test")
     plot_ext = Plot('F1-measure',
                     channels=[['test_MisclassificationRate']],
                     after_batch=True)
     main_loop = MainLoop(data_stream=data_stream,
                          algorithm=algorithm,
                          extensions=[
                              monitor,
                              FinishAfter(after_n_epochs=50),
                              Printing(), plot_ext
                          ])
     main_loop.run()
     return mlp
Beispiel #3
0
def run(model_name):

    running_on_laptop = socket.gethostname() == 'yop'

    X = tensor.tensor4('image_features', dtype='float32')
    T = tensor.matrix('targets', dtype='float32')

    image_border_size = 100

    if running_on_laptop:
        host_plot = 'http://*****:*****@ %s' %
             (model_name, datetime.datetime.now(), socket.gethostname()),
             channels=[['loss', 'valid_loss_test'], ['valid_error']],
             after_epoch=True,
             server_url=host_plot),
        Printing(),
        Checkpoint('train2')
    ]

    main_loop = MainLoop(data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    main_loop.run()
Beispiel #4
0
def test_plot():
    class Writer(SimpleExtension):
        def do(self, *args, **kwargs):
            self.main_loop.log.current_row['channel'] = (
                self.main_loop.status['iterations_done']**2)

    main_loop = MockMainLoop(extensions=[
        Writer(after_batch=True),
        Plot('test', [['channel']]).set_conditions(after_batch=True),
        FinishAfter(after_n_batches=11)
    ])
    main_loop.run()
Beispiel #5
0
    def train(self, data_file, output_data_file, n_epochs=0):

        training_data = dataset.T_H5PYDataset(data_file, which_sets=('train',))
        test_data = dataset.T_H5PYDataset(data_file, which_sets=('test',))

        session = Session(root_url='http://localhost:5006')

        if self.MainLoop is None:
            step_rules = [RMSProp(learning_rate=0.2, decay_rate=0.95), StepClipping(1)]

            algorithm = GradientDescent(cost=self.Cost,
                                        parameters=self.ComputationGraph.parameters,
                                        step_rule=CompositeRule(step_rules),
                                        on_unused_sources='ignore')

            train_stream = DataStream.default_stream(
                training_data, iteration_scheme=SequentialScheme(
                    training_data.num_examples, batch_size=100))

            test_stream = DataStream.default_stream(
                test_data, iteration_scheme=SequentialScheme(
                    test_data.num_examples, batch_size=100))

            self.MainLoop = MainLoop(
                model=Model(self.Cost),
                data_stream=train_stream,
                algorithm=algorithm,
                extensions=[
                    FinishAfter(after_n_epochs=n_epochs),
                    Printing(),
                    Checkpoint(output_data_file, every_n_epochs=50),
                    TrainingDataMonitoring([self.Cost], after_batch=True, prefix='train'),
                    DataStreamMonitoring([self.Cost], after_batch=True, data_stream=test_stream, prefix='test'),
                    Plot(output_data_file, channels=[['train_cost', 'test_cost']])
                ])

        self.MainLoop.run()
def main():
    feature_maps = [20, 50]
    mlp_hiddens = [50]
    conv_sizes = [5, 5]
    pool_sizes = [3, 3]
    save_to = "DvC.pkl"
    batch_size = 500
    image_size = (32, 32)
    output_size = 2
    learningRate = 0.1
    num_epochs = 10
    num_batches = None
    host_plot = 'http://*****:*****@ %s' %
             ('CNN ', datetime.datetime.now(), socket.gethostname()),
             channels=[['valid_cost', 'valid_error_rate'],
                       ['train_total_gradient_norm']],
             after_epoch=True,
             server_url=host_plot))

    model = Model(cost)

    main_loop = MainLoop(algorithm,
                         stream_data_train,
                         model=model,
                         extensions=extensions)

    main_loop.run()
def main():
    mlp_hiddens = [1000]
    filter_sizes = [(9, 9), (5, 5), (5, 5)]
    feature_maps = [80, 50, 20]
    pooling_sizes = [(3, 3), (2, 2), (2, 2)]
    save_to = "DvC.pkl"
    image_size = (128, 128)
    output_size = 2
    learningRate = 0.1
    num_epochs = 300
    num_batches = None
    if socket.gethostname() == 'tim-X550JX':
        host_plot = 'http://*****:*****@ %s' %
             ('CNN ', datetime.datetime.now(), socket.gethostname()),
             channels=[['train_error_rate', 'valid_error_rate'],
                       ['train_total_gradient_norm']],
             after_epoch=True,
             server_url=host_plot))

    model = Model(cost)

    main_loop = MainLoop(algorithm,
                         stream_data_train,
                         model=model,
                         extensions=extensions)

    main_loop.run()
Beispiel #8
0
def run(model_name, port_train, port_valid):

    running_on_laptop = socket.gethostname() == 'yop'

    X = tensor.tensor4('image_features', dtype='float32')
    T = tensor.matrix('targets', dtype='float32')

    image_border_size = (100, 100)

    if running_on_laptop:
        host_plot = 'http://*****:*****@ %s' %
             (model_name, datetime.datetime.now(), socket.gethostname()),
             channels=[['loss'], ['error', 'valid_error']],
             after_epoch=True,
             server_url=host_plot),
        Printing(),
        Checkpoint('/tmp/train_bn2')
    ]

    main_loop = MainLoop(data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)
    main_loop.run()
def main(config, tr_stream, dev_stream, use_bokeh=False):

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['dec_nhids'], config['enc_nhids'] * 2)
    cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask),
                        source_sentence_mask, target_sentence,
                        target_sentence_mask)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Apply weight noise for regularization
    if config['weight_noise_ff'] > 0.0:
        logger.info('Applying weight noise to ff layers')
        enc_params = Selector(encoder.lookup).get_params().values()
        enc_params += Selector(encoder.fwd_fork).get_params().values()
        enc_params += Selector(encoder.back_fork).get_params().values()
        dec_params = Selector(
            decoder.sequence_generator.readout).get_params().values()
        dec_params += Selector(
            decoder.sequence_generator.fork).get_params().values()
        dec_params += Selector(decoder.state_init).get_params().values()
        cg = apply_noise(cg, enc_params + dec_params,
                         config['weight_noise_ff'])

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_parameters(),
        Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model,
                    data_stream=tr_stream,
                    hook_samples=config['hook_samples'],
                    every_n_batches=config['sampling_freq'],
                    src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    if config['bleu_script'] is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(sampling_input,
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=CompositeRule([
                                    StepClipping(config['step_clipping']),
                                    eval(config['step_rule'])()
                                ]))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Beispiel #10
0
def main(config,
         tr_stream,
         dev_stream,
         source_vocab,
         target_vocab,
         use_bokeh=False):

    # add the tags from this function to the IMT datastream
    # prediction function signature
    # [target_suffix, source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask]
    prediction_function = get_prediction_function(exp_config=config)

    tr_stream = Mapping(
        tr_stream,
        CallPredictionFunctionOnStream(prediction_function,
                                       [1, 0, 5, 4, 7, 6]),
        #tr_stream = Mapping(tr_stream, CallFunctionOnStream(prediction_function, [6, 1, 0, 5, 4, 7]),
        add_sources=('predictions', 'orig_readouts', 'prediction_tags'))

    # now datastream has 11 things
    import ipdb
    ipdb.set_trace()

    # WORKING: call prediction function twice to get new readouts on predictions instead of reference suffs
    # the only difference is the index of the suffix
    tr_stream = Mapping(tr_stream,
                        CallPredictionFunctionOnStream(prediction_function,
                                                       [1, 0, 5, 4, 7, 8]),
                        add_sources=('dummy_predictions', 'readouts',
                                     'dummy_prediction_tags'))

    import ipdb
    ipdb.set_trace()

    # Create the prediction confidence model
    # the first draft of this model uses the readout output (before the post-merge step) as the per-timestep state vector

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')

    # Note that the _names_ are changed from normal NMT
    # for IMT training, we use only the suffix as the reference
    target_sentence = tensor.lmatrix('target_suffix')
    target_sentence_mask = tensor.matrix('target_suffix_mask')

    target_prefix = tensor.lmatrix('target_prefix')
    target_prefix_mask = tensor.matrix('target_prefix_mask')

    # symbolic variable which tags each timestep as GOOD/BAD
    # Note: later this might be tags for a hypothesis i.e. from TER(p), right now the timesteps are actually determined by the reference
    # By zipping the confidence model output with the reference, we get the model's confidence that this reference word
    # will be predicted correctly
    prediction_tags = tensor.matrix('prediction_tags')
    readouts = tensor.tensor3('readouts')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])

    decoder = NMTPrefixDecoder(config['trg_vocab_size'],
                               config['dec_embed'],
                               config['dec_nhids'],
                               config['enc_nhids'] * 2,
                               loss_function='cross_entropy')

    # rename to match baseline NMT systems
    decoder.name = 'decoder'

    cost = decoder.confidence_cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask,
        target_prefix, target_prefix_mask, readouts, prediction_tags)

    # WORKING: add l2 regularization

    logger.info('Creating computational graph')
    # working: implement cost for confidence model
    cg = ComputationGraph(cost)

    # INITIALIZATION
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    #cost_cg = ComputationGraph(cost)
    if config['l2_reg']:
        l2_reg_alpha = config['l2_reg_alpha']
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())
        # do we need to name the cost variable again?
        cost.name = 'cost'
        cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables if x.name in set([
                'confidence_model1_apply_output',
                'confidence_model2_apply_output',
                'confidence_model3_apply_output'
            ])
        ]
        # if x.name == 'maxout_apply_output']
        # if x.name == 'maxout_apply_output']
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # WORKING: implement confidence -- remove all params except output model
    cost_model = Model(cost)

    model_params = cost_model.get_parameter_dict()
    trainable_params = cg.parameters
    import ipdb
    ipdb.set_trace()
    print('trainable params')
    #params_to_remove = [model_params[k] for k in model_params.keys() if 'confidence' not in k]
    #for p in params_to_remove:
    #    trainable_params.remove(p)

    # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W']
    # trainable_params.remove(source_embeddings)
    # trainable_params.remove(target_embeddings)
    # END WORKING: implement confidence -- remove all params except output model

    # TODO: fixed dropout mask for recurrent params?
    # Print shapes
    # shapes = [param.get_value().shape for param in cg.parameters]
    # logger.info("Parameter shapes: ")
    # for shape, count in Counter(shapes).most_common():
    #     logger.info('    {:15}: {}'.format(shape, count))
    # logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    # enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
    #                            Selector(decoder).get_parameters())
    # logger.info("Parameter names: ")
    # for name, value in enc_dec_param_dict.items():
    #     logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    # logger.info("Total number of parameters: {}"
    #             .format(len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        # TrainingDataMonitoring(trainable_params, after_batch=True),
        # Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # WORKING: confidence prediction
    #monitor everything that could possibly be relevant

    # Set up the sampling graph for validation during training
    # Theano variables for the sampling graph
    # Note this also loads the model parameters
    sampling_vars = load_params_and_get_beam_search(config,
                                                    encoder=encoder,
                                                    decoder=decoder)
    beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars

    #if config['hook_samples'] >= 1:
    #    logger.info("Building sampler")
    #    extensions.append(
    #        Sampler(model=search_model, data_stream=tr_stream,
    #                hook_samples=config['hook_samples'],
    #                every_n_batches=config['sampling_freq'],
    #                src_vocab=source_vocab,
    #                trg_vocab=target_vocab,
    #                src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    #if config['bleu_script'] is not None:
    #    logger.info("Building bleu validator")
    #    extensions.append(
    #        BleuValidator(sampling_input, sampling_prefix, samples=samples, config=config,
    #                      model=search_model, data_stream=dev_stream,
    #                      src_vocab=source_vocab,
    #                      trg_vocab=target_vocab,
    #                      normalize=config['normalized_bleu'],
    #                      every_n_batches=config['bleu_val_freq']))

    # TODO: add first-word accuracy validation
    # TODO: add IMT meteor early stopping
    #if config.get('imt_f1_validation', None) is not None:
    #    logger.info("Building imt F1 validator")
    #    extensions.append(
    #        IMT_F1_Validator(sampling_input, sampling_prefix,
    #                         samples=samples,
    #                         config=config,
    #                         model=search_model, data_stream=dev_stream,
    #                         src_vocab=source_vocab,
    #                         trg_vocab=target_vocab,
    #                         normalize=config['normalized_bleu'],
    #                         every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # TODO: hacking here: get the predictions of the confidence model using the `readouts` source of the data_stream

    # Note that the parameters of this model must be pretrained, otherwise this doesn't make sense
    # confidence_predictions = decoder.get_confidence(readouts)
    # confidence_prediction_model = Model(confidence_predictions)
    #
    # confidence_param_values = LoadNMT.load_parameter_values(config['confidence_saved_parameters'], brick_delimiter=None)
    # LoadNMT.set_model_parameters(confidence_prediction_model, confidence_param_values)
    #
    # confidence_prediction_func = confidence_prediction_model.get_theano_function()

    # import ipdb; ipdb.set_trace()

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            # Plot(config['model_save_directory'], channels=[['decoder_confidence_cost_cost']],
            Plot(config['model_save_directory'],
                 channels=[['cost']],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # WORKING: implement confidence model
    # if there is dropout or random noise, we need to use the output of the modified graph
    algorithm = GradientDescent(
        cost=cg.outputs[0],
        parameters=trainable_params,
        step_rule=CompositeRule([
            StepClipping(config['step_clipping']),
            eval(config['step_rule'])()
        ]),
        # eval(config['step_rule'])(), RemoveNotFinite()]),
        # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]),
        on_unused_sources='warn')
    #if config['dropout'] < 1.0:
    #   algorithm = GradientDescent(
    #       cost=cg.outputs[0], parameters=trainable_params,
    #       step_rule=CompositeRule([StepClipping(config['step_clipping']),
    #                         eval(config['step_rule'])(), RemoveNotFinite()]),
    #       # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]),
    #       on_unused_sources='warn'
    #   )
    #else:
    #   algorithm = GradientDescent(
    #       cost=cost, parameters=cg.parameters,
    #       step_rule=CompositeRule([StepClipping(config['step_clipping']),
    #                                eval(config['step_rule'])()]),
    #       on_unused_sources='warn'
    #   )
    # END WORKING: implement confidence model

    import ipdb
    ipdb.set_trace()

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # WORKING: debugging confidence
    # get theano function from model
    # WORKING: implement word-level confidence cost
    #   @application(inputs=['representation', 'source_sentence_mask',
    #                                'target_sentence_mask', 'target_sentence', 'target_prefix_mask', 'target_prefix'],
    #                                                 outputs=['cost'])
    #       def confidence_cost(self, representation, source_sentence_mask,
    #                            target_sentence, target_sentence_mask, target_prefix, target_prefix_mask):

    logger.info('Creating theano variables')

    # WORKING: 26.9.16 -- get confidence outputs directly from (source, prefix, suffix) inputs
    # This is equivalent to forced alignment --> confidence scores
    # Note: but this section should probably be in "evaluate" mode, not here in "train"

    # source_sentence = tensor.lmatrix('source')
    # source_sentence_mask = tensor.matrix('source_mask')

    # Note that the _names_ are changed from normal NMT
    # for IMT training, we use only the suffix as the reference
    #target_sentence = tensor.lmatrix('target_suffix')
    #target_sentence_mask = tensor.matrix('target_suffix_mask')
    # TODO: change names back to *_suffix, there is currently a theano function name error
    # TODO: in the GradientDescent Algorithm

    #target_prefix = tensor.lmatrix('target_prefix')
    #target_prefix_mask = tensor.matrix('target_prefix_mask')

    # confidence_output = decoder.confidence_cost(
    #     encoder.apply(source_sentence, source_sentence_mask),
    #     source_sentence_mask, target_sentence, target_sentence_mask,
    #     target_prefix, target_prefix_mask)

    # confidence_model = Model(confidence_output)

    # t_cost_func = confidence_model.get_theano_function()
    # inputs
    # [source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask, target_suffix]

    #import ipdb;ipdb.set_trace()

    # get the right args from the datastream
    # TODO: just print source, prefix, suffix, prediction, correct to new files -- this makes sure everything is aligned
    # OUTPUT_DIR = '/media/1tb_drive/imt_models/word_prediction_accuracy_experiments/en-de/exp_1'
    # for the_file in os.listdir(OUTPUT_DIR):
    #     file_path = os.path.join(OUTPUT_DIR, the_file)
    #     try:
    #         if os.path.isfile(file_path):
    #             os.unlink(file_path)
    #     except Exception as e:
    #         print(e)
    #
    # def write_file_truncate_mask(filename, data, mask, mode='a'):
    #     ''' data is list of list '''
    #
    #     assert len(data) == len(mask)
    #     with codecs.open(filename, mode, encoding='utf8') as out:
    #         for l, m in zip(data, mask):
    #             output = u' '.join(l[:int(m.sum())]) + u'\n'
    #             out.write(output)
    #     logger.info('Wrote file: {}'.format(filename))
    #
    #
    # target_ivocab = {k:v.decode('utf8') for v,k in target_vocab.items()}
    # source_ivocab = {k:v.decode('utf8') for v,k in source_vocab.items()}
    # import ipdb; ipdb.set_trace()
    # tag_ivocab = {1: 'True', 0: 'False'}
    #
    # test_iter = tr_stream.get_epoch_iterator()
    # it = 0
    # for t_source, t_source_mask, t_target, t_target_mask, t_target_prefix, t_target_prefix_mask, t_target_suffix, t_target_suffix_mask in test_iter:
    #     if it <= 1000:
    #         it += 1
    #         t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix)
    #         readouts = t_cost[0]
    #         preds = readouts.argmax(axis=2)
    #         correct = preds.T == t_target_suffix
    #
    #
    #         source_output = os.path.join(OUTPUT_DIR,'sources.en')
    #         prefix_output = os.path.join(OUTPUT_DIR,'prefixes.de')
    #         suffix_output = os.path.join(OUTPUT_DIR,'suffixes.de')
    #         prediction_output = os.path.join(OUTPUT_DIR,'predictions.de')
    #         correct_output = os.path.join(OUTPUT_DIR,'prefix_word_prediction_acc.out')
    #
    #         source_text = [[source_ivocab[w] for w in s] for s in t_source]
    #         prefix_text = [[target_ivocab[w] for w in s] for s in t_target_prefix]
    #         suffix_text = [[target_ivocab[w] for w in s] for s in t_target_suffix]
    #         pred_text = [[target_ivocab[w] for w in s] for s in preds.T]
    #         correct_text = [[tag_ivocab[w] for w in s] for s in correct]
    #
    #
    #         for triple in zip([source_output, prefix_output, suffix_output, prediction_output, correct_output],
    #                           [source_text, prefix_text, suffix_text, pred_text, correct_text],
    #                           [t_source_mask, t_target_prefix_mask, t_target_suffix_mask, t_target_suffix_mask, t_target_suffix_mask]):
    #             write_file_truncate_mask(*triple)
    #     else:
    #         break
    #
    # import ipdb; ipdb.set_trace()

    #t_cost = t_cost_func(t_source, t_target_prefix)
    #t_cost = t_cost_func(t_target_suffix, t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask)
    #t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix)

    #    return confidence_cost, flat_y, confidence_logits, readouts

    #predictions = t_cost[0].argmax(axis=2)

    # TODO: next step -- print gradients and weights during training find out where nan is coming from
    # TODO: look at the gradient of this function with respect to parameters? -- see here: http://deeplearning.net/software/theano/tutorial/gradients.html

    # TODO: function which adds right/wrong tags for model predictions to the datastream. In this case we can learn a simple linear model as a baseline
    # TODO: print predictions for each batch for each timestep to file -- _dont shuffle_ so that we get the right order

    # import ipdb;ipdb.set_trace()

    # from blocks reverse_words example
    # observables = [
    #     cost, min_energy, max_energy, mean_activation,
    #     batch_size, max_length, cost_per_character,
    #     algorithm.total_step_norm, algorithm.total_gradient_norm]
    # for name, parameter in trainable_params.items():
    #     observables.append(parameter.norm(2).copy(name + "_norm"))
    #     observables.append(algorithm.gradients[parameter].norm(2).copy(
    #         name + "_grad_norm"))

    for i, (k, v) in enumerate(algorithm.updates):
        v.name = k.name + '_{}'.format(i)

    aux_vars = [v for v in cg.auxiliary_variables[-3:]]
    # import ipdb; ipdb.set_trace()

    extensions.extend([
        TrainingDataMonitoring([cost], after_batch=True),
        # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True),
        # TrainingDataMonitoring(aux_vars, after_batch=True),
        # TrainingDataMonitoring(trainable_params, after_batch=True),
        Printing(after_batch=True)
    ])

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)
    import ipdb
    ipdb.set_trace()

    # Train!
    main_loop.run()
Beispiel #11
0
def main(config,
         tr_stream,
         dev_stream,
         source_vocab,
         target_vocab,
         use_bokeh=False):

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    initial_context = tensor.matrix('initial_context')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])

    # let user specify the target transition class name in config,
    # eval it and pass to decoder
    target_transition_name = config.get(
        'target_transition', 'GRUInitialStateWithInitialStateSumContext')
    target_transition = eval(target_transition_name)

    logger.info('Using target transition: {}'.format(target_transition_name))
    decoder = InitialContextDecoder(config['trg_vocab_size'],
                                    config['dec_embed'], config['dec_nhids'],
                                    config['enc_nhids'] * 2,
                                    config['context_dim'], target_transition)

    cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask),
                        source_sentence_mask, target_sentence,
                        target_sentence_mask, initial_context)

    cost.name = 'decoder_cost'

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING
    # TODO: validate performance with/without regularization
    if config.get('l2_regularization', False) is True:
        l2_reg_alpha = config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to name the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_parameters(),
        Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions

    # TODO: add checking for existing model and loading
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Create the theano variables that we need for the sampling graph
    sampling_input = tensor.lmatrix('input')
    sampling_context = tensor.matrix('context_input')

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config.get('bleu_script',
                                                 None) is not None:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))

        generated = decoder.generate(sampling_input, sampling_representation,
                                     sampling_context)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(
                model=search_model,
                data_stream=tr_stream,
                hook_samples=config['hook_samples'],
                every_n_batches=config['sampling_freq'],
                src_vocab=source_vocab,
                trg_vocab=target_vocab,
                src_vocab_size=config['src_vocab_size'],
            ))

    # Add early stopping based on bleu
    if config.get('bleu_script', None) is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(sampling_input,
                          sampling_context,
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          src_vocab=source_vocab,
                          trg_vocab=target_vocab,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Add early stopping based on Meteor
    if config.get('meteor_directory', None) is not None:
        logger.info("Building meteor validator")
        extensions.append(
            MeteorValidator(sampling_input,
                            sampling_context,
                            samples=samples,
                            config=config,
                            model=search_model,
                            data_stream=dev_stream,
                            src_vocab=source_vocab,
                            trg_vocab=target_vocab,
                            normalize=config['normalized_bleu'],
                            every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(config['model_save_directory'],
                 channels=[[
                     'decoder_cost', 'validation_set_bleu_score',
                     'validation_set_meteor_score'
                 ]],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    # if there is dropout or random noise, we need to use the output of the modified graph
    if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0:
        algorithm = GradientDescent(cost=cg.outputs[0],
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Beispiel #12
0
                                             batch_size=128))

test_set = H5PYDataset('mushrooms.hdf5', which_sets=('test', ))
test_stream = DataStream.default_stream(test_set,
                                        iteration_scheme=SequentialScheme(
                                            test_set.num_examples,
                                            batch_size=128))

main = MainLoop(model=Model(cost),
                data_stream=train_stream,
                algorithm=algorithm,
                extensions=[
                    FinishAfter(after_n_epochs=10),
                    Printing(),
                    TrainingDataMonitoring([cost, error_rate],
                                           after_batch=True,
                                           prefix='train'),
                    DataStreamMonitoring([cost, error_rate],
                                         after_batch=True,
                                         data_stream=test_stream,
                                         prefix='test'),
                    Plot('Train',
                         channels=[['train_cost', 'test_cost'],
                                   ['train_error_rate', 'test_error_rate']])
                ])

main.run()

hinton(W1.get_value())
hinton(W2.get_value())
Beispiel #13
0
import socket
import datetime
import time

host_plot = 'http://tfjgeorge.com:5006'
cost.name = 'cost'
valid_cost.name = 'valid_cost'

extensions = [
    Timing(),
    TrainingDataMonitoring([cost], after_epoch=True, prefix='train'),
    DataStreamMonitoring(variables=[valid_cost], data_stream=valid_stream),
    Plot('%s %s' % (
        socket.gethostname(),
        datetime.datetime.now(),
    ),
         channels=[['train_cost', 'valid_cost']],
         after_epoch=True,
         server_url=host_plot),
    TrackTheBest('valid_cost'),
    Checkpoint('model', save_separately=["model", "log"]),
    FinishIfNoImprovementAfter('valid_cost_best_so_far', epochs=5),
    #FinishAfter(after_n_epochs=100),
    Printing()
]

from blocks.main_loop import MainLoop

main_loop = MainLoop(model=model,
                     data_stream=train_stream,
                     algorithm=algorithm,
Beispiel #14
0
def main(exp_config, source_vocab, target_vocab, dev_stream, use_bokeh=True):

    # def setup_model_and_stream(exp_config, source_vocab, target_vocab):
    # def setup_model_and_stream(exp_config, source_vocab, target_vocab):
    train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream = setup_model_and_stream(
        exp_config, source_vocab, target_vocab)
    cost = create_model(train_encoder, train_decoder,
                        exp_config.get('imt_smoothing_constant', 0.005))

    # Set up training model
    logger.info("Building model")
    train_model = Model(cost)

    # Set the parameters from a trained models (.npz file)
    logger.info("Loading parameters from model: {}".format(
        exp_config['saved_parameters']))
    # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API
    param_values = LoadNMT.load_parameter_values(
        exp_config['saved_parameters'],
        brick_delimiter=exp_config.get('brick_delimiter', None))
    LoadNMT.set_model_parameters(train_model, param_values)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING
    if exp_config.get('l2_regularization', False) is True:
        l2_reg_alpha = exp_config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to rename the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    # Note dropout variables are hard-coded here
    if exp_config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, exp_config['dropout'])

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(exp_config['saveto']):
        os.makedirs(exp_config['saveto'])
        # TODO: mv the actual config file once we switch to .yaml for min-risk
        shutil.copy(exp_config['config_file'], exp_config['saveto'])

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=exp_config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(exp_config['saveto'],
                      every_n_batches=exp_config['save_freq'])
    ]

    # Set up beam search and sampling computation graphs if necessary
    # TODO: change the if statement here
    if exp_config['hook_samples'] >= 1 or exp_config['bleu_script'] is not None:
        logger.info("Building sampling model")
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[train_decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling -- TODO: sampling is broken for min-risk
    #if config['hook_samples'] >= 1:
    #    logger.info("Building sampler")
    #    extensions.append(
    #        Sampler(model=search_model, data_stream=tr_stream,
    #                hook_samples=config['hook_samples'],
    #                every_n_batches=config['sampling_freq'],
    #                src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    # TODO: use multimodal meteor and BLEU validator
    # TODO: add 'validator' key to IMT config
    # Add early stopping based on bleu
    if exp_config.get('bleu_script', None) is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(theano_sampling_source_input,
                          theano_sampling_context_input,
                          samples=samples,
                          config=exp_config,
                          model=search_model,
                          data_stream=dev_stream,
                          src_vocab=source_vocab,
                          trg_vocab=target_vocab,
                          normalize=exp_config['normalized_bleu'],
                          every_n_batches=exp_config['bleu_val_freq']))

    if exp_config.get('imt_f1_validation', False) is not False:
        logger.info("Building imt F1 validator")
        extensions.append(
            IMT_F1_Validator(theano_sampling_source_input,
                             theano_sampling_context_input,
                             samples=samples,
                             config=exp_config,
                             model=search_model,
                             data_stream=dev_stream,
                             src_vocab=source_vocab,
                             trg_vocab=target_vocab,
                             normalize=exp_config['normalized_bleu'],
                             every_n_batches=exp_config['bleu_val_freq']))

    # Add early stopping based on Meteor
    # if exp_config.get('meteor_directory', None) is not None:
    #     logger.info("Building meteor validator")
    #     extensions.append(
    #         MeteorValidator(theano_sampling_source_input, theano_sampling_context_input,
    #                         samples=samples,
    #                         config=config,
    #                         model=search_model, data_stream=dev_stream,
    #                         src_vocab=src_vocab,
    #                         trg_vocab=trg_vocab,
    #                         normalize=config['normalized_bleu'],
    #                         every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if exp_config['reload']:
        extensions.append(LoadNMT(exp_config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(exp_config['model_save_directory'],
                 channels=[[
                     'decoder_cost_cost', 'validation_set_imt_f1_score',
                     'validation_set_bleu_score', 'validation_set_meteor_score'
                 ]],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph
    # WORKING: try to catch and fix nan
    if exp_config['dropout'] < 1.0:
        if exp_config.get('nan_guard', False):
            from theano.compile.nanguardmode import NanGuardMode
            algorithm = GradientDescent(cost=cg.outputs[0],
                                        parameters=cg.parameters,
                                        step_rule=CompositeRule([
                                            StepClipping(
                                                exp_config['step_clipping']),
                                            eval(exp_config['step_rule'])()
                                        ]),
                                        on_unused_sources='warn',
                                        theano_func_kwargs={
                                            'mode':
                                            NanGuardMode(nan_is_error=True,
                                                         inf_is_error=True)
                                        })
        else:
            algorithm = GradientDescent(cost=cg.outputs[0],
                                        parameters=cg.parameters,
                                        step_rule=CompositeRule([
                                            StepClipping(
                                                exp_config['step_clipping']),
                                            eval(exp_config['step_rule'])()
                                        ]),
                                        on_unused_sources='warn')
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(
                                            exp_config['step_clipping']),
                                        eval(exp_config['step_rule'])()
                                    ]),
                                    on_unused_sources='warn')

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=train_model,
                         algorithm=algorithm,
                         data_stream=masked_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Beispiel #15
0
def main(model, cost, config, tr_stream, dev_stream, use_bokeh=False):

    # Set the parameters from a trained models (.npz file)
    logger.info("Loading parameters from model: {}".format(
        exp_config['saved_parameters']))
    # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API
    param_values = LoadNMT.load_parameter_values(
        exp_config['saved_parameters'],
        brick_delimiter=exp_config.get('brick_delimiter', None))
    LoadNMT.set_model_parameters(model, param_values)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING
    if config.get('l2_regularization', False) is True:
        l2_reg_alpha = config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to rename the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    # Note dropout variables are hard-coded here
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        # TODO: mv the actual config file once we switch to .yaml for min-risk
        # shutil.copy(config['config_file'], config['saveto'])
        # shutil.copy(config['config_file'], config['saveto'])

        # TODO: this breaks when we directly reference a class in the config obj instead of using reflection
        with codecs.open(os.path.join(config['saveto'], 'config.yaml'),
                         'w',
                         encoding='utf8') as yaml_out:
            yaml_out.write(yaml.dump(config))

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Set up beam search and sampling computation graphs if necessary
    # TODO: change the if statement here
    if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        logger.info("Building sampling model")
        sampling_representation = train_encoder.apply(
            theano_sampling_source_input,
            tensor.ones(theano_sampling_source_input.shape))
        # TODO: the generated output actually contains several more values, ipdb to see what they are
        generated = train_decoder.generate(theano_sampling_source_input,
                                           sampling_representation,
                                           theano_sampling_context_input)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[train_decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling -- TODO: sampling is broken for min-risk
    #if config['hook_samples'] >= 1:
    #    logger.info("Building sampler")
    #    extensions.append(
    #        Sampler(model=search_model, data_stream=tr_stream,
    #                hook_samples=config['hook_samples'],
    #                every_n_batches=config['sampling_freq'],
    #                src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    # TODO: use multimodal meteor and BLEU validator
    # Add early stopping based on bleu
    if config.get('bleu_script', None) is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(theano_sampling_source_input,
                          theano_sampling_context_input,
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          src_vocab=src_vocab,
                          trg_vocab=trg_vocab,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Add early stopping based on Meteor
    if config.get('meteor_directory', None) is not None:
        logger.info("Building meteor validator")
        extensions.append(
            MeteorValidator(theano_sampling_source_input,
                            theano_sampling_context_input,
                            samples=samples,
                            config=config,
                            model=search_model,
                            data_stream=dev_stream,
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab,
                            normalize=config['normalized_bleu'],
                            every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(config['model_save_directory'],
                 channels=[[
                     'decoder_cost_cost', 'validation_set_bleu_score',
                     'validation_set_meteor_score'
                 ]],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph
    if config['dropout'] < 1.0:
        algorithm = GradientDescent(cost=cg.outputs[0],
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]),
                                    on_unused_sources='warn')
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]),
                                    on_unused_sources='warn')

    #algorithm = GradientDescent(
    #    cost=cost, parameters=cg.parameters,
    #    step_rule=CompositeRule([StepClipping(config['step_clipping']),
    #                             eval(config['step_rule'])()],
    #                           ),
    #    on_unused_sources='warn'
    #)

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Beispiel #16
0
def main(save_to,
         num_epochs,
         regularization=0.001,
         subset=None,
         num_batches=None,
         batch_size=None,
         histogram=None,
         resume=False):
    output_size = 10
    convnet = create_all_conv_net()

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(),
                                                 probs).copy(name='cost'))
    test_components = (ComponentwiseCrossEntropy().apply(
        y.flatten(), probs).copy(name='components'))
    test_error_rate = (MisclassificationRate().apply(
        y.flatten(), probs).copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(),
                                              probs).copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate, test_components])

    # Apply dropout to all layer outputs except final softmax
    dropout_vars = VariableFilter(
        roles=[OUTPUT],
        bricks=[Convolutional],
        theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(drop_cg.variables)
    # train_cg = apply_dropout(drop_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(drop_cg, [x], 0.2)
    train_cg = drop_cg
    # train_cg = test_cg

    train_cost, train_error_rate, train_components = train_cg.outputs

    # Apply regularization to the cost
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    l2_norm = sum([(W**2).sum() for W in weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = regularization * l2_norm
    l2_regularization.name = 'l2_regularization'
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + regularization * l2_norm
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train", ))
    #cifar10_train_stream = RandomPadCropFlip(
    #    NormalizeBatchLevels(DataStream.default_stream(
    #        cifar10_train, iteration_scheme=ShuffledScheme(
    #            cifar10_train.num_examples, batch_size)),
    #    which_sources=('features',)),
    #    (32, 32), pad=5, which_sources=('features',))
    cifar10_train_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_train,
        iteration_scheme=ShuffledScheme(cifar10_train.num_examples,
                                        batch_size)),
                                                which_sources=('features', ))

    test_batch_size = 1000
    cifar10_test = CIFAR10(("test", ))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(cifar10_test.num_examples,
                                        test_batch_size)),
                                               which_sources=('features', ))

    momentum = Momentum(0.002, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])
    # step_rule = CompositeRule([StepClipping(100), momentum])
    step_rule = momentum

    # Train with simple SGD
    algorithm = GradientDescent(cost=train_cost,
                                parameters=train_cg.parameters,
                                step_rule=step_rule)

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches),
        EpochSchedule(momentum.learning_rate, [(1, 0.005), (3, 0.01),
                                               (5, 0.02), (200, 0.002),
                                               (250, 0.0002), (300, 0.00002)]),
        DataStreamMonitoring([test_cost, test_error_rate, test_confusion],
                             cifar10_test_stream,
                             prefix="test"),
        TrainingDataMonitoring([
            train_cost, train_error_rate, train_cost_without_regularization,
            l2_regularization, momentum.learning_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               every_n_batches=10),
        # after_epoch=True),
        Plot('Training performance for ' + save_to,
             channels=[
                 [
                     'train_cost_with_regularization',
                     'train_cost_without_regularization',
                     'train_l2_regularization'
                 ],
                 ['train_error_rate'],
                 ['train_total_gradient_norm'],
             ],
             every_n_batches=10),
        # after_batch=True),
        Plot('Test performance for ' + save_to,
             channels=[[
                 'train_error_rate',
                 'test_error_rate',
             ]],
             after_epoch=True),
        Checkpoint(save_to),
        ProgressBar(),
        Printing()
    ]

    if histogram:
        attribution = AttributionExtension(components=train_components,
                                           parameters=cg.parameters,
                                           components_size=output_size,
                                           after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(save_to, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(algorithm,
                         cifar10_train_stream,
                         model=model,
                         extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)
Beispiel #17
0
def main(config,
         tr_stream,
         dev_stream,
         source_vocab,
         target_vocab,
         use_bokeh=False):

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')

    # Note that the _names_ are changed from normal NMT
    # for IMT training, we use only the suffix as the reference
    target_sentence = tensor.lmatrix('target_suffix')
    target_sentence_mask = tensor.matrix('target_suffix_mask')
    # TODO: change names back to *_suffix, there is currently a theano function name error
    # TODO: in the GradientDescent Algorithm

    target_prefix = tensor.lmatrix('target_prefix')
    target_prefix_mask = tensor.matrix('target_prefix_mask')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])

    decoder = NMTPrefixDecoder(config['trg_vocab_size'],
                               config['dec_embed'],
                               config['dec_nhids'],
                               config['enc_nhids'] * 2,
                               loss_function='cross_entropy')

    # rename to match baseline NMT systems
    decoder.name = 'decoder'

    # TODO: change the name of `target_sentence` to `target_suffix` for more clarity
    cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask),
                        source_sentence_mask, target_sentence,
                        target_sentence_mask, target_prefix,
                        target_prefix_mask)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # INITIALIZATION
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    trainable_params = cg.parameters

    # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W']
    # trainable_params.remove(source_embeddings)
    # trainable_params.remove(target_embeddings)

    # TODO: fixed dropout mask for recurrent params?
    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_parameters(),
        Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        # TrainingDataMonitoring(trainable_params, after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Set up the sampling graph for validation during training
    # Theano variables for the sampling graph
    sampling_vars = load_params_and_get_beam_search(config,
                                                    encoder=encoder,
                                                    decoder=decoder)
    beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars

    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model,
                    data_stream=tr_stream,
                    hook_samples=config['hook_samples'],
                    every_n_batches=config['sampling_freq'],
                    src_vocab=source_vocab,
                    trg_vocab=target_vocab,
                    src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    if config['bleu_script'] is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(sampling_input,
                          sampling_prefix,
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          src_vocab=source_vocab,
                          trg_vocab=target_vocab,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # TODO: add first-word accuracy validation
    # TODO: add IMT meteor early stopping
    if config.get('imt_f1_validation', None) is not None:
        logger.info("Building imt F1 validator")
        extensions.append(
            IMT_F1_Validator(sampling_input,
                             sampling_prefix,
                             samples=samples,
                             config=config,
                             model=search_model,
                             data_stream=dev_stream,
                             src_vocab=source_vocab,
                             trg_vocab=target_vocab,
                             normalize=config['normalized_bleu'],
                             every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(config['model_save_directory'],
                 channels=[['decoder_cost_cost'],
                           [
                               'validation_set_bleu_score',
                               'validation_set_imt_f1_score'
                           ]],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # WORKING: implement confidence model
    # if there is dropout or random noise, we need to use the output of the modified graph
    if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0:
        algorithm = GradientDescent(
            cost=cg.outputs[0],
            parameters=trainable_params,
            step_rule=CompositeRule([
                StepClipping(config['step_clipping']),
                eval(config['step_rule'])()
            ]),
            # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]),
            on_unused_sources='warn')
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]),
                                    on_unused_sources='warn')
    # END WORKING: implement confidence model

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # for i, (k,v) in enumerate(algorithm.updates):
    #     v.name = k.name + '_{}'.format(i)
    #
    # aux_vars = [v for v in cg.auxiliary_variables[-3:]]
    # import ipdb; ipdb.set_trace()

    extensions.extend([
        TrainingDataMonitoring([cost], after_batch=True),
        # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True),
        # TrainingDataMonitoring(aux_vars, after_batch=True),
        TrainingDataMonitoring(trainable_params, after_batch=True),
        Printing(after_batch=True)
    ])

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Beispiel #18
0
import time

host_plot = 'http://tfjgeorge.com:5006'
cost_dropout.name = 'cost'

extensions = [
    Timing(every_n_batches=50),
    TrainingDataMonitoring([cost_dropout], prefix='train'),
    DataStreamMonitoring(variables=[cost_dropout],
                         data_stream=valid_stream,
                         prefix="valid",
                         every_n_batches=50),
    Plot('%s %s' % (
        socket.gethostname(),
        datetime.datetime.now(),
    ),
         channels=[['train_cost', 'valid_cost']],
         every_n_batches=50,
         server_url=host_plot),
    TrackTheBest('valid_cost'),
    Checkpoint('train', save_separately=["model", "log"]),
    FinishIfNoImprovementAfter('valid_cost_best_so_far', epochs=5),
    #FinishAfter(every_n_epochs=100),
    Printing(every_n_batches=50)
]

# MAIN LOOP
from blocks.main_loop import MainLoop

main_loop = MainLoop(model=model,
                     data_stream=train_stream,
Beispiel #19
0
def build_and_run(save_to,modelconfig,experimentconfig):
    """ part of this is adapted from lasagne tutorial""" 

    n, num_filters, image_size, num_blockstack = modelconfig['depth'], modelconfig['num_filters'], modelconfig['image_size'], modelconfig['num_blockstack']
    
    print("Amount of bottlenecks: %d" % n)

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('image_features')
    #target_value = T.ivector('targets')
    target_var = T.lmatrix('targets')
    target_vec = T.extra_ops.to_one_hot(target_var[:,0],2)
    #target_var = T.matrix('targets')
    # Create residual net model
    print("Building model...")
    network = build_cnn(input_var, image_size, n, num_blockstack, num_filters)
    get_info(network)
    prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network))
    test_prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network,deterministic=True))

    # Loss function -> The objective to minimize 
    print("Instanciation of loss function...")
 
    #loss = CategoricalCrossEntropy().apply(target_var.flatten(), prediction)
    #test_loss = CategoricalCrossEntropy().apply(target_var.flatten(), test_prediction)
 #   loss = lasagne.objectives.categorical_crossentropy(prediction, target_var.flatten()).mean()
  #  test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var.flatten()).mean()
    loss = lasagne.objectives.squared_error(prediction,target_vec).mean()
    test_loss = lasagne.objectives.squared_error(test_prediction,target_vec).mean()
  #  loss = tensor.nnet.binary_crossentropy(prediction, target_var).mean()
  #  test_loss = tensor.nnet.binary_crossentropy(test_prediction, target_var).mean()
    test_loss.name = "loss"

#    loss.name = 'x-ent_error'
#    loss.name = 'sqr_error'
    layers = lasagne.layers.get_all_layers(network)

    #l1 and l2 regularization
    #pondlayers = {x:0.000025 for i,x in enumerate(layers)}
    #l1_penality = lasagne.regularization.regularize_layer_params_weighted(pondlayers, lasagne.regularization.l2)
    #l2_penality = lasagne.regularization.regularize_layer_params(layers[len(layers)/4:], lasagne.regularization.l1) * 25e-6
    #reg_penalty = l1_penality + l2_penality
    #reg_penalty.name = 'reg_penalty'
    #loss = loss + reg_penalty
    loss.name = 'reg_loss'
    error_rate = MisclassificationRate().apply(target_var.flatten(), test_prediction).copy(
            name='error_rate')

    
    # Load the dataset
    print("Loading data...")
    istest = 'test' in experimentconfig.keys()
    if istest:
        print("Using test stream")
    train_stream, valid_stream, test_stream = get_stream(experimentconfig['batch_size'],image_size,test=istest)

    # Defining step rule and algorithm
    if 'step_rule' in experimentconfig.keys() and not experimentconfig['step_rule'] is None :
        step_rule = experimentconfig['step_rule'](learning_rate=experimentconfig['learning_rate'])
    else :
        step_rule=Scale(learning_rate=experimentconfig['learning_rate'])

    params = map(lasagne.utils.as_theano_expression,lasagne.layers.get_all_params(network, trainable=True))
    print("Initializing algorithm")
    algorithm = GradientDescent(
                cost=loss, gradients={var:T.grad(loss,var) for var in params},#parameters=cg.parameters, #params
                step_rule=step_rule)

    #algorithm.add_updates(extra_updates)


    grad_norm = aggregation.mean(algorithm.total_gradient_norm)
    grad_norm.name = "grad_norm"

    print("Initializing extensions...")
    plot = Plot(save_to, channels=[['train_loss','valid_loss'], 
['train_grad_norm'],
#['train_grad_norm','train_reg_penalty'],
['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042')    

    checkpoint = Checkpoint('models/best_'+save_to+'.tar')
  #  checkpoint.add_condition(['after_n_batches=25'],

    checkpoint.add_condition(['after_epoch'],
                         predicate=OnLogRecord('valid_error_rate_best_so_far'))

    #Defining extensions
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=experimentconfig['num_epochs'],
                              after_n_batches=experimentconfig['num_batches']),
                  TrainingDataMonitoring([test_loss, error_rate, grad_norm], # reg_penalty],
                  prefix="train", after_epoch=True), #after_n_epochs=1
                  DataStreamMonitoring([test_loss, error_rate],valid_stream,prefix="valid", after_epoch=True), #after_n_epochs=1
                  plot,
                  #Checkpoint(save_to,after_n_epochs=5),
                  #ProgressBar(),
             #     Plot(save_to, channels=[['train_loss','valid_loss'], ['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042'), #'grad_norm'
                  #       after_batch=True),
                  Printing(after_epoch=True),
                  TrackTheBest('valid_error_rate',min), #Keep best
                  checkpoint,  #Save best
                  FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=5)] # Early-stopping

 #   model = Model(loss)
 #   print("Model",model)


    main_loop = MainLoop(
        algorithm,
        train_stream,
       # model=model,
        extensions=extensions)
    print("Starting main loop...")

    main_loop.run()
Beispiel #20
0
def main(save_to, num_epochs,
         weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None,
         batch_size=None, histogram=None, resume=False):
    output_size = 10

    prior_noise_level = -10
    noise_step_rule = Scale(1e-6)
    noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX))
    convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True,
            noise_rate=noise_rate,
            prior_noise_level=prior_noise_level)

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    test_probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs)
            .copy(name='cost'))
    test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs)
                  .copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs)
                  .copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate])

    # Apply dropout to all layer outputs except final softmax
    # dropout_vars = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(test_cg.variables)
    # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(test_cg, [x], 0.2)
    # train_cg = drop_cg
    # train_cg = apply_batch_normalization(test_cg)

    # train_cost, train_error_rate, train_components = train_cg.outputs

    with batch_normalization(convnet):
        with training_noise(convnet):
            train_probs = convnet.apply(x)
    train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs)
                .copy(name='cost'))
    train_components = (ComponentwiseCrossEntropy().apply(y.flatten(),
                train_probs).copy(name='components'))
    train_error_rate = (MisclassificationRate().apply(y.flatten(),
                train_probs).copy(name='error_rate'))
    train_cg = ComputationGraph([train_cost,
                train_error_rate, train_components])
    population_updates = get_batch_normalization_updates(train_cg)
    bn_alpha = 0.9
    extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha))
                for p, m in population_updates]

    # for annealing
    nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX))
    nit_penalty.name = 'nit_penalty'

    # Compute noise rates for training graph
    train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables)
    train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean()
    train_mean_log_sigma.name = 'mean_log_sigma'
    train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables)
    train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean()
    train_nit_rate.name = 'nit_rate'
    train_nit_regularization = nit_penalty * train_nit_rate
    train_nit_regularization.name = 'nit_regularization'

    # Apply regularization to the cost
    trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])(
            train_cg.parameters)
    mask_parameters = [p for p in trainable_parameters
            if get_brick(p).name == 'mask']
    noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters)
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    nonmask_weights = [p for p in weights if get_brick(p).name != 'mask']
    l2_norm = sum([(W ** 2).sum() for W in nonmask_weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = weight_decay * l2_norm
    l2_regularization.name = 'l2_regularization'

    # testversion
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + l2_regularization + train_nit_regularization
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train",))
    cifar10_train_stream = RandomPadCropFlip(
        NormalizeBatchLevels(DataStream.default_stream(
            cifar10_train, iteration_scheme=ShuffledScheme(
                cifar10_train.num_examples, batch_size)),
        which_sources=('features',)),
        (32, 32), pad=4, which_sources=('features',))

    test_batch_size = 128
    cifar10_test = CIFAR10(("test",))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(
            cifar10_test.num_examples, test_batch_size)),
        which_sources=('features',))

    momentum = Momentum(0.01, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])

    # Create a step rule that reduces the learning rate of noise
    scale_mask = Restrict(noise_step_rule, mask_parameters)
    step_rule = CompositeRule([scale_mask, momentum])

    # from theano.compile.nanguardmode import NanGuardMode

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=train_cost, parameters=trainable_parameters,
        step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    #,
    #    theano_func_kwargs={
    #        'mode': NanGuardMode(
    #            nan_is_error=True, inf_is_error=True, big_is_error=True)})

    exp_name = save_to.replace('.%d', '')

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  EpochSchedule(momentum.learning_rate, [
                      (0, 0.01),     # Warm up with 0.01 learning rate
                      (50, 0.1),     # Then go back to 0.1
                      (100, 0.01),
                      (150, 0.001)
                      # (83, 0.01),  # Follow the schedule in the paper
                      # (125, 0.001)
                  ]),
                  EpochSchedule(noise_step_rule.learning_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4)
                  ]),
                  EpochSchedule(noise_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4),
                      # (6, 3e-4),
                      # (8, 1e-3), # Causes nit rate to jump
                      # (10, 3e-3),
                      # (12, 1e-2),
                      # (15, 3e-2),
                      # (19, 1e-1),
                      # (24, 3e-1),
                      # (30, 1)
                  ]),
                  NoiseExtension(
                      noise_parameters=noise_parameters),
                  NoisyDataStreamMonitoring(
                      [test_cost, test_error_rate, test_confusion],
                      cifar10_test_stream,
                      noise_parameters=noise_parameters,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [train_cost, train_error_rate, train_nit_rate,
                       train_cost_without_regularization,
                       l2_regularization,
                       train_nit_regularization,
                       momentum.learning_rate,
                       train_mean_log_sigma,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      every_n_batches=17),
                      # after_epoch=True),
                  Plot('Training performance for ' + exp_name,
                      channels=[
                          ['train_cost_with_regularization',
                           'train_cost_without_regularization',
                           'train_nit_regularization',
                           'train_l2_regularization'],
                          ['train_error_rate'],
                          ['train_total_gradient_norm'],
                          ['train_mean_log_sigma'],
                      ],
                      every_n_batches=17),
                  Plot('Test performance for ' + exp_name,
                      channels=[[
                          'train_error_rate',
                          'test_error_rate',
                          ]],
                      after_epoch=True),
                  EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=train_components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(exp_name, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(
        algorithm,
        cifar10_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)
Beispiel #21
0
x1 = encoder.apply(f1)
x2 = encoder.apply(f2)

from cost import ContrastiveLoss

cost = ContrastiveLoss(q=dims[-1]).apply(x1=x1, x2=x2, y1=y1, y2=y2)
cost.name = 'contrastive_loss'
cost_test = cost.copy('contrastive_loss_test')

cg = ComputationGraph(cost)

algorithm = GradientDescent(cost=cost,
                            step_rule=AdaDelta(),
                            parameters=cg.parameters)

main_loop = MainLoop(algorithm=algorithm,
                     data_stream=train_stream,
                     extensions=[
                         TrainingDataMonitoring(variables=[cost],
                                                after_epoch=True),
                         DataStreamMonitoring(data_stream=test_stream,
                                              variables=[cost_test],
                                              after_epoch=True),
                         Printing(after_epoch=True),
                         FinishAfter(after_n_epochs=50),
                         Plot(document='siamese network larger',
                              channels=[[cost.name, cost_test.name]],
                              start_server=True,
                              after_epoch=True)
                     ])
Beispiel #22
0
def main(config,
         tr_stream,
         dev_stream,
         use_bokeh=False,
         src_vocab=None,
         trg_vocab=None):

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['dec_nhids'], config['enc_nhids'] * 2)
    cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask),
                        source_sentence_mask, target_sentence,
                        target_sentence_mask)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING

    # TODO: allow user to remove some params from the graph, for example if embeddings should be kept static
    if config.get('l2_regularization', False) is True:
        l2_reg_alpha = config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to name the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_parameters(),
        Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # allow user to externally initialize some params
    model_params = training_model.get_parameter_dict()
    if config.get('external_embeddings', None) is not None:
        for key in config['external_embeddings']:
            path_to_params = config['external_embeddings'][key]
            logger.info(
                'Replacing {} parameters with external params at: {}'.format(
                    key, path_to_params))
            external_params = numpy.load(path_to_params)
            len_external_idx = external_params.shape[0]
            print(external_params.shape)
            # Working: look in the dictionary and overwrite the correct rows
            existing_params = model_params[key].get_value()
            if key == '/bidirectionalencoder/embeddings.W':
                vocab = src_vocab
            elif key == '/decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W':
                vocab = trg_vocab
            else:
                raise KeyError(
                    'Unknown embedding parameter key: {}'.format(key))
            for k, i in vocab.items():
                if i < len_external_idx:
                    existing_params[i] = external_params[i]

            # model_params_shape = model_params[key].get_value().shape
            # assert model_params[key].get_value().shape == external_params.shape, ("Parameter dims must not change,"
            #                                                                       "shapes {} and {} do not match".
            #                                                                       format(model_params_shape,
            #                                                                              external_params.shape))
            model_params[key].set_value(existing_params)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions
    logger.info("Initializing extensions")
    extensions = []

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        # note that generated containes several different outputs
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling
    # Note: this is broken for unicode chars
    #if config['hook_samples'] >= 1:
    #    logger.info("Building sampler")
    #    extensions.append(
    #        Sampler(model=search_model, data_stream=tr_stream,
    #                hook_samples=config['hook_samples'],
    #                every_n_batches=config['sampling_freq'],
    #                src_vocab_size=config['src_vocab_size']))

    # WORKING: remove these validators in favor of Async
    # TODO: implement burn-in in the validation extension (don't fire until we're past the burn-in iteration)
    # Add early stopping based on bleu
    # if config.get('bleu_script', None) is not None:
    #     logger.info("Building bleu validator")
    #     extensions.append(
    #         BleuValidator(sampling_input, samples=samples, config=config,
    #                       model=search_model, data_stream=dev_stream,
    #                       normalize=config['normalized_bleu'],
    #                       every_n_batches=config['bleu_val_freq']))

    # Add early stopping based on Meteor
    # if config.get('meteor_directory', None) is not None:
    #     logger.info("Building meteor validator")
    #     extensions.append(
    #         MeteorValidator(sampling_input, samples=samples, config=config,
    #                       model=search_model, data_stream=dev_stream,
    #                       normalize=config['normalized_bleu'],
    #                       every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    # if there is dropout or random noise, we need to use the output of the modified graph
    if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0:
        algorithm = GradientDescent(cost=cg.outputs[0],
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))

    # enrich the logged information
    extensions.extend([
        Timing(every_n_batches=100),
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ])

    # External non-blocking validation
    extensions.append(
        RunExternalValidation(config=config,
                              every_n_batches=config['bleu_val_freq']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(config['model_save_directory'],
                 channels=[['decoder_cost_cost'],
                           ['validation_set_bleu_score'],
                           ['validation_set_meteor_score']],
                 every_n_batches=1))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Beispiel #23
0
    def _train_model(self, train_stream, valid_stream, load_from, save_to,
                     *args, **kwargs):
        # Build model
        self.model = self.config.Model(self.config,
                                       self.dataset)  # with word2id

        cg = Model(self.model.cg_generator)

        algorithm = GradientDescent(cost=self.model.cg_generator,
                                    step_rule=self.config.step_rule,
                                    parameters=cg.parameters,
                                    on_unused_sources='ignore')

        if plot_avail:
            extensions = [
                FinishAfter(after_n_epochs=1),
                TrainingDataMonitoring(
                    [v for l in self.model.monitor_train_vars for v in l],
                    prefix='train',
                    every_n_batches=self.config.print_freq),
                Plot('Training Process',
                     channels=[
                         v.name for l in self.model.monitor_train_vars
                         for v in l
                     ],
                     after_batch=True)
            ]
        else:
            extensions = [
                TrainingDataMonitoring(
                    [v for l in self.model.monitor_train_vars for v in l],
                    prefix='train',
                    every_n_batches=self.config.print_freq)
            ]

        saver_loader = self.model_save_loader(load_from=load_from,
                                              save_to=save_to,
                                              model=cg,
                                              dataset=self.dataset)
        saver_loader.do_load()

        n_batches = numpy.ceil(self.n_samples /
                               self.config.batch_size).astype('int32')
        n_valid_batches = numpy.ceil(n_batches *
                                     self.config.valid_freq).astype('int32')
        extensions += [
            EvaluatorWithEarlyStop(
                coverage=1.,
                tolerate_time=self.config.tolerate_time,
                variables=[
                    v for l in self.model.monitor_valid_vars for v in l
                ],
                monitor_variable=self.model.stop_monitor_var,
                data_stream=valid_stream,
                saver=saver_loader,
                prefix='valid',
                every_n_batches=n_valid_batches)
        ]

        extensions += [
            Printing(every_n_batches=self.config.print_freq, after_epoch=True),
            ProgressBar()
        ]
        extensions += [EpochMonitor(1)]
        main_loop = MainLoop(model=cg,
                             data_stream=train_stream,
                             algorithm=algorithm,
                             extensions=extensions)
        # Run the model!
        main_loop.run()
Beispiel #24
0
def initialaze_algorithm(config, save_path, bokeh_name, params, bokeh_server,
                         bokeh, use_load_ext, load_log, fast_start, 
                         recognizer, data, model, cg, regularized_cg,
                         cost, train_cost, parameters, 
                         max_norm_rules, observables,
                         batch_size, batch_cost, weights_entropy, 
                         labels_mask, labels,  gradients=None):
    primary_observables = observables
    secondary_observables = []
    validation_observables = []
    root_path, extension = os.path.splitext(save_path)
    train_conf = config['training']
    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'], train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    if 'adam' in rule_names:
        assert len(rule_names) == 1
        logger.info("Using Adam for training")
        core_rules.append(
            Adam(learning_rate=train_conf.get('scale', 0.002),
                 beta1=train_conf.get('beta1', 0.1),
                 beta2=train_conf.get('beta2', 0.001),
                 epsilon=train_conf.get('epsilon', 1e-8),
                 decay_factor=train_conf.get('decay_rate', (1 - 1e-8))))
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(
            BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')
        #theano_func_kwargs={'mode':NanGuardMode(nan_is_error=True)})

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements ** 0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost,
        algorithm.total_gradient_norm,
        algorithm.total_step_norm, clipping.threshold]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances')] + weights_entropy


    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name.startswith('weights_entropy'):
                chld_id = recognizer.child_id_from_postfix(var.name)
                result.append(rename(aggregation.mean(var, labels_mask[chld_id].sum()),
                                     'weights_entropy_per_label'+
                                     recognizer.children[chld_id].names_postfix))
            elif var.name.endswith('_nll'):
                chld_id = recognizer.child_id_from_postfix(var.name)
                result.append(rename(aggregation.mean(var.sum(),
                                                      labels_mask[chld_id].sum()),
                                     var.name+'_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']
    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(TrainingDataMonitoring(
        primary_observables, after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average", every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables),
        data.get_stream("valid", shuffle=False, **data_params_valid), prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)

    additional_patience_notifiers = []
    uas = DependencyErrorRate(recognizer.children[0], data,
                              **config['monitoring']['search'])
    las = AuxiliaryErrorRates(uas, name='LAS')
    lab = AuxiliaryErrorRates(uas, name='LAB')
    per_monitoring = DataStreamMonitoring(
        [uas, las, lab], data.get_one_stream("valid", data.langs[0], batches=False, shuffle=False, **data_params_valid)[0],
        prefix="valid").set_conditions(
                before_first_epoch=not fast_start,
                every_n_epochs=mon_conf['search_every_epochs'],
                every_n_batches=mon_conf['search_every_batches'],
                after_training=False)
    extensions.append(per_monitoring)
    track_the_best_uas = TrackTheBest(
        per_monitoring.record_name(uas)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_las = TrackTheBest(
        per_monitoring.record_name(las)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_lab = TrackTheBest(
        per_monitoring.record_name(lab)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_uas,
                   track_the_best_las,
                   track_the_best_lab,
                   ]
    per = uas
    track_the_best_per = track_the_best_uas
    additional_patience_notifiers = [track_the_best_lab,
                                     track_the_best_las]
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_cost]
    extensions.append(AdaptiveClipping(
        algorithm.total_gradient_norm.name,
        clipping, train_conf['gradient_threshold'],
        decay_rate=0.998, burnin_period=500,
        num_stds=train_conf.get('clip_stds', 1.0)))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf['num_batches'],
                    after_n_epochs=train_conf['num_epochs']),
            # .add_condition(["after_batch"], _gradient_norm_is_none),
    ]
    main_postfix = recognizer.children[0].names_postfix
    channels = [
        # Plot 1: training and validation costs
        [average_monitoring.record_name(train_cost),
         validation.record_name(cost)],
        # Plot 2: gradient norm,
        [average_monitoring.record_name(algorithm.total_gradient_norm),
         average_monitoring.record_name(clipping.threshold)],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [average_monitoring._record_name('weights_entropy_per_label'+main_postfix),
         validation._record_name('weights_entropy_per_label'+main_postfix)],
        # Plot 5: training and validation monotonicity penalty
        [average_monitoring._record_name('weights_penalty_per_recording'+main_postfix),
         validation._record_name('weights_penalty_per_recording'+main_postfix)]]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name
                 else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start, after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True)
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_per.notification_name),
            (root_path + "_best" + extension,))
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_cost.notification_name),
            (root_path + "_best_ll" + extension,)),
        ProgressBar()]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name] + additional_patience_notifiers
        extensions.append(Patience(**patience_conf))

    if train_conf.get('min_performance_stops'):
        extensions.append(EarlyTermination(
            param_name=track_the_best_per.best_name,
            min_performance_by_epoch=train_conf['min_performance_stops']))

    extensions.append(Printing(every_n_batches=1,
                               attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
Beispiel #25
0
    TrainingDataMonitoring(
        [cost, error_rate,
         aggregation.mean(algorithm.total_gradient_norm)],
        prefix="train",
        after_epoch=True),
    DataStreamMonitoring([cost, error_rate, error_rate2],
                         stream_valid,
                         prefix="valid",
                         after_epoch=True),
    Checkpoint("google_Ortho2_pretrain2_l0001.pkl", after_epoch=True),
    ProgressBar(),
    Printing()
]

#Adding a live plot with the bokeh server
extensions.append(
    Plot('CatsVsDogs160_GoogleNet_Reload2_l0001',
         channels=[['train_error_rate', 'valid_error_rate'],
                   ['valid_cost', 'valid_error_rate2'],
                   ['train_total_gradient_norm']],
         after_batch=True))

params = load_parameter_values('GoogleParameters.pkl')
model = Model(cost)
model.set_parameter_values(params)
main_loop = MainLoop(algorithm,
                     data_stream=stream_train,
                     model=model,
                     extensions=extensions)
main_loop.run()
Beispiel #26
0
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh,
                   test_tag, use_load_ext, load_log, fast_start):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])
    train_conf = config['training']
    recognizer = create_model(config, data, test_tag)

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result

    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    prediction, prediction_mask = add_exploration(recognizer, data, train_conf)

    #
    # Observables:
    #
    primary_observables = []  # monitored each batch
    secondary_observables = []  # monitored every 10 batches
    validation_observables = []  # monitored on the validation set

    cg = recognizer.get_cost_graph(batch=True,
                                   prediction=prediction,
                                   prediction_mask=prediction_mask)
    labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg)
    labels_mask, = VariableFilter(applications=[recognizer.cost],
                                  name='labels_mask')(cg)

    gain_matrix = VariableFilter(
        theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg)
    if len(gain_matrix):
        gain_matrix, = gain_matrix
        primary_observables.append(rename(gain_matrix.min(), 'min_gain'))
        primary_observables.append(rename(gain_matrix.max(), 'max_gain'))

    batch_cost = cg.outputs[0].sum()
    batch_size = rename(recognizer.labels.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_total_cost"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(applications=[r.generator.readout.readout],
                               name="output_0")(cost_cg)
    bottom_output = VariableFilter(
        # We need name_regex instead of name because LookupTable calls itsoutput output_0
        applications=[r.bottom.apply],
        name_regex="output")(cost_cg)[-1]
    attended, = VariableFilter(applications=[r.generator.transition.apply],
                               name="attended")(cost_cg)
    attended_mask, = VariableFilter(applications=[
        r.generator.transition.apply
    ],
                                    name="attended_mask")(cost_cg)
    weights, = VariableFilter(applications=[r.generator.evaluate],
                              name="weights")(cost_cg)

    from blocks.roles import AUXILIARY
    l2_cost, = VariableFilter(roles=[AUXILIARY],
                              theano_name='l2_cost_aux')(cost_cg)
    cost_forward, = VariableFilter(roles=[AUXILIARY],
                                   theano_name='costs_forward_aux')(cost_cg)

    max_recording_length = rename(bottom_output.shape[0],
                                  "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = rename(attended_mask.shape[0],
                                      "max_attended_mask_length")
    max_attended_length = rename(attended.shape[0], "max_attended_length")
    max_num_phonemes = rename(labels.shape[0], "max_num_phonemes")
    min_energy = rename(energies.min(), "min_energy")
    max_energy = rename(energies.max(), "max_energy")
    mean_attended = rename(abs(attended).mean(), "mean_attended")
    mean_bottom_output = rename(
        abs(bottom_output).mean(), "mean_bottom_output")
    weights_penalty = rename(monotonicity_penalty(weights, labels_mask),
                             "weights_penalty")
    weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy")
    mask_density = rename(labels_mask.mean(), "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy, min_energy, max_energy,
        mean_attended, mean_bottom_output, batch_size, max_num_phonemes,
        mask_density
    ])
    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config.get('regularization', dict())
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [
            p for p in cg.parameters if p not in attention_params
        ]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])

    train_cost = regularized_cg.outputs[0]
    if reg_config.get("penalty_coof", .0) > 0:
        # big warning!!!
        # here we assume that:
        # regularized_weights_penalty = regularized_cg.outputs[1]
        train_cost = (train_cost + reg_config.get("penalty_coof", .0) *
                      regularized_cg.outputs[1] / batch_size)
    if reg_config.get("decay", .0) > 0:
        train_cost = (
            train_cost + reg_config.get("decay", .0) *
            l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2)

    train_cost = rename(train_cost, 'train_cost')

    gradients = None
    if reg_config.get('adaptive_noise'):
        logger.info('apply adaptive noise')
        if ((reg_config.get("penalty_coof", .0) > 0)
                or (reg_config.get("decay", .0) > 0)):
            logger.error('using  adaptive noise with alignment weight panalty '
                         'or weight decay is probably stupid')
        train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise(
            cg,
            cg.outputs[0],
            variables=cg.parameters,
            num_examples=data.get_dataset('train').num_examples,
            parameters=Model(
                regularized_cg.outputs[0]).get_parameter_dict().values(),
            **reg_config.get('adaptive_noise'))
        train_cost.name = 'train_cost'
        adapt_noise_cg = ComputationGraph(train_cost)
        model_prior_mean = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_mean')(adapt_noise_cg)[0],
            'model_prior_mean')
        model_cost = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_cost')(adapt_noise_cg)[0], 'model_cost')
        model_prior_variance = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_variance')(adapt_noise_cg)[0],
            'model_prior_variance')
        regularized_cg = ComputationGraph(
            [train_cost, model_cost] + regularized_cg.outputs +
            [model_prior_mean, model_prior_variance])
        primary_observables += [
            regularized_cg.outputs[1],  # model cost
            regularized_cg.outputs[2],  # task cost
            regularized_cg.outputs[-2],  # model prior mean
            regularized_cg.outputs[-1]
        ]  # model prior variance

    model = Model(train_cost)
    if params:
        logger.info("Load parameters from " + params)
        # please note: we cannot use recognizer.load_params
        # as it builds a new computation graph that dies not have
        # shapred variables added by adaptive weight noise
        with open(params, 'r') as src:
            param_values = load_parameters(src)
        model.set_parameter_values(param_values)

    parameters = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, parameters[key].get_value().shape)
                                for key in sorted(parameters.keys())],
                               width=120))

    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'],
                                   train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(
            AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False) > 0:
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [
                v for v in maxnorm_subjects
                if not isinstance(get_brick(v), LookupTable)
            ]
        logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat(
            [name for name, p in parameters.items() if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([
            name for name, p in parameters.items() if not p in maxnorm_subjects
        ]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                     maxnorm_subjects)
        ]
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold
    ]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements**0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements**0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm,
        clipping.threshold, max_recording_length, max_attended_length,
        max_attended_mask_length
    ]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances'),
        weights_entropy, weights_penalty
    ]

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(
                    rename(aggregation.mean(var, batch_size),
                           'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(
                    rename(aggregation.mean(var, labels_mask.sum()),
                           'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(
            Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(
        TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward],
                               after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average",
        every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables +
                                   [l2_cost, cost_forward]),
        data.get_stream("valid", shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)
    per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search'])
    per_monitoring = DataStreamMonitoring(
        [per],
        data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['search_every_epochs'],
            every_n_batches=mon_conf['search_every_batches'],
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(before_first_epoch=True,
                                                     after_epoch=True)
    extensions += [track_the_best_cost, track_the_best_per]
    extensions.append(
        AdaptiveClipping(algorithm.total_gradient_norm.name,
                         clipping,
                         train_conf['gradient_threshold'],
                         decay_rate=0.998,
                         burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf.get('num_batches'),
                    after_n_epochs=train_conf.get('num_epochs')).add_condition(
                        ["after_batch"], _gradient_norm_is_none),
    ]
    channels = [
        # Plot 1: training and validation costs
        [
            average_monitoring.record_name(train_cost),
            validation.record_name(cost)
        ],
        # Plot 2: gradient norm,
        [
            average_monitoring.record_name(algorithm.total_gradient_norm),
            average_monitoring.record_name(clipping.threshold)
        ],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [
            average_monitoring._record_name('weights_entropy_per_label'),
            validation._record_name('weights_entropy_per_label')
        ],
        # Plot 5: training and validation monotonicity penalty
        [
            average_monitoring._record_name('weights_penalty_per_recording'),
            validation._record_name('weights_penalty_per_recording')
        ]
    ]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),
        ]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start,
                   after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True).add_condition(
                       ['after_epoch'],
                       OnLogRecord(track_the_best_per.notification_name),
                       (root_path + "_best" + extension, )).add_condition(
                           ['after_epoch'],
                           OnLogRecord(track_the_best_cost.notification_name),
                           (root_path + "_best_ll" + extension, )),
        ProgressBar()
    ]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))
    if config['net']['criterion']['name'].startswith('mse'):
        extensions.append(
            LogInputsGains(labels, cg, recognizer.generator.readout.emitter,
                           data))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name
            ]
        extensions.append(Patience(**patience_conf))

    extensions.append(
        Printing(every_n_batches=1, attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
Beispiel #27
0
algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            on_unused_sources='ignore',
                            step_rule=CompositeRule([
                                StepClipping(10.),
                                Adam(learning_rate),
                            ]))
main_loop = MainLoop(model=Model(cost),
                     data_stream=data_stream_train,
                     algorithm=algorithm,
                     extensions=[
                         Timing(),
                         FinishAfter(after_n_epochs=n_epochs), monitor,
                         monitor_val, monitor_test,
                         saveSnapshot(
                             '/home/xuehongyang/checkpoints_read/snapshot',
                             save_main_loop=False,
                             after_epoch=True,
                             save_separately=['log', 'model']),
                         ProgressBar(),
                         Printing(every_n_batches=500),
                         Plot('videoqa_open_rereading',
                              channels=[['train_cost']],
                              every_n_batches=500,
                              after_batch=True)
                     ])

print('starting...')

main_loop.run()
Beispiel #28
0
def run(get_model, model_name):
    train_stream = ServerDataStream(
        ('cases', 'image_features', 'image_targets', 'multiplier'),
        False,
        hwm=10)
    valid_stream = ServerDataStream(
        ('cases', 'image_features', 'image_targets', 'multiplier'),
        False,
        hwm=10,
        port=5558)

    input_var = tensor.tensor4('image_features')
    target_var = tensor.tensor4('image_targets')
    multiply_var = tensor.matrix('multiplier')
    multiply_var = T.addbroadcast(multiply_var, 1)

    test_prediction, prediction, params = get_model(input_var, target_var,
                                                    multiply_var)

    loss = binary_crossentropy(prediction, target_var).mean()

    loss.name = 'loss'

    valid_error = T.neq((test_prediction > 0.5) * 1., target_var).mean()
    valid_error.name = 'error'

    scale = Scale(0.1)
    algorithm = GradientDescent(
        cost=loss,
        parameters=params,
        step_rule=scale,
        #step_rule=Adam(),
        on_unused_sources='ignore')

    host_plot = 'http://localhost:5006'

    extensions = [
        Timing(),
        TrainingDataMonitoring([loss], after_epoch=True),
        DataStreamMonitoring(variables=[loss, valid_error],
                             data_stream=valid_stream,
                             prefix="valid"),
        Plot('%s %s %s' %
             (model_name, datetime.date.today(), time.strftime('%H:%M')),
             channels=[['loss', 'valid_loss'], ['valid_error']],
             after_epoch=True,
             server_url=host_plot),
        Printing(),
        # Checkpoint('train'),
        FinishAfter(after_n_epochs=10)
    ]

    main_loop = MainLoop(data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    cg = ComputationGraph(test_prediction)
    while True:
        main_loop.run()
        scale.learning_rate.set_value(
            numpy.float32(scale.learning_rate.get_value() * 0.7))
        numpy.savez('best_weights.npz',
                    [param.get_value() for param in cg.shared_variables])
Beispiel #29
0
extensions = [
    Timing(),
    FinishAfter(after_n_epochs=num_epochs),
    DataStreamMonitoring([cost, error_rate, error_rate2],
                         stream_valid,
                         prefix="valid"),
    TrainingDataMonitoring(
        [cost, error_rate,
         aggregation.mean(algorithm.total_gradient_norm)],
        prefix="train",
        after_epoch=True),
    Checkpoint("catsVsDogs256.pkl"),
    ProgressBar(),
    Printing()
]
#Adding a live plot with the bokeh server

extensions.append(
    Plot('CatsVsDogs_256',
         channels=[['train_error_rate', 'valid_error_rate'],
                   ['valid_cost', 'valid_error_rate2'],
                   ['train_total_gradient_norm']],
         after_epoch=True))

model = Model(cost)
main_loop = MainLoop(algorithm,
                     stream_train,
                     model=model,
                     extensions=extensions)
main_loop.run()
Beispiel #30
0
def run(get_model, model_name):
    train_stream = ServerDataStream(('cases', 'image_position', 'multiplier',
                                     'sax', 'sax_features', 'targets'),
                                    False,
                                    hwm=10)
    valid_stream = ServerDataStream(('cases', 'image_position', 'multiplier',
                                     'sax', 'sax_features', 'targets'),
                                    False,
                                    hwm=10,
                                    port=5558)

    ftensor5 = tensor.TensorType('float32', (False, ) * 5)

    input_var = ftensor5('sax_features')
    target_var = tensor.matrix('targets')
    multiply_var = tensor.matrix('multiplier')
    multiply_var = T.addbroadcast(multiply_var, 1)

    prediction, test_prediction, test_pred_mid, params_bottom, params_top = get_model(
        input_var, multiply_var)

    # load parameters
    cg = ComputationGraph(test_pred_mid)
    params_val = numpy.load('sunnybrook/best_weights.npz')

    for p, value in zip(cg.shared_variables, params_val['arr_0']):
        p.set_value(value)

    crps = tensor.abs_(test_prediction - target_var).mean()

    loss = squared_error(prediction, target_var).mean()

    loss.name = 'loss'
    crps.name = 'crps'

    algorithm = GradientDescent(cost=loss,
                                parameters=params_top,
                                step_rule=Adam(),
                                on_unused_sources='ignore')

    host_plot = 'http://localhost:5006'

    extensions = [
        Timing(),
        TrainingDataMonitoring([loss], after_epoch=True),
        DataStreamMonitoring(variables=[crps, loss],
                             data_stream=valid_stream,
                             prefix="valid"),
        Plot('%s %s %s' %
             (model_name, datetime.date.today(), time.strftime('%H:%M')),
             channels=[['loss', 'valid_loss'], ['valid_crps']],
             after_epoch=True,
             server_url=host_plot),
        Printing(),
        Checkpoint('train'),
        FinishAfter(after_n_epochs=20)
    ]

    main_loop = MainLoop(data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    main_loop.run()