Example #1
0
 def test_loading_exception(self):
     """Check loading exception."""
     with tempfile.NamedTemporaryFile(delete=False) as f:
         f.write("a".encode("utf-8"))
     load = Load(f.name)
     load.main_loop = self.main_loop
     self.assertRaises(tarfile.ReadError, load.before_training)
Example #2
0
 def test_loading_exception(self):
     """Check loading exception."""
     with tempfile.NamedTemporaryFile(delete=False) as f:
         f.write('a'.encode('utf-8'))
     load = Load(f.name)
     load.main_loop = self.main_loop
     self.assertRaises(tarfile.ReadError, load.before_training)
def prepare_opti(cost, test, *args):
    model = Model(cost)
    logger.info("Model created")

    algorithm = GradientDescent(cost=cost,
                                parameters=model.parameters,
                                step_rule=Adam(learning_rate=0.0015),
                                on_unused_sources='ignore')

    to_monitor = [algorithm.cost]
    if args:
        to_monitor.extend(args)

    extensions = [
        FinishAfter(after_n_epochs=nb_epoch),
        FinishIfNoImprovementAfter(notification_name='loglikelihood_nat',
                                   epochs=patience),
        TrainingDataMonitoring(to_monitor, prefix="train", after_epoch=True),
        DataStreamMonitoring(to_monitor, test_stream, prefix="test"),
        Printing(),
        ProgressBar(),
        ApplyMask(before_first_epoch=True, after_batch=True),
        Checkpoint(check, every_n_epochs=save_every),
        SaveModel(name=path + '/' + 'pixelcnn_{}'.format(dataset),
                  every_n_epochs=save_every),
        GenerateSamples(every_n_epochs=save_every),
        #Checkpoint(path+'/'+'exp.log', save_separately=['log'],every_n_epochs=save_every),
    ]

    if resume:
        logger.info("Restoring from previous checkpoint")
        extensions = [Load(path + '/' + check)]

    return model, algorithm, extensions
Example #4
0
def prepare_opti(cost, test):
    model = Model(cost)

    algorithm = GradientDescent(
        cost=cost,
        parameters=model.parameters,
        step_rule=RMSProp(),
        on_unused_sources='ignore'
    )

    extensions = [
        FinishAfter(after_n_epochs=nb_epoch),
        FinishIfNoImprovementAfter(notification_name='test_cross_entropy', epochs=patience),
        TrainingDataMonitoring(
            [algorithm.cost],
            prefix="train",
            after_epoch=True),
        DataStreamMonitoring(
            [algorithm.cost],
            test_stream,
            prefix="test"),
        Printing(),
        ProgressBar(),
        #Checkpoint(path, after_epoch=True)
    ]

    if resume:
        print "Restoring from previous breakpoint"
        extensions.extend([
            Load(path)
        ])
    return model, algorithm, extensions
Example #5
0
def test_load():
    # Create a main loop and checkpoint it
    mlp = MLP(activations=[None],
              dims=[10, 10],
              weights_init=Constant(1.),
              use_bias=False)
    mlp.initialize()
    W = mlp.linear_transformations[0].W
    x = tensor.vector('data')
    cost = mlp.apply(x).mean()
    data = numpy.random.rand(10, 10).astype(theano.config.floatX)
    data_stream = IterableDataset(data).get_example_stream()

    main_loop = MainLoop(data_stream=data_stream,
                         algorithm=GradientDescent(cost=cost, parameters=[W]),
                         extensions=[
                             FinishAfter(after_n_batches=5),
                             Checkpoint('myweirdmodel.picklebarrel')
                         ])
    main_loop.run()

    # Load the parameters, log and iteration state
    old_value = W.get_value()
    W.set_value(old_value * 2)
    main_loop = MainLoop(model=Model(cost),
                         data_stream=data_stream,
                         algorithm=GradientDescent(cost=cost, parameters=[W]),
                         extensions=[
                             Load('myweirdmodel.picklebarrel',
                                  load_iteration_state=True,
                                  load_log=True)
                         ])
    main_loop.extensions[0].main_loop = main_loop
    main_loop._run_extensions('before_training')
    assert_allclose(W.get_value(), old_value)

    # Make sure things work too if the model was never saved before
    main_loop = MainLoop(model=Model(cost),
                         data_stream=data_stream,
                         algorithm=GradientDescent(cost=cost, parameters=[W]),
                         extensions=[
                             Load('mynonexisting.picklebarrel',
                                  load_iteration_state=True,
                                  load_log=True)
                         ])
    main_loop.extensions[0].main_loop = main_loop
    main_loop._run_extensions('before_training')
    def __init__(self, config_dict):
        print config_dict
        train, valid, alphabet = build_datasets(config_dict)
        generator, cost = build_model(len(alphabet), config_dict)
        algorithm = build_algorithm(generator, cost, config_dict)
        extensions = build_extensions(cost, algorithm, valid, config_dict)
        main_loop = MainLoop(algorithm=algorithm, data_stream=train,
                             model=Model(cost), extensions=extensions)
        ml = Load(config_dict['checkpoint_path'], load_log=True)
        ml.load_to(main_loop)
        generator = main_loop.model.get_top_bricks()[-1]
        
        self.numbers_from_text = pickle.load(open(config_dict['dict_path']))

        x = tensor.lmatrix('sample')
        cost_cg = generator.cost(x)
        self.cost_f = theano.function([x], cost_cg)
Example #7
0
 def test_save_and_load(self):
     """Check that main loop have been saved properly."""
     old_value = self.W.get_value()
     self.W.set_value(old_value * 2)
     new_main_loop = MainLoop(model=self.model,
                              data_stream=self.data_stream,
                              algorithm=self.algorithm,
                              extensions=[Load('myweirdmodel.tar')])
     new_main_loop.extensions[0].main_loop = new_main_loop
     new_main_loop._run_extensions('before_training')
     assert_allclose(self.W.get_value(), old_value)
Example #8
0
 def test_load_log_and_iteration_state(self):
     """Check we can save the log and iteration state separately."""
     skip_if_configuration_set('log_backend', 'sqlite',
                               'Bug with log.status["resumed_from"]')
     new_main_loop = MainLoop(
         model=self.model,
         data_stream=self.data_stream,
         algorithm=self.algorithm,
         extensions=[Load('myweirdmodel.tar', True, True)])
     new_main_loop.extensions[0].main_loop = new_main_loop
     new_main_loop._run_extensions('before_training')
     # Check the log
     new_keys = sorted(new_main_loop.log.status.keys())
     old_keys = sorted(self.main_loop.log.status.keys())
     for new_key, old_key in zip(new_keys, old_keys):
         assert new_key == old_key
         assert (new_main_loop.log.status[new_key] ==
                 self.main_loop.log.status[old_key])
     # Check the iteration state
     new = next(new_main_loop.iteration_state[1])['data']
     old = next(self.main_loop.iteration_state[1])['data']
     assert_allclose(new, old)
Example #9
0
def initialaze_algorithm(config, save_path, bokeh_name, params, bokeh_server,
                         bokeh, use_load_ext, load_log, fast_start, 
                         recognizer, data, model, cg, regularized_cg,
                         cost, train_cost, parameters, 
                         max_norm_rules, observables,
                         batch_size, batch_cost, weights_entropy, 
                         labels_mask, labels,  gradients=None):
    primary_observables = observables
    secondary_observables = []
    validation_observables = []
    root_path, extension = os.path.splitext(save_path)
    train_conf = config['training']
    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'], train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    if 'adam' in rule_names:
        assert len(rule_names) == 1
        logger.info("Using Adam for training")
        core_rules.append(
            Adam(learning_rate=train_conf.get('scale', 0.002),
                 beta1=train_conf.get('beta1', 0.1),
                 beta2=train_conf.get('beta2', 0.001),
                 epsilon=train_conf.get('epsilon', 1e-8),
                 decay_factor=train_conf.get('decay_rate', (1 - 1e-8))))
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(
            BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')
        #theano_func_kwargs={'mode':NanGuardMode(nan_is_error=True)})

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements ** 0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost,
        algorithm.total_gradient_norm,
        algorithm.total_step_norm, clipping.threshold]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances')] + weights_entropy


    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name.startswith('weights_entropy'):
                chld_id = recognizer.child_id_from_postfix(var.name)
                result.append(rename(aggregation.mean(var, labels_mask[chld_id].sum()),
                                     'weights_entropy_per_label'+
                                     recognizer.children[chld_id].names_postfix))
            elif var.name.endswith('_nll'):
                chld_id = recognizer.child_id_from_postfix(var.name)
                result.append(rename(aggregation.mean(var.sum(),
                                                      labels_mask[chld_id].sum()),
                                     var.name+'_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']
    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(TrainingDataMonitoring(
        primary_observables, after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average", every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables),
        data.get_stream("valid", shuffle=False, **data_params_valid), prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)

    additional_patience_notifiers = []
    uas = DependencyErrorRate(recognizer.children[0], data,
                              **config['monitoring']['search'])
    las = AuxiliaryErrorRates(uas, name='LAS')
    lab = AuxiliaryErrorRates(uas, name='LAB')
    per_monitoring = DataStreamMonitoring(
        [uas, las, lab], data.get_one_stream("valid", data.langs[0], batches=False, shuffle=False, **data_params_valid)[0],
        prefix="valid").set_conditions(
                before_first_epoch=not fast_start,
                every_n_epochs=mon_conf['search_every_epochs'],
                every_n_batches=mon_conf['search_every_batches'],
                after_training=False)
    extensions.append(per_monitoring)
    track_the_best_uas = TrackTheBest(
        per_monitoring.record_name(uas)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_las = TrackTheBest(
        per_monitoring.record_name(las)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_lab = TrackTheBest(
        per_monitoring.record_name(lab)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_uas,
                   track_the_best_las,
                   track_the_best_lab,
                   ]
    per = uas
    track_the_best_per = track_the_best_uas
    additional_patience_notifiers = [track_the_best_lab,
                                     track_the_best_las]
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_cost]
    extensions.append(AdaptiveClipping(
        algorithm.total_gradient_norm.name,
        clipping, train_conf['gradient_threshold'],
        decay_rate=0.998, burnin_period=500,
        num_stds=train_conf.get('clip_stds', 1.0)))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf['num_batches'],
                    after_n_epochs=train_conf['num_epochs']),
            # .add_condition(["after_batch"], _gradient_norm_is_none),
    ]
    main_postfix = recognizer.children[0].names_postfix
    channels = [
        # Plot 1: training and validation costs
        [average_monitoring.record_name(train_cost),
         validation.record_name(cost)],
        # Plot 2: gradient norm,
        [average_monitoring.record_name(algorithm.total_gradient_norm),
         average_monitoring.record_name(clipping.threshold)],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [average_monitoring._record_name('weights_entropy_per_label'+main_postfix),
         validation._record_name('weights_entropy_per_label'+main_postfix)],
        # Plot 5: training and validation monotonicity penalty
        [average_monitoring._record_name('weights_penalty_per_recording'+main_postfix),
         validation._record_name('weights_penalty_per_recording'+main_postfix)]]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name
                 else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start, after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True)
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_per.notification_name),
            (root_path + "_best" + extension,))
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_cost.notification_name),
            (root_path + "_best_ll" + extension,)),
        ProgressBar()]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name] + additional_patience_notifiers
        extensions.append(Patience(**patience_conf))

    if train_conf.get('min_performance_stops'):
        extensions.append(EarlyTermination(
            param_name=track_the_best_per.best_name,
            min_performance_by_epoch=train_conf['min_performance_stops']))

    extensions.append(Printing(every_n_batches=1,
                               attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
Example #10
0
train_monitor = TrainingDataMonitoring(
    variables=monitoring_vars,
    every_n_batches=args.save_every,
    prefix="train")

valid_monitor = DataStreamMonitoring(
    monitoring_vars,
    valid_stream,
    every_n_batches=args.save_every,
    after_epoch=False,
    prefix="valid")

extensions = []

if args.load_experiment:
    extensions += [Load(os.path.join(
        save_dir, "pkl", "best_" + args.load_experiment + ".tar"))]

extensions += [
    Timing(every_n_batches=args.save_every),
    train_monitor]

extensions += [
    valid_monitor,
    TrackTheBest(
        'valid_' + cost_name,
        every_n_batches=args.save_every,
        before_first_epoch=True),
    Plot(
        os.path.join(save_dir, "progress", exp_name + ".png"),
        plot_names,
        every_n_batches=args.save_every,
Example #11
0
# Multi GPU
worker = None
if args.platoon_port:
    from blocks_extras.extensions.synchronization import (
        Synchronize, SynchronizeWorker)
    from platoon.param_sync import ASGD

    sync_rule = ASGD()
    worker = SynchronizeWorker(
        sync_rule, control_port=args.platoon_port, socket_timeout=2000)

extensions = []

if args.load_experiment and (not worker or worker.is_main_worker):
    extensions += [Load(os.path.join(
        save_dir, "pkl", load_prefix + args.load_experiment + ".tar"))]

extensions += [
    Timing(every_n_batches=args.save_every),
    train_monitor]

if not worker or worker.is_main_worker:
    extensions += [
        valid_monitor,
        TrackTheBest(
            'valid_nll',
            every_n_batches=args.save_every,
            before_first_epoch=True),
        Plot(
            os.path.join(save_dir, "progress", exp_name + ".png"),
            plot_names,
Example #12
0
def train_model(new_training_job, config, save_path, params, fast_start,
                fuel_server, seed):
    c = config
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    data, model = initialize_data_and_model(config, train_phase=True)

    # full main loop can be saved...
    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    # or only state (log + params) which can be useful not to pickle embeddings
    state_path = os.path.join(save_path, 'training_state.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')
    best_tar_path = os.path.join(save_path, "best_model.tar")

    keys = tensor.lmatrix('keys')
    n_identical_keys = tensor.lvector('n_identical_keys')
    words = tensor.ltensor3('words')
    words_mask = tensor.matrix('words_mask')
    if theano.config.compute_test_value != 'off':
        #TODO
        test_value_data = next(
            data.get_stream('train', batch_size=4,
                            max_length=5).get_epoch_iterator())
        words.tag.test_value = test_value_data[0]
        words_mask.tag.test_value = test_value_data[1]

    if use_keys(c) and use_n_identical_keys(c):
        costs = model.apply(words,
                            words_mask,
                            keys,
                            n_identical_keys,
                            train_phase=True)
    elif use_keys(c):
        costs = model.apply(words, words_mask, keys, train_phase=True)
    else:
        costs = model.apply(words, words_mask, train_phase=True)
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(words.shape[1], 'length')
    perplexity, = VariableFilter(name='perplexity')(cg)
    monitored_vars = [length, cost, perplexity]
    if c['proximity_coef']:
        proximity_term, = VariableFilter(name='proximity_term')(cg)
        monitored_vars.append(proximity_term)

    print "inputs of the model:", cg.inputs

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    saved_parameters = parameters.values()
    if c['embedding_path']:
        if c['freeze_pretrained']:
            logger.debug(
                "Exclude pretrained encoder embeddings from the trained parameters"
            )
            to_freeze = 'main'
        elif c['provide_targets']:
            logger.debug(
                "Exclude pretrained targets from the trained parameters")
            to_freeze = 'target'
        trained_parameters = [
            p for p in trained_parameters
            if not p == model.get_def_embeddings_params(to_freeze)
        ]
        saved_parameters = [
            p for p in saved_parameters
            if not p == model.get_def_embeddings_params(to_freeze)
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    train_monitored_vars = list(monitored_vars)
    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)

    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    # We use a completely random seed on purpose. With Fuel server
    # it's currently not possible to restore the state of the training
    # stream. That's why it's probably better to just have it stateless.
    stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None
    training_stream = data.get_stream(
        'train',
        batch_size=c['batch_size'],
        max_length=c['max_length'],
        seed=stream_seed,
        remove_keys=not use_keys(c),
        remove_n_identical_keys=not use_n_identical_keys(c))
    print "trainin_stream will contains sources:", training_stream.sources

    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    validate = c['mon_freq_valid'] > 0

    if validate:
        valid_stream = data.get_stream(
            'valid',
            batch_size=c['batch_size_valid'],
            max_length=c['max_length'],
            seed=stream_seed,
            remove_keys=not use_keys(c),
            remove_n_identical_keys=not use_n_identical_keys(c))
        validation = DataStreamMonitoring(
            monitored_vars, valid_stream,
            prefix="valid").set_conditions(before_first_epoch=not fast_start,
                                           on_resumption=True,
                                           every_n_batches=c['mon_freq_valid'])
        track_the_best = TrackTheBest(validation.record_name(cost),
                                      choose_best=min).set_conditions(
                                          on_resumption=True,
                                          after_epoch=True,
                                          every_n_batches=c['mon_freq_valid'])

    # don't save them the entire main loop to avoid pickling everything
    if c['fast_checkpoint']:
        cp_path = state_path
        load = (LoadNoUnpickling(cp_path,
                                 load_iteration_state=True,
                                 load_log=True).set_conditions(
                                     before_training=not new_training_job))
        cp_args = {
            'save_main_loop': False,
            'save_separately': ['log', 'iteration_state'],
            'parameters': saved_parameters
        }

    else:
        cp_path = main_loop_path
        load = (Load(cp_path, load_iteration_state=True,
                     load_log=True).set_conditions(
                         before_training=not new_training_job))
        cp_args = {
            'save_separately': ['iteration_state'],
            'parameters': saved_parameters
        }

    checkpoint = Checkpoint(cp_path,
                            before_training=not fast_start,
                            every_n_batches=c['save_freq_batches'],
                            after_training=not fast_start,
                            **cp_args)

    if c['checkpoint_every_n_batches'] > 0 or c[
            'checkpoint_every_n_epochs'] > 0:
        intermediate_cp = IntermediateCheckpoint(
            cp_path,
            every_n_epochs=c['checkpoint_every_n_epochs'],
            every_n_batches=c['checkpoint_every_n_batches'],
            after_training=False,
            **cp_args)

    if validate:
        checkpoint = checkpoint.add_condition(
            ['after_batch', 'after_epoch'],
            OnLogRecord(track_the_best.notification_name), (best_tar_path, ))

    extensions = [
        load,
        StartFuelServer(original_training_stream,
                        stream_path,
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train'])
    ]

    extensions.extend([
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
    ])
    if validate:
        extensions.extend([validation, track_the_best])

    extensions.append(checkpoint)
    if c['checkpoint_every_n_batches'] > 0 or c[
            'checkpoint_every_n_epochs'] > 0:
        extensions.append(intermediate_cp)
    extensions.extend(
        [Printing(on_resumption=True, every_n_batches=c['mon_freq_train'])])

    if validate and c['n_valid_early'] > 0:
        extensions.append(
            FinishIfNoImprovementAfter(track_the_best.notification_name,
                                       iterations=c['n_valid_early'] *
                                       c['mon_freq_valid'],
                                       every_n_batches=c['mon_freq_valid']))
    extensions.append(FinishAfter(after_n_epochs=c['n_epochs']))

    logger.info("monitored variables during training:" + "\n" +
                pprint.pformat(train_monitored_vars, width=120))
    logger.info("monitored variables during valid:" + "\n" +
                pprint.pformat(monitored_vars, width=120))

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
Example #13
0
 def test_load_nonexisting(self):
     """Check behaviour when loading nonexisting main loop."""
     load = Load("mynonexisting.tar")
     load.main_loop = self.main_loop
     load.before_training()
Example #14
0
def train_model(cost,
                cross_entropy,
                updates,
                train_stream,
                valid_stream,
                args,
                gate_values=None):

    step_rule = learning_algorithm(args)
    cg = ComputationGraph(cost)

    # ADD REGULARIZATION
    # WEIGHT NOISE
    weight_noise = args.weight_noise
    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        cg_train = apply_noise(cg, weights, weight_noise)
        cost = cg_train.outputs[0]
    cost.name = "cost_with_weight_noise"
    cg = ComputationGraph(cost)

    logger.info(cg.parameters)

    algorithm = GradientDescent(cost=cost,
                                step_rule=step_rule,
                                params=cg.parameters)
    algorithm.add_updates(updates)

    # extensions to be added
    extensions = []
    if args.load_path is not None:
        extensions.append(Load(args.load_path))

    outputs = [
        variable for variable in cg.variables if variable.name == "presoft"
    ]

    if args.generate:
        extensions.append(
            TextGenerationExtension(
                outputs=outputs,
                generation_length=args.generated_text_lenght,
                initial_text_length=args.initial_text_length,
                every_n_batches=args.monitoring_freq,
                ploting_path=os.path.join(args.save_path, 'prob_plot.png'),
                softmax_sampling=args.softmax_sampling,
                dataset=args.dataset,
                updates=updates,
                interactive_mode=args.interactive_mode))
    extensions.extend([
        TrainingDataMonitoring([cost],
                               prefix='train',
                               every_n_batches=args.monitoring_freq,
                               after_epoch=True),
        DataStreamMonitoring([cost, cross_entropy],
                             valid_stream,
                             args.mini_batch_size_valid,
                             state_updates=updates,
                             prefix='valid',
                             before_first_epoch=not (args.visualize_gates),
                             every_n_batches=args.monitoring_freq),
        ResetStates([v for v, _ in updates], every_n_batches=100),
        ProgressBar()
    ])
    # Creating directory for saving model.
    if not args.interactive_mode:
        if not os.path.exists(args.save_path):
            os.makedirs(args.save_path)
        else:
            raise Exception('Directory already exists')
    early_stopping = EarlyStopping('valid_cross_entropy',
                                   args.patience,
                                   args.save_path,
                                   every_n_batches=args.monitoring_freq)

    # Visualizing extensions
    if args.interactive_mode:
        extensions.append(InteractiveMode())
    if args.visualize_gates and (gate_values is not None):
        if args.rnn_type == "lstm":
            extensions.append(
                VisualizeGateLSTM(gate_values,
                                  updates,
                                  args.dataset,
                                  ploting_path=None))
        elif args.rnn_type == "soft":
            extensions.append(
                VisualizeGateSoft(gate_values,
                                  updates,
                                  args.dataset,
                                  ploting_path=None))
        else:
            assert (False)

    extensions.append(early_stopping)
    extensions.append(Printing(every_n_batches=args.monitoring_freq))

    main_loop = MainLoop(model=Model(cost),
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    main_loop.run()
def train_language_model(new_training_job, config, save_path, params,
                         fast_start, fuel_server, seed):
    c = config
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    data, lm, retrieval = initialize_data_and_model(config)

    # full main loop can be saved...
    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    # or only state (log + params) which can be useful not to pickle embeddings
    state_path = os.path.join(save_path, 'training_state.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')
    best_tar_path = os.path.join(save_path, "best_model.tar")

    words = tensor.ltensor3('words')
    words_mask = tensor.matrix('words_mask')
    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', batch_size=4,
                            max_length=5).get_epoch_iterator())
        words.tag.test_value = test_value_data[0]
        words_mask.tag.test_value = test_value_data[1]

    costs, updates = lm.apply(words, words_mask)
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(words.shape[1], 'length')
    perplexity, = VariableFilter(name='perplexity')(cg)
    perplexities = VariableFilter(name_regex='perplexity.*')(cg)
    monitored_vars = [length, cost] + perplexities
    if c['dict_path']:
        num_definitions, = VariableFilter(name='num_definitions')(cg)
        monitored_vars.extend([num_definitions])

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    saved_parameters = parameters.values()
    if c['embedding_path']:
        logger.debug("Exclude word embeddings from the trained parameters")
        trained_parameters = [
            p for p in trained_parameters
            if not p == lm.get_def_embeddings_params()
        ]
        saved_parameters = [
            p for p in saved_parameters
            if not p == lm.get_def_embeddings_params()
        ]

    if c['cache_size'] != 0:
        logger.debug("Enable fake recursivity for looking up embeddings")
        trained_parameters = [
            p for p in trained_parameters if not p == lm.get_cache_params()
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    if c['cache_size'] != 0:
        algorithm.add_updates(updates)

    train_monitored_vars = list(monitored_vars)
    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)

    word_emb_RMS, = VariableFilter(name='word_emb_RMS')(cg)
    main_rnn_in_RMS, = VariableFilter(name='main_rnn_in_RMS')(cg)
    train_monitored_vars.extend([word_emb_RMS, main_rnn_in_RMS])

    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    # We use a completely random seed on purpose. With Fuel server
    # it's currently not possible to restore the state of the training
    # stream. That's why it's probably better to just have it stateless.
    stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None
    training_stream = data.get_stream('train',
                                      batch_size=c['batch_size'],
                                      max_length=c['max_length'],
                                      seed=stream_seed)
    valid_stream = data.get_stream('valid',
                                   batch_size=c['batch_size_valid'],
                                   max_length=c['max_length'],
                                   seed=stream_seed)
    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    validation = DataStreamMonitoring(monitored_vars,
                                      valid_stream,
                                      prefix="valid").set_conditions(
                                          before_first_epoch=not fast_start,
                                          on_resumption=True,
                                          every_n_batches=c['mon_freq_valid'])
    track_the_best = TrackTheBest(validation.record_name(perplexity),
                                  choose_best=min).set_conditions(
                                      on_resumption=True,
                                      after_epoch=True,
                                      every_n_batches=c['mon_freq_valid'])

    # don't save them the entire main loop to avoid pickling everything
    if c['fast_checkpoint']:
        load = (LoadNoUnpickling(state_path,
                                 load_iteration_state=True,
                                 load_log=True).set_conditions(
                                     before_training=not new_training_job))
        cp_args = {
            'save_main_loop': False,
            'save_separately': ['log', 'iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(state_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                state_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)
    else:
        load = (Load(main_loop_path, load_iteration_state=True,
                     load_log=True).set_conditions(
                         before_training=not new_training_job))
        cp_args = {
            'save_separately': ['iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(main_loop_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                main_loop_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)

    checkpoint = checkpoint.add_condition(
        ['after_batch', 'after_epoch'],
        OnLogRecord(track_the_best.notification_name), (best_tar_path, ))

    extensions = [
        load,
        StartFuelServer(original_training_stream,
                        stream_path,
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train'])
    ]

    if retrieval:
        extensions.append(
            RetrievalPrintStats(retrieval=retrieval,
                                every_n_batches=c['mon_freq_train'],
                                before_training=not fast_start))

    extensions.extend([
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
        validation, track_the_best, checkpoint
    ])
    if c['checkpoint_every_n_batches']:
        extensions.append(intermediate_cp)
    extensions.extend([
        DumpTensorflowSummaries(save_path,
                                every_n_batches=c['mon_freq_train'],
                                after_training=True),
        Printing(on_resumption=True, every_n_batches=c['mon_freq_train']),
        FinishIfNoImprovementAfter(track_the_best.notification_name,
                                   iterations=50 * c['mon_freq_valid'],
                                   every_n_batches=c['mon_freq_valid']),
        FinishAfter(after_n_batches=c['n_batches'])
    ])

    logger.info("monitored variables during training:" + "\n" +
                pprint.pformat(train_monitored_vars, width=120))
    logger.info("monitored variables during valid:" + "\n" +
                pprint.pformat(monitored_vars, width=120))

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
Example #16
0
data_stream_test = DataStream.default_stream(data_test,
                                             iteration_scheme=SequentialScheme(
                                                 data_test.num_examples,
                                                 batch_size=bs))

learning_rate = 0.0002
n_epochs = 100
algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            on_unused_sources='ignore',
                            step_rule=CompositeRule([
                                StepClipping(10.),
                                Adam(learning_rate),
                            ]))

load = Load('/home/xuehongyang/checkpoints_read/snapshot_10')
predictor = PredictDataStream(data_stream=data_stream_test,
                              output_tensor=result,
                              path='/home/xuehongyang/RESULT_READ',
                              before_training=True,
                              after_epoch=False,
                              after_training=False)

main_loop = MainLoop(
    model=Model(cost),
    data_stream=data_stream_train,
    algorithm=algorithm,
    extensions=[Timing(),
                FinishAfter(after_n_epochs=1), load, predictor])

print('start prediction ...')
Example #17
0
 def test_load_nonexisting(self):
     """Check behaviour when loading nonexisting main loop."""
     load = Load('mynonexisting.tar')
     load.main_loop = self.main_loop
     load.do()
Example #18
0
 def test_load_nonexisting(self):
     """Check behaviour when loading nonexisting main loop."""
     load = Load('mynonexisting.tar')
     load.main_loop = self.main_loop
     load.before_training()
Example #19
0
def main(config): 
	vocab_src, _ = text_to_dict([config['train_src'],
		config['dev_src'], config['test_src']])
	vocab_tgt, cabvo = text_to_dict([config['train_tgt'],
		config['dev_tgt']])

	# Create Theano variables
	logger.info('Creating theano variables')
	source_sentence = tensor.lmatrix('source')
	source_sentence_mask = tensor.matrix('source_mask')
	target_sentence = tensor.lmatrix('target')
	target_sentence_mask = tensor.matrix('target_mask')
	source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0],
										[1, 4, 8, 4, 8, 4, 8],]
	source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0],
											[1, 0, 1, 0, 1, 0, 1],]
	target_sentence.tag.test_value = [[0,1,1,5],
										[2,0,1,0],]
	target_sentence_mask.tag.test_value = [[0,1,1,0],
											[1,1,1,0],]


	logger.info('Building RNN encoder-decoder')
	### Building Encoder 
	embedder = LookupTable(
		length=len(vocab_src), 
		dim=config['embed_src'], 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='embedder')
	transformer = Linear(
		config['embed_src'], 
		config['hidden_src']*4, 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='transformer')

	lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src'])
	encoder = Bidirectional(
		LSTM(
			dim=config['hidden_src'], 
			weights_init=IsotropicGaussian(0.01),
			biases_init=Constant(lstminit)),
		name='encoderBiLSTM'
		)
	encoder.prototype.weights_init = Orthogonal()
	
	### Building Decoder 
	lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt'])
	transition = LSTM2GO(
		attended_dim=config['hidden_tgt'], 
		dim=config['hidden_tgt'], 
		weights_init=IsotropicGaussian(0.01),
		biases_init=Constant(lstminit), 
		name='decoderLSTM')

	attention = SequenceContentAttention( 
		state_names=transition.apply.states, # default activation is Tanh
		state_dims=[config['hidden_tgt']],
		attended_dim=config['hidden_src']*2,
		match_dim=config['hidden_tgt'], 
		name="attention")

	readout = Readout(
		source_names=['states', 
			'feedback', 
			attention.take_glimpses.outputs[0]],
		readout_dim=len(vocab_tgt),
		emitter = SoftmaxEmitter(
			name='emitter'), 
		feedback_brick = LookupFeedback(
			num_outputs=len(vocab_tgt), 
			feedback_dim=config['embed_tgt'], 
			name='feedback'), 
		post_merge=InitializableFeedforwardSequence([
			Bias(dim=config['hidden_tgt'], 
				name='softmax_bias').apply,
			Linear(input_dim=config['hidden_tgt'], 
				output_dim=config['embed_tgt'],
				use_bias=False, 
				name='softmax0').apply,
			Linear(input_dim=config['embed_tgt'], 
				name='softmax1').apply]),
		merged_dim=config['hidden_tgt'])

	decoder = SequenceGenerator(
		readout=readout, 
		transition=transition, 
		attention=attention, 
		weights_init=IsotropicGaussian(0.01), 
		biases_init=Constant(0),
		name="generator",
		fork=Fork(
			[name for name in transition.apply.sequences if name != 'mask'], 
			prototype=Linear()),
		add_contexts=True)
	decoder.transition.weights_init = Orthogonal()

	#printchildren(encoder, 1)
	# Initialize model
	logger.info('Initializing model')
	embedder.initialize()
	transformer.initialize()
	encoder.initialize()
	decoder.initialize()
	
	# Apply model 
	embedded = embedder.apply(source_sentence)
	tansformed = transformer.apply(embedded)
	encoded = encoder.apply(tansformed)[0]
	generated = decoder.generate(
		n_steps=2*source_sentence.shape[1], 
		batch_size=source_sentence.shape[0], 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask=tensor.ones(source_sentence.shape).T
		)
	print 'Generated: ', generated
	# generator_generate_outputs
	#samples = generated[1] # For GRU 
	samples = generated[2] # For LSTM
	samples.name = 'samples'
	#samples_cost = generated[4] # For GRU 
	samples_cost = generated[5] # For LSTM
	samples_cost = 'sampling_cost'
	cost = decoder.cost(
		mask = target_sentence_mask.T, 
		outputs = target_sentence.T, 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask = source_sentence_mask.T)
	cost.name = 'target_cost'
	cost.tag.aggregation_scheme = TakeLast(cost)
	model = Model(cost)
	
	logger.info('Creating computational graph')
	cg = ComputationGraph(cost)
	
	# apply dropout for regularization
	if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog
		logger.info('Applying dropout')
		dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output']
		cg = apply_dropout(cg, dropout_inputs, config['dropout'])

	######## 
	# Print shapes
	shapes = [param.get_value().shape for param in cg.parameters]
	logger.info("Parameter shapes: ")
	for shape, count in Counter(shapes).most_common():
		logger.info('	{:15}: {}'.format(shape, count))
	logger.info("Total number of parameters: {}".format(len(shapes)))

	printchildren(embedder, 1)
	printchildren(transformer, 1)
	printchildren(encoder, 1)
	printchildren(decoder, 1)
	# Print parameter names
	# enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters())
	# enc_dec_param_dict = merge(Selector(decoder).get_parameters())
	# logger.info("Parameter names: ")
	# for name, value in enc_dec_param_dict.items():
	# 	logger.info('	{:15}: {}'.format(value.get_value().shape, name))
	# logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict)))
	##########

	# Training data 
	train_stream = get_train_stream(config, 
		[config['train_src'],], [config['train_tgt'],], 
		vocab_src, vocab_tgt)
	dev_stream = get_dev_stream(
		[config['dev_src'],], [config['dev_tgt'],], 
		vocab_src, vocab_tgt)
	test_stream = get_test_stream([config['test_src'],], vocab_src)

	# Set extensions
	logger.info("Initializing extensions")
	extensions = [
		FinishAfter(after_n_batches=config['finish_after']),
		ProgressBar(),
		TrainingDataMonitoring([cost], 
			prefix="tra", 
			after_batch=True),
		DataStreamMonitoring(variables=[cost], 
			data_stream=dev_stream, 
			prefix="dev", 
			after_batch=True), 
		Sampler(
			model=Model(samples), 
			data_stream=dev_stream,
			vocab=cabvo,
			saveto=config['saveto']+'dev',
			every_n_batches=config['save_freq']), 
		Sampler(
			model=Model(samples), 
			data_stream=test_stream,
			vocab=cabvo,
			saveto=config['saveto']+'test',
			after_n_batches=1, 
			on_resumption=True,
			before_training=True), 
		Plotter(saveto=config['saveto'], after_batch=True),
		Printing(after_batch=True),
		Checkpoint(
			path=config['saveto'], 
			parameters = cg.parameters,
			save_main_loop=False,
			every_n_batches=config['save_freq'])]
	if BOKEH_AVAILABLE: 
		Plot('Training cost', channels=[['target_cost']], after_batch=True)
	if config['reload']: 
		extensions.append(Load(path=config['saveto'], 
			load_iteration_state=False, 
			load_log=False))
	else: 
		with open(config['saveto']+'.txt', 'w') as f: 
			pass 

	# Set up training algorithm
	logger.info("Initializing training algorithm")
	algorithm = GradientDescent(cost=cost, 
		parameters=cg.parameters,
		step_rule=CompositeRule([StepClipping(config['step_clipping']), 
			eval(config['step_rule'])()])
    )

	# Initialize main loop
	logger.info("Initializing main loop")
	main_loop = MainLoop(
		model=model,
		algorithm=algorithm,
		data_stream=train_stream,
		extensions=extensions)
	main_loop.run()
Example #20
0
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh,
                   test_tag, use_load_ext, load_log, fast_start):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])
    train_conf = config['training']
    recognizer = create_model(config, data, test_tag)

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result

    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    prediction, prediction_mask = add_exploration(recognizer, data, train_conf)

    #
    # Observables:
    #
    primary_observables = []  # monitored each batch
    secondary_observables = []  # monitored every 10 batches
    validation_observables = []  # monitored on the validation set

    cg = recognizer.get_cost_graph(batch=True,
                                   prediction=prediction,
                                   prediction_mask=prediction_mask)
    labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg)
    labels_mask, = VariableFilter(applications=[recognizer.cost],
                                  name='labels_mask')(cg)

    gain_matrix = VariableFilter(
        theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg)
    if len(gain_matrix):
        gain_matrix, = gain_matrix
        primary_observables.append(rename(gain_matrix.min(), 'min_gain'))
        primary_observables.append(rename(gain_matrix.max(), 'max_gain'))

    batch_cost = cg.outputs[0].sum()
    batch_size = rename(recognizer.labels.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_total_cost"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(applications=[r.generator.readout.readout],
                               name="output_0")(cost_cg)
    bottom_output = VariableFilter(
        # We need name_regex instead of name because LookupTable calls itsoutput output_0
        applications=[r.bottom.apply],
        name_regex="output")(cost_cg)[-1]
    attended, = VariableFilter(applications=[r.generator.transition.apply],
                               name="attended")(cost_cg)
    attended_mask, = VariableFilter(applications=[
        r.generator.transition.apply
    ],
                                    name="attended_mask")(cost_cg)
    weights, = VariableFilter(applications=[r.generator.evaluate],
                              name="weights")(cost_cg)

    from blocks.roles import AUXILIARY
    l2_cost, = VariableFilter(roles=[AUXILIARY],
                              theano_name='l2_cost_aux')(cost_cg)
    cost_forward, = VariableFilter(roles=[AUXILIARY],
                                   theano_name='costs_forward_aux')(cost_cg)

    max_recording_length = rename(bottom_output.shape[0],
                                  "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = rename(attended_mask.shape[0],
                                      "max_attended_mask_length")
    max_attended_length = rename(attended.shape[0], "max_attended_length")
    max_num_phonemes = rename(labels.shape[0], "max_num_phonemes")
    min_energy = rename(energies.min(), "min_energy")
    max_energy = rename(energies.max(), "max_energy")
    mean_attended = rename(abs(attended).mean(), "mean_attended")
    mean_bottom_output = rename(
        abs(bottom_output).mean(), "mean_bottom_output")
    weights_penalty = rename(monotonicity_penalty(weights, labels_mask),
                             "weights_penalty")
    weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy")
    mask_density = rename(labels_mask.mean(), "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy, min_energy, max_energy,
        mean_attended, mean_bottom_output, batch_size, max_num_phonemes,
        mask_density
    ])
    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config.get('regularization', dict())
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [
            p for p in cg.parameters if p not in attention_params
        ]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])

    train_cost = regularized_cg.outputs[0]
    if reg_config.get("penalty_coof", .0) > 0:
        # big warning!!!
        # here we assume that:
        # regularized_weights_penalty = regularized_cg.outputs[1]
        train_cost = (train_cost + reg_config.get("penalty_coof", .0) *
                      regularized_cg.outputs[1] / batch_size)
    if reg_config.get("decay", .0) > 0:
        train_cost = (
            train_cost + reg_config.get("decay", .0) *
            l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2)

    train_cost = rename(train_cost, 'train_cost')

    gradients = None
    if reg_config.get('adaptive_noise'):
        logger.info('apply adaptive noise')
        if ((reg_config.get("penalty_coof", .0) > 0)
                or (reg_config.get("decay", .0) > 0)):
            logger.error('using  adaptive noise with alignment weight panalty '
                         'or weight decay is probably stupid')
        train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise(
            cg,
            cg.outputs[0],
            variables=cg.parameters,
            num_examples=data.get_dataset('train').num_examples,
            parameters=Model(
                regularized_cg.outputs[0]).get_parameter_dict().values(),
            **reg_config.get('adaptive_noise'))
        train_cost.name = 'train_cost'
        adapt_noise_cg = ComputationGraph(train_cost)
        model_prior_mean = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_mean')(adapt_noise_cg)[0],
            'model_prior_mean')
        model_cost = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_cost')(adapt_noise_cg)[0], 'model_cost')
        model_prior_variance = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_variance')(adapt_noise_cg)[0],
            'model_prior_variance')
        regularized_cg = ComputationGraph(
            [train_cost, model_cost] + regularized_cg.outputs +
            [model_prior_mean, model_prior_variance])
        primary_observables += [
            regularized_cg.outputs[1],  # model cost
            regularized_cg.outputs[2],  # task cost
            regularized_cg.outputs[-2],  # model prior mean
            regularized_cg.outputs[-1]
        ]  # model prior variance

    model = Model(train_cost)
    if params:
        logger.info("Load parameters from " + params)
        # please note: we cannot use recognizer.load_params
        # as it builds a new computation graph that dies not have
        # shapred variables added by adaptive weight noise
        with open(params, 'r') as src:
            param_values = load_parameters(src)
        model.set_parameter_values(param_values)

    parameters = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, parameters[key].get_value().shape)
                                for key in sorted(parameters.keys())],
                               width=120))

    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'],
                                   train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(
            AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False) > 0:
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [
                v for v in maxnorm_subjects
                if not isinstance(get_brick(v), LookupTable)
            ]
        logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat(
            [name for name, p in parameters.items() if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([
            name for name, p in parameters.items() if not p in maxnorm_subjects
        ]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                     maxnorm_subjects)
        ]
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold
    ]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements**0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements**0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm,
        clipping.threshold, max_recording_length, max_attended_length,
        max_attended_mask_length
    ]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances'),
        weights_entropy, weights_penalty
    ]

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(
                    rename(aggregation.mean(var, batch_size),
                           'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(
                    rename(aggregation.mean(var, labels_mask.sum()),
                           'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(
            Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(
        TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward],
                               after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average",
        every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables +
                                   [l2_cost, cost_forward]),
        data.get_stream("valid", shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)
    per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search'])
    per_monitoring = DataStreamMonitoring(
        [per],
        data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['search_every_epochs'],
            every_n_batches=mon_conf['search_every_batches'],
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(before_first_epoch=True,
                                                     after_epoch=True)
    extensions += [track_the_best_cost, track_the_best_per]
    extensions.append(
        AdaptiveClipping(algorithm.total_gradient_norm.name,
                         clipping,
                         train_conf['gradient_threshold'],
                         decay_rate=0.998,
                         burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf.get('num_batches'),
                    after_n_epochs=train_conf.get('num_epochs')).add_condition(
                        ["after_batch"], _gradient_norm_is_none),
    ]
    channels = [
        # Plot 1: training and validation costs
        [
            average_monitoring.record_name(train_cost),
            validation.record_name(cost)
        ],
        # Plot 2: gradient norm,
        [
            average_monitoring.record_name(algorithm.total_gradient_norm),
            average_monitoring.record_name(clipping.threshold)
        ],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [
            average_monitoring._record_name('weights_entropy_per_label'),
            validation._record_name('weights_entropy_per_label')
        ],
        # Plot 5: training and validation monotonicity penalty
        [
            average_monitoring._record_name('weights_penalty_per_recording'),
            validation._record_name('weights_penalty_per_recording')
        ]
    ]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),
        ]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start,
                   after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True).add_condition(
                       ['after_epoch'],
                       OnLogRecord(track_the_best_per.notification_name),
                       (root_path + "_best" + extension, )).add_condition(
                           ['after_epoch'],
                           OnLogRecord(track_the_best_cost.notification_name),
                           (root_path + "_best_ll" + extension, )),
        ProgressBar()
    ]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))
    if config['net']['criterion']['name'].startswith('mse'):
        extensions.append(
            LogInputsGains(labels, cg, recognizer.generator.readout.emitter,
                           data))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name
            ]
        extensions.append(Patience(**patience_conf))

    extensions.append(
        Printing(every_n_batches=1, attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
Example #21
0
from blocks.main_loop import MainLoop
from blocks.model import Model
from blocks.extensions.saveload import Load
import cPickle as pickle
from blocks.graph import ComputationGraph

config_dict = yaml.load(open(sys.argv[1], 'r'))
print config_dict

train, valid, alphabet = build_datasets(config_dict)
generator, cost = build_model(len(alphabet), config_dict)
algorithm = build_algorithm(generator, cost, config_dict)
extensions = build_extensions(cost, algorithm, valid, config_dict)
main_loop = MainLoop(algorithm=algorithm, data_stream=train,
                     model=Model(cost), extensions=extensions)
ml = Load(config_dict['checkpoint_path'], load_log=True)
print dir(ml)

ml.load_to(main_loop)
generator = main_loop.model.get_top_bricks()[-1]

sampler = ComputationGraph(generator.generate(
    n_steps=1000, batch_size=10, iterate=True)).get_theano_function()

samples = sampler()
outputs = samples[-2]
charset = pickle.load(open(config_dict['dict_path']))
new_charset = {}
for v in charset:
    new_charset[charset[v]] = v
charset = new_charset
Example #22
0
data_stream_test = DataStream.default_stream(data_test,
                                             iteration_scheme=SequentialScheme(
                                                 data_test.num_examples,
                                                 batch_size=bs))

learning_rate = 0.0002
n_epochs = 100
algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            on_unused_sources='ignore',
                            step_rule=CompositeRule([
                                StepClipping(10.),
                                Adam(learning_rate),
                            ]))
load = Load('/home/xuehongyang/checkpoints_watch/snapshot_12')
predictor = PredictDataStream(data_stream=data_stream_test,
                              output_tensor=result,
                              path='/home/xuehongyang/RESULT_WATCH',
                              before_training=True,
                              after_epoch=False,
                              after_training=False)

main_loop = MainLoop(model=Model(cost),
                     data_stream=data_stream_train,
                     algorithm=algorithm,
                     extensions=[
                         Timing(),
                         FinishAfter(after_n_epochs=n_epochs), load, predictor
                     ])
Example #23
0
                            'question_features_reverse',
                            'visual_features',
                        ),
                        subset=slice(0, 32378 // bs * bs))

data_stream_train = DataStream.default_stream(data_train,
                                              iteration_scheme=ShuffledScheme(
                                                  data_train.num_examples,
                                                  batch_size=bs))

data_stream_test = DataStream.default_stream(data_test,
                                             iteration_scheme=SequentialScheme(
                                                 data_test.num_examples,
                                                 batch_size=bs))

load = Load('/home/xuehongyang/checkpoints_seq/snapshot_22')
learning_rate = 0.0002
n_epochs = 100
algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            on_unused_sources='ignore',
                            step_rule=CompositeRule([
                                StepClipping(10.),
                                Adam(learning_rate),
                            ]))

predictor = PredictDataStream(data_stream=data_stream_test,
                              output_tensor=result,
                              path='/home/xuehongyang/RESULT_BASE',
                              before_training=True,
                              after_epoch=False,
def train(step_rule, label_dim, state_dim, epochs, seed, dropout, test_cost,
          experiment_path, features, weight_noise, to_watch, patience,
          batch_size, batch_norm, **kwargs):

    print '.. TIMIT experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()

    # ------------------------------------------------------------------------
    # Streams

    rng = np.random.RandomState(seed)
    stream_args = dict(rng=rng, batch_size=batch_size)

    print '.. initializing iterators'
    train_dataset = Timit('train', features=features)
    train_stream = construct_stream(train_dataset, **stream_args)
    dev_dataset = Timit('dev', features=features)
    dev_stream = construct_stream(dev_dataset, **stream_args)
    test_dataset = Timit('test', features=features)
    test_stream = construct_stream(test_dataset, **stream_args)
    update_stream = construct_stream(train_dataset,
                                     n_batches=100,
                                     **stream_args)

    phone_dict = train_dataset.get_phoneme_dict()
    phoneme_dict = {
        k: phone_to_phoneme_dict[v] if v in phone_to_phoneme_dict else v
        for k, v in phone_dict.iteritems()
    }
    ind_to_phoneme = {v: k for k, v in phoneme_dict.iteritems()}
    eol_symbol = ind_to_phoneme['<STOP>']

    # ------------------------------------------------------------------------
    # Graph

    print '.. building model'
    x = T.tensor3('features')
    y = T.matrix('phonemes')
    input_mask = T.matrix('features_mask')
    output_mask = T.matrix('phonemes_mask')

    theano.config.compute_test_value = 'off'
    x.tag.test_value = np.random.randn(100, 24, 123).astype(floatX)
    y.tag.test_value = np.ones((30, 24), dtype=floatX)
    input_mask.tag.test_value = np.ones((100, 24), dtype=floatX)
    output_mask.tag.test_value = np.ones((30, 24), dtype=floatX)

    seq_len = 100
    input_dim = 123
    activation = Tanh()
    recurrent_init = IdentityInit(0.99)

    rec1 = TimLSTM(not batch_norm,
                   input_dim,
                   state_dim,
                   activation,
                   name='LSTM')
    rec1.initialize()
    l1 = Linear(state_dim,
                label_dim + 1,
                name='out_linear',
                weights_init=Orthogonal(),
                biases_init=Constant(0.0))
    l1.initialize()
    o1 = rec1.apply(x)
    y_hat_o = l1.apply(o1)

    shape = y_hat_o.shape
    y_hat = Softmax().apply(y_hat_o.reshape((-1, shape[-1]))).reshape(shape)

    y_mask = output_mask
    y_hat_mask = input_mask

    # ------------------------------------------------------------------------
    # Costs and Algorithm

    ctc_cost = T.sum(
        ctc.cpu_ctc_th(y_hat_o, T.sum(y_hat_mask, axis=0), y + T.ones_like(y),
                       T.sum(y_mask, axis=0)))
    batch_cost = ctc_cost.copy(name='batch_cost')

    bs = y.shape[1]
    cost_train = aggregation.mean(batch_cost, bs).copy("sequence_cost")
    cost_per_character = aggregation.mean(
        batch_cost, output_mask.sum()).copy("character_cost")
    cg_train = ComputationGraph(cost_train)

    model = Model(cost_train)
    train_cost_per_character = aggregation.mean(
        cost_train, output_mask.sum()).copy("train_character_cost")

    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost_train,
                                parameters=cg_train.parameters,
                                on_unused_sources='warn')

    # ------------------------------------------------------------------------
    # Monitoring and extensions

    parameters = model.get_parameter_dict()
    observed_vars = [
        cost_train, train_cost_per_character,
        aggregation.mean(algorithm.total_gradient_norm)
    ]
    for name, param in parameters.iteritems():
        observed_vars.append(param.norm(2).copy(name + "_norm"))
        observed_vars.append(
            algorithm.gradients[param].norm(2).copy(name + "_grad_norm"))
    train_monitor = TrainingDataMonitoring(variables=observed_vars,
                                           prefix="train",
                                           after_epoch=True)

    dev_monitor = DataStreamMonitoring(
        variables=[cost_train, cost_per_character],
        data_stream=dev_stream,
        prefix="dev")
    train_ctc_monitor = CTCMonitoring(x,
                                      input_mask,
                                      y_hat,
                                      eol_symbol,
                                      train_stream,
                                      prefix='train',
                                      every_n_epochs=1,
                                      before_training=True,
                                      phoneme_dict=phoneme_dict,
                                      black_list=black_list,
                                      train=True)
    dev_ctc_monitor = CTCMonitoring(x,
                                    input_mask,
                                    y_hat,
                                    eol_symbol,
                                    dev_stream,
                                    prefix='dev',
                                    every_n_epochs=1,
                                    phoneme_dict=phoneme_dict,
                                    black_list=black_list)

    extensions = []
    if 'load_path' in kwargs:
        extensions.append(Load(kwargs['load_path']))

    extensions.extend([
        FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor,
        train_ctc_monitor, dev_ctc_monitor
    ])

    if test_cost:
        test_monitor = DataStreamMonitoring(
            variables=[cost_train, cost_per_character],
            data_stream=test_stream,
            prefix="test")
        test_ctc_monitor = CTCMonitoring(x,
                                         input_mask,
                                         y_hat,
                                         eol_symbol,
                                         test_stream,
                                         prefix='test',
                                         every_n_epochs=1,
                                         phoneme_dict=phoneme_dict,
                                         black_list=black_list)
        extensions.append(test_monitor)
        extensions.append(test_ctc_monitor)

    #if not os.path.exists(experiment_path):
    #    os.makedirs(experiment_path)
    #best_path = os.path.join(experiment_path, 'best/')
    #if not os.path.exists(best_path):
    #    os.mkdir(best_path)
    #best_path = os.path.join(best_path, 'model.bin')
    extensions.append(EarlyStopping(to_watch, patience, '/dev/null'))
    extensions.extend([ProgressBar(), Printing()])

    # ------------------------------------------------------------------------
    # Main Loop

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    print "Building time: %f" % (time.time() - t0)
    # if write_predictions:
    #     with open('predicted.txt', 'w') as f_pred:
    #         with open('targets.txt', 'w') as f_targets:
    #             evaluator = CTCEvaluator(
    #                 eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list)
    #             evaluator.evaluate(dev_stream, file_pred=f_pred,
    #                                file_targets=f_targets)
    #     return
    main_loop.run()
Example #25
0
def evaluation(model_file_path,
               data_name='',
               modData="m3",
               gpuData=True,
               mTest=False):
    trainInd1 = model_file_path.find("_train_size_") + len("_train_size_")
    trainInd2 = model_file_path.find("_", trainInd1)
    train_size = float(model_file_path[trainInd1:trainInd2])
    transInd1 = model_file_path.find("_transitions_") + len("_transitions_")
    transInd2 = model_file_path.find("_", transInd1)
    transitions = int(model_file_path[transInd1:transInd2])
    transInd1 = model_file_path.find("_trial_") + len("_trial_")
    transInd2 = model_file_path.find("_", transInd1)
    trial = int(model_file_path[transInd1:transInd2])

    mPrefix = ""
    if "_m1_" in model_file_path and mTest:
        mPrefix = "_m1"
    elif modData == "m3":
        data_train = "data/" + data_name + "_m3_trial_" + str(
            trial) + "_train_size_" + str(train_size) + "_transitions_" + str(
                transitions)
        data_valid = "data/" + data_name + "_m3_trial_" + str(
            trial) + "_valid_size_" + str(train_size) + "_transitions_" + str(
                transitions)
        data_test = "data/" + data_name + "_m3_trial_" + str(
            trial) + "_test_size_" + str(train_size) + "_transitions_" + str(
                transitions)

    #ix_to_char, char_to_ix, vocab_size = get_metadata(data_train.replace("_train", ""))
    data_train = "data/" + data_name + mPrefix + "_trial_" + str(
        trial) + "_train_size_" + str(train_size) + "_transitions_" + str(
            transitions)
    data_valid = "data/" + data_name + mPrefix + "_trial_" + str(
        trial) + "_valid_size_" + str(train_size) + "_transitions_" + str(
            transitions)
    data_test = "data/" + data_name + mPrefix + "_trial_" + str(
        trial) + "_test_size_" + str(train_size) + "_transitions_" + str(
            transitions)

    print 'Loading model from {0}...'.format(model_file_path)
    main_loop = Load(model_file_path)
    #get validation cost
    print 'Model loaded. Building prediction function...'
    model = main_loop.model
    if gpuData:
        batch_index_To, batch_index_From = model.inputs
    else:
        y_mask, y_mask_o, y, x, x_mask, x_mask_o, y_mask_o_mask, x_mask_o_mask = model.inputs

    for var in model.variables:
        if var.name == 'linear_output':
            linear_output = var
        if var.name == 'y':
            y = var
        if var.name == 'y_mask':
            y_mask = var
        if var.name == 'y_mask_o':
            y_mask_o = var
        if var.name == 'y_mask_o_mask':
            y_mask_o_mask = var

    sharedMRRSUM = shared(np.array(0.0, dtype=theano.config.floatX))
    sharedTOTSUM = shared(np.array(0.0, dtype=theano.config.floatX))
    sharedSUMVARs = {
        'sharedMRRSUM': sharedMRRSUM,
        'sharedTOTSUM': sharedTOTSUM
    }

    y_mask_final = y_mask * y_mask_o * y_mask_o_mask
    constant1 = shared(np.float32(1.0))
    cost_int, ymasksum = RR_cost(y, linear_output, y_mask_final, constant1)

    #validation calculations
    fRR = theano.function(inputs=[
        theano.In(batch_index_From, borrow=True),
        theano.In(batch_index_To, borrow=True)
    ],
                          updates=[(sharedMRRSUM, sharedMRRSUM + cost_int),
                                   (sharedTOTSUM, sharedTOTSUM + ymasksum)])
    localShared = np.array(0.0, dtype=theano.config.floatX)

    return (evaluateREC(data_train, fRR, model, sharedSUMVARs, localShared),
            evaluateREC(data_valid, fRR, model, sharedSUMVARs, localShared),
            evaluateREC(data_test, fRR, model, sharedSUMVARs, localShared))
Example #26
0
def train_model(cost,
                unregularized_cost,
                updates,
                train_stream,
                valid_stream,
                args,
                gate_values=None):

    step_rule = learning_algorithm(args)
    cg = ComputationGraph(cost)

    # ADD REGULARIZATION
    # WEIGHT NOISE
    weight_noise = args.weight_noise
    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        cg_train = apply_noise(cg, weights, weight_noise)
        cost = cg_train.outputs[0]
    cost.name = "cost_with_weight_noise"
    cg = ComputationGraph(cost)

    logger.info(cg.parameters)

    # Define algorithm
    algorithm = GradientDescent(cost=cost,
                                step_rule=step_rule,
                                parameters=cg.parameters)
    # Add the updates to carry the hidden state
    algorithm.add_updates(updates)

    # Extensions to be added
    extensions = []

    # Load from a dumped model
    if args.load_path is not None:
        if args.fine_tuning:
            cost = fine_tuning(cost, args)
        else:
            extensions.append(Load(args.load_path))

    # Generation extension
    if args.generate:
        extensions.append(
            TextGenerationExtension(
                cost=cost,
                generation_length=args.generated_text_lenght,
                initial_text_length=args.initial_text_length,
                every_n_batches=1,
                ploting_path=os.path.join(args.save_path, 'prob_plot.png'),
                softmax_sampling=args.softmax_sampling,
                dataset=args.dataset,
                updates=updates,
                interactive_mode=args.interactive_mode))

    # Training and Validation score monitoring
    extensions.extend([
        TrainingDataMonitoring([cost],
                               prefix='train',
                               every_n_batches=args.monitoring_freq),
        DataStreamMonitoring([cost, unregularized_cost],
                             valid_stream,
                             args.mini_batch_size_valid,
                             args.dataset,
                             state_updates=updates,
                             prefix='valid',
                             before_first_epoch=(args.visualize is None),
                             every_n_batches=args.monitoring_freq)
    ])

    # Creating directory for saving model.
    if not args.interactive_mode:
        if not os.path.exists(args.save_path):
            os.makedirs(args.save_path)
        elif 'test' in args.save_path:
            print("Rewriting in " + args.save_path)
        else:
            raise Exception('Directory already exists')

    # Early stopping
    extensions.append(
        EarlyStopping('valid_' + unregularized_cost.name,
                      args.patience,
                      args.save_path,
                      every_n_batches=args.monitoring_freq))

    # Printing
    extensions.append(ProgressBar())
    extensions.append(Printing(every_n_batches=args.monitoring_freq))

    # Reset the initial states
    if args.dataset == "sine":
        reset_frequency = 1
    else:
        reset_frequency = 100
    extensions.append(
        ResetStates([v for v, _ in updates], every_n_batches=reset_frequency))

    # Visualizing extensions
    if args.interactive_mode:
        extensions.append(InteractiveMode())

    main_loop = MainLoop(model=Model(cost),
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    # This is where the magic happens!
    main_loop.run()
Example #27
0
def train_snli_model(new_training_job,
                     config,
                     save_path,
                     params,
                     fast_start,
                     fuel_server,
                     seed,
                     model='simple'):
    if config['exclude_top_k'] > config['num_input_words'] and config[
            'num_input_words'] > 0:
        raise Exception("Some words have neither word nor def embedding")
    c = config
    logger = configure_logger(name="snli_baseline_training",
                              log_file=os.path.join(save_path, "log.txt"))
    if not os.path.exists(save_path):
        logger.info("Start a new job")
        os.mkdir(save_path)
    else:
        logger.info("Continue an existing job")
    with open(os.path.join(save_path, "cmd.txt"), "w") as f:
        f.write(" ".join(sys.argv))

    # Make data paths nice
    for path in [
            'dict_path', 'embedding_def_path', 'embedding_path', 'vocab',
            'vocab_def', 'vocab_text'
    ]:
        if c.get(path, ''):
            if not os.path.isabs(c[path]):
                c[path] = os.path.join(fuel.config.data_path[0], c[path])

    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')

    # Save config to save_path
    json.dump(config, open(os.path.join(save_path, "config.json"), "w"))

    if model == 'simple':
        nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data(
            c)
    elif model == 'esim':
        nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data(
            c)
    else:
        raise NotImplementedError()

    # Compute cost
    s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2')

    if c['dict_path']:
        assert os.path.exists(c['dict_path'])
        s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix(
            'sentence2_def_map')
        def_mask = T.fmatrix("def_mask")
        defs = T.lmatrix("defs")
    else:
        s1_def_map, s2_def_map = None, None
        def_mask = None
        defs = None

    s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask')
    y = T.ivector('label')

    cg = {}
    for train_phase in [True, False]:
        # NOTE: Please don't change outputs of cg
        if train_phase:
            with batch_normalization(nli_model):
                pred = nli_model.apply(s1,
                                       s1_mask,
                                       s2,
                                       s2_mask,
                                       def_mask=def_mask,
                                       defs=defs,
                                       s1_def_map=s1_def_map,
                                       s2_def_map=s2_def_map,
                                       train_phase=train_phase)
        else:
            pred = nli_model.apply(s1,
                                   s1_mask,
                                   s2,
                                   s2_mask,
                                   def_mask=def_mask,
                                   defs=defs,
                                   s1_def_map=s1_def_map,
                                   s2_def_map=s2_def_map,
                                   train_phase=train_phase)

        cost = CategoricalCrossEntropy().apply(y.flatten(), pred)
        error_rate = MisclassificationRate().apply(y.flatten(), pred)
        cg[train_phase] = ComputationGraph([cost, error_rate])

    # Weight decay (TODO: Make it less bug prone)
    if model == 'simple':
        weights_to_decay = VariableFilter(
            bricks=[dense for dense, relu, bn in nli_model._mlp],
            roles=[WEIGHT])(cg[True].variables)
        weight_decay = np.float32(c['l2']) * sum(
            (w**2).sum() for w in weights_to_decay)
    elif model == 'esim':
        weight_decay = 0.0
    else:
        raise NotImplementedError()

    final_cost = cg[True].outputs[0] + weight_decay
    final_cost.name = 'final_cost'

    # Add updates for population parameters

    if c.get("bn", True):
        pop_updates = get_batch_normalization_updates(cg[True])
        extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates]
    else:
        pop_updates = []
        extra_updates = []

    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            loaded_params = load_parameters(src)
            cg[True].set_parameter_values(loaded_params)
            for param, m in pop_updates:
                param.set_value(loaded_params[get_brick(
                    param).get_hierarchical_name(param)])

    if os.path.exists(os.path.join(save_path, "main_loop.tar")):
        logger.warning("Manually loading BN stats :(")
        with open(os.path.join(save_path, "main_loop.tar")) as src:
            loaded_params = load_parameters(src)

        for param, m in pop_updates:
            param.set_value(
                loaded_params[get_brick(param).get_hierarchical_name(param)])

    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', batch_size=4).get_epoch_iterator())
        s1.tag.test_value = test_value_data[0]
        s1_mask.tag.test_value = test_value_data[1]
        s2.tag.test_value = test_value_data[2]
        s2_mask.tag.test_value = test_value_data[3]
        y.tag.test_value = test_value_data[4]

    # Freeze embeddings
    if not c['train_emb']:
        frozen_params = [
            p for E in nli_model.get_embeddings_lookups() for p in E.parameters
        ]
        train_params = [p for p in cg[True].parameters]
        assert len(set(frozen_params) & set(train_params)) > 0
    else:
        frozen_params = []
    if not c.get('train_def_emb', 1):
        frozen_params_def = [
            p for E in nli_model.get_def_embeddings_lookups()
            for p in E.parameters
        ]
        train_params = [p for p in cg[True].parameters]
        assert len(set(frozen_params_def) & set(train_params)) > 0
        frozen_params += frozen_params_def
    train_params = [p for p in cg[True].parameters if p not in frozen_params]
    train_params_keys = [
        get_brick(p).get_hierarchical_name(p) for p in train_params
    ]

    # Optimizer
    algorithm = GradientDescent(cost=final_cost,
                                on_unused_sources='ignore',
                                parameters=train_params,
                                step_rule=Adam(learning_rate=c['lr']))
    algorithm.add_updates(extra_updates)
    m = Model(final_cost)

    parameters = m.get_parameter_dict()  # Blocks version mismatch
    logger.info("Trainable parameters" + "\n" +
                pprint.pformat([(key, parameters[key].get_value().shape)
                                for key in sorted(train_params_keys)],
                               width=120))
    logger.info("# of parameters {}".format(
        sum([
            np.prod(parameters[key].get_value().shape)
            for key in sorted(train_params_keys)
        ])))

    ### Monitored args ###
    train_monitored_vars = [final_cost] + cg[True].outputs
    monitored_vars = cg[False].outputs
    val_acc = monitored_vars[1]
    to_monitor_names = [
        'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2',
        's1_gate_rootmean2', 's1_compose_gate_rootmean2'
    ]
    for k in to_monitor_names:
        train_v, valid_v = VariableFilter(name=k)(
            cg[True]), VariableFilter(name=k)(cg[False])
        if len(train_v):
            logger.info("Adding {} tracking".format(k))
            train_monitored_vars.append(train_v[0])
            monitored_vars.append(valid_v[0])
        else:
            logger.warning("Didnt find {} in cg".format(k))

    if c['monitor_parameters']:
        for name in train_params_keys:
            param = parameters[name]
            num_elements = numpy.product(param.get_value().shape)
            norm = param.norm(2) / num_elements
            grad_norm = algorithm.gradients[param].norm(2) / num_elements
            step_norm = algorithm.steps[param].norm(2) / num_elements
            stats = tensor.stack(norm, grad_norm, step_norm,
                                 step_norm / grad_norm)
            stats.name = name + '_stats'
            train_monitored_vars.append(stats)

    regular_training_stream = data.get_stream('train',
                                              batch_size=c['batch_size'],
                                              seed=seed)

    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=regular_training_stream.sources,
            hwm=100,
            produces_examples=regular_training_stream.produces_examples)
    else:
        training_stream = regular_training_stream

    ### Build extensions ###

    extensions = [
        # Load(main_loop_path, load_iteration_state=True, load_log=True)
        #     .set_conditions(before_training=not new_training_job),
        StartFuelServer(regular_training_stream,
                        stream_path,
                        hwm=100,
                        script_path=os.path.join(
                            os.path.dirname(__file__),
                            "../bin/start_fuel_server.py"),
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq']),
        ProgressBar(),
        RetrievalPrintStats(retrieval=used_retrieval,
                            every_n_batches=c['mon_freq_valid'],
                            before_training=not fast_start),
        Timestamp(),
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq']),
    ]

    if c['layout'] == 'snli':
        validation = DataStreamMonitoring(monitored_vars,
                                          data.get_stream('valid',
                                                          batch_size=14,
                                                          seed=seed),
                                          before_training=not fast_start,
                                          on_resumption=True,
                                          after_training=True,
                                          every_n_batches=c['mon_freq_valid'],
                                          prefix='valid')
        extensions.append(validation)
    elif c['layout'] == 'mnli':
        validation = DataStreamMonitoring(monitored_vars,
                                          data.get_stream('valid_matched',
                                                          batch_size=14,
                                                          seed=seed),
                                          every_n_batches=c['mon_freq_valid'],
                                          on_resumption=True,
                                          after_training=True,
                                          prefix='valid_matched')
        validation_mismatched = DataStreamMonitoring(
            monitored_vars,
            data.get_stream('valid_mismatched', batch_size=14, seed=seed),
            every_n_batches=c['mon_freq_valid'],
            before_training=not fast_start,
            on_resumption=True,
            after_training=True,
            prefix='valid_mismatched')
        extensions.extend([validation, validation_mismatched])
    else:
        raise NotImplementedError()

    # Similarity trackers for embeddings
    if len(c.get('vocab_def', '')):
        retrieval_vocab = Vocabulary(c['vocab_def'])
    else:
        retrieval_vocab = data.vocab

    retrieval_all = Retrieval(vocab_text=retrieval_vocab,
                              dictionary=used_dict,
                              max_def_length=c['max_def_length'],
                              exclude_top_k=0,
                              max_def_per_word=c['max_def_per_word'])

    for name in [
            's1_word_embeddings', 's1_dict_word_embeddings',
            's1_translated_word_embeddings'
    ]:
        variables = VariableFilter(name=name)(cg[False])
        if len(variables):
            s1_emb = variables[0]
            logger.info("Adding similarity tracking for " + name)
            # A bit sloppy about downcast

            if "dict" in name:
                embedder = construct_dict_embedder(theano.function(
                    [s1, defs, def_mask, s1_def_map],
                    s1_emb,
                    allow_input_downcast=True),
                                                   vocab=data.vocab,
                                                   retrieval=retrieval_all)
                extensions.append(
                    SimilarityWordEmbeddingEval(
                        embedder=embedder,
                        prefix=name,
                        every_n_batches=c['mon_freq_valid'],
                        before_training=not fast_start))
            else:
                embedder = construct_embedder(theano.function(
                    [s1], s1_emb, allow_input_downcast=True),
                                              vocab=data.vocab)
                extensions.append(
                    SimilarityWordEmbeddingEval(
                        embedder=embedder,
                        prefix=name,
                        every_n_batches=c['mon_freq_valid'],
                        before_training=not fast_start))

    track_the_best = TrackTheBest(validation.record_name(val_acc),
                                  before_training=not fast_start,
                                  every_n_epochs=c['save_freq_epochs'],
                                  after_training=not fast_start,
                                  every_n_batches=c['mon_freq_valid'],
                                  choose_best=min)
    extensions.append(track_the_best)

    # Special care for serializing embeddings
    if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path',
                                                     '')):
        extensions.insert(
            0,
            LoadNoUnpickling(main_loop_path,
                             load_iteration_state=True,
                             load_log=True).set_conditions(
                                 before_training=not new_training_job))
        extensions.append(
            Checkpoint(main_loop_path,
                       parameters=train_params + [p for p, m in pop_updates],
                       save_main_loop=False,
                       save_separately=['log', 'iteration_state'],
                       before_training=not fast_start,
                       every_n_epochs=c['save_freq_epochs'],
                       after_training=not fast_start).add_condition(
                           ['after_batch', 'after_epoch'],
                           OnLogRecord(track_the_best.notification_name),
                           (main_loop_best_val_path, )))
    else:
        extensions.insert(
            0,
            Load(main_loop_path, load_iteration_state=True,
                 load_log=True).set_conditions(
                     before_training=not new_training_job))
        extensions.append(
            Checkpoint(main_loop_path,
                       parameters=cg[True].parameters +
                       [p for p, m in pop_updates],
                       before_training=not fast_start,
                       every_n_epochs=c['save_freq_epochs'],
                       after_training=not fast_start).add_condition(
                           ['after_batch', 'after_epoch'],
                           OnLogRecord(track_the_best.notification_name),
                           (main_loop_best_val_path, )))

    extensions.extend([
        DumpCSVSummaries(save_path,
                         every_n_batches=c['mon_freq_valid'],
                         after_training=True),
        DumpTensorflowSummaries(save_path,
                                after_epoch=True,
                                every_n_batches=c['mon_freq_valid'],
                                after_training=True),
        Printing(every_n_batches=c['mon_freq_valid']),
        PrintMessage(msg="save_path={}".format(save_path),
                     every_n_batches=c['mon_freq']),
        FinishAfter(after_n_batches=c['n_batches']).add_condition(
            ['after_batch'],
            OnLogStatusExceed('iterations_done', c['n_batches']))
    ])

    logger.info(extensions)

    ### Run training ###

    if "VISDOM_SERVER" in os.environ:
        print("Running visdom server")
        ret = subprocess.Popen([
            os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"),
            "--visdom-server={}".format(os.environ['VISDOM_SERVER']),
            "--folder={}".format(save_path)
        ])
        time.sleep(0.1)
        if ret.returncode is not None:
            raise Exception()
        atexit.register(lambda: os.kill(ret.pid, signal.SIGINT))

    model = Model(cost)
    for p, m in pop_updates:
        model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=model,
                         extensions=extensions)

    assert os.path.exists(save_path)
    main_loop.run()
Example #28
0
def main(save_to, num_epochs,
         weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None,
         batch_size=None, histogram=None, resume=False):
    output_size = 10

    prior_noise_level = -10
    noise_step_rule = Scale(1e-6)
    noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX))
    convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True,
            noise_rate=noise_rate,
            prior_noise_level=prior_noise_level)

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    test_probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs)
            .copy(name='cost'))
    test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs)
                  .copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs)
                  .copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate])

    # Apply dropout to all layer outputs except final softmax
    # dropout_vars = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(test_cg.variables)
    # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(test_cg, [x], 0.2)
    # train_cg = drop_cg
    # train_cg = apply_batch_normalization(test_cg)

    # train_cost, train_error_rate, train_components = train_cg.outputs

    with batch_normalization(convnet):
        with training_noise(convnet):
            train_probs = convnet.apply(x)
    train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs)
                .copy(name='cost'))
    train_components = (ComponentwiseCrossEntropy().apply(y.flatten(),
                train_probs).copy(name='components'))
    train_error_rate = (MisclassificationRate().apply(y.flatten(),
                train_probs).copy(name='error_rate'))
    train_cg = ComputationGraph([train_cost,
                train_error_rate, train_components])
    population_updates = get_batch_normalization_updates(train_cg)
    bn_alpha = 0.9
    extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha))
                for p, m in population_updates]

    # for annealing
    nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX))
    nit_penalty.name = 'nit_penalty'

    # Compute noise rates for training graph
    train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables)
    train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean()
    train_mean_log_sigma.name = 'mean_log_sigma'
    train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables)
    train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean()
    train_nit_rate.name = 'nit_rate'
    train_nit_regularization = nit_penalty * train_nit_rate
    train_nit_regularization.name = 'nit_regularization'

    # Apply regularization to the cost
    trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])(
            train_cg.parameters)
    mask_parameters = [p for p in trainable_parameters
            if get_brick(p).name == 'mask']
    noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters)
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    nonmask_weights = [p for p in weights if get_brick(p).name != 'mask']
    l2_norm = sum([(W ** 2).sum() for W in nonmask_weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = weight_decay * l2_norm
    l2_regularization.name = 'l2_regularization'

    # testversion
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + l2_regularization + train_nit_regularization
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train",))
    cifar10_train_stream = RandomPadCropFlip(
        NormalizeBatchLevels(DataStream.default_stream(
            cifar10_train, iteration_scheme=ShuffledScheme(
                cifar10_train.num_examples, batch_size)),
        which_sources=('features',)),
        (32, 32), pad=4, which_sources=('features',))

    test_batch_size = 128
    cifar10_test = CIFAR10(("test",))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(
            cifar10_test.num_examples, test_batch_size)),
        which_sources=('features',))

    momentum = Momentum(0.01, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])

    # Create a step rule that reduces the learning rate of noise
    scale_mask = Restrict(noise_step_rule, mask_parameters)
    step_rule = CompositeRule([scale_mask, momentum])

    # from theano.compile.nanguardmode import NanGuardMode

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=train_cost, parameters=trainable_parameters,
        step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    #,
    #    theano_func_kwargs={
    #        'mode': NanGuardMode(
    #            nan_is_error=True, inf_is_error=True, big_is_error=True)})

    exp_name = save_to.replace('.%d', '')

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  EpochSchedule(momentum.learning_rate, [
                      (0, 0.01),     # Warm up with 0.01 learning rate
                      (50, 0.1),     # Then go back to 0.1
                      (100, 0.01),
                      (150, 0.001)
                      # (83, 0.01),  # Follow the schedule in the paper
                      # (125, 0.001)
                  ]),
                  EpochSchedule(noise_step_rule.learning_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4)
                  ]),
                  EpochSchedule(noise_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4),
                      # (6, 3e-4),
                      # (8, 1e-3), # Causes nit rate to jump
                      # (10, 3e-3),
                      # (12, 1e-2),
                      # (15, 3e-2),
                      # (19, 1e-1),
                      # (24, 3e-1),
                      # (30, 1)
                  ]),
                  NoiseExtension(
                      noise_parameters=noise_parameters),
                  NoisyDataStreamMonitoring(
                      [test_cost, test_error_rate, test_confusion],
                      cifar10_test_stream,
                      noise_parameters=noise_parameters,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [train_cost, train_error_rate, train_nit_rate,
                       train_cost_without_regularization,
                       l2_regularization,
                       train_nit_regularization,
                       momentum.learning_rate,
                       train_mean_log_sigma,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      every_n_batches=17),
                      # after_epoch=True),
                  Plot('Training performance for ' + exp_name,
                      channels=[
                          ['train_cost_with_regularization',
                           'train_cost_without_regularization',
                           'train_nit_regularization',
                           'train_l2_regularization'],
                          ['train_error_rate'],
                          ['train_total_gradient_norm'],
                          ['train_mean_log_sigma'],
                      ],
                      every_n_batches=17),
                  Plot('Test performance for ' + exp_name,
                      channels=[[
                          'train_error_rate',
                          'test_error_rate',
                          ]],
                      after_epoch=True),
                  EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=train_components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(exp_name, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(
        algorithm,
        cifar10_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)
Example #29
0
data_stream_test = DataStream.default_stream(data_test,
                                             iteration_scheme=SequentialScheme(
                                                 data_test.num_examples,
                                                 batch_size=bs))

learning_rate = 0.0002
n_epochs = 100
algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=CompositeRule([
                                StepClipping(10.),
                                Adam(learning_rate),
                            ]))

print('..loading...')
load = Load('/home/xuehongyang/checkpoints_open/snapshot_18')
predictor = PredictDataStream(data_stream=data_stream_test,
                              output_tensor=result,
                              path='/home/xuehongyang/RESULT_MAIN',
                              before_training=True,
                              after_epoch=False,
                              after_training=False)
main_loop = MainLoop(
    model=Model(cost),
    data_stream=data_stream_train,
    algorithm=algorithm,
    extensions=[Timing(),
                FinishAfter(after_n_epochs=1), load, predictor])

print('start prediction ...')
main_loop.run()
Example #30
0
def main(save_to, num_epochs,
         regularization=0.0003, subset=None, num_batches=None,
         histogram=None, resume=False):
    batch_size = 500
    output_size = 10
    convnet = create_lenet_5()
    layers = convnet.layers

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    cost = (CategoricalCrossEntropy().apply(y.flatten(), probs)
            .copy(name='cost'))
    components = (ComponentwiseCrossEntropy().apply(y.flatten(), probs)
            .copy(name='components'))
    error_rate = (MisclassificationRate().apply(y.flatten(), probs)
                  .copy(name='error_rate'))
    confusion = (ConfusionMatrix().apply(y.flatten(), probs)
                  .copy(name='confusion'))
    confusion.tag.aggregation_scheme = Sum(confusion)

    cg = ComputationGraph([cost, error_rate, components])

    # Apply regularization to the cost
    weights = VariableFilter(roles=[WEIGHT])(cg.variables)
    l2_norm = sum([(W ** 2).sum() for W in weights])
    l2_norm.name = 'l2_norm'
    cost = cost + regularization * l2_norm
    cost.name = 'cost_with_regularization'

    if subset:
        start = 30000 - subset // 2
        mnist_train = MNIST(("train",), subset=slice(start, start+subset))
    else:
        mnist_train = MNIST(("train",))
    mnist_train_stream = DataStream.default_stream(
        mnist_train, iteration_scheme=ShuffledScheme(
            mnist_train.num_examples, batch_size))

    mnist_test = MNIST(("test",))
    mnist_test_stream = DataStream.default_stream(
        mnist_test,
        iteration_scheme=ShuffledScheme(
            mnist_test.num_examples, batch_size))

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=AdaDelta(decay_rate=0.99))

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  DataStreamMonitoring(
                      [cost, error_rate, confusion],
                      mnist_test_stream,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [cost, error_rate, l2_norm,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      after_epoch=True),
                  Checkpoint(save_to),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(save_to, True, True))

    model = Model(cost)

    main_loop = MainLoop(
        algorithm,
        mnist_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)
Example #31
0
def main(save_to,
         num_epochs,
         regularization=0.001,
         subset=None,
         num_batches=None,
         batch_size=None,
         histogram=None,
         resume=False):
    output_size = 10
    convnet = create_all_conv_net()

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(),
                                                 probs).copy(name='cost'))
    test_components = (ComponentwiseCrossEntropy().apply(
        y.flatten(), probs).copy(name='components'))
    test_error_rate = (MisclassificationRate().apply(
        y.flatten(), probs).copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(),
                                              probs).copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate, test_components])

    # Apply dropout to all layer outputs except final softmax
    dropout_vars = VariableFilter(
        roles=[OUTPUT],
        bricks=[Convolutional],
        theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(drop_cg.variables)
    # train_cg = apply_dropout(drop_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(drop_cg, [x], 0.2)
    train_cg = drop_cg
    # train_cg = test_cg

    train_cost, train_error_rate, train_components = train_cg.outputs

    # Apply regularization to the cost
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    l2_norm = sum([(W**2).sum() for W in weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = regularization * l2_norm
    l2_regularization.name = 'l2_regularization'
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + regularization * l2_norm
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train", ))
    #cifar10_train_stream = RandomPadCropFlip(
    #    NormalizeBatchLevels(DataStream.default_stream(
    #        cifar10_train, iteration_scheme=ShuffledScheme(
    #            cifar10_train.num_examples, batch_size)),
    #    which_sources=('features',)),
    #    (32, 32), pad=5, which_sources=('features',))
    cifar10_train_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_train,
        iteration_scheme=ShuffledScheme(cifar10_train.num_examples,
                                        batch_size)),
                                                which_sources=('features', ))

    test_batch_size = 1000
    cifar10_test = CIFAR10(("test", ))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(cifar10_test.num_examples,
                                        test_batch_size)),
                                               which_sources=('features', ))

    momentum = Momentum(0.002, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])
    # step_rule = CompositeRule([StepClipping(100), momentum])
    step_rule = momentum

    # Train with simple SGD
    algorithm = GradientDescent(cost=train_cost,
                                parameters=train_cg.parameters,
                                step_rule=step_rule)

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches),
        EpochSchedule(momentum.learning_rate, [(1, 0.005), (3, 0.01),
                                               (5, 0.02), (200, 0.002),
                                               (250, 0.0002), (300, 0.00002)]),
        DataStreamMonitoring([test_cost, test_error_rate, test_confusion],
                             cifar10_test_stream,
                             prefix="test"),
        TrainingDataMonitoring([
            train_cost, train_error_rate, train_cost_without_regularization,
            l2_regularization, momentum.learning_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               every_n_batches=10),
        # after_epoch=True),
        Plot('Training performance for ' + save_to,
             channels=[
                 [
                     'train_cost_with_regularization',
                     'train_cost_without_regularization',
                     'train_l2_regularization'
                 ],
                 ['train_error_rate'],
                 ['train_total_gradient_norm'],
             ],
             every_n_batches=10),
        # after_batch=True),
        Plot('Test performance for ' + save_to,
             channels=[[
                 'train_error_rate',
                 'test_error_rate',
             ]],
             after_epoch=True),
        Checkpoint(save_to),
        ProgressBar(),
        Printing()
    ]

    if histogram:
        attribution = AttributionExtension(components=train_components,
                                           parameters=cg.parameters,
                                           components_size=output_size,
                                           after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(save_to, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(algorithm,
                         cifar10_train_stream,
                         model=model,
                         extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)