Example #1
0
def test_track_the_best():
    main_loop = MockMainLoop()
    extension = TrackTheBest("cost")
    extension.main_loop = main_loop

    main_loop.status["epochs_done"] += 1
    main_loop.status["iterations_done"] += 10
    main_loop.log.current_row["cost"] = 5
    extension.dispatch("after_epoch")
    assert main_loop.status["best_cost"] == 5
    assert main_loop.log.current_row["cost_best_so_far"]

    main_loop.status["epochs_done"] += 1
    main_loop.status["iterations_done"] += 10
    main_loop.log.current_row["cost"] = 6
    extension.dispatch("after_epoch")
    assert main_loop.status["best_cost"] == 5
    assert main_loop.log.current_row.get("cost_best_so_far", None) is None

    main_loop.status["epochs_done"] += 1
    main_loop.status["iterations_done"] += 10
    main_loop.log.current_row["cost"] = 5
    extension.dispatch("after_epoch")
    assert main_loop.status["best_cost"] == 5
    assert main_loop.log.current_row.get("cost_best_so_far", None) is None

    main_loop.status["epochs_done"] += 1
    main_loop.status["iterations_done"] += 10
    main_loop.log.current_row["cost"] = 4
    extension.dispatch("after_epoch")
    assert main_loop.status["best_cost"] == 4
    assert main_loop.log.current_row["cost_best_so_far"]
Example #2
0
def test_track_the_best():
    main_loop = MockMainLoop()
    extension = TrackTheBest("cost")
    extension.main_loop = main_loop

    main_loop.status['iterations_done'] += 1
    main_loop.log.current_row['cost'] = 5
    extension.dispatch('after_batch')
    assert main_loop.status['best_cost'] == 5
    assert main_loop.log.current_row['cost_best_so_far']

    main_loop.status['iterations_done'] += 1
    main_loop.log.current_row['cost'] = 6
    extension.dispatch('after_batch')
    assert main_loop.status['best_cost'] == 5
    assert main_loop.log.current_row.get('cost_best_so_far', None) is None

    main_loop.status['iterations_done'] += 1
    main_loop.log.current_row['cost'] = 5
    extension.dispatch('after_batch')
    assert main_loop.status['best_cost'] == 5
    assert main_loop.log.current_row.get('cost_best_so_far', None) is None

    main_loop.status['iterations_done'] += 1
    main_loop.log.current_row['cost'] = 4
    extension.dispatch('after_batch')
    assert main_loop.status['best_cost'] == 4
    assert main_loop.log.current_row['cost_best_so_far']
Example #3
0
def test_save_the_best():
    with NamedTemporaryFile(dir=config.temp_dir) as dst,\
            NamedTemporaryFile(dir=config.temp_dir) as dst_best:
        track_cost = TrackTheBest("cost", after_epoch=False, after_batch=True)
        main_loop = MockMainLoop(
            extensions=[FinishAfter(after_n_epochs=1),
                        WriteCostExtension(),
                        track_cost,
                        Checkpoint(dst.name, after_batch=True,
                                   save_separately=['log'])
                        .add_condition(
                            ["after_batch"],
                            OnLogRecord(track_cost.notification_name),
                            (dst_best.name,))])
        main_loop.run()

        assert main_loop.log[4]['saved_to'] == (dst.name, dst_best.name)
        assert main_loop.log[5]['saved_to'] == (dst.name, dst_best.name)
        assert main_loop.log[6]['saved_to'] == (dst.name,)
        with open(dst_best.name, 'rb') as src:
            assert load(src).log.status['iterations_done'] == 5
        root, ext = os.path.splitext(dst_best.name)
        log_path = root + "_log" + ext
        with open(log_path, 'rb') as src:
            assert cPickle.load(src).status['iterations_done'] == 5
Example #4
0
def track_best(channel, save_path):
    tracker = TrackTheBest(channel, choose_best=min)
    checkpoint = saveload.Checkpoint(save_path,
                                     after_training=False,
                                     use_cpickle=True)
    checkpoint.add_condition(["after_epoch"],
                             predicate=predicates.OnLogRecord(
                                 '{0}_best_so_far'.format(channel)))
    return [tracker, checkpoint]
Example #5
0
 def track_best(self, channel, save_path=None, choose_best=min):
     tracker = TrackTheBest(channel, choose_best=choose_best)
     self.extensions.append(tracker)
     if save_path:
         checkpoint = saveload.Checkpoint(save_path,
                                          after_training=False,
                                          use_cpickle=True)
         checkpoint.add_condition(["after_epoch"],
                                  predicate=predicates.OnLogRecord(
                                      '{0}_best_so_far'.format(channel)))
         self.extensions.append(checkpoint)
Example #6
0
def track_best(channel, save_path, last_path, num_epochs, maxEpochs,
               maxIterations, epsilon, tempSharedData):
    tracker = TrackTheBest(channel)
    finishNoimprove = FinishIfNoImprovementEpsilonAfter(
        'valid_MRR_best_so_far', epochs=num_epochs, epsilon=epsilon)

    lastcheckpoint = myCheckpoint(last_path, tempSharedData)

    checkpoint = myCheckpoint(save_path,
                              tempSharedData,
                              after_training=False,
                              use_cpickle=True)
    checkpoint.add_condition(["after_epoch"],
                             predicate=predicates.OnLogRecord(
                                 '{0}_best_so_far'.format(channel)))
    finishAfter = FinishAfter(after_n_epochs=maxEpochs,
                              after_n_batches=maxIterations)

    return [tracker, finishNoimprove, lastcheckpoint, checkpoint, finishAfter]
Example #7
0
def test_save_the_best():
    skip_if_configuration_set('log_backend', 'sqlite',
                              "Known to be flaky with SQLite log backend.")
    with NamedTemporaryFile(dir=config.temp_dir) as dst,\
            NamedTemporaryFile(dir=config.temp_dir) as dst_best:
        track_cost = TrackTheBest("cost", after_epoch=False, after_batch=True)
        main_loop = MockMainLoop(extensions=[
            FinishAfter(after_n_epochs=1),
            WriteCostExtension(), track_cost,
            Checkpoint(dst.name, after_batch=True, save_separately=['log']).
            add_condition(["after_batch"],
                          OnLogRecord(track_cost.notification_name), (
                              dst_best.name, ))
        ])
        main_loop.run()

        assert main_loop.log[4]['saved_to'] == (dst.name, dst_best.name)
        assert main_loop.log[5]['saved_to'] == (dst.name, dst_best.name)
        assert main_loop.log[6]['saved_to'] == (dst.name, )
        with open(dst_best.name, 'rb') as src:
            assert load(src).log.status['iterations_done'] == 5
Example #8
0
host_plot = 'http://tfjgeorge.com:5006'
cost.name = 'cost'
valid_cost.name = 'valid_cost'

extensions = [
    Timing(),
    TrainingDataMonitoring([cost], after_epoch=True, prefix='train'),
    DataStreamMonitoring(variables=[valid_cost], data_stream=valid_stream),
    Plot('%s %s' % (
        socket.gethostname(),
        datetime.datetime.now(),
    ),
         channels=[['train_cost', 'valid_cost']],
         after_epoch=True,
         server_url=host_plot),
    TrackTheBest('valid_cost'),
    Checkpoint('model', save_separately=["model", "log"]),
    FinishIfNoImprovementAfter('valid_cost_best_so_far', epochs=5),
    #FinishAfter(after_n_epochs=100),
    Printing()
]

from blocks.main_loop import MainLoop

main_loop = MainLoop(model=model,
                     data_stream=train_stream,
                     algorithm=algorithm,
                     extensions=extensions)

main_loop.run()
def main():
    nclasses = 27

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=1)
    parser.add_argument("--length", type=int, default=180)
    parser.add_argument("--num-epochs", type=int, default=100)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=1e-3)
    parser.add_argument("--epsilon", type=float, default=1e-5)
    parser.add_argument("--num-hidden", type=int, default=1000)
    parser.add_argument("--baseline", action="store_true")
    parser.add_argument("--initialization",
                        choices="identity glorot orthogonal uniform".split(),
                        default="identity")
    parser.add_argument("--initial-gamma", type=float, default=1e-1)
    parser.add_argument("--initial-beta", type=float, default=0)
    parser.add_argument("--cluster", action="store_true")
    parser.add_argument("--activation",
                        choices=list(activations.keys()),
                        default="tanh")
    parser.add_argument("--optimizer",
                        choices="sgdmomentum adam rmsprop",
                        default="rmsprop")
    parser.add_argument("--continue-from")
    parser.add_argument("--evaluate")
    parser.add_argument("--dump-hiddens")
    args = parser.parse_args()

    np.random.seed(args.seed)
    blocks.config.config.default_seed = args.seed

    if args.continue_from:
        from blocks.serialization import load
        main_loop = load(args.continue_from)
        main_loop.run()
        sys.exit(0)

    graphs, extensions, updates = construct_graphs(args, nclasses)

    ### optimization algorithm definition
    if args.optimizer == "adam":
        optimizer = Adam(learning_rate=args.learning_rate)
    elif args.optimizer == "rmsprop":
        optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9)
    elif args.optimizer == "sgdmomentum":
        optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99)
    step_rule = CompositeRule([
        StepClipping(1.),
        optimizer,
    ])
    algorithm = GradientDescent(cost=graphs["training"].outputs[0],
                                parameters=graphs["training"].parameters,
                                step_rule=step_rule)
    algorithm.add_updates(updates["training"])
    model = Model(graphs["training"].outputs[0])
    extensions = extensions["training"] + extensions["inference"]

    # step monitor
    step_channels = []
    step_channels.extend([
        algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
        for name, param in model.get_parameter_dict().items()
    ])
    step_channels.append(
        algorithm.total_step_norm.copy(name="total_step_norm"))
    step_channels.append(
        algorithm.total_gradient_norm.copy(name="total_gradient_norm"))
    step_channels.extend(graphs["training"].outputs)
    logger.warning("constructing training data monitor")
    extensions.append(
        TrainingDataMonitoring(step_channels,
                               prefix="iteration",
                               after_batch=True))

    # parameter monitor
    extensions.append(
        DataStreamMonitoring([
            param.norm(2).copy(name="parameter.norm:%s" % name)
            for name, param in model.get_parameter_dict().items()
        ],
                             data_stream=None,
                             after_epoch=True))

    validation_interval = 500
    # performance monitor
    for situation in "training inference".split():
        if situation == "inference" and not args.evaluate:
            # save time when we don't need the inference graph
            continue

        for which_set in "train valid test".split():
            logger.warning("constructing %s %s monitor" %
                           (which_set, situation))
            channels = list(graphs[situation].outputs)
            extensions.append(
                DataStreamMonitoring(channels,
                                     prefix="%s_%s" % (which_set, situation),
                                     every_n_batches=validation_interval,
                                     data_stream=get_stream(
                                         which_set=which_set,
                                         batch_size=args.batch_size,
                                         num_examples=10000,
                                         length=args.length)))

    extensions.extend([
        TrackTheBest("valid_training_error_rate",
                     "best_valid_training_error_rate"),
        DumpBest("best_valid_training_error_rate", "best.zip"),
        FinishAfter(after_n_epochs=args.num_epochs),
        #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50),
        Checkpoint("checkpoint.zip",
                   on_interrupt=False,
                   every_n_epochs=1,
                   use_cpickle=True),
        DumpLog("log.pkl", after_epoch=True)
    ])

    if not args.cluster:
        extensions.append(ProgressBar())

    extensions.extend([
        Timing(),
        Printing(every_n_batches=validation_interval),
        PrintingTo("log"),
    ])
    main_loop = MainLoop(data_stream=get_stream(which_set="train",
                                                batch_size=args.batch_size,
                                                length=args.length,
                                                augment=True),
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)

    if args.dump_hiddens:
        dump_hiddens(args, main_loop)
        return

    if args.evaluate:
        evaluate(args, main_loop)
        return

    main_loop.run()
def train_language_model(new_training_job, config, save_path, params,
                         fast_start, fuel_server, seed):
    c = config
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    data, lm, retrieval = initialize_data_and_model(config)

    # full main loop can be saved...
    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    # or only state (log + params) which can be useful not to pickle embeddings
    state_path = os.path.join(save_path, 'training_state.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')
    best_tar_path = os.path.join(save_path, "best_model.tar")

    words = tensor.ltensor3('words')
    words_mask = tensor.matrix('words_mask')
    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', batch_size=4,
                            max_length=5).get_epoch_iterator())
        words.tag.test_value = test_value_data[0]
        words_mask.tag.test_value = test_value_data[1]

    costs, updates = lm.apply(words, words_mask)
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(words.shape[1], 'length')
    perplexity, = VariableFilter(name='perplexity')(cg)
    perplexities = VariableFilter(name_regex='perplexity.*')(cg)
    monitored_vars = [length, cost] + perplexities
    if c['dict_path']:
        num_definitions, = VariableFilter(name='num_definitions')(cg)
        monitored_vars.extend([num_definitions])

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    saved_parameters = parameters.values()
    if c['embedding_path']:
        logger.debug("Exclude word embeddings from the trained parameters")
        trained_parameters = [
            p for p in trained_parameters
            if not p == lm.get_def_embeddings_params()
        ]
        saved_parameters = [
            p for p in saved_parameters
            if not p == lm.get_def_embeddings_params()
        ]

    if c['cache_size'] != 0:
        logger.debug("Enable fake recursivity for looking up embeddings")
        trained_parameters = [
            p for p in trained_parameters if not p == lm.get_cache_params()
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    if c['cache_size'] != 0:
        algorithm.add_updates(updates)

    train_monitored_vars = list(monitored_vars)
    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)

    word_emb_RMS, = VariableFilter(name='word_emb_RMS')(cg)
    main_rnn_in_RMS, = VariableFilter(name='main_rnn_in_RMS')(cg)
    train_monitored_vars.extend([word_emb_RMS, main_rnn_in_RMS])

    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    # We use a completely random seed on purpose. With Fuel server
    # it's currently not possible to restore the state of the training
    # stream. That's why it's probably better to just have it stateless.
    stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None
    training_stream = data.get_stream('train',
                                      batch_size=c['batch_size'],
                                      max_length=c['max_length'],
                                      seed=stream_seed)
    valid_stream = data.get_stream('valid',
                                   batch_size=c['batch_size_valid'],
                                   max_length=c['max_length'],
                                   seed=stream_seed)
    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    validation = DataStreamMonitoring(monitored_vars,
                                      valid_stream,
                                      prefix="valid").set_conditions(
                                          before_first_epoch=not fast_start,
                                          on_resumption=True,
                                          every_n_batches=c['mon_freq_valid'])
    track_the_best = TrackTheBest(validation.record_name(perplexity),
                                  choose_best=min).set_conditions(
                                      on_resumption=True,
                                      after_epoch=True,
                                      every_n_batches=c['mon_freq_valid'])

    # don't save them the entire main loop to avoid pickling everything
    if c['fast_checkpoint']:
        load = (LoadNoUnpickling(state_path,
                                 load_iteration_state=True,
                                 load_log=True).set_conditions(
                                     before_training=not new_training_job))
        cp_args = {
            'save_main_loop': False,
            'save_separately': ['log', 'iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(state_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                state_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)
    else:
        load = (Load(main_loop_path, load_iteration_state=True,
                     load_log=True).set_conditions(
                         before_training=not new_training_job))
        cp_args = {
            'save_separately': ['iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(main_loop_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                main_loop_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)

    checkpoint = checkpoint.add_condition(
        ['after_batch', 'after_epoch'],
        OnLogRecord(track_the_best.notification_name), (best_tar_path, ))

    extensions = [
        load,
        StartFuelServer(original_training_stream,
                        stream_path,
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train'])
    ]

    if retrieval:
        extensions.append(
            RetrievalPrintStats(retrieval=retrieval,
                                every_n_batches=c['mon_freq_train'],
                                before_training=not fast_start))

    extensions.extend([
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
        validation, track_the_best, checkpoint
    ])
    if c['checkpoint_every_n_batches']:
        extensions.append(intermediate_cp)
    extensions.extend([
        DumpTensorflowSummaries(save_path,
                                every_n_batches=c['mon_freq_train'],
                                after_training=True),
        Printing(on_resumption=True, every_n_batches=c['mon_freq_train']),
        FinishIfNoImprovementAfter(track_the_best.notification_name,
                                   iterations=50 * c['mon_freq_valid'],
                                   every_n_batches=c['mon_freq_valid']),
        FinishAfter(after_n_batches=c['n_batches'])
    ])

    logger.info("monitored variables during training:" + "\n" +
                pprint.pformat(train_monitored_vars, width=120))
    logger.info("monitored variables during valid:" + "\n" +
                pprint.pformat(monitored_vars, width=120))

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
Example #11
0
def build_and_run(save_to,modelconfig,experimentconfig):
    """ part of this is adapted from lasagne tutorial""" 

    n, num_filters, image_size, num_blockstack = modelconfig['depth'], modelconfig['num_filters'], modelconfig['image_size'], modelconfig['num_blockstack']
    
    print("Amount of bottlenecks: %d" % n)

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('image_features')
    #target_value = T.ivector('targets')
    target_var = T.lmatrix('targets')
    target_vec = T.extra_ops.to_one_hot(target_var[:,0],2)
    #target_var = T.matrix('targets')
    # Create residual net model
    print("Building model...")
    network = build_cnn(input_var, image_size, n, num_blockstack, num_filters)
    get_info(network)
    prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network))
    test_prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network,deterministic=True))

    # Loss function -> The objective to minimize 
    print("Instanciation of loss function...")
 
    #loss = CategoricalCrossEntropy().apply(target_var.flatten(), prediction)
    #test_loss = CategoricalCrossEntropy().apply(target_var.flatten(), test_prediction)
 #   loss = lasagne.objectives.categorical_crossentropy(prediction, target_var.flatten()).mean()
  #  test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var.flatten()).mean()
    loss = lasagne.objectives.squared_error(prediction,target_vec).mean()
    test_loss = lasagne.objectives.squared_error(test_prediction,target_vec).mean()
  #  loss = tensor.nnet.binary_crossentropy(prediction, target_var).mean()
  #  test_loss = tensor.nnet.binary_crossentropy(test_prediction, target_var).mean()
    test_loss.name = "loss"

#    loss.name = 'x-ent_error'
#    loss.name = 'sqr_error'
    layers = lasagne.layers.get_all_layers(network)

    #l1 and l2 regularization
    #pondlayers = {x:0.000025 for i,x in enumerate(layers)}
    #l1_penality = lasagne.regularization.regularize_layer_params_weighted(pondlayers, lasagne.regularization.l2)
    #l2_penality = lasagne.regularization.regularize_layer_params(layers[len(layers)/4:], lasagne.regularization.l1) * 25e-6
    #reg_penalty = l1_penality + l2_penality
    #reg_penalty.name = 'reg_penalty'
    #loss = loss + reg_penalty
    loss.name = 'reg_loss'
    error_rate = MisclassificationRate().apply(target_var.flatten(), test_prediction).copy(
            name='error_rate')

    
    # Load the dataset
    print("Loading data...")
    istest = 'test' in experimentconfig.keys()
    if istest:
        print("Using test stream")
    train_stream, valid_stream, test_stream = get_stream(experimentconfig['batch_size'],image_size,test=istest)

    # Defining step rule and algorithm
    if 'step_rule' in experimentconfig.keys() and not experimentconfig['step_rule'] is None :
        step_rule = experimentconfig['step_rule'](learning_rate=experimentconfig['learning_rate'])
    else :
        step_rule=Scale(learning_rate=experimentconfig['learning_rate'])

    params = map(lasagne.utils.as_theano_expression,lasagne.layers.get_all_params(network, trainable=True))
    print("Initializing algorithm")
    algorithm = GradientDescent(
                cost=loss, gradients={var:T.grad(loss,var) for var in params},#parameters=cg.parameters, #params
                step_rule=step_rule)

    #algorithm.add_updates(extra_updates)


    grad_norm = aggregation.mean(algorithm.total_gradient_norm)
    grad_norm.name = "grad_norm"

    print("Initializing extensions...")
    plot = Plot(save_to, channels=[['train_loss','valid_loss'], 
['train_grad_norm'],
#['train_grad_norm','train_reg_penalty'],
['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042')    

    checkpoint = Checkpoint('models/best_'+save_to+'.tar')
  #  checkpoint.add_condition(['after_n_batches=25'],

    checkpoint.add_condition(['after_epoch'],
                         predicate=OnLogRecord('valid_error_rate_best_so_far'))

    #Defining extensions
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=experimentconfig['num_epochs'],
                              after_n_batches=experimentconfig['num_batches']),
                  TrainingDataMonitoring([test_loss, error_rate, grad_norm], # reg_penalty],
                  prefix="train", after_epoch=True), #after_n_epochs=1
                  DataStreamMonitoring([test_loss, error_rate],valid_stream,prefix="valid", after_epoch=True), #after_n_epochs=1
                  plot,
                  #Checkpoint(save_to,after_n_epochs=5),
                  #ProgressBar(),
             #     Plot(save_to, channels=[['train_loss','valid_loss'], ['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042'), #'grad_norm'
                  #       after_batch=True),
                  Printing(after_epoch=True),
                  TrackTheBest('valid_error_rate',min), #Keep best
                  checkpoint,  #Save best
                  FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=5)] # Early-stopping

 #   model = Model(loss)
 #   print("Model",model)


    main_loop = MainLoop(
        algorithm,
        train_stream,
       # model=model,
        extensions=extensions)
    print("Starting main loop...")

    main_loop.run()
Example #12
0
    [algorithm.total_step_norm, algorithm.total_gradient_norm],
    every_n_batches=n_batches,
    prefix="train")

valid_monitor = DataStreamMonitoring(variables,
                                     valid_stream,
                                     every_n_batches=n_batches,
                                     prefix="valid")

from utils import _is_nan

extensions = extensions = [
    train_monitor,
    valid_monitor,
    TrackTheBest('valid_nll',
                 every_n_batches=n_batches,
                 before_first_epoch=True),
    Timing(every_n_batches=n_batches),
    Printing(every_n_batches=n_batches),
    Write(generator,
          save_name=save_dir + "samples/" + exp_name + ".png").add_condition(
              ["after_batch"], predicate=OnLogRecord('valid_nll_best_so_far')),
    FinishAfter().add_condition(["after_batch"], _is_nan),
    #Checkpoint(save_dir + "pkl/" + exp_name + ".pkl",after_epoch = True),
    Checkpoint(save_dir + "pkl/best_" + exp_name + ".pkl").add_condition(
        ["after_batch"], predicate=OnLogRecord('valid_nll_best_so_far')),
    SaveComputationGraph(emit),
    Plot(save_dir + "progress/" + exp_name + ".png",
         [['train_nll', 'valid_nll']],
         every_n_batches=5 * n_batches,
         email=False),
Example #13
0
    def run_pretrain(model, hyper_params, cost, train_data, valid_data=None, extra_costs=None):
        """
        generic training method for neural networks;
        works with any network structure
        :return:
        """
        from fuel.streams import DataStream
        from fuel.schemes import SequentialScheme, ShuffledScheme
        from blocks.filter import VariableFilter
        from blocks.graph import ComputationGraph
        from blocks.roles import WEIGHT
        from blocks.algorithms import GradientDescent, Adam, RMSProp, Scale
        from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar
        from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring
        from blocks.extensions.predicates import OnLogRecord
        from blocks.monitoring import aggregation
        from blocks.main_loop import MainLoop
        from blocks.extensions.training import TrackTheBest
        from deepthought.extensions.parameters import BestParams    

        if extra_costs is None:
            extra_costs = []
        
        cg = ComputationGraph([cost])

        # TODO: more hyper-params for regularization
        # L1 regularization
        if hyper_params['l1wdecay'] > 0:
            weights = VariableFilter(roles=[WEIGHT])(cg.variables)
            cost = cost + hyper_params['l1wdecay'] * sum([abs(W).sum() for W in weights])

        cost.name = 'cost'

        # set up step_rule
        if hyper_params['step_rule'] == 'Adam':
            step_rule = Adam(learning_rate=hyper_params['learning_rate'])
        elif hyper_params['step_rule'] == 'RMSProp':
            step_rule = RMSProp(learning_rate=hyper_params['learning_rate']) #, decay_rate=0.9, max_scaling=1e5)
        else:
            step_rule = Scale(learning_rate=hyper_params['learning_rate'])
        
        algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule)

        if 'blocks_print_variable_names' in hyper_params and hyper_params['blocks_print_variable_names']:
            print 'cg.variables:', cg.variables

        train_monitoring_vars = [cost] + extra_costs + [aggregation.mean(algorithm.total_gradient_norm)]
        for var_name in hyper_params['blocks_extensions_train_monitoring_channels']:
            for v in cg.variables:
                if v.name == var_name:
                    print 'Monitoring variable:', v
                    train_monitoring_vars.append(v)

        # default extensions
        extensions = [Timing(),
                      FinishAfter(after_n_epochs=hyper_params['max_epochs']),
                      TrainingDataMonitoring(
                          train_monitoring_vars,
                          suffix="train",
                          after_epoch=True)
                      ]

        # additional stuff if validation set is used
        if valid_data is not None:
            valid_monitoring_vars = [cost] + extra_costs
            for var_name in hyper_params['blocks_extensions_valid_monitoring_channels']:
                for v in cg.variables:
                    if v.name == var_name:
                        print 'Monitoring variable:', v
                        valid_monitoring_vars.append(v)

            extensions.append(
                DataStreamMonitoring(
                    valid_monitoring_vars,
                    DataStream.default_stream(
                        valid_data,
                        iteration_scheme=SequentialScheme(
                            valid_data.num_examples, hyper_params['batch_size'])),
                    suffix="valid"))

            best_channel = 'cost_valid'
            print '#train:', train_data.num_examples, '#valid:', valid_data.num_examples
        else:
            best_channel = 'cost_train'
            print '#train:', train_data.num_examples

        # tracking of the best
        best_params = BestParams()
        best_params.add_condition(['after_epoch'],
                                  predicate=OnLogRecord(best_channel + '_best_so_far'))
        extensions.append(TrackTheBest(best_channel))
        extensions.append(best_params)  # after TrackTheBest!

        # printing and plotting
        if hyper_params['blocks_extensions_printing'] is True:
            extensions.append(Printing())  # optional
        if hyper_params['blocks_extensions_progressbar'] is True:
            extensions.append(ProgressBar())

        if hyper_params['blocks_extensions_bokeh'] is True:
            try:
                from blocks_extras.extensions.plot import Plot
                bokeh_available = True
            except:
                bokeh_available = False
            print 'bokeh available: ', bokeh_available

            if bokeh_available:
                extensions.append(Plot(
                    hyper_params['blocks_extensions_bokeh_plot_title'],
                    channels=hyper_params['blocks_extensions_bokeh_channels'],
                ))

        main_loop = MainLoop(
            algorithm,
            DataStream.default_stream(
                train_data,
                iteration_scheme=ShuffledScheme(
                    train_data.num_examples, hyper_params['batch_size'])),
            model=model,
            extensions=extensions)

        main_loop.run()

        return best_params.values, main_loop.status['best_' + best_channel]
Example #14
0
def construct_main_loop(name, task_name, patch_shape, batch_size,
                        n_spatial_dims, n_patches, max_epochs, patience_epochs,
                        learning_rate, gradient_limiter, hyperparameters,
                        **kwargs):
    task = tasks.get_task(**hyperparameters)
    hyperparameters["n_channels"] = task.n_channels

    extensions = []

    # let theta noise decay as training progresses
    for key in "location_std scale_std".split():
        hyperparameters[key] = theano.shared(hyperparameters[key], name=key)
        extensions.append(
            util.ExponentialDecay(hyperparameters[key],
                                  hyperparameters["%s_decay" % key],
                                  after_batch=True))

    print "constructing graphs..."
    graphs, outputs, updates = construct_graphs(task=task, **hyperparameters)

    print "setting up main loop..."

    from blocks.model import Model
    model = Model(outputs["train"]["cost"])

    from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, Adam, RMSProp
    from extensions import Compressor
    if gradient_limiter == "clip":
        limiter = StepClipping(1.)
    elif gradient_limiter == "compress":
        limiter = Compressor()
    else:
        raise ValueError()

    algorithm = GradientDescent(
        cost=outputs["train"]["cost"],
        parameters=graphs["train"].parameters,
        step_rule=CompositeRule([limiter,
                                 Adam(learning_rate=learning_rate)]))
    algorithm.add_updates(updates["train"])

    extensions.extend(
        construct_monitors(algorithm=algorithm,
                           task=task,
                           model=model,
                           graphs=graphs,
                           outputs=outputs,
                           updates=updates,
                           **hyperparameters))

    from blocks.extensions import FinishAfter, Printing, ProgressBar, Timing
    from blocks.extensions.stopping import FinishIfNoImprovementAfter
    from blocks.extensions.training import TrackTheBest
    from blocks.extensions.saveload import Checkpoint
    from dump import DumpBest, LightCheckpoint, PrintingTo, DumpGraph, DumpLog
    extensions.extend([
        TrackTheBest("valid_error_rate", "best_valid_error_rate"),
        FinishIfNoImprovementAfter("best_valid_error_rate",
                                   epochs=patience_epochs),
        FinishAfter(after_n_epochs=max_epochs),
        DumpBest("best_valid_error_rate", name + "_best.zip"),
        Checkpoint(hyperparameters["checkpoint_save_path"],
                   on_interrupt=False,
                   every_n_epochs=10,
                   use_cpickle=True),
        DumpLog("log.pkl", after_epoch=True),
        ProgressBar(),
        Timing(),
        Printing(),
        PrintingTo(name + "_log"),
        DumpGraph(name + "_grad_graph")
    ])

    from blocks.main_loop import MainLoop
    main_loop = MainLoop(data_stream=task.get_stream("train"),
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)

    from tabulate import tabulate
    print "parameter sizes:"
    print tabulate(
        (key, "x".join(map(str,
                           value.get_value().shape)), value.get_value().size)
        for key, value in main_loop.model.get_parameter_dict().items())

    return main_loop
Example #15
0
    def training(self,
                 fea2obj,
                 batch_size,
                 learning_rate=0.005,
                 steprule='adagrad',
                 wait_epochs=5,
                 kl_weight_init=None,
                 klw_ep=50,
                 klw_inc_rate=0,
                 num_epochs=None):
        networkfile = self._config['net']

        n_epochs = num_epochs or int(self._config['nepochs'])
        reg_weight = float(self._config['loss_weight'])
        reg_type = self._config['loss_reg']
        numtrain = int(
            self._config['num_train']) if 'num_train' in self._config else None
        train_stream, num_samples_train = get_comb_stream(
            fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain)
        dev_stream, num_samples_dev = get_comb_stream(fea2obj,
                                                      'dev',
                                                      batch_size=None,
                                                      shuffle=False)
        logger.info('sources: %s -- number of train/dev samples: %d/%d',
                    train_stream.sources, num_samples_train, num_samples_dev)

        t2idx = fea2obj['targets'].t2idx
        klw_init = kl_weight_init or float(
            self._config['kld_weight']) if 'kld_weight' in self._config else 1
        logger.info('kl_weight_init: %d', klw_init)
        kl_weight = shared_floatx(klw_init, 'kl_weight')
        entropy_weight = shared_floatx(1., 'entropy_weight')

        cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate = build_model_new(
            fea2obj, len(t2idx), self._config, kl_weight, entropy_weight)

        cg = ComputationGraph(cost)

        weights = VariableFilter(roles=[WEIGHT])(cg.parameters)
        logger.info('Model weights are: %s', weights)
        if 'L2' in reg_type:
            cost += reg_weight * l2_norm(weights)
            logger.info('applying %s with weight: %f ', reg_type, reg_weight)

        dropout = -0.1
        if dropout > 0:
            cg = apply_dropout(cg, weights, dropout)
            cost = cg.outputs[0]

        cost.name = 'cost'
        logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule,
                    learning_rate)
        if 'adagrad' in steprule:
            cnf_step_rule = AdaGrad(learning_rate)
        elif 'adadelta' in steprule:
            cnf_step_rule = AdaDelta(decay_rate=0.95)
        elif 'decay' in steprule:
            cnf_step_rule = RMSProp(learning_rate=learning_rate,
                                    decay_rate=0.90)
            cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)])
        elif 'momentum' in steprule:
            cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9)
        elif 'adam' in steprule:
            cnf_step_rule = Adam(learning_rate=learning_rate)
        else:
            logger.info('The steprule param is wrong! which is: %s', steprule)

        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=cnf_step_rule,
                                    on_unused_sources='warn')
        #algorithm.add_updates(updates)
        gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
        step_norm = aggregation.mean(algorithm.total_step_norm)
        monitored_vars = [
            cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight,
            pat1_recog
        ]
        train_monitor = TrainingDataMonitoring(variables=monitored_vars,
                                               after_batch=True,
                                               before_first_epoch=True,
                                               prefix='tra')

        dev_monitor = DataStreamMonitoring(variables=[
            cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate
        ],
                                           after_epoch=True,
                                           before_first_epoch=True,
                                           data_stream=dev_stream,
                                           prefix="dev")

        extensions = [
            dev_monitor,
            train_monitor,
            Timing(),
            TrackTheBest('dev_cost'),
            FinishIfNoImprovementAfter('dev_cost_best_so_far',
                                       epochs=wait_epochs),
            Printing(after_batch=False),  #, ProgressBar()
            FinishAfter(after_n_epochs=n_epochs),
            saveload.Load(networkfile + '.toload.pkl'),
        ] + track_best('dev_cost', networkfile + '.best.pkl')

        #extensions.append(SharedVariableModifier(kl_weight,
        #                                          lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))
        #         extensions.append(SharedVariableModifier(entropy_weight,
        #                                                   lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))

        logger.info('number of parameters in the model: %d',
                    tensor.sum([p.size for p in cg.parameters]).eval())
        logger.info('Lookup table sizes: %s',
                    [p.size.eval() for p in cg.parameters if 'lt' in p.name])

        main_loop = MainLoop(data_stream=train_stream,
                             algorithm=algorithm,
                             model=Model(cost),
                             extensions=extensions)
        main_loop.run()
Example #16
0
extensions = []

if args.load_experiment and (not worker or worker.is_main_worker):
    extensions += [Load(os.path.join(
        save_dir, "pkl", load_prefix + args.load_experiment + ".tar"))]

extensions += [
    Timing(every_n_batches=args.save_every),
    train_monitor]

if not worker or worker.is_main_worker:
    extensions += [
        valid_monitor,
        TrackTheBest(
            'valid_nll',
            every_n_batches=args.save_every,
            before_first_epoch=True),
        Plot(
            os.path.join(save_dir, "progress", exp_name + ".png"),
            plot_names,
            every_n_batches=args.save_every,
            email=False),
        Checkpoint(
            os.path.join(save_dir, "pkl", "best_" + exp_name + ".tar"),
            after_training=False,
            save_separately=['log'],
            use_cpickle=True,
            save_main_loop=False,
            before_first_epoch=True)
        .add_condition(
            ["after_batch", "before_training"],
Example #17
0
error_rate = error.copy(name='error_rate')
error_rate2 = error.copy(name='error_rate2')
cg = ComputationGraph([cost, error_rate])

########### GET THE DATA #####################
stream_train = ServerDataStream(('image_features', 'targets'),
                                False,
                                port=5652,
                                hwm=50)
stream_valid = ServerDataStream(('image_features', 'targets'),
                                False,
                                port=5653,
                                hwm=50)

########### DEFINE THE ALGORITHM #############
track_cost = TrackTheBest("cost", after_epoch=True, after_batch=False)
algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=Momentum(learning_rate=0.0001,
                                               momentum=0.9))
extensions = [
    Timing(),
    FinishAfter(after_n_epochs=num_epochs),
    TrainingDataMonitoring(
        [cost, error_rate,
         aggregation.mean(algorithm.total_gradient_norm)],
        prefix="train",
        after_epoch=True),
    DataStreamMonitoring([cost, error_rate, error_rate2],
                         stream_valid,
                         prefix="valid",
Example #18
0
                               theano.function([x, x_m], y_hat_softmax),
                               before_first_epoch=False,
                               every_n_epochs=5,
                               prefix='testPER')

checkpoint = Checkpoint(conf.path_to_model, after_training=False)
checkpoint.add_condition(
    ['after_epoch'],
    predicate=predicates.OnLogRecord('valid_log_p_best_so_far'))
extensions = [
    val_monitor,
    train_monitor,
    per_val_monitor,
    per_test_monitor,
    Timing(),
    FinishAfter(after_n_epochs=conf.max_epochs),
    checkpoint,
    Printing(),
    TrackTheBest(record_name='val_monitor',
                 notification_name='valid_log_p_best_so_far'),
    FinishIfNoImprovementAfter(notification_name='valid_log_p_best_so_far',
                               epochs=conf.epochs_early_stopping),
]
main_loop = MainLoop(
    algorithm=algorithm,
    data_stream=stream_train,
    model=model,
    extensions=extensions,
)

main_loop.run()
Example #19
0
def train_snli_model(new_training_job,
                     config,
                     save_path,
                     params,
                     fast_start,
                     fuel_server,
                     seed,
                     model='simple'):
    if config['exclude_top_k'] > config['num_input_words'] and config[
            'num_input_words'] > 0:
        raise Exception("Some words have neither word nor def embedding")
    c = config
    logger = configure_logger(name="snli_baseline_training",
                              log_file=os.path.join(save_path, "log.txt"))
    if not os.path.exists(save_path):
        logger.info("Start a new job")
        os.mkdir(save_path)
    else:
        logger.info("Continue an existing job")
    with open(os.path.join(save_path, "cmd.txt"), "w") as f:
        f.write(" ".join(sys.argv))

    # Make data paths nice
    for path in [
            'dict_path', 'embedding_def_path', 'embedding_path', 'vocab',
            'vocab_def', 'vocab_text'
    ]:
        if c.get(path, ''):
            if not os.path.isabs(c[path]):
                c[path] = os.path.join(fuel.config.data_path[0], c[path])

    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')

    # Save config to save_path
    json.dump(config, open(os.path.join(save_path, "config.json"), "w"))

    if model == 'simple':
        nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data(
            c)
    elif model == 'esim':
        nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data(
            c)
    else:
        raise NotImplementedError()

    # Compute cost
    s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2')

    if c['dict_path']:
        assert os.path.exists(c['dict_path'])
        s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix(
            'sentence2_def_map')
        def_mask = T.fmatrix("def_mask")
        defs = T.lmatrix("defs")
    else:
        s1_def_map, s2_def_map = None, None
        def_mask = None
        defs = None

    s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask')
    y = T.ivector('label')

    cg = {}
    for train_phase in [True, False]:
        # NOTE: Please don't change outputs of cg
        if train_phase:
            with batch_normalization(nli_model):
                pred = nli_model.apply(s1,
                                       s1_mask,
                                       s2,
                                       s2_mask,
                                       def_mask=def_mask,
                                       defs=defs,
                                       s1_def_map=s1_def_map,
                                       s2_def_map=s2_def_map,
                                       train_phase=train_phase)
        else:
            pred = nli_model.apply(s1,
                                   s1_mask,
                                   s2,
                                   s2_mask,
                                   def_mask=def_mask,
                                   defs=defs,
                                   s1_def_map=s1_def_map,
                                   s2_def_map=s2_def_map,
                                   train_phase=train_phase)

        cost = CategoricalCrossEntropy().apply(y.flatten(), pred)
        error_rate = MisclassificationRate().apply(y.flatten(), pred)
        cg[train_phase] = ComputationGraph([cost, error_rate])

    # Weight decay (TODO: Make it less bug prone)
    if model == 'simple':
        weights_to_decay = VariableFilter(
            bricks=[dense for dense, relu, bn in nli_model._mlp],
            roles=[WEIGHT])(cg[True].variables)
        weight_decay = np.float32(c['l2']) * sum(
            (w**2).sum() for w in weights_to_decay)
    elif model == 'esim':
        weight_decay = 0.0
    else:
        raise NotImplementedError()

    final_cost = cg[True].outputs[0] + weight_decay
    final_cost.name = 'final_cost'

    # Add updates for population parameters

    if c.get("bn", True):
        pop_updates = get_batch_normalization_updates(cg[True])
        extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates]
    else:
        pop_updates = []
        extra_updates = []

    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            loaded_params = load_parameters(src)
            cg[True].set_parameter_values(loaded_params)
            for param, m in pop_updates:
                param.set_value(loaded_params[get_brick(
                    param).get_hierarchical_name(param)])

    if os.path.exists(os.path.join(save_path, "main_loop.tar")):
        logger.warning("Manually loading BN stats :(")
        with open(os.path.join(save_path, "main_loop.tar")) as src:
            loaded_params = load_parameters(src)

        for param, m in pop_updates:
            param.set_value(
                loaded_params[get_brick(param).get_hierarchical_name(param)])

    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', batch_size=4).get_epoch_iterator())
        s1.tag.test_value = test_value_data[0]
        s1_mask.tag.test_value = test_value_data[1]
        s2.tag.test_value = test_value_data[2]
        s2_mask.tag.test_value = test_value_data[3]
        y.tag.test_value = test_value_data[4]

    # Freeze embeddings
    if not c['train_emb']:
        frozen_params = [
            p for E in nli_model.get_embeddings_lookups() for p in E.parameters
        ]
        train_params = [p for p in cg[True].parameters]
        assert len(set(frozen_params) & set(train_params)) > 0
    else:
        frozen_params = []
    if not c.get('train_def_emb', 1):
        frozen_params_def = [
            p for E in nli_model.get_def_embeddings_lookups()
            for p in E.parameters
        ]
        train_params = [p for p in cg[True].parameters]
        assert len(set(frozen_params_def) & set(train_params)) > 0
        frozen_params += frozen_params_def
    train_params = [p for p in cg[True].parameters if p not in frozen_params]
    train_params_keys = [
        get_brick(p).get_hierarchical_name(p) for p in train_params
    ]

    # Optimizer
    algorithm = GradientDescent(cost=final_cost,
                                on_unused_sources='ignore',
                                parameters=train_params,
                                step_rule=Adam(learning_rate=c['lr']))
    algorithm.add_updates(extra_updates)
    m = Model(final_cost)

    parameters = m.get_parameter_dict()  # Blocks version mismatch
    logger.info("Trainable parameters" + "\n" +
                pprint.pformat([(key, parameters[key].get_value().shape)
                                for key in sorted(train_params_keys)],
                               width=120))
    logger.info("# of parameters {}".format(
        sum([
            np.prod(parameters[key].get_value().shape)
            for key in sorted(train_params_keys)
        ])))

    ### Monitored args ###
    train_monitored_vars = [final_cost] + cg[True].outputs
    monitored_vars = cg[False].outputs
    val_acc = monitored_vars[1]
    to_monitor_names = [
        'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2',
        's1_gate_rootmean2', 's1_compose_gate_rootmean2'
    ]
    for k in to_monitor_names:
        train_v, valid_v = VariableFilter(name=k)(
            cg[True]), VariableFilter(name=k)(cg[False])
        if len(train_v):
            logger.info("Adding {} tracking".format(k))
            train_monitored_vars.append(train_v[0])
            monitored_vars.append(valid_v[0])
        else:
            logger.warning("Didnt find {} in cg".format(k))

    if c['monitor_parameters']:
        for name in train_params_keys:
            param = parameters[name]
            num_elements = numpy.product(param.get_value().shape)
            norm = param.norm(2) / num_elements
            grad_norm = algorithm.gradients[param].norm(2) / num_elements
            step_norm = algorithm.steps[param].norm(2) / num_elements
            stats = tensor.stack(norm, grad_norm, step_norm,
                                 step_norm / grad_norm)
            stats.name = name + '_stats'
            train_monitored_vars.append(stats)

    regular_training_stream = data.get_stream('train',
                                              batch_size=c['batch_size'],
                                              seed=seed)

    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=regular_training_stream.sources,
            hwm=100,
            produces_examples=regular_training_stream.produces_examples)
    else:
        training_stream = regular_training_stream

    ### Build extensions ###

    extensions = [
        # Load(main_loop_path, load_iteration_state=True, load_log=True)
        #     .set_conditions(before_training=not new_training_job),
        StartFuelServer(regular_training_stream,
                        stream_path,
                        hwm=100,
                        script_path=os.path.join(
                            os.path.dirname(__file__),
                            "../bin/start_fuel_server.py"),
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq']),
        ProgressBar(),
        RetrievalPrintStats(retrieval=used_retrieval,
                            every_n_batches=c['mon_freq_valid'],
                            before_training=not fast_start),
        Timestamp(),
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq']),
    ]

    if c['layout'] == 'snli':
        validation = DataStreamMonitoring(monitored_vars,
                                          data.get_stream('valid',
                                                          batch_size=14,
                                                          seed=seed),
                                          before_training=not fast_start,
                                          on_resumption=True,
                                          after_training=True,
                                          every_n_batches=c['mon_freq_valid'],
                                          prefix='valid')
        extensions.append(validation)
    elif c['layout'] == 'mnli':
        validation = DataStreamMonitoring(monitored_vars,
                                          data.get_stream('valid_matched',
                                                          batch_size=14,
                                                          seed=seed),
                                          every_n_batches=c['mon_freq_valid'],
                                          on_resumption=True,
                                          after_training=True,
                                          prefix='valid_matched')
        validation_mismatched = DataStreamMonitoring(
            monitored_vars,
            data.get_stream('valid_mismatched', batch_size=14, seed=seed),
            every_n_batches=c['mon_freq_valid'],
            before_training=not fast_start,
            on_resumption=True,
            after_training=True,
            prefix='valid_mismatched')
        extensions.extend([validation, validation_mismatched])
    else:
        raise NotImplementedError()

    # Similarity trackers for embeddings
    if len(c.get('vocab_def', '')):
        retrieval_vocab = Vocabulary(c['vocab_def'])
    else:
        retrieval_vocab = data.vocab

    retrieval_all = Retrieval(vocab_text=retrieval_vocab,
                              dictionary=used_dict,
                              max_def_length=c['max_def_length'],
                              exclude_top_k=0,
                              max_def_per_word=c['max_def_per_word'])

    for name in [
            's1_word_embeddings', 's1_dict_word_embeddings',
            's1_translated_word_embeddings'
    ]:
        variables = VariableFilter(name=name)(cg[False])
        if len(variables):
            s1_emb = variables[0]
            logger.info("Adding similarity tracking for " + name)
            # A bit sloppy about downcast

            if "dict" in name:
                embedder = construct_dict_embedder(theano.function(
                    [s1, defs, def_mask, s1_def_map],
                    s1_emb,
                    allow_input_downcast=True),
                                                   vocab=data.vocab,
                                                   retrieval=retrieval_all)
                extensions.append(
                    SimilarityWordEmbeddingEval(
                        embedder=embedder,
                        prefix=name,
                        every_n_batches=c['mon_freq_valid'],
                        before_training=not fast_start))
            else:
                embedder = construct_embedder(theano.function(
                    [s1], s1_emb, allow_input_downcast=True),
                                              vocab=data.vocab)
                extensions.append(
                    SimilarityWordEmbeddingEval(
                        embedder=embedder,
                        prefix=name,
                        every_n_batches=c['mon_freq_valid'],
                        before_training=not fast_start))

    track_the_best = TrackTheBest(validation.record_name(val_acc),
                                  before_training=not fast_start,
                                  every_n_epochs=c['save_freq_epochs'],
                                  after_training=not fast_start,
                                  every_n_batches=c['mon_freq_valid'],
                                  choose_best=min)
    extensions.append(track_the_best)

    # Special care for serializing embeddings
    if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path',
                                                     '')):
        extensions.insert(
            0,
            LoadNoUnpickling(main_loop_path,
                             load_iteration_state=True,
                             load_log=True).set_conditions(
                                 before_training=not new_training_job))
        extensions.append(
            Checkpoint(main_loop_path,
                       parameters=train_params + [p for p, m in pop_updates],
                       save_main_loop=False,
                       save_separately=['log', 'iteration_state'],
                       before_training=not fast_start,
                       every_n_epochs=c['save_freq_epochs'],
                       after_training=not fast_start).add_condition(
                           ['after_batch', 'after_epoch'],
                           OnLogRecord(track_the_best.notification_name),
                           (main_loop_best_val_path, )))
    else:
        extensions.insert(
            0,
            Load(main_loop_path, load_iteration_state=True,
                 load_log=True).set_conditions(
                     before_training=not new_training_job))
        extensions.append(
            Checkpoint(main_loop_path,
                       parameters=cg[True].parameters +
                       [p for p, m in pop_updates],
                       before_training=not fast_start,
                       every_n_epochs=c['save_freq_epochs'],
                       after_training=not fast_start).add_condition(
                           ['after_batch', 'after_epoch'],
                           OnLogRecord(track_the_best.notification_name),
                           (main_loop_best_val_path, )))

    extensions.extend([
        DumpCSVSummaries(save_path,
                         every_n_batches=c['mon_freq_valid'],
                         after_training=True),
        DumpTensorflowSummaries(save_path,
                                after_epoch=True,
                                every_n_batches=c['mon_freq_valid'],
                                after_training=True),
        Printing(every_n_batches=c['mon_freq_valid']),
        PrintMessage(msg="save_path={}".format(save_path),
                     every_n_batches=c['mon_freq']),
        FinishAfter(after_n_batches=c['n_batches']).add_condition(
            ['after_batch'],
            OnLogStatusExceed('iterations_done', c['n_batches']))
    ])

    logger.info(extensions)

    ### Run training ###

    if "VISDOM_SERVER" in os.environ:
        print("Running visdom server")
        ret = subprocess.Popen([
            os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"),
            "--visdom-server={}".format(os.environ['VISDOM_SERVER']),
            "--folder={}".format(save_path)
        ])
        time.sleep(0.1)
        if ret.returncode is not None:
            raise Exception()
        atexit.register(lambda: os.kill(ret.pid, signal.SIGINT))

    model = Model(cost)
    for p, m in pop_updates:
        model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=model,
                         extensions=extensions)

    assert os.path.exists(save_path)
    main_loop.run()
Example #20
0
def test_track_the_best():
    main_loop = MockMainLoop()
    extension = TrackTheBest("cost")
    extension.main_loop = main_loop

    main_loop.status['epochs_done'] += 1
    main_loop.status['iterations_done'] += 10
    main_loop.log.current_row['cost'] = 5
    extension.dispatch('after_epoch')
    assert main_loop.status['best_cost'] == 5
    assert main_loop.log.current_row['cost_best_so_far']

    main_loop.status['epochs_done'] += 1
    main_loop.status['iterations_done'] += 10
    main_loop.log.current_row['cost'] = 6
    extension.dispatch('after_epoch')
    assert main_loop.status['best_cost'] == 5
    assert main_loop.log.current_row.get('cost_best_so_far', None) is None

    main_loop.status['epochs_done'] += 1
    main_loop.status['iterations_done'] += 10
    main_loop.log.current_row['cost'] = 5
    extension.dispatch('after_epoch')
    assert main_loop.status['best_cost'] == 5
    assert main_loop.log.current_row.get('cost_best_so_far', None) is None

    main_loop.status['epochs_done'] += 1
    main_loop.status['iterations_done'] += 10
    main_loop.log.current_row['cost'] = 4
    extension.dispatch('after_epoch')
    assert main_loop.status['best_cost'] == 4
    assert main_loop.log.current_row['cost_best_so_far']
                    after_epoch=True,
                    data_stream=get_stream(
                        which_set=which_set,
                        for_evaluation=True,
                        batch_size=args.batch_size,
                        drop_prob=args.drop_prob,
                        hidden_dim=args.num_hidden)))  #, num_examples=1000)))

    if not args.baseline:
        save_str += 'BN_'
    if args.zoneout:
        save_str += 'zoneout_' + str(args.drop_prob) + '-'
    if args.elephant:
        save_str += 'elephant_' + str(args.drop_prob) + '-'
    extensions.extend([
        TrackTheBest("valid_training_error_rate",
                     "best_valid_training_error_rate"),
        #DumpBest("best_valid_training_error_rate", "best.zip"),
        FinishAfter(after_n_epochs=args.num_epochs),
        #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50),
        Checkpoint(save_str + "checkpoint.zip",
                   on_interrupt=False,
                   every_n_epochs=1,
                   use_cpickle=True),
        DumpLog(save_str + "log.pkl", after_epoch=True)
    ])

    if not args.cluster:
        extensions.append(ProgressBar())

    extensions.extend([
        Timing(),
Example #22
0
                                 Adam(lr)]))

train_monitor = TrainingDataMonitoring(variables=[cost],
                                       every_n_batches=n_batches,
                                       prefix="train")

valid_monitor = DataStreamMonitoring(
    [cost],
    valid_stream,
    after_epoch=True,
    #before_first_epoch = False,
    prefix="valid")

extensions = extensions = [
    Timing(every_n_batches=n_batches), train_monitor, valid_monitor,
    TrackTheBest('valid_nll', after_epoch=True),
    Plot(save_dir + experiment_name + ".png", [['train_nll', 'valid_nll']],
         every_n_batches=4 * n_batches,
         email=True),
    Checkpoint(save_dir + experiment_name + ".pkl",
               use_cpickle=True,
               every_n_batches=n_batches * 8,
               after_epoch=True),
    Checkpoint(save_dir + "best_" + experiment_name + ".pkl",
               after_epoch=True,
               use_cpickle=True).add_condition(
                   ['after_epoch'],
                   predicate=OnLogRecord('valid_nll_best_so_far')),
    Printing(every_n_batches=n_batches, after_epoch=True),
    FinishAfter(after_n_epochs=2),
    SaveComputationGraph(emit)
Example #23
0
    prefix="valid")

extensions = []

if args.load_experiment:
    extensions += [Load(os.path.join(
        save_dir, "pkl", "best_" + args.load_experiment + ".tar"))]

extensions += [
    Timing(every_n_batches=args.save_every),
    train_monitor]

extensions += [
    valid_monitor,
    TrackTheBest(
        'valid_' + cost_name,
        every_n_batches=args.save_every,
        before_first_epoch=True),
    Plot(
        os.path.join(save_dir, "progress", exp_name + ".png"),
        plot_names,
        every_n_batches=args.save_every,
        email=False),
    Checkpoint(
        os.path.join(save_dir, "pkl", "best_" + exp_name + ".tar"),
        after_training=False,
        save_separately=['log'],
        use_cpickle=True,
        save_main_loop=False,
        before_first_epoch=True)
    .add_condition(
        ["after_batch", "before_training"],
Example #24
0
def train_extractive_qa(new_training_job, config, save_path, params,
                        fast_start, fuel_server, seed):
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    root_path = os.path.join(save_path, 'training_state')
    extension = '.tar'
    tar_path = root_path + extension
    best_tar_path = root_path + '_best' + extension

    c = config
    data, qam = initialize_data_and_model(c)

    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', shuffle=True, batch_size=4,
                            max_length=5).get_epoch_iterator(as_dict=True))
        for var in qam.input_vars.values():
            var.tag.test_value = test_value_data[var.name]

    costs = qam.apply_with_default_vars()
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(qam.contexts.shape[1], 'length')
    batch_size = rename(qam.contexts.shape[0], 'batch_size')
    predicted_begins, = VariableFilter(name='predicted_begins')(cg)
    predicted_ends, = VariableFilter(name='predicted_ends')(cg)
    exact_match, = VariableFilter(name='exact_match')(cg)
    exact_match_ratio = rename(exact_match.mean(), 'exact_match_ratio')
    context_unk_ratio, = VariableFilter(name='context_unk_ratio')(cg)
    monitored_vars = [
        length, batch_size, cost, exact_match_ratio, context_unk_ratio
    ]
    if c['dict_path']:
        def_unk_ratio, = VariableFilter(name='def_unk_ratio')(cg)
        num_definitions = rename(qam.input_vars['defs'].shape[0],
                                 'num_definitions')
        max_definition_length = rename(qam.input_vars['defs'].shape[1],
                                       'max_definition_length')
        monitored_vars.extend(
            [def_unk_ratio, num_definitions, max_definition_length])
        if c['def_word_gating'] == 'self_attention':
            def_gates = VariableFilter(name='def_gates')(cg)
            def_gates_min = tensor.minimum(*[x.min() for x in def_gates])
            def_gates_max = tensor.maximum(*[x.max() for x in def_gates])
            monitored_vars.extend([
                rename(def_gates_min, 'def_gates_min'),
                rename(def_gates_max, 'def_gates_max')
            ])
    text_match_ratio = TextMatchRatio(data_path=os.path.join(
        fuel.config.data_path[0], 'squad/dev-v1.1.json'),
                                      requires=[
                                          predicted_begins, predicted_ends,
                                          tensor.ltensor3('contexts_text'),
                                          tensor.lmatrix('q_ids')
                                      ],
                                      name='text_match_ratio')

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    if c['embedding_path']:
        logger.debug("Exclude  word embeddings from the trained parameters")
        trained_parameters = [
            p for p in trained_parameters if not p == qam.embeddings_var()
        ]
    if c['train_only_def_part']:
        def_reading_parameters = qam.def_reading_parameters()
        trained_parameters = [
            p for p in trained_parameters if p in def_reading_parameters
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    # apply dropout to the training cost and to all the variables
    # that we monitor during training
    train_cost = cost
    train_monitored_vars = list(monitored_vars)
    if c['dropout']:
        regularized_cg = ComputationGraph([cost] + train_monitored_vars)
        # Dima: the dropout that I implemented first
        bidir_outputs, = VariableFilter(bricks=[Bidirectional],
                                        roles=[OUTPUT])(cg)
        readout_layers = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg)
        dropout_vars = [bidir_outputs] + readout_layers
        logger.debug("applying dropout to {}".format(", ".join(
            [v.name for v in dropout_vars])))
        regularized_cg = apply_dropout(regularized_cg, dropout_vars,
                                       c['dropout'])
        # a new dropout with exactly same mask at different steps
        emb_vars = VariableFilter(roles=[EMBEDDINGS])(regularized_cg)
        emb_dropout_mask = get_dropout_mask(emb_vars[0], c['emb_dropout'])
        if c['emb_dropout_type'] == 'same_mask':
            regularized_cg = apply_dropout2(regularized_cg,
                                            emb_vars,
                                            c['emb_dropout'],
                                            dropout_mask=emb_dropout_mask)
        elif c['emb_dropout_type'] == 'regular':
            regularized_cg = apply_dropout(regularized_cg, emb_vars,
                                           c['emb_dropout'])
        else:
            raise ValueError("unknown dropout type {}".format(
                c['emb_dropout_type']))
        train_cost = regularized_cg.outputs[0]
        train_monitored_vars = regularized_cg.outputs[1:]

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=train_cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)
    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    training_stream = data.get_stream('train',
                                      batch_size=c['batch_size'],
                                      shuffle=True,
                                      max_length=c['max_length'])
    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    extensions = [
        LoadNoUnpickling(tar_path, load_iteration_state=True,
                         load_log=True).set_conditions(
                             before_training=not new_training_job),
        StartFuelServer(original_training_stream,
                        os.path.join(save_path, 'stream.pkl'),
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train']),
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
    ]
    validation = DataStreamMonitoring(
        [text_match_ratio] + monitored_vars,
        data.get_stream('dev',
                        batch_size=c['batch_size_valid'],
                        raw_text=True,
                        q_ids=True),
        prefix="dev").set_conditions(before_training=not fast_start,
                                     after_epoch=True)
    dump_predictions = DumpPredictions(save_path,
                                       text_match_ratio,
                                       before_training=not fast_start,
                                       after_epoch=True)
    track_the_best_exact = TrackTheBest(
        validation.record_name(exact_match_ratio),
        choose_best=max).set_conditions(before_training=True, after_epoch=True)
    track_the_best_text = TrackTheBest(
        validation.record_name(text_match_ratio),
        choose_best=max).set_conditions(before_training=True, after_epoch=True)
    extensions.extend([
        validation, dump_predictions, track_the_best_exact, track_the_best_text
    ])
    # We often use pretrained word embeddings and we don't want
    # to load and save them every time. To avoid that, we use
    # save_main_loop=False, we only save the trained parameters,
    # and we save the log and the iterations state separately
    # in the tar file.
    extensions.extend([
        Checkpoint(tar_path,
                   parameters=trained_parameters,
                   save_main_loop=False,
                   save_separately=['log', 'iteration_state'],
                   before_training=not fast_start,
                   every_n_epochs=c['save_freq_epochs'],
                   every_n_batches=c['save_freq_batches'],
                   after_training=not fast_start).add_condition(
                       ['after_batch', 'after_epoch'],
                       OnLogRecord(track_the_best_text.notification_name),
                       (best_tar_path, )),
        DumpTensorflowSummaries(save_path,
                                after_epoch=True,
                                every_n_batches=c['mon_freq_train'],
                                after_training=True),
        RetrievalPrintStats(retrieval=data._retrieval,
                            every_n_batches=c['mon_freq_train'],
                            before_training=not fast_start),
        Printing(after_epoch=True, every_n_batches=c['mon_freq_train']),
        FinishAfter(after_n_batches=c['n_batches'],
                    after_n_epochs=c['n_epochs']),
        Annealing(c['annealing_learning_rate'],
                  after_n_epochs=c['annealing_start_epoch']),
        LoadNoUnpickling(best_tar_path,
                         after_n_epochs=c['annealing_start_epoch'])
    ])

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)
    main_loop.run()
Example #25
0
def main(args):
    """Run experiment. """
    lr_tag = float_tag(args.learning_rate)

    x_dim, train_stream, valid_stream, test_stream = datasets.get_streams(
        args.data, args.batch_size)

    #------------------------------------------------------------
    # Setup model
    deterministic_act = Tanh
    deterministic_size = 1.

    if args.method == 'vae':
        sizes_tag = args.layer_spec.replace(",", "-")
        layer_sizes = [int(i) for i in args.layer_spec.split(",")]
        layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1]

        name = "%s-%s-%s-lr%s-spl%d-%s" % \
            (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag)

        if args.activation == "tanh":
            hidden_act = Tanh()
        elif args.activation == "logistic":
            hidden_act = Logistic()
        elif args.activation == "relu":
            hidden_act = Rectifier()
        else:
            raise "Unknown hidden nonlinearity %s" % args.hidden_act

        model = VAE(x_dim=x_dim,
                    hidden_layers=layer_sizes,
                    hidden_act=hidden_act,
                    z_dim=z_dim,
                    batch_norm=args.batch_normalization)
        model.initialize()
    elif args.method == 'dvae':
        sizes_tag = args.layer_spec.replace(",", "-")
        layer_sizes = [int(i) for i in args.layer_spec.split(",")]
        layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1]

        name = "%s-%s-%s-lr%s-spl%d-%s" % \
            (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag)

        if args.activation == "tanh":
            hidden_act = Tanh()
        elif args.activation == "logistic":
            hidden_act = Logistic()
        elif args.activation == "relu":
            hidden_act = Rectifier()
        else:
            raise "Unknown hidden nonlinearity %s" % args.hidden_act

        model = DVAE(x_dim=x_dim,
                     hidden_layers=layer_sizes,
                     hidden_act=hidden_act,
                     z_dim=z_dim,
                     batch_norm=args.batch_normalization)
        model.initialize()
    elif args.method == 'rws':
        sizes_tag = args.layer_spec.replace(",", "-")
        qbase = "" if not args.no_qbaseline else "noqb-"

        name = "%s-%s-%s-%slr%s-dl%d-spl%d-%s" % \
            (args.data, args.method, args.name, qbase, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag)

        p_layers, q_layers = create_layers(args.layer_spec, x_dim,
                                           args.deterministic_layers,
                                           deterministic_act,
                                           deterministic_size)

        model = ReweightedWakeSleep(
            p_layers,
            q_layers,
            qbaseline=(not args.no_qbaseline),
        )
        model.initialize()
    elif args.method == 'bihm-rws':
        sizes_tag = args.layer_spec.replace(",", "-")
        name = "%s-%s-%s-lr%s-dl%d-spl%d-%s" % \
            (args.data, args.method, args.name, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag)

        p_layers, q_layers = create_layers(args.layer_spec, x_dim,
                                           args.deterministic_layers,
                                           deterministic_act,
                                           deterministic_size)

        model = BiHM(
            p_layers,
            q_layers,
            l1reg=args.l1reg,
            l2reg=args.l2reg,
        )
        model.initialize()
    elif args.method == 'continue':
        import cPickle as pickle
        from os.path import basename, splitext

        with open(args.model_file, 'rb') as f:
            m = pickle.load(f)

        if isinstance(m, MainLoop):
            m = m.model

        model = m.get_top_bricks()[0]
        while len(model.parents) > 0:
            model = model.parents[0]

        assert isinstance(model, (BiHM, ReweightedWakeSleep, VAE))

        mname, _, _ = basename(args.model_file).rpartition("_model.pkl")
        name = "%s-cont-%s-lr%s-spl%s" % (mname, args.name, lr_tag,
                                          args.n_samples)
    else:
        raise ValueError("Unknown training method '%s'" % args.method)

    #------------------------------------------------------------

    x = tensor.matrix('features')

    #------------------------------------------------------------
    # Testset monitoring

    train_monitors = []
    valid_monitors = []
    test_monitors = []
    for s in [1, 10, 100, 1000]:
        log_p, log_ph = model.log_likelihood(x, s)
        log_p = -log_p.mean()
        log_ph = -log_ph.mean()
        log_p.name = "log_p_%d" % s
        log_ph.name = "log_ph_%d" % s

        #train_monitors += [log_p, log_ph]
        #valid_monitors += [log_p, log_ph]
        test_monitors += [log_p, log_ph]

    #------------------------------------------------------------
    # Z estimation
    #for s in [100000]:
    #    z2 = tensor.exp(model.estimate_log_z2(s)) / s
    #    z2.name = "z2_%d" % s
    #
    #    valid_monitors += [z2]
    #    test_monitors += [z2]

    #------------------------------------------------------------
    # Gradient and training monitoring

    if args.method in ['vae', 'dvae']:
        log_p_bound, gradients = model.get_gradients(x, args.n_samples)
        log_p_bound = -log_p_bound.mean()
        log_p_bound.name = "log_p_bound"
        cost = log_p_bound

        train_monitors += [
            log_p_bound,
            named(model.kl_term.mean(), 'kl_term'),
            named(model.recons_term.mean(), 'recons_term')
        ]
        valid_monitors += [
            log_p_bound,
            named(model.kl_term.mean(), 'kl_term'),
            named(model.recons_term.mean(), 'recons_term')
        ]
        test_monitors += [
            log_p_bound,
            named(model.kl_term.mean(), 'kl_term'),
            named(model.recons_term.mean(), 'recons_term')
        ]
    else:
        log_p, log_ph, gradients = model.get_gradients(x, args.n_samples)
        log_p = -log_p.mean()
        log_ph = -log_ph.mean()
        log_p.name = "log_p"
        log_ph.name = "log_ph"
        cost = log_ph

        train_monitors += [log_p, log_ph]
        valid_monitors += [log_p, log_ph]

    #------------------------------------------------------------
    # Detailed monitoring
    """
    n_layers = len(p_layers)

    log_px, w, log_p, log_q, samples = model.log_likelihood(x, n_samples)

    exp_samples = []
    for l in xrange(n_layers):
        e = (w.dimshuffle(0, 1, 'x')*samples[l]).sum(axis=1)
        e.name = "inference_h%d" % l
        e.tag.aggregation_scheme = aggregation.TakeLast(e)
        exp_samples.append(e)

    s1 = samples[1]
    sh1 = s1.shape
    s1_ = s1.reshape([sh1[0]*sh1[1], sh1[2]])
    s0, _ = model.p_layers[0].sample_expected(s1_)
    s0 = s0.reshape([sh1[0], sh1[1], s0.shape[1]])
    s0 = (w.dimshuffle(0, 1, 'x')*s0).sum(axis=1)
    s0.name = "inference_h0^"
    s0.tag.aggregation_scheme = aggregation.TakeLast(s0)
    exp_samples.append(s0)

    # Draw P-samples
    p_samples, _, _ = model.sample_p(100)
    #weights = model.importance_weights(samples)
    #weights = weights / weights.sum()

    for i, s in enumerate(p_samples):
        s.name = "psamples_h%d" % i
        s.tag.aggregation_scheme = aggregation.TakeLast(s)

    #
    samples = model.sample(100, oversample=100)

    for i, s in enumerate(samples):
        s.name = "samples_h%d" % i
        s.tag.aggregation_scheme = aggregation.TakeLast(s)
    """
    cg = ComputationGraph([cost])

    #------------------------------------------------------------

    if args.step_rule == "momentum":
        step_rule = Momentum(args.learning_rate, 0.95)
    elif args.step_rule == "rmsprop":
        step_rule = RMSProp(args.learning_rate)
    elif args.step_rule == "adam":
        step_rule = Adam(args.learning_rate)
    else:
        raise "Unknown step_rule %s" % args.step_rule

    #parameters = cg.parameters[:4] + cg.parameters[5:]
    parameters = cg.parameters

    algorithm = GradientDescent(
        cost=cost,
        parameters=parameters,
        gradients=gradients,
        step_rule=CompositeRule([
            #StepClipping(25),
            step_rule,
            #RemoveNotFinite(1.0),
        ]))

    #------------------------------------------------------------

    train_monitors += [
        aggregation.mean(algorithm.total_gradient_norm),
        aggregation.mean(algorithm.total_step_norm)
    ]

    #------------------------------------------------------------

    # Live plotting?
    plotting_extensions = []
    if args.live_plotting:
        plotting_extensions = [
            PlotManager(
                name,
                [
                    Plotter(channels=[[
                        "valid_%s" % cost.name, "valid_log_p"
                    ], ["train_total_gradient_norm", "train_total_step_norm"]],
                            titles=[
                                "validation cost",
                                "norm of training gradient and step"
                            ]),
                    DisplayImage(
                        [
                            WeightDisplay(model.p_layers[0].mlp.
                                          linear_transformations[0].W,
                                          n_weights=100,
                                          image_shape=(28, 28))
                        ]
                        #ImageDataStreamDisplay(test_stream, image_shape=(28,28))]
                    )
                ])
        ]

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            ProgressBar(),
            TrainingDataMonitoring(
                train_monitors, prefix="train", after_epoch=True),
            DataStreamMonitoring(
                valid_monitors, data_stream=valid_stream, prefix="valid"),
            DataStreamMonitoring(test_monitors,
                                 data_stream=test_stream,
                                 prefix="test",
                                 after_epoch=False,
                                 after_training=True,
                                 every_n_epochs=10),
            #SharedVariableModifier(
            #    algorithm.step_rule.components[0].learning_rate,
            #    half_lr_func,
            #    before_training=False,
            #    after_epoch=False,
            #    after_batch=False,
            #    every_n_epochs=half_lr),
            TrackTheBest('valid_%s' % cost.name),
            Checkpoint(name + ".pkl", save_separately=['log', 'model']),
            FinishIfNoImprovementAfter('valid_%s_best_so_far' % cost.name,
                                       epochs=args.patience),
            FinishAfter(after_n_epochs=args.max_epochs),
            Printing()
        ] + plotting_extensions)
    main_loop.run()
Example #26
0
def initialaze_algorithm(config, save_path, bokeh_name, params, bokeh_server,
                         bokeh, use_load_ext, load_log, fast_start, 
                         recognizer, data, model, cg, regularized_cg,
                         cost, train_cost, parameters, 
                         max_norm_rules, observables,
                         batch_size, batch_cost, weights_entropy, 
                         labels_mask, labels,  gradients=None):
    primary_observables = observables
    secondary_observables = []
    validation_observables = []
    root_path, extension = os.path.splitext(save_path)
    train_conf = config['training']
    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'], train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    if 'adam' in rule_names:
        assert len(rule_names) == 1
        logger.info("Using Adam for training")
        core_rules.append(
            Adam(learning_rate=train_conf.get('scale', 0.002),
                 beta1=train_conf.get('beta1', 0.1),
                 beta2=train_conf.get('beta2', 0.001),
                 epsilon=train_conf.get('epsilon', 1e-8),
                 decay_factor=train_conf.get('decay_rate', (1 - 1e-8))))
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(
            BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')
        #theano_func_kwargs={'mode':NanGuardMode(nan_is_error=True)})

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements ** 0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost,
        algorithm.total_gradient_norm,
        algorithm.total_step_norm, clipping.threshold]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances')] + weights_entropy


    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name.startswith('weights_entropy'):
                chld_id = recognizer.child_id_from_postfix(var.name)
                result.append(rename(aggregation.mean(var, labels_mask[chld_id].sum()),
                                     'weights_entropy_per_label'+
                                     recognizer.children[chld_id].names_postfix))
            elif var.name.endswith('_nll'):
                chld_id = recognizer.child_id_from_postfix(var.name)
                result.append(rename(aggregation.mean(var.sum(),
                                                      labels_mask[chld_id].sum()),
                                     var.name+'_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']
    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(TrainingDataMonitoring(
        primary_observables, after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average", every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables),
        data.get_stream("valid", shuffle=False, **data_params_valid), prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)

    additional_patience_notifiers = []
    uas = DependencyErrorRate(recognizer.children[0], data,
                              **config['monitoring']['search'])
    las = AuxiliaryErrorRates(uas, name='LAS')
    lab = AuxiliaryErrorRates(uas, name='LAB')
    per_monitoring = DataStreamMonitoring(
        [uas, las, lab], data.get_one_stream("valid", data.langs[0], batches=False, shuffle=False, **data_params_valid)[0],
        prefix="valid").set_conditions(
                before_first_epoch=not fast_start,
                every_n_epochs=mon_conf['search_every_epochs'],
                every_n_batches=mon_conf['search_every_batches'],
                after_training=False)
    extensions.append(per_monitoring)
    track_the_best_uas = TrackTheBest(
        per_monitoring.record_name(uas)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_las = TrackTheBest(
        per_monitoring.record_name(las)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_lab = TrackTheBest(
        per_monitoring.record_name(lab)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_uas,
                   track_the_best_las,
                   track_the_best_lab,
                   ]
    per = uas
    track_the_best_per = track_the_best_uas
    additional_patience_notifiers = [track_the_best_lab,
                                     track_the_best_las]
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_cost]
    extensions.append(AdaptiveClipping(
        algorithm.total_gradient_norm.name,
        clipping, train_conf['gradient_threshold'],
        decay_rate=0.998, burnin_period=500,
        num_stds=train_conf.get('clip_stds', 1.0)))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf['num_batches'],
                    after_n_epochs=train_conf['num_epochs']),
            # .add_condition(["after_batch"], _gradient_norm_is_none),
    ]
    main_postfix = recognizer.children[0].names_postfix
    channels = [
        # Plot 1: training and validation costs
        [average_monitoring.record_name(train_cost),
         validation.record_name(cost)],
        # Plot 2: gradient norm,
        [average_monitoring.record_name(algorithm.total_gradient_norm),
         average_monitoring.record_name(clipping.threshold)],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [average_monitoring._record_name('weights_entropy_per_label'+main_postfix),
         validation._record_name('weights_entropy_per_label'+main_postfix)],
        # Plot 5: training and validation monotonicity penalty
        [average_monitoring._record_name('weights_penalty_per_recording'+main_postfix),
         validation._record_name('weights_penalty_per_recording'+main_postfix)]]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name
                 else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start, after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True)
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_per.notification_name),
            (root_path + "_best" + extension,))
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_cost.notification_name),
            (root_path + "_best_ll" + extension,)),
        ProgressBar()]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name] + additional_patience_notifiers
        extensions.append(Patience(**patience_conf))

    if train_conf.get('min_performance_stops'):
        extensions.append(EarlyTermination(
            param_name=track_the_best_per.best_name,
            min_performance_by_epoch=train_conf['min_performance_stops']))

    extensions.append(Printing(every_n_batches=1,
                               attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
Example #27
0
    variables=[cost],
    every_n_batches = n_batches,
    prefix="train")

valid_monitor = DataStreamMonitoring(
     [cost],
     valid_stream,
     after_epoch = True,
     #before_first_epoch = False,
     prefix="valid")

extensions = extensions=[
    Timing(every_n_batches = n_batches),
    train_monitor,
    valid_monitor,
    TrackTheBest('valid_sequence_log_likelihood', after_epoch = True),
    Plot(save_dir+experiment_name+".png",
         [['train_sequence_log_likelihood',
           'valid_sequence_log_likelihood']],
         every_n_batches = 4*n_batches,
         email=False),
    Checkpoint(save_dir+experiment_name+".pkl",
               use_cpickle = True,
               every_n_batches = n_batches*8,
               after_epoch = True),
    Checkpoint(save_dir+"best_"+experiment_name+".pkl",
     after_epoch = True,
     use_cpickle = True
     ).add_condition(['after_epoch'],
          predicate=OnLogRecord('valid_sequence_log_likelihood_best_so_far')),
    Printing(every_n_batches = n_batches, after_epoch = True),
Example #28
0
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh,
                   test_tag, use_load_ext, load_log, fast_start):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])
    train_conf = config['training']
    recognizer = create_model(config, data, test_tag)

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result

    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    prediction, prediction_mask = add_exploration(recognizer, data, train_conf)

    #
    # Observables:
    #
    primary_observables = []  # monitored each batch
    secondary_observables = []  # monitored every 10 batches
    validation_observables = []  # monitored on the validation set

    cg = recognizer.get_cost_graph(batch=True,
                                   prediction=prediction,
                                   prediction_mask=prediction_mask)
    labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg)
    labels_mask, = VariableFilter(applications=[recognizer.cost],
                                  name='labels_mask')(cg)

    gain_matrix = VariableFilter(
        theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg)
    if len(gain_matrix):
        gain_matrix, = gain_matrix
        primary_observables.append(rename(gain_matrix.min(), 'min_gain'))
        primary_observables.append(rename(gain_matrix.max(), 'max_gain'))

    batch_cost = cg.outputs[0].sum()
    batch_size = rename(recognizer.labels.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_total_cost"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(applications=[r.generator.readout.readout],
                               name="output_0")(cost_cg)
    bottom_output = VariableFilter(
        # We need name_regex instead of name because LookupTable calls itsoutput output_0
        applications=[r.bottom.apply],
        name_regex="output")(cost_cg)[-1]
    attended, = VariableFilter(applications=[r.generator.transition.apply],
                               name="attended")(cost_cg)
    attended_mask, = VariableFilter(applications=[
        r.generator.transition.apply
    ],
                                    name="attended_mask")(cost_cg)
    weights, = VariableFilter(applications=[r.generator.evaluate],
                              name="weights")(cost_cg)

    from blocks.roles import AUXILIARY
    l2_cost, = VariableFilter(roles=[AUXILIARY],
                              theano_name='l2_cost_aux')(cost_cg)
    cost_forward, = VariableFilter(roles=[AUXILIARY],
                                   theano_name='costs_forward_aux')(cost_cg)

    max_recording_length = rename(bottom_output.shape[0],
                                  "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = rename(attended_mask.shape[0],
                                      "max_attended_mask_length")
    max_attended_length = rename(attended.shape[0], "max_attended_length")
    max_num_phonemes = rename(labels.shape[0], "max_num_phonemes")
    min_energy = rename(energies.min(), "min_energy")
    max_energy = rename(energies.max(), "max_energy")
    mean_attended = rename(abs(attended).mean(), "mean_attended")
    mean_bottom_output = rename(
        abs(bottom_output).mean(), "mean_bottom_output")
    weights_penalty = rename(monotonicity_penalty(weights, labels_mask),
                             "weights_penalty")
    weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy")
    mask_density = rename(labels_mask.mean(), "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy, min_energy, max_energy,
        mean_attended, mean_bottom_output, batch_size, max_num_phonemes,
        mask_density
    ])
    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config.get('regularization', dict())
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [
            p for p in cg.parameters if p not in attention_params
        ]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])

    train_cost = regularized_cg.outputs[0]
    if reg_config.get("penalty_coof", .0) > 0:
        # big warning!!!
        # here we assume that:
        # regularized_weights_penalty = regularized_cg.outputs[1]
        train_cost = (train_cost + reg_config.get("penalty_coof", .0) *
                      regularized_cg.outputs[1] / batch_size)
    if reg_config.get("decay", .0) > 0:
        train_cost = (
            train_cost + reg_config.get("decay", .0) *
            l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2)

    train_cost = rename(train_cost, 'train_cost')

    gradients = None
    if reg_config.get('adaptive_noise'):
        logger.info('apply adaptive noise')
        if ((reg_config.get("penalty_coof", .0) > 0)
                or (reg_config.get("decay", .0) > 0)):
            logger.error('using  adaptive noise with alignment weight panalty '
                         'or weight decay is probably stupid')
        train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise(
            cg,
            cg.outputs[0],
            variables=cg.parameters,
            num_examples=data.get_dataset('train').num_examples,
            parameters=Model(
                regularized_cg.outputs[0]).get_parameter_dict().values(),
            **reg_config.get('adaptive_noise'))
        train_cost.name = 'train_cost'
        adapt_noise_cg = ComputationGraph(train_cost)
        model_prior_mean = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_mean')(adapt_noise_cg)[0],
            'model_prior_mean')
        model_cost = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_cost')(adapt_noise_cg)[0], 'model_cost')
        model_prior_variance = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_variance')(adapt_noise_cg)[0],
            'model_prior_variance')
        regularized_cg = ComputationGraph(
            [train_cost, model_cost] + regularized_cg.outputs +
            [model_prior_mean, model_prior_variance])
        primary_observables += [
            regularized_cg.outputs[1],  # model cost
            regularized_cg.outputs[2],  # task cost
            regularized_cg.outputs[-2],  # model prior mean
            regularized_cg.outputs[-1]
        ]  # model prior variance

    model = Model(train_cost)
    if params:
        logger.info("Load parameters from " + params)
        # please note: we cannot use recognizer.load_params
        # as it builds a new computation graph that dies not have
        # shapred variables added by adaptive weight noise
        with open(params, 'r') as src:
            param_values = load_parameters(src)
        model.set_parameter_values(param_values)

    parameters = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, parameters[key].get_value().shape)
                                for key in sorted(parameters.keys())],
                               width=120))

    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'],
                                   train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(
            AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False) > 0:
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [
                v for v in maxnorm_subjects
                if not isinstance(get_brick(v), LookupTable)
            ]
        logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat(
            [name for name, p in parameters.items() if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([
            name for name, p in parameters.items() if not p in maxnorm_subjects
        ]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                     maxnorm_subjects)
        ]
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold
    ]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements**0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements**0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm,
        clipping.threshold, max_recording_length, max_attended_length,
        max_attended_mask_length
    ]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances'),
        weights_entropy, weights_penalty
    ]

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(
                    rename(aggregation.mean(var, batch_size),
                           'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(
                    rename(aggregation.mean(var, labels_mask.sum()),
                           'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(
            Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(
        TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward],
                               after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average",
        every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables +
                                   [l2_cost, cost_forward]),
        data.get_stream("valid", shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)
    per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search'])
    per_monitoring = DataStreamMonitoring(
        [per],
        data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['search_every_epochs'],
            every_n_batches=mon_conf['search_every_batches'],
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(before_first_epoch=True,
                                                     after_epoch=True)
    extensions += [track_the_best_cost, track_the_best_per]
    extensions.append(
        AdaptiveClipping(algorithm.total_gradient_norm.name,
                         clipping,
                         train_conf['gradient_threshold'],
                         decay_rate=0.998,
                         burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf.get('num_batches'),
                    after_n_epochs=train_conf.get('num_epochs')).add_condition(
                        ["after_batch"], _gradient_norm_is_none),
    ]
    channels = [
        # Plot 1: training and validation costs
        [
            average_monitoring.record_name(train_cost),
            validation.record_name(cost)
        ],
        # Plot 2: gradient norm,
        [
            average_monitoring.record_name(algorithm.total_gradient_norm),
            average_monitoring.record_name(clipping.threshold)
        ],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [
            average_monitoring._record_name('weights_entropy_per_label'),
            validation._record_name('weights_entropy_per_label')
        ],
        # Plot 5: training and validation monotonicity penalty
        [
            average_monitoring._record_name('weights_penalty_per_recording'),
            validation._record_name('weights_penalty_per_recording')
        ]
    ]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),
        ]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start,
                   after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True).add_condition(
                       ['after_epoch'],
                       OnLogRecord(track_the_best_per.notification_name),
                       (root_path + "_best" + extension, )).add_condition(
                           ['after_epoch'],
                           OnLogRecord(track_the_best_cost.notification_name),
                           (root_path + "_best_ll" + extension, )),
        ProgressBar()
    ]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))
    if config['net']['criterion']['name'].startswith('mse'):
        extensions.append(
            LogInputsGains(labels, cg, recognizer.generator.readout.emitter,
                           data))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name
            ]
        extensions.append(Patience(**patience_conf))

    extensions.append(
        Printing(every_n_batches=1, attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
Example #29
0
algorithm.add_updates(extra_updates)

train_monitor = TrainingDataMonitoring(variables=monitoring_variables +
                                       data_monitoring,
                                       every_n_batches=n_batches,
                                       prefix="train")

valid_monitor = DataStreamMonitoring(monitoring_variables,
                                     valid_stream,
                                     every_n_batches=n_batches,
                                     prefix="valid")

extensions = [
    ProgressBar(),
    Timing(every_n_batches=n_batches), train_monitor, valid_monitor,
    TrackTheBest('valid_nll', every_n_batches=n_batches),
    Plot(save_dir + "progress/" + experiment_name + ".png",
         [['train_nll', 'valid_nll'], ['valid_learning_rate']],
         every_n_batches=n_batches,
         email=False),
    Checkpoint(save_dir + "pkl/best_" + experiment_name + ".pkl",
               use_cpickle=True).add_condition(
                   ['after_batch'],
                   predicate=OnLogRecord('valid_nll_best_so_far')),
    Printing(every_n_batches=n_batches),
    Flush(every_n_batches=n_batches, before_first_epoch=True),
    LearningRateSchedule(lr,
                         'valid_nll',
                         path=save_dir + "pkl/best_" + experiment_name +
                         ".pkl",
                         states=states.values(),
    def train(self,
              cost,
              y_hat,
              train_stream,
              accuracy=None,
              prediction_cost=None,
              regularization_cost=None,
              params_to_optimize=None,
              valid_stream=None,
              extra_extensions=None,
              model=None,
              vars_to_monitor_on_train=None,
              vars_to_monitor_on_valid=None,
              step_rule=None,
              additional_streams=None,
              save_on_best=None,
              use_own_validation=False,
              objects_to_dump=None):
        """
        Generic method for training models. It extends functionality already provided by Blocks.
        :param cost: Theano var with cost function
        :param y_hat: Theano var with predictions from the model
        :param train_stream: Fuel stream with training data
        :param accuracy: Theano var with accuracy
        :param prediction_cost:
        :param regularization_cost:
        :param params_to_optimize:
        :param valid_stream: Fuel stream with validation data
        :param extra_extensions:
        :param model:
        :param vars_to_monitor_on_train:
        :param vars_to_monitor_on_valid:
        :param step_rule:
        :param additional_streams:
        :param save_on_best:
        :param use_own_validation:
        :param objects_to_dump:
        :return:
        """

        if not vars_to_monitor_on_valid:
            vars_to_monitor_on_valid = [(cost, min)]
            if accuracy:
                vars_to_monitor_on_valid.append((accuracy, max))

        if not save_on_best:
            # use default metrics for saving the best model
            save_on_best = [(cost, min)]
            if accuracy:
                save_on_best.append((accuracy, max))

        # setup the training algorithm #######################################
        # step_rule = Scale(learning_rate=0.01)
        #    step_rule = Adam()
        model_save_suffix = ""
        if self.args.append_metaparams:
            model_save_suffix = "." + get_current_metaparams_str(
                self.parser, self.args)

        # get a list of variables that will be monitored during training
        vars_to_monitor = [cost]
        if accuracy:
            vars_to_monitor.append(accuracy)
        if prediction_cost:
            vars_to_monitor.append(prediction_cost)
        if regularization_cost:
            vars_to_monitor.append(regularization_cost)

        theano_vars_to_monitor = [
            var for var, comparator in vars_to_monitor_on_valid
        ]

        if not params_to_optimize:
            # use all parameters of the model for optimization
            cg = ComputationGraph(cost)
            params_to_optimize = cg.parameters

        self.print_parameters_info(params_to_optimize)

        if not model:
            if accuracy:
                model = MultiOutputModel([cost, accuracy, y_hat] +
                                         theano_vars_to_monitor)
            else:
                model = MultiOutputModel([cost, y_hat] +
                                         theano_vars_to_monitor)

        if not step_rule:
            step_rule = AdaDelta()  # learning_rate=0.02, momentum=0.9)

        step_rules = [
            StepClipping(self.args.gradient_clip), step_rule,
            RemoveNotFinite()
        ]

        # optionally add gradient noise
        if self.args.gradient_noise:
            step_rules = [
                GradientNoise(self.args.gradient_noise, self.args.gn_decay)
            ] + step_rules

        algorithm = GradientDescent(cost=cost,
                                    parameters=params_to_optimize,
                                    step_rule=CompositeRule(step_rules),
                                    on_unused_sources="warn")

        # this variable aggregates all extensions executed periodically during training
        extensions = []

        if self.args.epochs_max:
            # finis training after fixed number of epochs
            extensions.append(FinishAfter(after_n_epochs=self.args.epochs_max))

        # training data monitoring
        def create_training_data_monitoring():
            if "every_n_epochs" in self.args.evaluate_every_n:
                return TrainingDataMonitoring(vars_to_monitor,
                                              prefix='train',
                                              after_epoch=True)
            else:
                return TrainingDataMonitoring(vars_to_monitor,
                                              prefix='train',
                                              after_epoch=True,
                                              **self.args.evaluate_every_n)

        # add extensions that monitors progress of training on train set
        extensions.extend([create_training_data_monitoring()])

        if not self.args.disable_progress_bar:
            extensions.append(ProgressBar())

        def add_data_stream_monitor(data_stream, prefix):
            if not use_own_validation:
                extensions.append(
                    DataStreamMonitoring(variables=theano_vars_to_monitor,
                                         data_stream=data_stream,
                                         prefix=prefix,
                                         before_epoch=False,
                                         **self.args.evaluate_every_n))

        # additional streams that should be monitored
        if additional_streams:
            for stream_name, stream in additional_streams:
                add_data_stream_monitor(stream, stream_name)

        # extra extensions need to be called before Printing extension
        if extra_extensions:
            extensions.extend(extra_extensions)

        if valid_stream:
            # add validation set monitoring
            add_data_stream_monitor(valid_stream, 'valid')

            # add best val monitoring
            for var, comparator in vars_to_monitor_on_valid:
                extensions.append(
                    TrackTheBest("valid_" + var.name,
                                 choose_best=comparator,
                                 **self.args.evaluate_every_n))

            if self.args.patience_metric == 'cost':
                patience_metric_name = cost.name
            elif self.args.patience_metric == 'accuracy':
                patience_metric_name = accuracy.name
            else:
                print "WARNING: Falling back to COST function for patience."
                patience_metric_name = cost.name

            extensions.append(
                # "valid_cost_best_so_far" message will be entered to the main loop log by TrackTheBest extension
                FinishIfNoImprovementAfter(
                    "valid_" + patience_metric_name + "_best_so_far",
                    epochs=self.args.epochs_patience_valid))

            if not self.args.do_not_save:

                # use user provided metrics for saving
                valid_save_extensions = map(
                    lambda metric_comparator: SaveTheBest(
                        "valid_" + metric_comparator[0].name,
                        self.args.save_path + ".best." + metric_comparator[
                            0].name + model_save_suffix,
                        choose_best=metric_comparator[1],
                        **self.args.evaluate_every_n), save_on_best)
                extensions.extend(valid_save_extensions)

        extensions.extend([
            Timing(**self.args.evaluate_every_n),
            Printing(after_epoch=False, **self.args.evaluate_every_n),
        ])

        if not self.args.do_not_save or self.args.save_only_best:
            extensions.append(
                Checkpoint(self.args.save_path + model_save_suffix,
                           **self.args.save_every_n))

        extensions.append(FlushStreams(**self.args.evaluate_every_n))

        # main loop ##########################################################
        main_loop = MainLoop(data_stream=train_stream,
                             model=model,
                             algorithm=algorithm,
                             extensions=extensions)
        sys.setrecursionlimit(1000000)
        main_loop.run()
Example #31
0
cost.name = "sequence_log_likelihood"

cg = ComputationGraph(cost)
model = Model(cost)

#################
# Algorithm
#################

algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=CompositeRule(
                                [StepClipping(10.0),
                                 Adam(lr)]))

train_monitor = TrainingDataMonitoring(variables=[cost],
                                       after_epoch=True,
                                       prefix="train")

extensions = extensions = [
    train_monitor,
    TrackTheBest('train_sequence_log_likelihood'),
    Printing(after_epoch=True)
]

main_loop = MainLoop(model=model,
                     data_stream=data_stream,
                     algorithm=algorithm,
                     extensions=extensions)

main_loop.run()
Example #32
0
def build_and_run(label, config):
    ############## CREATE THE NETWORK ###############
    #Define the parameters
    num_epochs, num_batches, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation = config[
        'num_epochs'], config['num_batches'], config['num_channels'], config[
            'image_shape'], config['filter_size'], config[
                'num_filter'], config['pooling_sizes'], config[
                    'mlp_hiddens'], config['output_size'], config[
                        'batch_size'], config['activation'], config[
                            'mlp_activation']
    #    print(num_epochs, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation)
    lambda_l1 = 0.000025
    lambda_l2 = 0.000025

    print("Building model")
    #Create the symbolics variable
    x = T.tensor4('image_features')
    y = T.lmatrix('targets')

    #Get the parameters
    conv_parameters = zip(filter_size, num_filter)

    #Create the convolutions layers
    conv_layers = list(
        interleave([(Convolutional(filter_size=filter_size,
                                   num_filters=num_filter,
                                   name='conv_{}'.format(i))
                     for i, (filter_size,
                             num_filter) in enumerate(conv_parameters)),
                    (activation),
                    (MaxPooling(size, name='pool_{}'.format(i))
                     for i, size in enumerate(pooling_sizes))]))
    #    (AveragePooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))]))

    #Create the sequence
    conv_sequence = ConvolutionalSequence(conv_layers,
                                          num_channels,
                                          image_size=image_shape,
                                          weights_init=Uniform(width=0.2),
                                          biases_init=Constant(0.))
    #Initialize the convnet
    conv_sequence.initialize()
    #Add the MLP
    top_mlp_dims = [np.prod(conv_sequence.get_dim('output'))
                    ] + mlp_hiddens + [output_size]
    out = Flattener().apply(conv_sequence.apply(x))
    mlp = MLP(mlp_activation,
              top_mlp_dims,
              weights_init=Uniform(0, 0.2),
              biases_init=Constant(0.))
    #Initialisze the MLP
    mlp.initialize()
    #Get the output
    predict = mlp.apply(out)

    cost = CategoricalCrossEntropy().apply(y.flatten(),
                                           predict).copy(name='cost')
    error = MisclassificationRate().apply(y.flatten(), predict)

    #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason)
    error_rate = error.copy(name='error_rate')
    error_rate2 = error.copy(name='error_rate2')

    ########### REGULARIZATION ##################
    cg = ComputationGraph([cost])
    weights = VariableFilter(roles=[WEIGHT])(cg.variables)
    biases = VariableFilter(roles=[BIAS])(cg.variables)
    # # l2_penalty_weights = T.sum([i*lambda_l2/len(weights) * (W ** 2).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer
    l2_penalty = T.sum([
        lambda_l2 * (W**2).sum() for i, W in enumerate(weights + biases)
    ])  # Gradually increase penalty for layer
    # # #l2_penalty_bias = T.sum([lambda_l2*(B **2).sum() for B in biases])
    # # #l2_penalty = l2_penalty_weights + l2_penalty_bias
    l2_penalty.name = 'l2_penalty'
    l1_penalty = T.sum([lambda_l1 * T.abs_(z).sum() for z in weights + biases])
    #  l1_penalty_weights = T.sum([i*lambda_l1/len(weights) * T.abs_(W).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer
    #  l1_penalty_biases = T.sum([lambda_l1 * T.abs_(B).sum() for B in biases])
    #  l1_penalty = l1_penalty_biases + l1_penalty_weights
    l1_penalty.name = 'l1_penalty'
    costreg = cost + l2_penalty + l1_penalty
    costreg.name = 'costreg'

    ########### DEFINE THE ALGORITHM #############
    #  algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum())
    algorithm = GradientDescent(cost=costreg,
                                parameters=cg.parameters,
                                step_rule=Adam())

    ########### GET THE DATA #####################
    istest = 'test' in config.keys()
    train_stream, valid_stream, test_stream = get_stream(batch_size,
                                                         image_shape,
                                                         test=istest)

    ########### INITIALIZING EXTENSIONS ##########
    checkpoint = Checkpoint('models/best_' + label + '.tar')
    checkpoint.add_condition(
        ['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far'))
    #Adding a live plot with the bokeh server
    plot = Plot(
        label,
        channels=[
            ['train_error_rate', 'valid_error_rate'],
            ['valid_cost', 'valid_error_rate2'],
            # ['train_costreg','train_grad_norm']], #
            [
                'train_costreg', 'train_total_gradient_norm',
                'train_l2_penalty', 'train_l1_penalty'
            ]
        ],
        server_url="http://hades.calculquebec.ca:5042")

    grad_norm = aggregation.mean(algorithm.total_gradient_norm)
    grad_norm.name = 'grad_norm'

    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches),
        DataStreamMonitoring([cost, error_rate, error_rate2],
                             valid_stream,
                             prefix="valid"),
        TrainingDataMonitoring([
            costreg, error_rate, error_rate2, grad_norm, l2_penalty, l1_penalty
        ],
                               prefix="train",
                               after_epoch=True),
        plot,
        ProgressBar(),
        Printing(),
        TrackTheBest('valid_error_rate', min),  #Keep best
        checkpoint,  #Save best
        FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=4)
    ]  # Early-stopping
    model = Model(cost)
    main_loop = MainLoop(algorithm,
                         data_stream=train_stream,
                         model=model,
                         extensions=extensions)
    main_loop.run()