예제 #1
0
def main(config, tr_stream, dev_stream):
    # Create Theano variables
    logger.info('Creating theano variables')
    source_char_seq = tensor.lmatrix('source_char_seq')
    source_sample_matrix = tensor.btensor3('source_sample_matrix')
    source_char_aux = tensor.bmatrix('source_char_aux')
    source_word_mask = tensor.bmatrix('source_word_mask')
    target_char_seq = tensor.lmatrix('target_char_seq')
    target_char_aux = tensor.bmatrix('target_char_aux')
    target_char_mask = tensor.bmatrix('target_char_mask')
    target_sample_matrix = tensor.btensor3('target_sample_matrix')
    target_word_mask = tensor.bmatrix('target_word_mask')
    target_resample_matrix = tensor.btensor3('target_resample_matrix')
    target_prev_char_seq = tensor.lmatrix('target_prev_char_seq')
    target_prev_char_aux = tensor.bmatrix('target_prev_char_aux')
    target_bos_idx = tr_stream.trg_bos
    target_space_idx = tr_stream.space_idx['target']

    # Construct model
    logger.info('Building RNN encoder-decoder')

    encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'],
                                   config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth'])

    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'],
                      config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'],
                      config['trg_igru_depth'],
                      config['trg_dgru_depth'], target_space_idx, target_bos_idx)

    representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask)
    cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix,
                        target_resample_matrix, target_char_aux, target_char_mask,
                        target_word_mask, target_prev_char_seq, target_prev_char_aux)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.decimator.bidir_w.prototype.recurrent.weights_init = Orthogonal()
    for layer_n in range(config['src_dgru_depth']):
        encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal()
    for layer_n in range(config['bidir_encoder_depth']):
        encoder.children[1 + layer_n].prototype.recurrent.weights_init = Orthogonal()
    if config['trg_igru_depth'] == 1:
        decoder.interpolator.igru.weights_init = Orthogonal()
    else:
        for layer_n in range(config['trg_igru_depth']):
            decoder.interpolator.igru.transitions[layer_n].weights_init = Orthogonal()
    for layer_n in range(config['trg_dgru_depth']):
        decoder.interpolator.feedback_brick.dgru.transitions[layer_n].weights_init = Orthogonal()
    for layer_n in range(config['transition_depth']):
        decoder.transition.transitions[layer_n].weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(str(shape), count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(str(value.get_value().shape), name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)
    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # You could use 1e-4 in Adam, however manually decay will be faster.
    # We decay it to 5e-4 when trained for about 30K
    # then decay it to 2e-4 when trained for about 90K
    # finally set it to 1e-4 when trained for about 180K
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                 Adam(learning_rate=1e-3)]))

    # Set extensions
    logger.info("Initializing extensions")
    # Extensions
    gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
    step_norm = aggregation.mean(algorithm.total_step_norm)
    train_monitor = CostCurve([cost, gradient_norm, step_norm], config=config, after_batch=True,
                              before_first_epoch=True, prefix='tra')
    extensions = [
        train_monitor, Timing(),
        Printing(every_n_batches=config['print_freq']),
        FinishAfter(after_n_batches=config['finish_after']),
        CheckpointNMT(saveto=config['saveto'], dump_freq=config['dump_freq'], every_n_batches=config['save_freq'], )]

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1:
        logger.info("Building sampling model")
        generated = decoder.generate(representation, source_word_mask)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
            ComputationGraph(generated[config['transition_depth']]))  # generated[transition_depth] is next_outputs

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model, data_stream=tr_stream,
                    hook_samples=config['hook_samples'], transition_depth=config['transition_depth'],
                    every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size']))


    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=training_model,
        algorithm=algorithm,
        data_stream=tr_stream,
        extensions=extensions
    )

    # Train!
    main_loop.run()
        step_compute]),
        parameters=cg.parameters, cost=cost)
    extension_list = []
    extension_list.append(
        SharedVariableModifier(step_compute.learning_rate,
            extensions.decay_learning_rate,
            after_batch=False,
            every_n_batches=batches_per_epoch, ))
    extension_list.append(FinishAfter(after_n_epochs=100001))

    ## logging of test set performance
    extension_list.append(extensions.LogLikelihood(dpm, test_stream, scl,
        every_n_batches=args.ext_every_n*batches_per_epoch, before_training=False))

    ## set up logging
    extension_list.extend([Timing(), Printing()])
    model_dir = util.create_log_dir(args, dpm.name + '_' + args.dataset)
    model_save_name = os.path.join(model_dir, 'model.pkl')
    extension_list.append(
        Checkpoint(model_save_name, every_n_batches=args.ext_every_n*batches_per_epoch, save_separately=['log']))
    # generate plots
    extension_list.append(extensions.PlotMonitors(model_dir,
        every_n_batches=args.ext_every_n*batches_per_epoch, before_training=args.plot_before_training))
    test_batch = next(test_stream.get_epoch_iterator())[0]
    extension_list.append(extensions.PlotSamples(dpm, algorithm, test_batch, model_dir,
        every_n_batches=args.ext_every_n*batches_per_epoch, before_training=args.plot_before_training))
    internal_state = dpm.internal_state(features)
    train_batch = next(train_stream.get_epoch_iterator())[0]
    # extension_list.append(
    #     extensions.PlotInternalState(dpm, blocks_model, internal_state, features, train_batch, model_dir,
    #         every_n_batches=args.ext_every_n*batches_per_epoch, before_training=args.plot_before_training))
예제 #3
0
predict1 = NDimensionalSoftmax(name='Soft1').apply(out_soft1)
cost = CategoricalCrossEntropy(name='Cross1').apply(y.flatten(), predict1).copy(name='cost')
error = MisclassificationRate().apply(y.flatten(), predict1)

#Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason)
error_rate = error.copy(name='error_rate')
error_rate2 = error.copy(name='error_rate2')
cg = ComputationGraph([cost, error_rate])

########### GET THE DATA #####################
stream_train = ServerDataStream(('image_features','targets'), False, port=5512, hwm=40)
stream_valid = ServerDataStream(('image_features','targets'), False, port=5513, hwm=40)

########### DEFINE THE ALGORITHM #############
algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum(learning_rate=0.01, momentum=0.9))
extensions = [Timing(),
              FinishAfter(after_n_epochs=num_epochs),
              TrainingDataMonitoring([cost, error_rate,
                aggregation.mean(algorithm.total_gradient_norm)],
                prefix="train",
                after_epoch=True),
              DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid", after_epoch=True),
              Checkpoint("catsVsDogs_fire2_new.pkl", after_epoch=True, save_separately=['log']),
              ProgressBar(),
              Printing()]

#Adding a live plot with the bokeh server
extensions.append(Plot(
    'CatsVsDogs_Fire2',
    channels=[['train_error_rate', 'valid_error_rate'],
              ['train_cost', 'valid_cost'],
예제 #4
0
파일: train.py 프로젝트: csmfindling/kaggle
    from blocks.extensions import Printing, Timing, FinishAfter
    from blocks.extensions.training import TrackTheBest
    from blocks.extensions.monitoring import TrainingDataMonitoring, DataStreamMonitoring
    from blocks.extensions.stopping import FinishIfNoImprovementAfter
    from blocks.extensions.saveload import Checkpoint
    from blocks_extras.extensions.plot import Plot
    import socket
    import datetime
    import time

    host_plot = 'http://tfjgeorge.com:5006'
    cost.name = 'cost'

    extensions = [
        Timing(),
        TrainingDataMonitoring([cost], after_epoch=True, prefix='train'),
        DataStreamMonitoring(variables=[cost], data_stream=valid_stream, prefix="valid"),
        Plot('%s %s' % (socket.gethostname(), datetime.datetime.now(), ),channels=[['train_cost', 'valid_cost']], after_epoch=True, server_url=host_plot),
        TrackTheBest('valid_cost'),
        Checkpoint('train', save_separately=["model","log"]),
        FinishIfNoImprovementAfter('valid_cost_best_so_far', epochs=5),
        #FinishAfter(after_n_epochs=100),
        Printing()
    ]


    # In[7]:

    from blocks.main_loop import MainLoop
예제 #5
0
                                     valid_stream,
                                     every_n_batches=15 * n_batches,
                                     prefix="valid")


def _is_nan(log):
    try:
        result = math.isnan(log.current_row['train_total_gradient_norm'])
        return result
    except:
        return False


extensions = extensions = [
    train_monitor, valid_monitor,
    Timing(every_n_batches=n_batches),
    Printing(every_n_batches=n_batches),
    WriteConditional(emit,
                     every_n_batches=n_batches,
                     save_name=save_dir + "samples/" + exp_name + ".png"),
    FinishAfter().add_condition(["after_batch"], _is_nan),
    ProgressBar(),
    Checkpoint(save_dir + "pkl/" + exp_name + ".pkl", after_epoch=True),
    SaveComputationGraph(emit),
    Plot(save_dir + exp_name + ".png", [['train_nll', 'valid_nll']],
         every_n_batches=5 * n_batches,
         email=False)
]

main_loop = MainLoop(model=model,
                     data_stream=data_stream,
예제 #6
0
def train_rnnrbm(train,
                 rnnrbm,
                 epochs=1000,
                 test=None,
                 bokeh=True,
                 load_path=None):
    cdk = theano.shared(10)
    lr = theano.shared(float32(0.004))

    cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk)

    error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask)
    error_rate.name = "error on note as a whole"
    mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask)
    mistake_rate.name = "single error within note"
    cost.name = 'rbm_cost'

    model = Model(cost)
    cg = ComputationGraph([cost])
    step_rule = CompositeRule([
        RemoveNotFinite(),
        StepClipping(30.0),
        Adam(learning_rate=lr),
        StepClipping(6.0),
        RemoveNotFinite()
    ])  # Scale(0.01)
    gradients = dict(
        equizip(cg.parameters,
                T.grad(cost, cg.parameters, consider_constant=[v_sample])))
    algorithm = GradientDescent(step_rule=step_rule,
                                gradients=gradients,
                                cost=cost,
                                params=cg.parameters)
    algorithm.add_updates(cg.updates)
    extensions = [
        SharedVariableModifier(parameter=cdk,
                               function=lambda n, v: rnnrbm_cdk[n]
                               if rnnrbm_cdk.get(n) else v),
        SharedVariableModifier(parameter=lr,
                               function=lambda n, v: float32(0.78 * v)
                               if n % (200 * 5) == 0 else v),
        FinishAfter(after_n_epochs=epochs),
        TrainingDataMonitoring(
            [
                cost,
                error_rate,
                mistake_rate,
            ],  # hidden_states, debug_val, param_nans,
            # aggregation.mean(algorithm.total_gradient_norm)],  #+ params,
            prefix="train",
            after_epoch=False,
            every_n_batches=40),
        Timing(),
        Printing(),
        ProgressBar()
    ]
    if test is not None:
        extensions.append(
            DataStreamMonitoring([cost, error_rate, mistake_rate],
                                 data_stream=test,
                                 updates=cg.updates,
                                 prefix="test",
                                 after_epoch=False,
                                 every_n_batches=40))
    if bokeh:
        extensions.append(
            Plot(
                'Training RNN-RBM',
                channels=[
                    [
                        'train_error on note as a whole',
                        'train_single error within note',
                        'test_error on note as a whole',
                        'test_single error within note'
                    ],
                    ['train_final_cost'],
                    # ['train_total_gradient_norm'],
                ]))

    main_loop = MainLoop(algorithm=algorithm,
                         data_stream=train,
                         model=model,
                         extensions=extensions)
    return main_loop
예제 #7
0
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh,
                   test_tag, use_load_ext, load_log, fast_start):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])
    train_conf = config['training']
    recognizer = create_model(config, data, test_tag)

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result

    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    prediction, prediction_mask = add_exploration(recognizer, data, train_conf)

    #
    # Observables:
    #
    primary_observables = []  # monitored each batch
    secondary_observables = []  # monitored every 10 batches
    validation_observables = []  # monitored on the validation set

    cg = recognizer.get_cost_graph(batch=True,
                                   prediction=prediction,
                                   prediction_mask=prediction_mask)
    labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg)
    labels_mask, = VariableFilter(applications=[recognizer.cost],
                                  name='labels_mask')(cg)

    gain_matrix = VariableFilter(
        theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg)
    if len(gain_matrix):
        gain_matrix, = gain_matrix
        primary_observables.append(rename(gain_matrix.min(), 'min_gain'))
        primary_observables.append(rename(gain_matrix.max(), 'max_gain'))

    batch_cost = cg.outputs[0].sum()
    batch_size = rename(recognizer.labels.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_total_cost"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(applications=[r.generator.readout.readout],
                               name="output_0")(cost_cg)
    bottom_output = VariableFilter(
        # We need name_regex instead of name because LookupTable calls itsoutput output_0
        applications=[r.bottom.apply],
        name_regex="output")(cost_cg)[-1]
    attended, = VariableFilter(applications=[r.generator.transition.apply],
                               name="attended")(cost_cg)
    attended_mask, = VariableFilter(applications=[
        r.generator.transition.apply
    ],
                                    name="attended_mask")(cost_cg)
    weights, = VariableFilter(applications=[r.generator.evaluate],
                              name="weights")(cost_cg)
    max_recording_length = rename(bottom_output.shape[0],
                                  "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = rename(attended_mask.shape[0],
                                      "max_attended_mask_length")
    max_attended_length = rename(attended.shape[0], "max_attended_length")
    max_num_phonemes = rename(labels.shape[0], "max_num_phonemes")
    min_energy = rename(energies.min(), "min_energy")
    max_energy = rename(energies.max(), "max_energy")
    mean_attended = rename(abs(attended).mean(), "mean_attended")
    mean_bottom_output = rename(
        abs(bottom_output).mean(), "mean_bottom_output")
    weights_penalty = rename(monotonicity_penalty(weights, labels_mask),
                             "weights_penalty")
    weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy")
    mask_density = rename(labels_mask.mean(), "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy, min_energy, max_energy,
        mean_attended, mean_bottom_output, batch_size, max_num_phonemes,
        mask_density
    ])
    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config.get('regularization', dict())
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [
            p for p in cg.parameters if p not in attention_params
        ]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])

    train_cost = regularized_cg.outputs[0]
    if reg_config.get("penalty_coof", .0) > 0:
        # big warning!!!
        # here we assume that:
        # regularized_weights_penalty = regularized_cg.outputs[1]
        train_cost = (train_cost + reg_config.get("penalty_coof", .0) *
                      regularized_cg.outputs[1] / batch_size)
    if reg_config.get("decay", .0) > 0:
        train_cost = (
            train_cost + reg_config.get("decay", .0) *
            l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2)

    train_cost = rename(train_cost, 'train_cost')

    gradients = None
    if reg_config.get('adaptive_noise'):
        logger.info('apply adaptive noise')
        if ((reg_config.get("penalty_coof", .0) > 0)
                or (reg_config.get("decay", .0) > 0)):
            logger.error('using  adaptive noise with alignment weight panalty '
                         'or weight decay is probably stupid')
        train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise(
            cg,
            cg.outputs[0],
            variables=cg.parameters,
            num_examples=data.get_dataset('train').num_examples,
            parameters=Model(
                regularized_cg.outputs[0]).get_parameter_dict().values(),
            **reg_config.get('adaptive_noise'))
        train_cost.name = 'train_cost'
        adapt_noise_cg = ComputationGraph(train_cost)
        model_prior_mean = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_mean')(adapt_noise_cg)[0],
            'model_prior_mean')
        model_cost = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_cost')(adapt_noise_cg)[0], 'model_cost')
        model_prior_variance = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_variance')(adapt_noise_cg)[0],
            'model_prior_variance')
        regularized_cg = ComputationGraph(
            [train_cost, model_cost] + regularized_cg.outputs +
            [model_prior_mean, model_prior_variance])
        primary_observables += [
            regularized_cg.outputs[1],  # model cost
            regularized_cg.outputs[2],  # task cost
            regularized_cg.outputs[-2],  # model prior mean
            regularized_cg.outputs[-1]
        ]  # model prior variance

    model = Model(train_cost)
    if params:
        logger.info("Load parameters from " + params)
        # please note: we cannot use recognizer.load_params
        # as it builds a new computation graph that dies not have
        # shapred variables added by adaptive weight noise
        with open(params, 'r') as src:
            param_values = load_parameters(src)
        model.set_parameter_values(param_values)

    parameters = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, parameters[key].get_value().shape)
                                for key in sorted(parameters.keys())],
                               width=120))

    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'],
                                   train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(
            AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False) > 0:
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [
                v for v in maxnorm_subjects
                if not isinstance(get_brick(v), LookupTable)
            ]
        logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat(
            [name for name, p in parameters.items() if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([
            name for name, p in parameters.items() if not p in maxnorm_subjects
        ]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                     maxnorm_subjects)
        ]
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold
    ]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements**0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements**0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm,
        clipping.threshold, max_recording_length, max_attended_length,
        max_attended_mask_length
    ]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances'),
        weights_entropy, weights_penalty
    ]

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(
                    rename(aggregation.mean(var, batch_size),
                           'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(
                    rename(aggregation.mean(var, labels_mask.sum()),
                           'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(
            Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(
        TrainingDataMonitoring(primary_observables, after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average",
        every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables),
        data.get_stream("valid", shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)
    per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search'])
    per_monitoring = DataStreamMonitoring(
        [per],
        data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['search_every_epochs'],
            every_n_batches=mon_conf['search_every_batches'],
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(before_first_epoch=True,
                                                     after_epoch=True)
    extensions += [track_the_best_cost, track_the_best_per]
    extensions.append(
        AdaptiveClipping(algorithm.total_gradient_norm.name,
                         clipping,
                         train_conf['gradient_threshold'],
                         decay_rate=0.998,
                         burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf.get('num_batches'),
                    after_n_epochs=train_conf.get('num_epochs')).add_condition(
                        ["after_batch"], _gradient_norm_is_none),
    ]
    channels = [
        # Plot 1: training and validation costs
        [
            average_monitoring.record_name(train_cost),
            validation.record_name(cost)
        ],
        # Plot 2: gradient norm,
        [
            average_monitoring.record_name(algorithm.total_gradient_norm),
            average_monitoring.record_name(clipping.threshold)
        ],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [
            average_monitoring._record_name('weights_entropy_per_label'),
            validation._record_name('weights_entropy_per_label')
        ],
        # Plot 5: training and validation monotonicity penalty
        [
            average_monitoring._record_name('weights_penalty_per_recording'),
            validation._record_name('weights_penalty_per_recording')
        ]
    ]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),
        ]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start,
                   after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True).add_condition(
                       ['after_epoch'],
                       OnLogRecord(track_the_best_per.notification_name),
                       (root_path + "_best" + extension, )).add_condition(
                           ['after_epoch'],
                           OnLogRecord(track_the_best_cost.notification_name),
                           (root_path + "_best_ll" + extension, )),
        ProgressBar()
    ]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))
    if config['net']['criterion']['name'].startswith('mse'):
        extensions.append(
            LogInputsGains(labels, cg, recognizer.generator.readout.emitter,
                           data))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name
            ]
        extensions.append(Patience(**patience_conf))

    extensions.append(
        Printing(every_n_batches=1, attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
def main(dataset_name='sklearn', num_epochs=100):
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')

    logistic_regressor = LogisticRegressor(input_dim=2)
    probs = logistic_regressor.get_probs(features=x)
    params = logistic_regressor.get_params()
    cost = logistic_regressor.get_cost(probs=probs, targets=y).mean()
    cost.name = 'cost'
    misclassification = logistic_regressor.get_misclassification(
        probs=probs, targets=y).mean()
    misclassification.name = 'misclassification'

    train_dataset, test_dataset = build_2d_datasets(dataset_name=dataset_name)

    algorithm = GradientDescent(cost=cost,
                                params=params,
                                step_rule=Scale(learning_rate=0.1))

    train_data_stream = ForceFloatX(
        data_stream=DataStream(dataset=train_dataset,
                               iteration_scheme=SequentialScheme(
                                   examples=train_dataset.num_examples,
                                   batch_size=40,
                               )))
    test_data_stream = ForceFloatX(
        data_stream=DataStream(dataset=test_dataset,
                               iteration_scheme=SequentialScheme(
                                   examples=test_dataset.num_examples,
                                   batch_size=400,
                               )))

    model = Model(cost)

    extensions = []
    extensions.append(Timing())
    extensions.append(FinishAfter(after_n_epochs=num_epochs))
    extensions.append(
        DataStreamMonitoring([cost, misclassification],
                             test_data_stream,
                             prefix='test'))
    extensions.append(
        TrainingDataMonitoring([cost, misclassification],
                               prefix='train',
                               after_epoch=True))

    scoring_function = theano.function([x], probs)

    plotters = []
    plotters.append(
        Plotter(channels=[[
            'test_cost', 'test_misclassification', 'train_cost',
            'train_misclassification'
        ]],
                titles=['Costs']))
    score_train_stream = Mapping(data_stream=copy.deepcopy(train_data_stream),
                                 mapping=TupleMapping(scoring_function),
                                 add_sources=('scores', ))
    score_test_stream = Mapping(data_stream=copy.deepcopy(test_data_stream),
                                mapping=TupleMapping(scoring_function),
                                add_sources=('scores', ))
    plotters.append(
        Display2DData(data_streams=[
            score_train_stream,
            copy.deepcopy(train_data_stream), score_test_stream,
            copy.deepcopy(test_data_stream)
        ],
                      radius=0.01))

    extensions.append(
        PlotManager('2D examples',
                    plotters=plotters,
                    after_epoch=False,
                    every_n_epochs=50,
                    after_training=True))
    extensions.append(Printing())
    main_loop = MainLoop(model=model,
                         data_stream=train_data_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
예제 #9
0
    def train(self):
        """ Setup and train the model """
        to_train = ComputationGraph([self.tagger.total_cost]).parameters
        logger.info('Found the following parameters: %s' % str(to_train))

        step_rule = Adam(learning_rate=self.p.lr)
        training_algorithm = GradientDescent(
            cost=self.tagger.total_cost, parameters=to_train, step_rule=step_rule,
            on_unused_sources='warn',
            theano_func_kwargs={'on_unused_input': 'warn'}
        )

        # TRACKED GRAPH NODES
        train_params = {
            'Train_Denoising_Cost': self.tagger.corr.denoising_cost,
        }
        if self.p.class_cost_x > 0:
            train_params['Train_Classification_Cost'] = self.tagger.corr.class_cost
            train_params['Train_Classification_Error'] = self.tagger.clean.class_error

        valid_params = {
            'Validation_Denoising_Cost': self.tagger.corr.denoising_cost,
        }
        if self.p.class_cost_x > 0:
            valid_params['Validation_Classification_Cost'] = self.tagger.corr.class_cost
            valid_params['Validation_Classification_Error'] = self.tagger.clean.class_error

        test_params = {
            'Test_AMI_Score': self.tagger.clean.ami_score,
            'Test_Denoising_Cost': self.tagger.corr.denoising_cost,
        }
        if self.p.class_cost_x > 0:
            test_params['Test_Classification_Cost'] = self.tagger.corr.class_cost
            test_params['Test_Classification_Error'] = self.tagger.clean.class_error

        short_prints = {
            "train": train_params,
            "valid": valid_params,
            "test": test_params,
        }

        main_loop = MainLoop(
            training_algorithm,
            # Datastream used for training
            self.streams['train'],
            model=Model(self.tagger.total_cost),
            extensions=[
                FinishAfter(after_n_epochs=self.p.num_epochs),
                SaveParams(self.p.get('save_freq', 0), self.tagger, self.save_dir, before_epoch=True),
                DataStreamMonitoring(
                    valid_params.values(),
                    self.streams['valid'],
                    prefix="valid"
                ),
                FinalTestMonitoring(
                    test_params.values(),
                    self.streams['train'],
                    {'valid': self.streams['valid'], 'test': self.streams['test']},
                    after_training=True
                ),
                TrainingDataMonitoring(
                    train_params.values(),
                    prefix="train", after_epoch=True
                ),
                SaveExpParams(self.p, self.save_dir, before_training=True),
                Timing(after_epoch=True),
                ShortPrinting(short_prints, after_epoch=True),
            ])
        logger.info('Running the main loop')
        main_loop.run()
예제 #10
0
def main(port_data):
    mlp_hiddens = [500]
    filter_sizes = [(3, 3), (3, 3)]
    feature_maps = [20, 20]
    pooling_sizes = [(3, 3), (2, 2)]
    save_to = "DvC.pkl"
    image_size = (128, 128)
    output_size = 2
    learningRate = 0.1
    num_epochs = 300
    num_batches = None
    if socket.gethostname() == 'tim-X550JX':
        host_plot = 'http://*****:*****@ %s' %
             ('CNN ', datetime.datetime.now(), socket.gethostname()),
             channels=[['train_error_rate', 'valid_error_rate'],
                       ['train_total_gradient_norm']],
             after_epoch=True,
             server_url=host_plot))

    model = Model(cost)

    main_loop = MainLoop(algorithm,
                         stream_data_train,
                         model=model,
                         extensions=extensions)

    main_loop.run()
예제 #11
0
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter,
         enc_dim, dec_dim, z_dim, oldmodel, live_plotting):

    image_size, channels, data_train, data_valid, data_test = datasets.get_data(
        dataset)

    train_stream = Flatten(
        DataStream.default_stream(data_train,
                                  iteration_scheme=SequentialScheme(
                                      data_train.num_examples, batch_size)))
    valid_stream = Flatten(
        DataStream.default_stream(data_valid,
                                  iteration_scheme=SequentialScheme(
                                      data_valid.num_examples, batch_size)))
    test_stream = Flatten(
        DataStream.default_stream(data_test,
                                  iteration_scheme=SequentialScheme(
                                      data_test.num_examples, batch_size)))

    if name is None:
        name = dataset

    img_height, img_width = image_size
    x_dim = channels * img_height * img_width

    rnninits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    # Configure attention mechanism
    if attention != "":
        read_N, write_N = attention.split(',')

        read_N = int(read_N)
        write_N = int(write_N)
        read_dim = 2 * channels * read_N**2

        reader = AttentionReader(x_dim=x_dim,
                                 dec_dim=dec_dim,
                                 channels=channels,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
        writer = AttentionWriter(input_dim=dec_dim,
                                 output_dim=x_dim,
                                 channels=channels,
                                 width=img_width,
                                 height=img_height,
                                 N=write_N,
                                 **inits)
        attention_tag = "r%d-w%d" % (read_N, write_N)
    else:
        read_dim = 2 * x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

        attention_tag = "full"

    #----------------------------------------------------------------------

    if name is None:
        name = dataset

    # Learning rate
    def lr_tag(value):
        """ Convert a float into a short tag-usable string representation. E.g.:
            0.1   -> 11
            0.01  -> 12
            0.001 -> 13
            0.005 -> 53
        """
        exp = np.floor(np.log10(value))
        leading = ("%e" % value)[0]
        return "%s%d" % (leading, -exp)

    lr_str = lr_tag(learning_rate)

    subdir = name + "-" + time.strftime("%Y%m%d-%H%M%S")
    longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (
        dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str)
    pickle_file = subdir + "/" + longname + ".pkl"

    print("\nRunning experiment %s" % longname)
    print("               dataset: %s" % dataset)
    print("          subdirectory: %s" % subdir)
    print("         learning rate: %g" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print("            batch size: %d" % batch_size)
    print("                epochs: %d" % epochs)
    print()

    #----------------------------------------------------------------------

    encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim],
                      name="MLP_enc",
                      **inits)
    decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim],
                      name="MLP_dec",
                      **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    draw = DrawModel(n_iter,
                     reader=reader,
                     encoder_mlp=encoder_mlp,
                     encoder_rnn=encoder_rnn,
                     sampler=q_sampler,
                     decoder_mlp=decoder_mlp,
                     decoder_rnn=decoder_rnn,
                     writer=writer)
    draw.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    x_recons, kl_terms = draw.reconstruct(x)

    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        parameters=params,
        step_rule=CompositeRule([
            StepClipping(10.),
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t, :].mean()
        kl_term_t.name = "kl_term_%d" % t

        #x_recons_t = T.nnet.sigmoid(c[t,:,:])
        #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        #recons_term_t = recons_term_t.mean()
        #recons_term_t.name = "recons_term_%d" % t

        monitors += [kl_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]
    # Live plotting...
    plot_channels = [
        ["train_nll_bound", "test_nll_bound"],
        ["train_kl_term_%d" % t for t in range(n_iter)],
        #["train_recons_term_%d" % t for t in range(n_iter)],
        ["train_total_gradient_norm", "train_total_step_norm"]
    ]

    #------------------------------------------------------------

    if not os.path.exists(subdir):
        os.makedirs(subdir)

    plotting_extensions = []
    if live_plotting:
        plotting_extensions = [Plot(name, channels=plot_channels)]

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            TrainingDataMonitoring(
                train_monitors, prefix="train", after_epoch=True),
            #            DataStreamMonitoring(
            #                monitors,
            #                valid_stream,
            ##                updates=scan_updates,
            #                prefix="valid"),
            DataStreamMonitoring(
                monitors,
                test_stream,
                #                updates=scan_updates,
                prefix="test"),
            #Checkpoint(name, before_training=False, after_epoch=True, save_separately=['log', 'model']),
            Checkpoint("{}/{}".format(subdir, name),
                       save_main_loop=False,
                       before_training=True,
                       after_epoch=True,
                       save_separately=['log', 'model']),
            SampleCheckpoint(image_size=image_size[0],
                             channels=channels,
                             save_subdir=subdir,
                             before_training=True,
                             after_epoch=True),
            ProgressBar(),
            Printing()
        ] + plotting_extensions)

    if oldmodel is not None:
        print("Initializing parameters with old model %s" % oldmodel)
        with open(oldmodel, "rb") as f:
            oldmodel = pickle.load(f)
            main_loop.model.set_parameter_values(oldmodel.get_param_values())
        del oldmodel

    main_loop.run()
예제 #12
0
def run(batch_size, classifier, oldmodel, monitor_every, checkpoint_every, final_epoch,
        dataset, color_convert, image_size, net_depth, allowed, stretch):

    streams = create_custom_streams(filename=dataset,
                                    training_batch_size=batch_size,
                                    monitoring_batch_size=batch_size,
                                    include_targets=True,
                                    color_convert=color_convert,
                                    allowed=allowed,
                                    stretch=stretch)

    main_loop_stream = streams[0]
    train_monitor_stream = streams[1]
    valid_monitor_stream = streams[2]

    cg, bn_dropout_cg = create_training_computation_graphs(image_size, net_depth)

    model = Model(bn_dropout_cg.outputs[0])

    # Compute parameter updates for the batch normalization population
    # statistics. They are updated following an exponential moving average.
    pop_updates = get_batch_normalization_updates(bn_dropout_cg)
    decay_rate = 0.05
    extra_updates = [(p, m * decay_rate + p * (1 - decay_rate))
                     for p, m in pop_updates]

    # Prepare algorithm
    step_rule = Adam()
    algorithm = GradientDescent(cost=bn_dropout_cg.outputs[0],
                                parameters=bn_dropout_cg.parameters,
                                step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    # Prepare monitoring
    cost = bn_dropout_cg.outputs[0]
    cost.name = 'cost'
    train_monitoring = DataStreamMonitoring(
        [cost], train_monitor_stream, prefix="train",
        before_first_epoch=False, after_epoch=False, after_training=True,
        updates=extra_updates)

    cost, accuracy = cg.outputs
    cost.name = 'cost'
    accuracy.name = 'accuracy'
    monitored_quantities = [cost, accuracy]
    valid_monitoring = DataStreamMonitoring(
        monitored_quantities, valid_monitor_stream, prefix="valid",
        before_first_epoch=True, after_epoch=False,
        every_n_epochs=monitor_every)

    # Prepare checkpoint
    checkpoint = Checkpoint(classifier, every_n_epochs=checkpoint_every,
                            use_cpickle=True)

    extensions = [Timing(), FinishAfter(after_n_epochs=final_epoch), train_monitoring,
                  valid_monitoring, checkpoint, Printing(), ProgressBar()]
    main_loop = MainLoop(model=model, data_stream=main_loop_stream,
                         algorithm=algorithm, extensions=extensions)

    if oldmodel is not None:
        print("Initializing parameters with old model {}".format(oldmodel))
        with open(oldmodel, 'rb') as src:
            saved_model = load(src)
            main_loop.model.set_parameter_values(
                saved_model.model.get_parameter_values())
            del saved_model

    main_loop.run()
예제 #13
0
def build_and_run(label, config):
    ############## CREATE THE NETWORK ###############
    #Define the parameters
    num_epochs, num_batches, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation = config[
        'num_epochs'], config['num_batches'], config['num_channels'], config[
            'image_shape'], config['filter_size'], config[
                'num_filter'], config['pooling_sizes'], config[
                    'mlp_hiddens'], config['output_size'], config[
                        'batch_size'], config['activation'], config[
                            'mlp_activation']
    #    print(num_epochs, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation)
    lambda_l1 = 0.000025
    lambda_l2 = 0.000025

    print("Building model")
    #Create the symbolics variable
    x = T.tensor4('image_features')
    y = T.lmatrix('targets')

    #Get the parameters
    conv_parameters = zip(filter_size, num_filter)

    #Create the convolutions layers
    conv_layers = list(
        interleave([(Convolutional(filter_size=filter_size,
                                   num_filters=num_filter,
                                   name='conv_{}'.format(i))
                     for i, (filter_size,
                             num_filter) in enumerate(conv_parameters)),
                    (activation),
                    (MaxPooling(size, name='pool_{}'.format(i))
                     for i, size in enumerate(pooling_sizes))]))
    #    (AveragePooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))]))

    #Create the sequence
    conv_sequence = ConvolutionalSequence(conv_layers,
                                          num_channels,
                                          image_size=image_shape,
                                          weights_init=Uniform(width=0.2),
                                          biases_init=Constant(0.))
    #Initialize the convnet
    conv_sequence.initialize()
    #Add the MLP
    top_mlp_dims = [np.prod(conv_sequence.get_dim('output'))
                    ] + mlp_hiddens + [output_size]
    out = Flattener().apply(conv_sequence.apply(x))
    mlp = MLP(mlp_activation,
              top_mlp_dims,
              weights_init=Uniform(0, 0.2),
              biases_init=Constant(0.))
    #Initialisze the MLP
    mlp.initialize()
    #Get the output
    predict = mlp.apply(out)

    cost = CategoricalCrossEntropy().apply(y.flatten(),
                                           predict).copy(name='cost')
    error = MisclassificationRate().apply(y.flatten(), predict)

    #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason)
    error_rate = error.copy(name='error_rate')
    error_rate2 = error.copy(name='error_rate2')

    ########### REGULARIZATION ##################
    cg = ComputationGraph([cost])
    weights = VariableFilter(roles=[WEIGHT])(cg.variables)
    biases = VariableFilter(roles=[BIAS])(cg.variables)
    # # l2_penalty_weights = T.sum([i*lambda_l2/len(weights) * (W ** 2).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer
    l2_penalty = T.sum([
        lambda_l2 * (W**2).sum() for i, W in enumerate(weights + biases)
    ])  # Gradually increase penalty for layer
    # # #l2_penalty_bias = T.sum([lambda_l2*(B **2).sum() for B in biases])
    # # #l2_penalty = l2_penalty_weights + l2_penalty_bias
    l2_penalty.name = 'l2_penalty'
    l1_penalty = T.sum([lambda_l1 * T.abs_(z).sum() for z in weights + biases])
    #  l1_penalty_weights = T.sum([i*lambda_l1/len(weights) * T.abs_(W).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer
    #  l1_penalty_biases = T.sum([lambda_l1 * T.abs_(B).sum() for B in biases])
    #  l1_penalty = l1_penalty_biases + l1_penalty_weights
    l1_penalty.name = 'l1_penalty'
    costreg = cost + l2_penalty + l1_penalty
    costreg.name = 'costreg'

    ########### DEFINE THE ALGORITHM #############
    #  algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum())
    algorithm = GradientDescent(cost=costreg,
                                parameters=cg.parameters,
                                step_rule=Adam())

    ########### GET THE DATA #####################
    istest = 'test' in config.keys()
    train_stream, valid_stream, test_stream = get_stream(batch_size,
                                                         image_shape,
                                                         test=istest)

    ########### INITIALIZING EXTENSIONS ##########
    checkpoint = Checkpoint('models/best_' + label + '.tar')
    checkpoint.add_condition(
        ['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far'))
    #Adding a live plot with the bokeh server
    plot = Plot(
        label,
        channels=[
            ['train_error_rate', 'valid_error_rate'],
            ['valid_cost', 'valid_error_rate2'],
            # ['train_costreg','train_grad_norm']], #
            [
                'train_costreg', 'train_total_gradient_norm',
                'train_l2_penalty', 'train_l1_penalty'
            ]
        ],
        server_url="http://hades.calculquebec.ca:5042")

    grad_norm = aggregation.mean(algorithm.total_gradient_norm)
    grad_norm.name = 'grad_norm'

    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches),
        DataStreamMonitoring([cost, error_rate, error_rate2],
                             valid_stream,
                             prefix="valid"),
        TrainingDataMonitoring([
            costreg, error_rate, error_rate2, grad_norm, l2_penalty, l1_penalty
        ],
                               prefix="train",
                               after_epoch=True),
        plot,
        ProgressBar(),
        Printing(),
        TrackTheBest('valid_error_rate', min),  #Keep best
        checkpoint,  #Save best
        FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=4)
    ]  # Early-stopping
    model = Model(cost)
    main_loop = MainLoop(algorithm,
                         data_stream=train_stream,
                         model=model,
                         extensions=extensions)
    main_loop.run()
예제 #14
0
def construct_main_loop(name, task_name, patch_shape, batch_size,
                        n_spatial_dims, n_patches, n_epochs, learning_rate,
                        hyperparameters, **kwargs):
    name = "%s_%s" % (name, task_name)
    hyperparameters["name"] = name

    task = get_task(**hyperparameters)
    hyperparameters["n_channels"] = task.n_channels

    theano.config.compute_test_value = "warn"

    x, x_shape, y = task.get_variables()

    ram = construct_model(task=task, **hyperparameters)
    ram.initialize()

    states = []
    states.append(ram.compute_initial_state(x, x_shape, as_dict=True))
    n_steps = n_patches - 1
    for i in xrange(n_steps):
        states.append(ram.apply(x, x_shape, as_dict=True, **states[-1]))

    emitter = task.get_emitter(input_dim=ram.get_dim("states"),
                               **hyperparameters)
    emitter.initialize()
    cost = emitter.cost(states[-1]["states"], y, n_patches)
    cost.name = "cost"

    print "setting up main loop..."
    graph = ComputationGraph(cost)
    uselessflunky = Model(cost)
    algorithm = GradientDescent(cost=cost,
                                parameters=graph.parameters,
                                step_rule=CompositeRule([
                                    StepClipping(1.),
                                    Adam(learning_rate=learning_rate)
                                ]))
    monitors = construct_monitors(x=x,
                                  x_shape=x_shape,
                                  y=y,
                                  cost=cost,
                                  algorithm=algorithm,
                                  task=task,
                                  model=uselessflunky,
                                  ram=ram,
                                  graph=graph,
                                  **hyperparameters)
    main_loop = MainLoop(
        data_stream=task.get_stream("train"),
        algorithm=algorithm,
        extensions=(
            monitors + [
                FinishAfter(after_n_epochs=n_epochs),
                DumpMinimum(name + '_best', channel_name='valid_error_rate'),
                Dump(name + '_dump', every_n_epochs=10),
                #Checkpoint(name+'_checkpoint.pkl', every_n_epochs=10, on_interrupt=False),
                ProgressBar(),
                Timing(),
                Printing(),
                PrintingTo(name + "_log")
            ]),
        model=uselessflunky)
    return main_loop
예제 #15
0
def main(save_to,
         num_epochs,
         feature_maps=None,
         mlp_hiddens=None,
         conv_sizes=None,
         pool_sizes=None,
         batch_size=500,
         num_batches=None):
    if feature_maps is None:
        feature_maps = [20, 50]
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    if conv_sizes is None:
        conv_sizes = [5, 5]
    if pool_sizes is None:
        pool_sizes = [2, 2]
    image_size = (28, 28)
    output_size = 10

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations,
                    1,
                    image_size,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='full',
                    weights_init=Uniform(width=.2),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.push_initialization_config()
    convnet.layers[0].weights_init = Uniform(width=.2)
    convnet.layers[1].weights_init = Uniform(width=.09)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info(
        "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        logging.info("Layer {} ({}) dim: {} {} {}".format(
            i, layer.__class__.__name__, *layer.get_dim('output')))

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    cost = CategoricalCrossEntropy().apply(y.flatten(),
                                           probs).copy(name='cost')
    error_rate = MisclassificationRate().apply(y.flatten(),
                                               probs).copy(name='error_rate')

    cg = ComputationGraph([cost, error_rate])

    mnist_train = MNIST(("train", ))
    mnist_train_stream = DataStream.default_stream(
        mnist_train,
        iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size))

    mnist_test = MNIST(("test", ))
    mnist_test_stream = DataStream.default_stream(
        mnist_test,
        iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size))

    # Train with simple SGD
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))
    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches),
        DataStreamMonitoring([cost, error_rate],
                             mnist_test_stream,
                             prefix="test"),
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               after_epoch=True),
        Checkpoint(save_to),
        ProgressBar(),
        Printing()
    ]

    model = Model(cost)

    main_loop = MainLoop(algorithm,
                         mnist_train_stream,
                         model=model,
                         extensions=extensions)

    main_loop.run()
def main(config, tr_stream, dev_stream, use_bokeh=False):

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    initial_context = tensor.matrix('initial_context')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])

    decoder = InitialContextDecoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2, config['context_dim'])
    
    cost = decoder.cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask, initial_context)

    cost.name = 'decoder_cost'

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [x for x in cg.intermediary_variables
                          if x.name == 'maxout_apply_output']
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Apply weight noise for regularization
    if config['weight_noise_ff'] > 0.0:
        logger.info('Applying weight noise to ff layers')
        enc_params = Selector(encoder.lookup).get_parameters().values()
        enc_params += Selector(encoder.fwd_fork).get_parameters().values()
        enc_params += Selector(encoder.back_fork).get_parameters().values()
        dec_params = Selector(
            decoder.sequence_generator.readout).get_parameters().values()
        dec_params += Selector(
            decoder.sequence_generator.fork).get_parameters().values()
        dec_params += Selector(decoder.transition.initial_transformer).get_parameters().values()
        cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'])

    # TODO: weight noise for recurrent params isn't currently implemented -- see config['weight_noise_rec']
    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions

    # TODO: add checking for existing model and loading
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'],
                      every_n_batches=config['save_freq'])
    ]

    # Create the theano variables that we need for the sampling graph
    sampling_input = tensor.lmatrix('input')
    sampling_context = tensor.matrix('context_input')
    
    # WORKING: change this part to account for the new initial context for decoder
    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        
        generated = decoder.generate(sampling_input, sampling_representation, sampling_context)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling
    # TODO: currently commented because we need to modify the sampler to use the contexts
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model, data_stream=tr_stream,
                    hook_samples=config['hook_samples'],
                    every_n_batches=config['sampling_freq'],
                    src_vocab=source_vocab,
                    trg_vocab=target_vocab,
                    src_vocab_size=config['src_vocab_size'],
                   ))


    # TODO: add sampling_context to BleuValidator and Sampler
    # Add early stopping based on bleu
    if config['bleu_script'] is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(sampling_input, sampling_context, samples=samples, config=config,
                          model=search_model, data_stream=dev_stream,
                          src_vocab=source_vocab,
                          trg_vocab=target_vocab,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(config['model_save_directory'], channels=[['decoder_cost', 'validation_set_bleu_score']],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    # if there is dropout or random noise, we need to use the output of the modified graph
    if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0:
        algorithm = GradientDescent(
            cost=cg.outputs[0], parameters=cg.parameters,
            step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                     eval(config['step_rule'])()])
        )
    else:
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                     eval(config['step_rule'])()])
        )

    # enrich the logged information
    extensions.append(
        Timing(every_n_batches=100)
    )

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=training_model,
        algorithm=algorithm,
        data_stream=tr_stream,
        extensions=extensions
    )

    # Train!
    main_loop.run()
def main(num_epochs=100):
    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    num_filters = 64
    layers = [ConvolutionalLayer(filter_size=(8, 8), num_filters=num_filters,
                                 num_channels=3, step=(1, 1),
                                 border_mode='valid'),
              MaxPooling(pooling_size=(2, 2)),
              ConvolutionalLayer(filter_size=(5, 5), num_filters=num_filters,
                                 num_channels=num_filters, step=(1, 1),
                                 border_mode='valid')]
    convnet = ConvolutionalNetwork(layers=layers)
    output_shape = convnet.get_output_shape((32, 32))
    convnet.set_input_shape((32, 32))
    fc_net = NeuralSoftmax(input_dim=numpy.prod(output_shape) * num_filters,
                           n_classes=10, n_hidden=[500])

    convnet_features = convnet.apply(x)
    probs = fc_net.get_probs(features=convnet_features.flatten(2))
    params = convnet.get_params() + fc_net.get_params()
    weights = convnet.get_weights()
    cost = fc_net.get_cost(probs=probs, targets=y).mean()
    cost.name = 'cost'
    misclassification = fc_net.get_misclassification(
        probs=probs, targets=y
    ).mean()
    misclassification.name = 'misclassification'

    train_dataset = CIFAR10('train', flatten=False)
    test_dataset = CIFAR10('test', flatten=False)

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=Momentum(learning_rate=.01,
                           momentum=0.1))

    train_data_stream = Mapping(
        data_stream=ForceFloatX(
            data_stream=DataStream(
                dataset=train_dataset,
                iteration_scheme=ShuffledScheme(
                    examples=range(50000),
                    batch_size=100,
                )
            )
        ),
        mapping=Rescale(scale=1/127.5, shift=-127.5)
    )
    valid_data_stream = Mapping(
        data_stream=ForceFloatX(
            data_stream=DataStream(
                dataset=train_dataset,
                iteration_scheme=SequentialScheme(
                    examples=range(40000, 50000),
                    batch_size=1000,
                )
            )
        ),
        mapping=Rescale(scale=1/127.5, shift=-127.5)
    )
    test_data_stream = Mapping(
        data_stream=ForceFloatX(
            data_stream=DataStream(
                dataset=test_dataset,
                iteration_scheme=SequentialScheme(
                    examples=test_dataset.num_examples,
                    batch_size=100,
                )
            )
        ),
        mapping=Rescale(scale=1/127.5, shift=-127.5)
    )

    model = Model(cost)

    extensions = []
    extensions.append(Timing())
    extensions.append(FinishAfter(after_n_epochs=num_epochs))
    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        test_data_stream,
        prefix='test'))
    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        valid_data_stream,
        prefix='valid'))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True))

    plotters = []
    plotters.append(Plotter(
        channels=[['test_cost', 'test_misclassification',
                   'train_cost', 'train_misclassification']],
        titles=['Costs']))
    display_train = ImageDataStreamDisplay(
        data_stream=copy.deepcopy(train_data_stream),
        image_shape=(3, 32, 32),
        axes=('c', 0, 1),
        shift=0,
        rescale=1.,
    )
    weight_display = WeightDisplay(
        weights=weights,
        transpose=(0, 1, 2, 3),
        image_shape=(3, 8, 8),
        axes=('c', 0, 1),
        shift=-0.5,
        rescale=2.,
    )

    # Feature maps
    one_example_train_data_stream = Mapping(
        data_stream=ForceFloatX(
            data_stream=DataStream(
                dataset=train_dataset,
                iteration_scheme=ShuffledScheme(
                    examples=train_dataset.num_examples,
                    batch_size=1,
                )
            )
        ),
        mapping=Rescale(scale=1/127.5, shift=-127.5)
    )
    displayable_convnet_features = convnet_features.dimshuffle((1, 0, 2, 3))
    convnet_features_normalizer = abs(
        displayable_convnet_features
    ).max(axis=(1, 2, 3))
    displayable_convnet_features = displayable_convnet_features \
        / convnet_features_normalizer.dimshuffle((0, 'x', 'x', 'x'))
    get_displayable_convnet_features = theano.function(
        [x], displayable_convnet_features
    )
    display_feature_maps_data_stream = Mapping(
        data_stream=one_example_train_data_stream,
        mapping=TupleMapping(get_displayable_convnet_features,
                             same_len_out=True)
    )
    display_feature_maps = ImageDataStreamDisplay(
        data_stream=display_feature_maps_data_stream,
        image_shape=(1, ) + output_shape,
        axes=('c', 0, 1),
        shift=0,
        rescale=1.,
    )

    # Saliency map
    displayable_saliency_map = tensor.grad(cost, x)
    saliency_map_normalizer = abs(
        displayable_saliency_map
    ).max(axis=(1, 2, 3))
    displayable_saliency_map = displayable_saliency_map \
        / saliency_map_normalizer.dimshuffle((0, 'x', 'x', 'x'))
    get_displayable_saliency_map = theano.function(
        [x, y], displayable_saliency_map
    )

    display_saliency_map_data_stream = Mapping(
        data_stream=copy.deepcopy(train_data_stream),
        mapping=TupleMapping(get_displayable_saliency_map,
                             same_len_out=True,
                             same_len_in=True)
    )
    display_saliency_map = ImageDataStreamDisplay(
        data_stream=display_saliency_map_data_stream,
        image_shape=(3, 32, 32),
        axes=('c', 0, 1),
        shift=0,
        rescale=1.,
    )

    # Deconvolution
    x_repeated = x.repeat(num_filters, axis=0)
    convnet_features_repeated = convnet.apply(x_repeated)
    convnet_features_selected = convnet_features_repeated \
        * tensor.eye(num_filters).repeat(
            x.shape[0], axis=0
        ).dimshuffle((0, 1, 'x', 'x'))
    displayable_deconvolution = tensor.grad(misclassification, x_repeated,
                                            known_grads={
                                                convnet_features_selected:
                                                    convnet_features_selected
                                            })
    deconvolution_normalizer = abs(
        displayable_deconvolution
    ).max(axis=(1, 2, 3))
    displayable_deconvolution = displayable_deconvolution \
        / deconvolution_normalizer.dimshuffle((0, 'x', 'x', 'x'))
    get_displayable_deconvolution = theano.function(
        [x], displayable_deconvolution
    )
    display_deconvolution_data_stream = Mapping(
        data_stream=one_example_train_data_stream,
        mapping=TupleMapping(get_displayable_deconvolution,
                             same_len_out=True)
    )
    display_deconvolution = ImageDataStreamDisplay(
        data_stream=display_deconvolution_data_stream,
        image_shape=(3, 32, 32),
        axes=('c', 0, 1),
        shift=0,
        rescale=1.,
    )

    images_displayer = DisplayImage(
        image_getters=[display_train, weight_display, display_feature_maps,
                       display_saliency_map, display_deconvolution],
        titles=['Training examples', 'Convolutional weights', 'Feature maps',
                'Sensitivity', 'Deconvolution']
    )
    plotters.append(images_displayer)

    extensions.append(PlotManager('CIFAR-10 convnet examples',
                                  plotters=plotters,
                                  after_epoch=False,
                                  every_n_epochs=10,
                                  after_training=True))
    extensions.append(Printing())
    main_loop = MainLoop(model=model,
                         data_stream=train_data_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
예제 #18
0
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim,
         dec_dim, z_dim):

    x_dim = 28 * 28
    img_height, img_width = (28, 28)

    rnninits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    if attention != "":
        read_N, write_N = attention.split(',')

        read_N = int(read_N)
        write_N = int(write_N)
        read_dim = 2 * read_N**2

        reader = AttentionReader(x_dim=x_dim,
                                 dec_dim=dec_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
        writer = AttentionWriter(input_dim=dec_dim,
                                 output_dim=x_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=write_N,
                                 **inits)
        attention_tag = "r%d-w%d" % (read_N, write_N)
    else:
        read_dim = 2 * x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

        attention_tag = "full"

    #----------------------------------------------------------------------

    # Learning rate
    def lr_tag(value):
        """ Convert a float into a short tag-usable string representation. E.g.:
            0.1   -> 11
            0.01  -> 12
            0.001 -> 13
            0.005 -> 53
        """
        exp = np.floor(np.log10(value))
        leading = ("%e" % value)[0]
        return "%s%d" % (leading, -exp)

    lr_str = lr_tag(learning_rate)
    name = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (name, attention_tag, n_iter,
                                               enc_dim, dec_dim, z_dim, lr_str)

    print("\nRunning experiment %s" % name)
    print("         learning rate: %5.3f" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print()

    #----------------------------------------------------------------------

    encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim],
                      name="MLP_enc",
                      **inits)
    decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim],
                      name="MLP_dec",
                      **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    draw = DrawModel(n_iter,
                     reader=reader,
                     encoder_mlp=encoder_mlp,
                     encoder_rnn=encoder_rnn,
                     sampler=q_sampler,
                     decoder_mlp=decoder_mlp,
                     decoder_rnn=decoder_rnn,
                     writer=writer)
    draw.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    #x_recons = 1. + x
    x_recons, kl_terms = draw.reconstruct(x)
    #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100)
    #x_recons = x_recons[-1,:,:]

    #samples = draw.sample(100)
    #x_recons = samples[-1, :, :]
    #x_recons = samples[-1, :, :]

    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule([
            StepClipping(10.),
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )
    #algorithm.add_updates(scan_updates)

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t, :].mean()
        kl_term_t.name = "kl_term_%d" % t

        #x_recons_t = T.nnet.sigmoid(c[t,:,:])
        #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        #recons_term_t = recons_term_t.mean()
        #recons_term_t.name = "recons_term_%d" % t

        monitors += [kl_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]
    # Live plotting...
    plot_channels = [
        ["train_nll_bound", "test_nll_bound"],
        ["train_kl_term_%d" % t for t in range(n_iter)],
        #["train_recons_term_%d" % t for t in range(n_iter)],
        ["train_total_gradient_norm", "train_total_step_norm"]
    ]

    #------------------------------------------------------------

    mnist_train = BinarizedMNIST("train", sources=['features'])
    mnist_valid = BinarizedMNIST("valid", sources=['features'])
    mnist_test = BinarizedMNIST("test", sources=['features'])

    train_stream = DataStream(mnist_train,
                              iteration_scheme=SequentialScheme(
                                  mnist_train.num_examples, batch_size))
    valid_stream = DataStream(mnist_valid,
                              iteration_scheme=SequentialScheme(
                                  mnist_valid.num_examples, batch_size))
    test_stream = DataStream(mnist_test,
                             iteration_scheme=SequentialScheme(
                                 mnist_test.num_examples, batch_size))

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            TrainingDataMonitoring(train_monitors,
                                   prefix="train",
                                   after_every_epoch=True),
            #            DataStreamMonitoring(
            #                monitors,
            #                valid_stream,
            ##                updates=scan_updates,
            #                prefix="valid"),
            DataStreamMonitoring(
                monitors,
                test_stream,
                #                updates=scan_updates,
                prefix="test"),
            Checkpoint(name + ".pkl", save_separately=['log', 'model']),
            #Dump(name),
            Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()
        ])
    main_loop.run()
예제 #19
0
    def training(self,
                 fea2obj,
                 batch_size,
                 learning_rate=0.005,
                 steprule='adagrad',
                 wait_epochs=5,
                 kl_weight_init=None,
                 klw_ep=50,
                 klw_inc_rate=0,
                 num_epochs=None):
        networkfile = self._config['net']

        n_epochs = num_epochs or int(self._config['nepochs'])
        reg_weight = float(self._config['loss_weight'])
        reg_type = self._config['loss_reg']
        numtrain = int(
            self._config['num_train']) if 'num_train' in self._config else None
        train_stream, num_samples_train = get_comb_stream(
            fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain)
        dev_stream, num_samples_dev = get_comb_stream(fea2obj,
                                                      'dev',
                                                      batch_size=None,
                                                      shuffle=False)
        logger.info('sources: %s -- number of train/dev samples: %d/%d',
                    train_stream.sources, num_samples_train, num_samples_dev)

        t2idx = fea2obj['targets'].t2idx
        klw_init = kl_weight_init or float(
            self._config['kld_weight']) if 'kld_weight' in self._config else 1
        logger.info('kl_weight_init: %d', klw_init)
        kl_weight = shared_floatx(klw_init, 'kl_weight')
        entropy_weight = shared_floatx(1., 'entropy_weight')

        cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate = build_model_new(
            fea2obj, len(t2idx), self._config, kl_weight, entropy_weight)

        cg = ComputationGraph(cost)

        weights = VariableFilter(roles=[WEIGHT])(cg.parameters)
        logger.info('Model weights are: %s', weights)
        if 'L2' in reg_type:
            cost += reg_weight * l2_norm(weights)
            logger.info('applying %s with weight: %f ', reg_type, reg_weight)

        dropout = -0.1
        if dropout > 0:
            cg = apply_dropout(cg, weights, dropout)
            cost = cg.outputs[0]

        cost.name = 'cost'
        logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule,
                    learning_rate)
        if 'adagrad' in steprule:
            cnf_step_rule = AdaGrad(learning_rate)
        elif 'adadelta' in steprule:
            cnf_step_rule = AdaDelta(decay_rate=0.95)
        elif 'decay' in steprule:
            cnf_step_rule = RMSProp(learning_rate=learning_rate,
                                    decay_rate=0.90)
            cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)])
        elif 'momentum' in steprule:
            cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9)
        elif 'adam' in steprule:
            cnf_step_rule = Adam(learning_rate=learning_rate)
        else:
            logger.info('The steprule param is wrong! which is: %s', steprule)

        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=cnf_step_rule,
                                    on_unused_sources='warn')
        #algorithm.add_updates(updates)
        gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
        step_norm = aggregation.mean(algorithm.total_step_norm)
        monitored_vars = [
            cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight,
            pat1_recog
        ]
        train_monitor = TrainingDataMonitoring(variables=monitored_vars,
                                               after_batch=True,
                                               before_first_epoch=True,
                                               prefix='tra')

        dev_monitor = DataStreamMonitoring(variables=[
            cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate
        ],
                                           after_epoch=True,
                                           before_first_epoch=True,
                                           data_stream=dev_stream,
                                           prefix="dev")

        extensions = [
            dev_monitor,
            train_monitor,
            Timing(),
            TrackTheBest('dev_cost'),
            FinishIfNoImprovementAfter('dev_cost_best_so_far',
                                       epochs=wait_epochs),
            Printing(after_batch=False),  #, ProgressBar()
            FinishAfter(after_n_epochs=n_epochs),
            saveload.Load(networkfile + '.toload.pkl'),
        ] + track_best('dev_cost', networkfile + '.best.pkl')

        #extensions.append(SharedVariableModifier(kl_weight,
        #                                          lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))
        #         extensions.append(SharedVariableModifier(entropy_weight,
        #                                                   lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))

        logger.info('number of parameters in the model: %d',
                    tensor.sum([p.size for p in cg.parameters]).eval())
        logger.info('Lookup table sizes: %s',
                    [p.size.eval() for p in cg.parameters if 'lt' in p.name])

        main_loop = MainLoop(data_stream=train_stream,
                             algorithm=algorithm,
                             model=Model(cost),
                             extensions=extensions)
        main_loop.run()
예제 #20
0
def main():
    x = T.tensor3('features')
    m = T.matrix('features_mask')
    y = T.imatrix('targets')

    embedding_size = 300
    glove_version = "glove.6B.300d.txt"
    #embedding_size = 50
    #glove_version = "vectors.6B.50d.txt"

    o = x.sum(axis=1) + m.mean() * 0

    score_layer = Linear(
            input_dim = 300,
            output_dim = 1,
            weights_init = IsotropicGaussian(std=0.02),
            biases_init = Constant(0.),
            name="linear2")
    score_layer.initialize()
    o = score_layer.apply(o)
    probs = Sigmoid().apply(o)

    cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    # =================
    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
            cost = cg.outputs[0],
            params=params,
            step_rule = CompositeRule([
                StepClipping(threshold=4),
                AdaM(),
                ])

            )

    # ========
    print "setting up data"
    ports = {
            'gpu0_train' : 5557,
            'gpu0_test' : 5558,
            'gpu1_train' : 5559,
            'gpu1_test' : 5560,
            }

    #batch_size = 16
    batch_size = 16
    def start_server(port, which_set):
        fuel.server.logger.setLevel('WARN')
        dataset = IMDBText(which_set, sorted=True)

        n_train = dataset.num_examples
        #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size)
        scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size)

        stream = DataStream(
                dataset=dataset,
                iteration_scheme=scheme)
        print "loading glove"
        glove = GloveTransformer(glove_version, data_stream=stream)
        padded = Padding(
                data_stream=glove,
                mask_sources=('features',)
                )

        fuel.server.start_server(padded, port=port, hwm=20)

    train_port = ports[theano.config.device + '_train']
    train_p = Process(target=start_server, args=(train_port, 'train'))
    train_p.start()

    test_port = ports[theano.config.device + '_test']
    test_p = Process(target=start_server, args=(test_port, 'test'))
    test_p.start()

    train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port)
    test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port)

    print "setting up model"

    n_examples = 25000
    #======
    model = Model(cost)
    extensions = []
    extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1))
    extensions.append(TrainingDataMonitoring(
        [
            cost,
            misclassification,
            ],
        prefix='train',
        after_epoch=True
        ))

    #extensions.append(DataStreamMonitoring(
        #[cost, misclassification],
        #data_stream=test_stream,
        #prefix='test',
        #after_epoch=True
        #))
    extensions.append(Timing())
    extensions.append(Printing())

    extensions.append(Plot(
        theano.config.device+"_result",
        channels=[['train_cost']],
        after_epoch=True
        ))


    main_loop = MainLoop(
            model=model,
            data_stream=train_stream,
            algorithm=algorithm,
            extensions=extensions)
    main_loop.run()
예제 #21
0
    def run_pretrain(model, hyper_params, cost, train_data, valid_data=None, extra_costs=None):
        """
        generic training method for neural networks;
        works with any network structure
        :return:
        """
        from fuel.streams import DataStream
        from fuel.schemes import SequentialScheme, ShuffledScheme
        from blocks.filter import VariableFilter
        from blocks.graph import ComputationGraph
        from blocks.roles import WEIGHT
        from blocks.algorithms import GradientDescent, Adam, RMSProp, Scale
        from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar
        from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring
        from blocks.extensions.predicates import OnLogRecord
        from blocks.monitoring import aggregation
        from blocks.main_loop import MainLoop
        from blocks.extensions.training import TrackTheBest
        from deepthought.extensions.parameters import BestParams    

        if extra_costs is None:
            extra_costs = []
        
        cg = ComputationGraph([cost])

        # TODO: more hyper-params for regularization
        # L1 regularization
        if hyper_params['l1wdecay'] > 0:
            weights = VariableFilter(roles=[WEIGHT])(cg.variables)
            cost = cost + hyper_params['l1wdecay'] * sum([abs(W).sum() for W in weights])

        cost.name = 'cost'

        # set up step_rule
        if hyper_params['step_rule'] == 'Adam':
            step_rule = Adam(learning_rate=hyper_params['learning_rate'])
        elif hyper_params['step_rule'] == 'RMSProp':
            step_rule = RMSProp(learning_rate=hyper_params['learning_rate']) #, decay_rate=0.9, max_scaling=1e5)
        else:
            step_rule = Scale(learning_rate=hyper_params['learning_rate'])
        
        algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule)

        if 'blocks_print_variable_names' in hyper_params and hyper_params['blocks_print_variable_names']:
            print 'cg.variables:', cg.variables

        train_monitoring_vars = [cost] + extra_costs + [aggregation.mean(algorithm.total_gradient_norm)]
        for var_name in hyper_params['blocks_extensions_train_monitoring_channels']:
            for v in cg.variables:
                if v.name == var_name:
                    print 'Monitoring variable:', v
                    train_monitoring_vars.append(v)

        # default extensions
        extensions = [Timing(),
                      FinishAfter(after_n_epochs=hyper_params['max_epochs']),
                      TrainingDataMonitoring(
                          train_monitoring_vars,
                          suffix="train",
                          after_epoch=True)
                      ]

        # additional stuff if validation set is used
        if valid_data is not None:
            valid_monitoring_vars = [cost] + extra_costs
            for var_name in hyper_params['blocks_extensions_valid_monitoring_channels']:
                for v in cg.variables:
                    if v.name == var_name:
                        print 'Monitoring variable:', v
                        valid_monitoring_vars.append(v)

            extensions.append(
                DataStreamMonitoring(
                    valid_monitoring_vars,
                    DataStream.default_stream(
                        valid_data,
                        iteration_scheme=SequentialScheme(
                            valid_data.num_examples, hyper_params['batch_size'])),
                    suffix="valid"))

            best_channel = 'cost_valid'
            print '#train:', train_data.num_examples, '#valid:', valid_data.num_examples
        else:
            best_channel = 'cost_train'
            print '#train:', train_data.num_examples

        # tracking of the best
        best_params = BestParams()
        best_params.add_condition(['after_epoch'],
                                  predicate=OnLogRecord(best_channel + '_best_so_far'))
        extensions.append(TrackTheBest(best_channel))
        extensions.append(best_params)  # after TrackTheBest!

        # printing and plotting
        if hyper_params['blocks_extensions_printing'] is True:
            extensions.append(Printing())  # optional
        if hyper_params['blocks_extensions_progressbar'] is True:
            extensions.append(ProgressBar())

        if hyper_params['blocks_extensions_bokeh'] is True:
            try:
                from blocks_extras.extensions.plot import Plot
                bokeh_available = True
            except:
                bokeh_available = False
            print 'bokeh available: ', bokeh_available

            if bokeh_available:
                extensions.append(Plot(
                    hyper_params['blocks_extensions_bokeh_plot_title'],
                    channels=hyper_params['blocks_extensions_bokeh_channels'],
                ))

        main_loop = MainLoop(
            algorithm,
            DataStream.default_stream(
                train_data,
                iteration_scheme=ShuffledScheme(
                    train_data.num_examples, hyper_params['batch_size'])),
            model=model,
            extensions=extensions)

        main_loop.run()

        return best_params.values, main_loop.status['best_' + best_channel]
예제 #22
0
def pretrain_rnn(train, rnnrbm, test=None, epochs=1000, bokeh=True):
    lr = theano.shared(float32(0.1))

    probs, _, _, _ = rnnrbm.rnn_pretrain_pred(x, x_mask)
    cost = NegativeLogLikelihood().apply(y, probs, y_mask)

    error_rate = MismulitclassificationRate().apply(y, probs, y_mask)
    error_rate.name = "error on note as a whole"
    mistake_rate = MismulitmistakeRate().apply(y, probs, y_mask)
    mistake_rate.name = "single error within note"
    cost.name = 'final_cost'

    model = Model(cost)
    cg = ComputationGraph([cost])
    step_rule = CompositeRule([
        RemoveNotFinite(),
        StepClipping(30.0),
        Adam(learning_rate=lr),
        StepClipping(6.0),
        RemoveNotFinite()
    ])
    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost,
                                params=cg.parameters)
    extensions = [
        SharedVariableModifier(parameter=lr,
                               function=lambda n, v: float32(0.7 * v)
                               if n % 700 == 0 else v),
        FinishAfter(after_n_epochs=epochs),
        TrainingDataMonitoring(
            [
                cost,
                error_rate,
                mistake_rate,
            ],  # hidden_states, debug_val, param_nans,
            # aggregation.mean(algorithm.total_gradient_norm)],  #+ params,
            prefix="train",
            after_epoch=False,
            every_n_batches=40),
        Timing(),
        Printing(),
        ProgressBar()
    ]
    if test is not None:
        extensions.append(
            DataStreamMonitoring([cost, error_rate, mistake_rate],
                                 data_stream=test,
                                 updates=cg.updates,
                                 prefix="test",
                                 after_epoch=False,
                                 every_n_batches=40))

    if bokeh:
        extensions.append(
            Plot(
                'Pretrain RNN',
                channels=[
                    [
                        'train_error on note as a whole',
                        'train_single error within note',
                        'test_error on note as a whole',
                        'test_single error within note'
                    ],
                    ['train_rbm_cost'],
                    # ['train_total_gradient_norm'],
                ]))

    main_loop = MainLoop(algorithm=algorithm,
                         data_stream=train,
                         model=model,
                         extensions=extensions)
    return main_loop
예제 #23
0
 def add_timing(self, **kwargs):
     self.extensions.append(Timing(**kwargs))
예제 #24
0
파일: train.py 프로젝트: yangyuwu/scribe
    from blocks_extras.extensions.synchronization import (
        Synchronize, SynchronizeWorker)
    from platoon.param_sync import ASGD

    sync_rule = ASGD()
    worker = SynchronizeWorker(
        sync_rule, control_port=args.platoon_port, socket_timeout=2000)

extensions = []

if args.load_experiment and (not worker or worker.is_main_worker):
    extensions += [Load(os.path.join(
        save_dir, "pkl", "best_" + args.load_experiment + ".tar"))]

extensions += [
    Timing(every_n_batches=args.save_every),
    train_monitor]

if not worker or worker.is_main_worker:
    extensions += [
        valid_monitor,
        TrackTheBest(
            'valid_nll',
            every_n_batches=args.save_every,
            before_first_epoch=True),
        Plot(
            os.path.join(save_dir, "progress", exp_name + ".png"),
            [['train_nll', 'valid_nll']],
            every_n_batches=args.save_every,
            email=False),
        Checkpoint(
예제 #25
0
def main(name, epochs, batch_size, learning_rate):
    if name is None:
        name = "att-rw"

    print("\nRunning experiment %s" % name)
    print("         learning rate: %5.3f" % learning_rate)
    print()

    #------------------------------------------------------------------------

    img_height, img_width = 28, 28

    read_N = 12
    write_N = 14

    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.001),
        'biases_init': Constant(0.),
    }

    x_dim = img_height * img_width

    reader = ZoomableAttentionWindow(img_height, img_width, read_N)
    writer = ZoomableAttentionWindow(img_height, img_width, write_N)

    # Parameterize the attention reader and writer
    mlpr = MLP(activations=[Tanh(), Identity()],
               dims=[x_dim, 50, 5],
               name="RMLP",
               **inits)
    mlpw = MLP(activations=[Tanh(), Identity()],
               dims=[x_dim, 50, 5],
               name="WMLP",
               **inits)

    # MLP between the reader and writer
    mlp = MLP(activations=[Tanh(), Identity()],
              dims=[read_N**2, 300, write_N**2],
              name="MLP",
              **inits)

    for brick in [mlpr, mlpw, mlp]:
        brick.allocate()
        brick.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    hr = mlpr.apply(x)
    hw = mlpw.apply(x)

    center_y, center_x, delta, sigma, gamma = reader.nn2att(hr)
    r = reader.read(x, center_y, center_x, delta, sigma)

    h = mlp.apply(r)

    center_y, center_x, delta, sigma, gamma = writer.nn2att(hw)
    c = writer.write(h, center_y, center_x, delta, sigma) / gamma
    x_recons = T.nnet.sigmoid(c)

    cost = BinaryCrossEntropy().apply(x, x_recons)
    cost.name = "cost"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule([
            RemoveNotFinite(),
            Adam(learning_rate),
            StepClipping(3.),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]:
    #    v_mean = v.mean()
    #    v_mean.name = v.name
    #    monitors += [v_mean]
    #    monitors += [aggregation.mean(v)]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]

    # Live plotting...
    plot_channels = [
        ["cost"],
    ]

    #------------------------------------------------------------

    mnist_train = BinarizedMNIST("train", sources=['features'])
    mnist_test = BinarizedMNIST("test", sources=['features'])
    #mnist_train = MNIST("train", binary=True, sources=['features'])
    #mnist_test = MNIST("test", binary=True, sources=['features'])

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=ForceFloatX(
            DataStream(mnist_train,
                       iteration_scheme=SequentialScheme(
                           mnist_train.num_examples, batch_size))),
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            DataStreamMonitoring(
                monitors,
                ForceFloatX(
                    DataStream(mnist_test,
                               iteration_scheme=SequentialScheme(
                                   mnist_test.num_examples, batch_size))),
                prefix="test"),
            TrainingDataMonitoring(train_monitors,
                                   prefix="train",
                                   after_every_epoch=True),
            SerializeMainLoop(name + ".pkl"),
            #Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()
        ])
    main_loop.run()
예제 #26
0
def main(config,
         tr_stream,
         dev_stream,
         source_vocab,
         target_vocab,
         use_bokeh=False):

    # add the tags from this function to the IMT datastream
    # prediction function signature
    # [target_suffix, source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask]
    prediction_function = get_prediction_function(exp_config=config)

    tr_stream = Mapping(
        tr_stream,
        CallPredictionFunctionOnStream(prediction_function,
                                       [1, 0, 5, 4, 7, 6]),
        #tr_stream = Mapping(tr_stream, CallFunctionOnStream(prediction_function, [6, 1, 0, 5, 4, 7]),
        add_sources=('predictions', 'orig_readouts', 'prediction_tags'))

    # now datastream has 11 things
    import ipdb
    ipdb.set_trace()

    # WORKING: call prediction function twice to get new readouts on predictions instead of reference suffs
    # the only difference is the index of the suffix
    tr_stream = Mapping(tr_stream,
                        CallPredictionFunctionOnStream(prediction_function,
                                                       [1, 0, 5, 4, 7, 8]),
                        add_sources=('dummy_predictions', 'readouts',
                                     'dummy_prediction_tags'))

    import ipdb
    ipdb.set_trace()

    # Create the prediction confidence model
    # the first draft of this model uses the readout output (before the post-merge step) as the per-timestep state vector

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')

    # Note that the _names_ are changed from normal NMT
    # for IMT training, we use only the suffix as the reference
    target_sentence = tensor.lmatrix('target_suffix')
    target_sentence_mask = tensor.matrix('target_suffix_mask')

    target_prefix = tensor.lmatrix('target_prefix')
    target_prefix_mask = tensor.matrix('target_prefix_mask')

    # symbolic variable which tags each timestep as GOOD/BAD
    # Note: later this might be tags for a hypothesis i.e. from TER(p), right now the timesteps are actually determined by the reference
    # By zipping the confidence model output with the reference, we get the model's confidence that this reference word
    # will be predicted correctly
    prediction_tags = tensor.matrix('prediction_tags')
    readouts = tensor.tensor3('readouts')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])

    decoder = NMTPrefixDecoder(config['trg_vocab_size'],
                               config['dec_embed'],
                               config['dec_nhids'],
                               config['enc_nhids'] * 2,
                               loss_function='cross_entropy')

    # rename to match baseline NMT systems
    decoder.name = 'decoder'

    cost = decoder.confidence_cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask,
        target_prefix, target_prefix_mask, readouts, prediction_tags)

    # WORKING: add l2 regularization

    logger.info('Creating computational graph')
    # working: implement cost for confidence model
    cg = ComputationGraph(cost)

    # INITIALIZATION
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    #cost_cg = ComputationGraph(cost)
    if config['l2_reg']:
        l2_reg_alpha = config['l2_reg_alpha']
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())
        # do we need to name the cost variable again?
        cost.name = 'cost'
        cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables if x.name in set([
                'confidence_model1_apply_output',
                'confidence_model2_apply_output',
                'confidence_model3_apply_output'
            ])
        ]
        # if x.name == 'maxout_apply_output']
        # if x.name == 'maxout_apply_output']
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # WORKING: implement confidence -- remove all params except output model
    cost_model = Model(cost)

    model_params = cost_model.get_parameter_dict()
    trainable_params = cg.parameters
    import ipdb
    ipdb.set_trace()
    print('trainable params')
    #params_to_remove = [model_params[k] for k in model_params.keys() if 'confidence' not in k]
    #for p in params_to_remove:
    #    trainable_params.remove(p)

    # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W']
    # trainable_params.remove(source_embeddings)
    # trainable_params.remove(target_embeddings)
    # END WORKING: implement confidence -- remove all params except output model

    # TODO: fixed dropout mask for recurrent params?
    # Print shapes
    # shapes = [param.get_value().shape for param in cg.parameters]
    # logger.info("Parameter shapes: ")
    # for shape, count in Counter(shapes).most_common():
    #     logger.info('    {:15}: {}'.format(shape, count))
    # logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    # enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
    #                            Selector(decoder).get_parameters())
    # logger.info("Parameter names: ")
    # for name, value in enc_dec_param_dict.items():
    #     logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    # logger.info("Total number of parameters: {}"
    #             .format(len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        # TrainingDataMonitoring(trainable_params, after_batch=True),
        # Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # WORKING: confidence prediction
    #monitor everything that could possibly be relevant

    # Set up the sampling graph for validation during training
    # Theano variables for the sampling graph
    # Note this also loads the model parameters
    sampling_vars = load_params_and_get_beam_search(config,
                                                    encoder=encoder,
                                                    decoder=decoder)
    beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars

    #if config['hook_samples'] >= 1:
    #    logger.info("Building sampler")
    #    extensions.append(
    #        Sampler(model=search_model, data_stream=tr_stream,
    #                hook_samples=config['hook_samples'],
    #                every_n_batches=config['sampling_freq'],
    #                src_vocab=source_vocab,
    #                trg_vocab=target_vocab,
    #                src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    #if config['bleu_script'] is not None:
    #    logger.info("Building bleu validator")
    #    extensions.append(
    #        BleuValidator(sampling_input, sampling_prefix, samples=samples, config=config,
    #                      model=search_model, data_stream=dev_stream,
    #                      src_vocab=source_vocab,
    #                      trg_vocab=target_vocab,
    #                      normalize=config['normalized_bleu'],
    #                      every_n_batches=config['bleu_val_freq']))

    # TODO: add first-word accuracy validation
    # TODO: add IMT meteor early stopping
    #if config.get('imt_f1_validation', None) is not None:
    #    logger.info("Building imt F1 validator")
    #    extensions.append(
    #        IMT_F1_Validator(sampling_input, sampling_prefix,
    #                         samples=samples,
    #                         config=config,
    #                         model=search_model, data_stream=dev_stream,
    #                         src_vocab=source_vocab,
    #                         trg_vocab=target_vocab,
    #                         normalize=config['normalized_bleu'],
    #                         every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # TODO: hacking here: get the predictions of the confidence model using the `readouts` source of the data_stream

    # Note that the parameters of this model must be pretrained, otherwise this doesn't make sense
    # confidence_predictions = decoder.get_confidence(readouts)
    # confidence_prediction_model = Model(confidence_predictions)
    #
    # confidence_param_values = LoadNMT.load_parameter_values(config['confidence_saved_parameters'], brick_delimiter=None)
    # LoadNMT.set_model_parameters(confidence_prediction_model, confidence_param_values)
    #
    # confidence_prediction_func = confidence_prediction_model.get_theano_function()

    # import ipdb; ipdb.set_trace()

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            # Plot(config['model_save_directory'], channels=[['decoder_confidence_cost_cost']],
            Plot(config['model_save_directory'],
                 channels=[['cost']],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # WORKING: implement confidence model
    # if there is dropout or random noise, we need to use the output of the modified graph
    algorithm = GradientDescent(
        cost=cg.outputs[0],
        parameters=trainable_params,
        step_rule=CompositeRule([
            StepClipping(config['step_clipping']),
            eval(config['step_rule'])()
        ]),
        # eval(config['step_rule'])(), RemoveNotFinite()]),
        # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]),
        on_unused_sources='warn')
    #if config['dropout'] < 1.0:
    #   algorithm = GradientDescent(
    #       cost=cg.outputs[0], parameters=trainable_params,
    #       step_rule=CompositeRule([StepClipping(config['step_clipping']),
    #                         eval(config['step_rule'])(), RemoveNotFinite()]),
    #       # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]),
    #       on_unused_sources='warn'
    #   )
    #else:
    #   algorithm = GradientDescent(
    #       cost=cost, parameters=cg.parameters,
    #       step_rule=CompositeRule([StepClipping(config['step_clipping']),
    #                                eval(config['step_rule'])()]),
    #       on_unused_sources='warn'
    #   )
    # END WORKING: implement confidence model

    import ipdb
    ipdb.set_trace()

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # WORKING: debugging confidence
    # get theano function from model
    # WORKING: implement word-level confidence cost
    #   @application(inputs=['representation', 'source_sentence_mask',
    #                                'target_sentence_mask', 'target_sentence', 'target_prefix_mask', 'target_prefix'],
    #                                                 outputs=['cost'])
    #       def confidence_cost(self, representation, source_sentence_mask,
    #                            target_sentence, target_sentence_mask, target_prefix, target_prefix_mask):

    logger.info('Creating theano variables')

    # WORKING: 26.9.16 -- get confidence outputs directly from (source, prefix, suffix) inputs
    # This is equivalent to forced alignment --> confidence scores
    # Note: but this section should probably be in "evaluate" mode, not here in "train"

    # source_sentence = tensor.lmatrix('source')
    # source_sentence_mask = tensor.matrix('source_mask')

    # Note that the _names_ are changed from normal NMT
    # for IMT training, we use only the suffix as the reference
    #target_sentence = tensor.lmatrix('target_suffix')
    #target_sentence_mask = tensor.matrix('target_suffix_mask')
    # TODO: change names back to *_suffix, there is currently a theano function name error
    # TODO: in the GradientDescent Algorithm

    #target_prefix = tensor.lmatrix('target_prefix')
    #target_prefix_mask = tensor.matrix('target_prefix_mask')

    # confidence_output = decoder.confidence_cost(
    #     encoder.apply(source_sentence, source_sentence_mask),
    #     source_sentence_mask, target_sentence, target_sentence_mask,
    #     target_prefix, target_prefix_mask)

    # confidence_model = Model(confidence_output)

    # t_cost_func = confidence_model.get_theano_function()
    # inputs
    # [source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask, target_suffix]

    #import ipdb;ipdb.set_trace()

    # get the right args from the datastream
    # TODO: just print source, prefix, suffix, prediction, correct to new files -- this makes sure everything is aligned
    # OUTPUT_DIR = '/media/1tb_drive/imt_models/word_prediction_accuracy_experiments/en-de/exp_1'
    # for the_file in os.listdir(OUTPUT_DIR):
    #     file_path = os.path.join(OUTPUT_DIR, the_file)
    #     try:
    #         if os.path.isfile(file_path):
    #             os.unlink(file_path)
    #     except Exception as e:
    #         print(e)
    #
    # def write_file_truncate_mask(filename, data, mask, mode='a'):
    #     ''' data is list of list '''
    #
    #     assert len(data) == len(mask)
    #     with codecs.open(filename, mode, encoding='utf8') as out:
    #         for l, m in zip(data, mask):
    #             output = u' '.join(l[:int(m.sum())]) + u'\n'
    #             out.write(output)
    #     logger.info('Wrote file: {}'.format(filename))
    #
    #
    # target_ivocab = {k:v.decode('utf8') for v,k in target_vocab.items()}
    # source_ivocab = {k:v.decode('utf8') for v,k in source_vocab.items()}
    # import ipdb; ipdb.set_trace()
    # tag_ivocab = {1: 'True', 0: 'False'}
    #
    # test_iter = tr_stream.get_epoch_iterator()
    # it = 0
    # for t_source, t_source_mask, t_target, t_target_mask, t_target_prefix, t_target_prefix_mask, t_target_suffix, t_target_suffix_mask in test_iter:
    #     if it <= 1000:
    #         it += 1
    #         t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix)
    #         readouts = t_cost[0]
    #         preds = readouts.argmax(axis=2)
    #         correct = preds.T == t_target_suffix
    #
    #
    #         source_output = os.path.join(OUTPUT_DIR,'sources.en')
    #         prefix_output = os.path.join(OUTPUT_DIR,'prefixes.de')
    #         suffix_output = os.path.join(OUTPUT_DIR,'suffixes.de')
    #         prediction_output = os.path.join(OUTPUT_DIR,'predictions.de')
    #         correct_output = os.path.join(OUTPUT_DIR,'prefix_word_prediction_acc.out')
    #
    #         source_text = [[source_ivocab[w] for w in s] for s in t_source]
    #         prefix_text = [[target_ivocab[w] for w in s] for s in t_target_prefix]
    #         suffix_text = [[target_ivocab[w] for w in s] for s in t_target_suffix]
    #         pred_text = [[target_ivocab[w] for w in s] for s in preds.T]
    #         correct_text = [[tag_ivocab[w] for w in s] for s in correct]
    #
    #
    #         for triple in zip([source_output, prefix_output, suffix_output, prediction_output, correct_output],
    #                           [source_text, prefix_text, suffix_text, pred_text, correct_text],
    #                           [t_source_mask, t_target_prefix_mask, t_target_suffix_mask, t_target_suffix_mask, t_target_suffix_mask]):
    #             write_file_truncate_mask(*triple)
    #     else:
    #         break
    #
    # import ipdb; ipdb.set_trace()

    #t_cost = t_cost_func(t_source, t_target_prefix)
    #t_cost = t_cost_func(t_target_suffix, t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask)
    #t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix)

    #    return confidence_cost, flat_y, confidence_logits, readouts

    #predictions = t_cost[0].argmax(axis=2)

    # TODO: next step -- print gradients and weights during training find out where nan is coming from
    # TODO: look at the gradient of this function with respect to parameters? -- see here: http://deeplearning.net/software/theano/tutorial/gradients.html

    # TODO: function which adds right/wrong tags for model predictions to the datastream. In this case we can learn a simple linear model as a baseline
    # TODO: print predictions for each batch for each timestep to file -- _dont shuffle_ so that we get the right order

    # import ipdb;ipdb.set_trace()

    # from blocks reverse_words example
    # observables = [
    #     cost, min_energy, max_energy, mean_activation,
    #     batch_size, max_length, cost_per_character,
    #     algorithm.total_step_norm, algorithm.total_gradient_norm]
    # for name, parameter in trainable_params.items():
    #     observables.append(parameter.norm(2).copy(name + "_norm"))
    #     observables.append(algorithm.gradients[parameter].norm(2).copy(
    #         name + "_grad_norm"))

    for i, (k, v) in enumerate(algorithm.updates):
        v.name = k.name + '_{}'.format(i)

    aux_vars = [v for v in cg.auxiliary_variables[-3:]]
    # import ipdb; ipdb.set_trace()

    extensions.extend([
        TrainingDataMonitoring([cost], after_batch=True),
        # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True),
        # TrainingDataMonitoring(aux_vars, after_batch=True),
        # TrainingDataMonitoring(trainable_params, after_batch=True),
        Printing(after_batch=True)
    ])

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)
    import ipdb
    ipdb.set_trace()

    # Train!
    main_loop.run()
예제 #27
0
def train_extractive_qa(new_training_job, config, save_path, params,
                        fast_start, fuel_server, seed):
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    root_path = os.path.join(save_path, 'training_state')
    extension = '.tar'
    tar_path = root_path + extension
    best_tar_path = root_path + '_best' + extension

    c = config
    data, qam = initialize_data_and_model(c)

    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', shuffle=True, batch_size=4,
                            max_length=5).get_epoch_iterator(as_dict=True))
        for var in qam.input_vars.values():
            var.tag.test_value = test_value_data[var.name]

    costs = qam.apply_with_default_vars()
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(qam.contexts.shape[1], 'length')
    batch_size = rename(qam.contexts.shape[0], 'batch_size')
    predicted_begins, = VariableFilter(name='predicted_begins')(cg)
    predicted_ends, = VariableFilter(name='predicted_ends')(cg)
    exact_match, = VariableFilter(name='exact_match')(cg)
    exact_match_ratio = rename(exact_match.mean(), 'exact_match_ratio')
    context_unk_ratio, = VariableFilter(name='context_unk_ratio')(cg)
    monitored_vars = [
        length, batch_size, cost, exact_match_ratio, context_unk_ratio
    ]
    if c['dict_path']:
        def_unk_ratio, = VariableFilter(name='def_unk_ratio')(cg)
        num_definitions = rename(qam.input_vars['defs'].shape[0],
                                 'num_definitions')
        max_definition_length = rename(qam.input_vars['defs'].shape[1],
                                       'max_definition_length')
        monitored_vars.extend(
            [def_unk_ratio, num_definitions, max_definition_length])
        if c['def_word_gating'] == 'self_attention':
            def_gates = VariableFilter(name='def_gates')(cg)
            def_gates_min = tensor.minimum(*[x.min() for x in def_gates])
            def_gates_max = tensor.maximum(*[x.max() for x in def_gates])
            monitored_vars.extend([
                rename(def_gates_min, 'def_gates_min'),
                rename(def_gates_max, 'def_gates_max')
            ])
    text_match_ratio = TextMatchRatio(data_path=os.path.join(
        fuel.config.data_path[0], 'squad/dev-v1.1.json'),
                                      requires=[
                                          predicted_begins, predicted_ends,
                                          tensor.ltensor3('contexts_text'),
                                          tensor.lmatrix('q_ids')
                                      ],
                                      name='text_match_ratio')

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    if c['embedding_path']:
        logger.debug("Exclude  word embeddings from the trained parameters")
        trained_parameters = [
            p for p in trained_parameters if not p == qam.embeddings_var()
        ]
    if c['train_only_def_part']:
        def_reading_parameters = qam.def_reading_parameters()
        trained_parameters = [
            p for p in trained_parameters if p in def_reading_parameters
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    # apply dropout to the training cost and to all the variables
    # that we monitor during training
    train_cost = cost
    train_monitored_vars = list(monitored_vars)
    if c['dropout']:
        regularized_cg = ComputationGraph([cost] + train_monitored_vars)
        # Dima: the dropout that I implemented first
        bidir_outputs, = VariableFilter(bricks=[Bidirectional],
                                        roles=[OUTPUT])(cg)
        readout_layers = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg)
        dropout_vars = [bidir_outputs] + readout_layers
        logger.debug("applying dropout to {}".format(", ".join(
            [v.name for v in dropout_vars])))
        regularized_cg = apply_dropout(regularized_cg, dropout_vars,
                                       c['dropout'])
        # a new dropout with exactly same mask at different steps
        emb_vars = VariableFilter(roles=[EMBEDDINGS])(regularized_cg)
        emb_dropout_mask = get_dropout_mask(emb_vars[0], c['emb_dropout'])
        if c['emb_dropout_type'] == 'same_mask':
            regularized_cg = apply_dropout2(regularized_cg,
                                            emb_vars,
                                            c['emb_dropout'],
                                            dropout_mask=emb_dropout_mask)
        elif c['emb_dropout_type'] == 'regular':
            regularized_cg = apply_dropout(regularized_cg, emb_vars,
                                           c['emb_dropout'])
        else:
            raise ValueError("unknown dropout type {}".format(
                c['emb_dropout_type']))
        train_cost = regularized_cg.outputs[0]
        train_monitored_vars = regularized_cg.outputs[1:]

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=train_cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)
    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    training_stream = data.get_stream('train',
                                      batch_size=c['batch_size'],
                                      shuffle=True,
                                      max_length=c['max_length'])
    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    extensions = [
        LoadNoUnpickling(tar_path, load_iteration_state=True,
                         load_log=True).set_conditions(
                             before_training=not new_training_job),
        StartFuelServer(original_training_stream,
                        os.path.join(save_path, 'stream.pkl'),
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train']),
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
    ]
    validation = DataStreamMonitoring(
        [text_match_ratio] + monitored_vars,
        data.get_stream('dev',
                        batch_size=c['batch_size_valid'],
                        raw_text=True,
                        q_ids=True),
        prefix="dev").set_conditions(before_training=not fast_start,
                                     after_epoch=True)
    dump_predictions = DumpPredictions(save_path,
                                       text_match_ratio,
                                       before_training=not fast_start,
                                       after_epoch=True)
    track_the_best_exact = TrackTheBest(
        validation.record_name(exact_match_ratio),
        choose_best=max).set_conditions(before_training=True, after_epoch=True)
    track_the_best_text = TrackTheBest(
        validation.record_name(text_match_ratio),
        choose_best=max).set_conditions(before_training=True, after_epoch=True)
    extensions.extend([
        validation, dump_predictions, track_the_best_exact, track_the_best_text
    ])
    # We often use pretrained word embeddings and we don't want
    # to load and save them every time. To avoid that, we use
    # save_main_loop=False, we only save the trained parameters,
    # and we save the log and the iterations state separately
    # in the tar file.
    extensions.extend([
        Checkpoint(tar_path,
                   parameters=trained_parameters,
                   save_main_loop=False,
                   save_separately=['log', 'iteration_state'],
                   before_training=not fast_start,
                   every_n_epochs=c['save_freq_epochs'],
                   every_n_batches=c['save_freq_batches'],
                   after_training=not fast_start).add_condition(
                       ['after_batch', 'after_epoch'],
                       OnLogRecord(track_the_best_text.notification_name),
                       (best_tar_path, )),
        DumpTensorflowSummaries(save_path,
                                after_epoch=True,
                                every_n_batches=c['mon_freq_train'],
                                after_training=True),
        RetrievalPrintStats(retrieval=data._retrieval,
                            every_n_batches=c['mon_freq_train'],
                            before_training=not fast_start),
        Printing(after_epoch=True, every_n_batches=c['mon_freq_train']),
        FinishAfter(after_n_batches=c['n_batches'],
                    after_n_epochs=c['n_epochs']),
        Annealing(c['annealing_learning_rate'],
                  after_n_epochs=c['annealing_start_epoch']),
        LoadNoUnpickling(best_tar_path,
                         after_n_epochs=c['annealing_start_epoch'])
    ])

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)
    main_loop.run()
예제 #28
0
파일: __init__.py 프로젝트: rizar/tle-mnist
def main(save_to, cost_name, learning_rate, momentum, num_epochs):
    mlp = MLP([None], [784, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    scores = mlp.apply(x)

    batch_size = y.shape[0]
    indices = tensor.arange(y.shape[0])
    target_scores = tensor.set_subtensor(
        tensor.zeros((batch_size, 10))[indices, y.flatten()], 1)
    score_diff = scores - target_scores

    # Logistic Regression
    if cost_name == 'lr':
        cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean()
    # MSE
    elif cost_name == 'mse':
        cost = (score_diff**2).mean()
    # Perceptron
    elif cost_name == 'perceptron':
        cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean()
    # TLE
    elif cost_name == 'minmin':
        cost = abs(score_diff[indices, y.flatten()]).mean()
        cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean()
    # TLEcut
    elif cost_name == 'minmin_cut':
        # Score of the groundtruth should be greater or equal than its target score
        cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean()
        # Score of the prediction should be less or equal than its actual score
        cost += tensor.maximum(0, score_diff[indices,
                                             scores.argmax(axis=1)]).mean()
    # TLE2
    elif cost_name == 'minmin2':
        cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()])**2).mean()
        cost += ((score_diff[tensor.arange(y.shape[0]),
                             scores.argmax(axis=1)])**2).mean()
    # Direct loss minimization
    elif cost_name == 'direct':
        epsilon = 0.1
        cost = (-scores[indices,
                        (scores + epsilon * target_scores).argmax(axis=1)] +
                scores[indices, scores.argmax(axis=1)]).mean()
        cost /= epsilon
    elif cost_name == 'svm':
        cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] -
                scores[indices, y.flatten()]).mean()
    else:
        raise ValueError("Unknown cost " + cost)

    error_rate = MisclassificationRate().apply(y.flatten(), scores)
    error_rate.name = 'error_rate'

    cg = ComputationGraph([cost])
    cost.name = 'cost'

    mnist_train = MNIST(("train", ))
    mnist_test = MNIST(("test", ))

    if learning_rate == None:
        learning_rate = 0.0001
    if momentum == None:
        momentum = 0.0
    rule = Momentum(learning_rate=learning_rate, momentum=momentum)
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=rule)
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs),
        DataStreamMonitoring([cost, error_rate],
                             Flatten(DataStream.default_stream(
                                 mnist_test,
                                 iteration_scheme=SequentialScheme(
                                     mnist_test.num_examples, 500)),
                                     which_sources=('features', )),
                             prefix="test"),
        # CallbackExtension(
        #    lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9),
        #    after_epoch=True),
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate
        ],
                               prefix="train",
                               after_epoch=True),
        Checkpoint(save_to),
        Printing()
    ]

    if BLOCKS_EXTRAS_AVAILABLE:
        extensions.append(
            Plot('MNIST example',
                 channels=[['test_cost', 'test_error_rate'],
                           ['train_total_gradient_norm']]))

    main_loop = MainLoop(algorithm,
                         Flatten(DataStream.default_stream(
                             mnist_train,
                             iteration_scheme=SequentialScheme(
                                 mnist_train.num_examples, 50)),
                                 which_sources=('features', )),
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()

    df = pandas.DataFrame.from_dict(main_loop.log, orient='index')
    res = {
        'cost': cost_name,
        'learning_rate': learning_rate,
        'momentum': momentum,
        'train_cost': df.train_cost.iloc[-1],
        'test_cost': df.test_cost.iloc[-1],
        'best_test_cost': df.test_cost.min(),
        'train_error': df.train_error_rate.iloc[-1],
        'test_error': df.test_error_rate.iloc[-1],
        'best_test_error': df.test_error_rate.min()
    }
    res = {
        k: float(v) if isinstance(v, numpy.ndarray) else v
        for k, v in res.items()
    }
    json.dump(res, sys.stdout)
    sys.stdout.flush()
예제 #29
0
def main(mode, save_path, num_batches, data_path=None):
    # Experiment configuration
    dimension = 100
    readout_dimension = len(char2code)

    # Build bricks
    encoder = Bidirectional(SimpleRecurrent(dim=dimension, activation=Tanh()),
                            weights_init=Orthogonal())
    fork = Fork(
        [name for name in encoder.prototype.apply.sequences if name != 'mask'],
        weights_init=IsotropicGaussian(0.1),
        biases_init=Constant(0))
    fork.input_dim = dimension
    fork.output_dims = {name: dimension for name in fork.input_names}
    lookup = LookupTable(readout_dimension,
                         dimension,
                         weights_init=IsotropicGaussian(0.1))
    transition = SimpleRecurrent(activation=Tanh(),
                                 dim=dimension,
                                 name="transition")
    attention = SequenceContentAttention(state_names=transition.apply.states,
                                         sequence_dim=2 * dimension,
                                         match_dim=dimension,
                                         name="attention")
    readout = LinearReadout(readout_dim=readout_dimension,
                            source_names=["states"],
                            emitter=SoftmaxEmitter(name="emitter"),
                            feedbacker=LookupFeedback(readout_dimension,
                                                      dimension),
                            name="readout")
    generator = SequenceGenerator(readout=readout,
                                  transition=transition,
                                  attention=attention,
                                  weights_init=IsotropicGaussian(0.1),
                                  biases_init=Constant(0),
                                  name="generator")
    generator.push_initialization_config()
    transition.weights_init = Orthogonal()

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code,
                               level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = DataStreamMapping(
            mapping=_transpose,
            data_stream=PaddingDataStream(
                BatchDataStream(
                    iteration_scheme=ConstantScheme(10),
                    data_stream=DataStreamMapping(
                        mapping=reverse_words,
                        add_sources=("targets", ),
                        data_stream=DataStreamFilter(
                            predicate=_filter_long,
                            data_stream=dataset.get_default_stream())))))

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = generator.cost(
            targets,
            targets_mask,
            attended=encoder.apply(**dict_union(fork.apply(
                lookup.lookup(chars), return_dict=True),
                                                mask=chars_mask)),
            attended_mask=chars_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        params = model.get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat([(key, value.get_value().shape)
                                    for key, value in params.items()],
                                   width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Fetch variables useful for debugging
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        cg = ComputationGraph(cost)
        (energies, ) = VariableFilter(application=readout.readout,
                                      name="output")(cg.variables)
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        (activations, ) = VariableFilter(
            application=generator.transition.apply,
            name="states")(cg.variables)
        mean_activation = named_copy(
            abs(activations).mean(), "mean_activation")

        # Define the training algorithm.
        algorithm = GradientDescent(cost=cost,
                                    step_rule=CompositeRule(
                                        [StepClipping(10.0),
                                         Scale(0.01)]))

        # More variables for debugging
        observables = [
            cost, min_energy, max_energy, mean_activation, batch_size,
            max_length, cost_per_character, algorithm.total_step_norm,
            algorithm.total_gradient_norm
        ]
        for name, param in params.items():
            observables.append(named_copy(param.norm(2), name + "_norm"))
            observables.append(
                named_copy(algorithm.gradients[param].norm(2),
                           name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(observables,
                                                    prefix="average",
                                                    every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_every_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches).add_condition(
                    "after_batch", _is_nan),
                Plot(os.path.basename(save_path),
                     [[average_monitoring.record_name(cost)],
                      [average_monitoring.record_name(cost_per_character)]],
                     every_n_batches=10),
                SerializeMainLoop(save_path,
                                  every_n_batches=500,
                                  save_separately=["model", "log"]),
                Printing(every_n_batches=1)
            ])
        main_loop.run()
    elif mode == "test":
        logger.info("Model is loaded")
        chars = tensor.lmatrix("features")
        generated = generator.generate(
            n_steps=3 * chars.shape[0],
            batch_size=chars.shape[1],
            attended=encoder.apply(**dict_union(
                fork.apply(lookup.lookup(chars), return_dict=True))),
            attended_mask=tensor.ones(chars.shape))
        model = Model(generated)
        model.set_param_values(load_parameter_values(save_path))
        sample_function = model.get_theano_function()
        logging.info("Sampling function is compiled")

        while True:
            # Python 2-3 compatibility
            line = input("Enter a sentence\n")
            batch_size = int(input("Enter a number of samples\n"))
            encoded_input = [
                char2code.get(char, char2code["<UNK>"])
                for char in line.lower().strip()
            ]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input, ))[0]
            print("Target: ", target)
            states, samples, glimpses, weights, costs = sample_function(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size,
                             axis=1))

            messages = []
            for i in range(samples.shape[1]):
                sample = list(samples[:, i])
                try:
                    true_length = sample.index(char2code['</S>']) + 1
                except ValueError:
                    true_length = len(sample)
                sample = sample[:true_length]
                cost = costs[:true_length, i].sum()
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
예제 #30
0
def test_timing():
    class FakeMainLoop():

        def __init__(self):
            self.log = TrainingLog()

    # Build the main loop
    now = 0
    timing = Timing(lambda: now)
    ml = FakeMainLoop()
    timing.main_loop = ml

    # Start training
    now += 1
    timing.before_training()

    # Start epoch 1
    now += 2
    timing.before_epoch()
    assert ml.log[0].initialization_took == 2
    ml.log.status._epoch_started = True

    # Batch 1
    timing.before_batch(None)
    now += 7
    ml.log.status.iterations_done += 1
    timing.after_batch(None)
    assert ml.log[1].iteration_took == 7

    # Batch 2
    timing.before_batch(None)
    now += 8
    ml.log.status.iterations_done += 1
    timing.after_batch(None)
    assert ml.log[2].iteration_took == 8

    # Epoch 1 is done
    ml.log.status.epochs_done += 1
    timing.after_epoch()
    assert ml.log[2].epoch_took == 15
    assert ml.log[2].total_took == 17

    # Finish training
    now += 1
    timing.after_training()
    assert ml.log[2].final_total_took == 18

    # Resume training
    now = 0
    timing.on_resumption()

    # Start epoch 2
    timing.before_epoch()
    assert ml.log[2].initialization_took is None

    # Batch 3
    timing.before_batch(None)
    now += 6
    ml.log.status.iterations_done += 1
    timing.after_batch(None)
    assert ml.log[3].iteration_took == 6
    assert ml.log[3].total_took == 24

    # Finish training before the end of the current epoch
    timing.after_training()

    # Resume training
    now = 2
    timing.on_resumption()

    # Batch 4
    timing.before_batch(None)
    now += 2
    ml.log.status.iterations_done += 1
    timing.after_batch(None)
    assert ml.log[4].iteration_took == 2
    assert ml.log[4].total_took == 26

    # Epoch 2 is done
    ml.log.status.epochs_done += 1
    timing.after_epoch()
    assert ml.log[4].epoch_took == 8

    # Start epoch 3
    timing.before_epoch()

    # Batch 5
    timing.before_batch(None)
    now += 5
    ml.log.status.iterations_done += 1
    timing.after_batch(None)
    assert ml.log[5].iteration_took == 5
    assert ml.log[5].total_took == 31

    # Epoch 3 is done
    ml.log.status.epochs_done += 1
    timing.after_epoch()
    assert ml.log[5].epoch_took == 5
예제 #31
0
def main(save_to, num_epochs,
         regularization=0.0001, subset=None, num_batches=None,
         batch_size=None, histogram=None, resume=False):
    output_size = 10
    convnet = create_res_net()

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    test_probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs)
            .copy(name='cost'))
    test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs)
                  .copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs)
                  .copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate])

    # Apply dropout to all layer outputs except final softmax
    # dropout_vars = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(test_cg.variables)
    # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(test_cg, [x], 0.2)
    # train_cg = drop_cg
    # train_cg = apply_batch_normalization(test_cg)

    # train_cost, train_error_rate, train_components = train_cg.outputs

    with batch_normalization(convnet):
        train_probs = convnet.apply(x)
    train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs)
                .copy(name='cost'))
    train_components = (ComponentwiseCrossEntropy().apply(y.flatten(),
                train_probs).copy(name='components'))
    train_error_rate = (MisclassificationRate().apply(y.flatten(),
                train_probs).copy(name='error_rate'))
    train_cg = ComputationGraph([train_cost,
                train_error_rate, train_components])
    population_updates = get_batch_normalization_updates(train_cg)
    bn_alpha = 0.9
    extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha))
                for p, m in population_updates]

    # Apply regularization to the cost
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    l2_norm = sum([(W ** 2).sum() for W in weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = regularization * l2_norm
    l2_regularization.name = 'l2_regularization'
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + regularization * l2_norm
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train",))
    cifar10_train_stream = RandomPadCropFlip(
        NormalizeBatchLevels(DataStream.default_stream(
            cifar10_train, iteration_scheme=ShuffledScheme(
                cifar10_train.num_examples, batch_size)),
        which_sources=('features',)),
        (32, 32), pad=4, which_sources=('features',))

    test_batch_size = 500
    cifar10_test = CIFAR10(("test",))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(
            cifar10_test.num_examples, test_batch_size)),
        which_sources=('features',))

    momentum = Momentum(0.01, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])

    # from theano.compile.nanguardmode import NanGuardMode

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=train_cost, parameters=train_cg.parameters,
        step_rule=momentum)
    algorithm.add_updates(extra_updates)

    #,
    #    theano_func_kwargs={
    #        'mode': NanGuardMode(
    #            nan_is_error=True, inf_is_error=True, big_is_error=True)})

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  EpochSchedule(momentum.learning_rate, [
                      (0, 0.01),   # Warm up with 0.01 learning rate
                      (1, 0.1),    # Then go back to 0.1
                      (100, 0.01),
                      (150, 0.001)
                      # (83, 0.01),  # Follow the schedule in the paper
                      # (125, 0.001)
                  ]),
                  DataStreamMonitoring(
                      [test_cost, test_error_rate, test_confusion],
                      cifar10_test_stream,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [train_cost, train_error_rate,
                       train_cost_without_regularization,
                       l2_regularization,
                       momentum.learning_rate,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      every_n_batches=17),
                      # after_epoch=True),
                  Plot('Training performance for ' + save_to,
                      channels=[
                          ['train_cost_with_regularization',
                           'train_cost_without_regularization',
                           'train_l2_regularization'],
                          ['train_error_rate'],
                          ['train_total_gradient_norm'],
                      ],
                      every_n_batches=17),
                  Plot('Test performance for ' + save_to,
                      channels=[[
                          'train_error_rate',
                          'test_error_rate',
                          ]],
                      after_epoch=True),
                  Checkpoint(save_to, use_cpickle=True),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=train_components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(save_to, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(
        algorithm,
        cifar10_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)