Example #1
0
def test_model_handles_brickless_parameteres():
    x = tensor.matrix('x')
    v = shared_floatx(numpy.zeros((10, 10)), name='V')
    add_role(v, PARAMETER)
    y = x.dot(v)
    model = Model(y)
    assert list(model.get_parameter_dict().items()) == [('V', v)]
Example #2
0
def evaluate(model, load_path):
    with open(load_path + '/trained_params_best.npz') as f:
        loaded = np.load(f)
        blocks_model = Model(model.cost)
        params_dicts = blocks_model.get_parameter_dict()
        params_names = params_dicts.keys()
        for param_name in params_names:
                    param = params_dicts[param_name]
                    # '/f_6_.W' --> 'f_6_.W'
                    slash_index = param_name.find('/')
                    param_name = param_name[slash_index + 1:]
                    assert param.get_value().shape == loaded[param_name].shape
                    param.set_value(loaded[param_name])

    train_data_stream, valid_data_stream = get_cmv_v2_streams(100)
    # T x B x F
    data = train_data_stream.get_epoch_iterator().next()
    cg = ComputationGraph(model.cost)
    f = theano.function(cg.inputs, [model.location, model.scale],
                        on_unused_input='ignore',
                        allow_input_downcast=True)
    res = f(data[1], data[0])
    for i in range(10):
        visualize_attention(data[0][:, i, :],
                            res[0][:, i, :], res[1][:, i, :], prefix=str(i))
def test_sampling():

    # Create Theano variables
    sampling_input = theano.tensor.lmatrix("input")

    # Construct model
    encoder = BidirectionalEncoder(vocab_size=10, embedding_dim=5, state_dim=8)
    decoder = Decoder(vocab_size=12, embedding_dim=6, state_dim=8, representation_dim=16, theano_seed=1234)
    sampling_representation = encoder.apply(sampling_input, theano.tensor.ones(sampling_input.shape))
    generateds = decoder.generate(sampling_input, sampling_representation)
    model = Model(generateds[1])

    # Initialize model
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(0.01)
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # Compile a function for the generated
    sampling_fn = model.get_theano_function()

    # Create literal variables
    numpy.random.seed(1234)
    x = numpy.random.randint(0, 10, size=(1, 2))

    # Call function and check result
    generated_step = sampling_fn(x)
    assert len(generated_step[0].flatten()) == 4
Example #4
0
    def create_act_table(self, save_to, act_table):
        batch_size = 500
        image_size = (28, 28)
        output_size = 10
        convnet = create_lenet_5()
        layers = convnet.layers

        x = tensor.tensor4('features')
        y = tensor.lmatrix('targets')

        # Normalize input and apply the convnet
        probs = convnet.apply(x)
        cg = ComputationGraph([probs])

        def full_brick_name(brick):
            return '/'.join([''] + [b.name for b in brick.get_unique_path()])

        # Find layer outputs to probe
        outmap = OrderedDict((full_brick_name(get_brick(out)), out)
                for out in VariableFilter(
                    roles=[OUTPUT], bricks=[Convolutional, Linear])(
                        cg.variables))
        # Generate pics for biases
        biases = VariableFilter(roles=[BIAS])(cg.parameters)

        # Generate parallel array, in the same order, for outputs
        outs = [outmap[full_brick_name(get_brick(b))] for b in biases]

        # Figure work count
        error_rate = (MisclassificationRate().apply(y.flatten(), probs)
                      .copy(name='error_rate'))
        max_activation_table = (MaxActivationTable().apply(
                outs).copy(name='max_activation_table'))
        max_activation_table.tag.aggregation_scheme = (
                Concatenate(max_activation_table))

        model = Model([
            error_rate,
            max_activation_table])

        # Load it with trained parameters
        params = load_parameters(open(save_to, 'rb'))
        model.set_parameter_values(params)

        mnist_test_stream = DataStream.default_stream(
            self.mnist_test,
            iteration_scheme=SequentialScheme(
                self.mnist_test.num_examples, batch_size))

        evaluator = DatasetEvaluator([
            error_rate,
            max_activation_table
            ])
        results = evaluator.evaluate(mnist_test_stream)
        table = results['max_activation_table']
        pickle.dump(table, open(act_table, 'wb'))
        return table
Example #5
0
def evaluate(model, load_path):
    with open(load_path + '/trained_params_best.npz') as f:
        loaded = np.load(f)
        blocks_model = Model(model)
        params_dicts = blocks_model.get_parameter_dict()
        params_names = params_dicts.keys()
        for param_name in params_names:
            param = params_dicts[param_name]
            assert param.get_value().shape == loaded[param_name].shape
            param.set_value(loaded[param_name])
Example #6
0
def main():

    import configurations
    from stream import DStream
    logger = logging.getLogger(__name__)
    cfig = getattr(configurations, 'get_config_penn')()

    rnnlm = Rnnlm(cfig['vocabsize'], cfig['nemb'], cfig['nhids'])
    rnnlm.weights_init = IsotropicGaussian(0.1)
    rnnlm.biases_init = Constant(0.)
    rnnlm.push_initialization_config()
    rnnlm.generator.transition.weights_init = Orthogonal()

    sentence = tensor.lmatrix('sentence')
    sentence_mask = tensor.matrix('sentence_mask')
    batch_cost = rnnlm.cost(sentence, sentence_mask).sum()
    batch_size = sentence.shape[1].copy(name='batch_size')
    cost = aggregation.mean(batch_cost, batch_size)
    cost.name = "sequence_log_likelihood"
    logger.info("Cost graph is built")

    model = Model(cost)
    parameters = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, value.get_value().shape) for key, value
                        in parameters.items()],
                    width=120))

    for brick in model.get_top_bricks():
        brick.initialize()
    cg = ComputationGraph(cost)
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]))

    gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
    step_norm = aggregation.mean(algorithm.total_step_norm)
    monitored_vars = [cost, gradient_norm, step_norm]

    train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True,
                                           before_first_epoch=True, prefix='tra')

    extensions = [train_monitor, Timing(), Printing(after_batch=True),
                  FinishAfter(after_n_epochs=1000),
                  Printing(every_n_batches=1)]

    train_stream = DStream(datatype='train', config=cfig)
    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
Example #7
0
def fine_tuning(cost, args):
    param_values = load_parameter_values(args.fine_tuning)

    param_values[
        "/output_layer.W"] = np.concatenate((
            param_values["/output_layer.W"],
            0.1 * np.random.randn(args.state_dim, 40).astype(np.float32)))

    model = Model(cost)
    model.set_parameter_values(param_values)

    return cost
Example #8
0
    def testing(self, fea2obj):
        config = self._config
        dsdir = config['dsdir']
        devfile = dsdir + '/dev.txt'
        testfile = dsdir + '/test.txt'
        networkfile = config['net']
        batch_size = 10000#int(config['batchsize'])
        devMentions = load_ent_ds(devfile)
        tstMentions = load_ent_ds(testfile)
        logger.info('#dev: %d #test: %d', len(devMentions), len(tstMentions))

        main_loop = load(networkfile + '.best.pkl')
        logger.info('Model loaded. Building prediction function...')
        old_model = main_loop.model
        logger.info(old_model.inputs)
        sources = [inp.name for inp in old_model.inputs]
#         fea2obj = build_input_objs(sources, config)
        t2idx = fea2obj['targets'].t2idx
        deterministic = str_to_bool(config['use_mean_pred']) if 'use_mean_pred' in config else True
        kl_weight = shared_floatx(0.001, 'kl_weight')
        entropy_weight= shared_floatx(0.001, 'entropy_weight')


        cost, _, y_hat, _, _,_,_ = build_model_new(fea2obj, len(t2idx), self._config, kl_weight, entropy_weight, deterministic=deterministic, test=True)
        model = Model(cost)
        model.set_parameter_values(old_model.get_parameter_values())

        theinputs = []
        for fe in fea2obj.keys():
            if 'targets' in fe:
                continue
            for inp in model.inputs:
                if inp.name == fe:
                    theinputs.append(inp)

#         theinputs = [inp for inp in model.inputs if inp.name != 'targets']
        print "theinputs: ", theinputs
        predict = theano.function(theinputs, y_hat)

        test_stream, num_samples_test = get_comb_stream(fea2obj, 'test', batch_size, shuffle=False)
        dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size, shuffle=False)
        logger.info('sources: %s -- number of test/dev samples: %d/%d', test_stream.sources, num_samples_test, num_samples_dev)
        idx2type = {idx:t for t,idx in t2idx.iteritems()}

        logger.info('Starting to apply on dev inputs...')
        self.applypredict(theinputs, predict, dev_stream, devMentions, num_samples_dev, batch_size, os.path.join(config['exp_dir'], config['matrixdev']), idx2type)
        logger.info('...apply on dev data finished')

        logger.info('Starting to apply on test inputs...')
        self.applypredict(theinputs, predict, test_stream, tstMentions, num_samples_test, batch_size, os.path.join(config['exp_dir'], config['matrixtest']), idx2type)
        logger.info('...apply on test data finished')
Example #9
0
    def __init__(self, model_name, model, stream, **kwargs):
        super(RunOnTest, self).__init__(**kwargs)

        self.model_name = model_name

        cg = Model(model.predict(**stream.inputs()))

        self.inputs = cg.inputs
        self.outputs = model.predict.outputs

        req_vars_test = model.predict.inputs + ['trip_id']
        self.test_stream = stream.test(req_vars_test)

        self.function = cg.get_theano_function()
Example #10
0
def evaluate(model, load_path, configs):
    with open(load_path + "trained_params_best.npz") as f:
        loaded = np.load(f)
        blocks_model = Model(model.cost)
        params_dicts = blocks_model.get_parameter_dict()
        params_names = params_dicts.keys()
        for param_name in params_names:
            param = params_dicts[param_name]
            # '/f_6_.W' --> 'f_6_.W'
            slash_index = param_name.find("/")
            param_name = param_name[slash_index + 1 :]
            assert param.get_value().shape == loaded[param_name].shape
            param.set_value(loaded[param_name])

        inps = ComputationGraph(model.error_rate).inputs
        eval_function = theano.function(inps, [model.error_rate, model.probabilities])
        _, vds = configs["get_streams"](100)
        data = vds.get_epoch_iterator().next()
        print "Valid_ER: " + str(eval_function(data[0], data[2], data[1])[0])
        return eval_function
Example #11
0
def train_model(cost, train_stream, valid_stream, valid_freq, valid_rare,
                load_location=None, save_location=None):
    cost.name = 'nll'
    perplexity = 2 ** (cost / tensor.log(2))
    perplexity.name = 'ppl'

    # Define the model
    model = Model(cost)

    # Load the parameters from a dumped model
    if load_location is not None:
        logger.info('Loading parameters...')
        model.set_param_values(load_parameter_values(load_location))

    cg = ComputationGraph(cost)
    algorithm = GradientDescent(cost=cost, step_rule=Scale(learning_rate=0.01),
                                params=cg.parameters)
    main_loop = MainLoop(
        model=model,
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            DataStreamMonitoring([cost, perplexity], valid_stream,
                                 prefix='valid_all', every_n_batches=5000),
            # Overfitting of rare words occurs between 3000 and 4000 iterations
            DataStreamMonitoring([cost, perplexity], valid_rare,
                                 prefix='valid_rare', every_n_batches=500),
            DataStreamMonitoring([cost, perplexity], valid_freq,
                                 prefix='valid_frequent',
                                 every_n_batches=5000),
            Printing(every_n_batches=500)
        ]
    )
    main_loop.run()

    # Save the main loop
    if save_location is not None:
        logger.info('Saving the main loop...')
        dump_manager = MainLoopDumpManager(save_location)
        dump_manager.dump(main_loop)
        logger.info('Saved')
def train_model(cost, error_rate, train_stream,
                load_location=None, save_location=None):

    cost.name = "Cross_entropy"
    error_rate.name = 'Error_rate'

    # Define the model
    model = Model(cost)

    # Load the parameters from a dumped model
    if load_location is not None:
        logger.info('Loading parameters...')
        model.set_param_values(load_parameter_values(load_location))

    cg = ComputationGraph(cost)
    step_rule = Momentum(learning_rate=0.1, momentum=0.9)
    algorithm = GradientDescent(cost=cost, step_rule=step_rule,
                                params=cg.parameters)
    main_loop = MainLoop(
        model=model,
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            # DataStreamMonitoring([cost], test_stream, prefix='test',
            #                      after_epoch=False, every_n_epochs=10),
            DataStreamMonitoring([cost], train_stream, prefix='train',
                                 after_epoch=True),
            Printing(after_epoch=True)
        ]
    )
    main_loop.run()

    # Save the main loop
    if save_location is not None:
        logger.info('Saving the main loop...')
        dump_manager = MainLoopDumpManager(save_location)
        dump_manager.dump(main_loop)
        logger.info('Saved')
Example #13
0
def test_model():
    x = tensor.matrix('x')
    mlp1 = MLP([Tanh(), Tanh()], [10, 20, 30], name="mlp1")
    mlp2 = MLP([Tanh()], [30, 40], name="mlp2")
    h1 = mlp1.apply(x)
    h2 = mlp2.apply(h1)

    model = Model(h2)
    assert model.get_top_bricks() == [mlp1, mlp2]
    # The order of parameters returned is deterministic but
    # not sensible.
    assert list(model.get_parameter_dict().items()) == [
        ('/mlp2/linear_0.b', mlp2.linear_transformations[0].b),
        ('/mlp1/linear_1.b', mlp1.linear_transformations[1].b),
        ('/mlp1/linear_0.b', mlp1.linear_transformations[0].b),
        ('/mlp1/linear_0.W', mlp1.linear_transformations[0].W),
        ('/mlp1/linear_1.W', mlp1.linear_transformations[1].W),
        ('/mlp2/linear_0.W', mlp2.linear_transformations[0].W)]

    # Test getting and setting parameter values
    mlp3 = MLP([Tanh()], [10, 10])
    mlp3.allocate()
    model3 = Model(mlp3.apply(x))
    parameter_values = {
        '/mlp/linear_0.W': 2 * numpy.ones((10, 10),
                                          dtype=theano.config.floatX),
        '/mlp/linear_0.b': 3 * numpy.ones(10, dtype=theano.config.floatX)}
    model3.set_parameter_values(parameter_values)
    assert numpy.all(
        mlp3.linear_transformations[0].parameters[0].get_value() == 2)
    assert numpy.all(
        mlp3.linear_transformations[0].parameters[1].get_value() == 3)
    got_parameter_values = model3.get_parameter_values()
    assert len(got_parameter_values) == len(parameter_values)
    for name, value in parameter_values.items():
        assert_allclose(value, got_parameter_values[name])

    # Test exception is raised if parameter shapes don't match
    def helper():
        parameter_values = {
            '/mlp/linear_0.W': 2 * numpy.ones((11, 11),
                                              dtype=theano.config.floatX),
            '/mlp/linear_0.b': 3 * numpy.ones(11, dtype=theano.config.floatX)}
        model3.set_parameter_values(parameter_values)
    assert_raises(ValueError, helper)

    # Test name conflict handling
    mlp4 = MLP([Tanh()], [10, 10])

    def helper():
        Model(mlp4.apply(mlp3.apply(x)))
    assert_raises(ValueError, helper)
Example #14
0
def evaluate(ladder, load_path):
    with open(load_path + '/trained_params_best.npz') as f:
        loaded = np.load(f)
        model = Model(ladder.costs.total)
        params_dicts = model.get_parameter_dict()
        params_names = params_dicts.keys()
        for param_name in params_names:
            param = params_dicts[param_name]
            # '/f_6_.W' --> 'f_6_.W'
            slash_index = param_name.find('/')
            param_name = param_name[slash_index + 1:]
            assert param.get_value().shape == loaded[param_name].shape
            param.set_value(loaded[param_name])

    test_data_stream, test_data_stream = get_mixed_streams(10000)
    test_data = test_data_stream.get_epoch_iterator().next()
    test_data_input = test_data[10]
    test_data_target = test_data[0]
    print 'Compiling ...'
    cg = ComputationGraph([ladder.costs.total])
    eval_ = theano.function(cg.inputs, ladder.error)
    print 'Test_set_Error: ' + str(eval_(test_data_input, test_data_target))
    import ipdb
    ipdb.set_trace()
Example #15
0
train_stream = get_stream('train', batch_size, h_dim, False)
data = train_stream.get_epoch_iterator(as_dict=True).next()
cg = ComputationGraph(cost)
f = theano.function(cg.inputs, cost)
print f(data['y'], data['x'], data['is_for_test'], data['drops'])

if not os.path.exists(save_path):
    os.makedirs(save_path)
log_path = save_path + '/log.txt'
fh = logging.FileHandler(filename=log_path)
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)

print 'Bulding training process...'
model = Model(cost)
params = ComputationGraph(cost).parameters
print_num_params(params)
clipping = StepClipping(threshold=np.cast[floatX](1.0))
# Momentum(learning_rate=args.learning_rate, momentum=0.9)
rm_non_finite = RemoveNotFinite()
rms_prop = RMSProp(learning_rate=1e-3, decay_rate=0.5)
step_rule = CompositeRule([clipping, rms_prop, rm_non_finite])
algorithm = GradientDescent(
    cost=cost,
    parameters=params,
    step_rule=step_rule)

# train_stream, valid_stream = get_seq_mnist_streams(
#    h_dim, batch_size, update_prob)
train_stream = get_stream('train', batch_size, h_dim, False)
Example #16
0
def main(config, tr_stream, dev_stream, use_bokeh=False, the_task=None, the_track=None, use_embeddings=False, lang='german'):

    config['the_task'] = the_task
    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        # end_embed is dimension of word embedding matrix in encoder; enc_nhids number of hidden units in encoder GRU
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2, config['use_attention'], cost_type=config['error_fct'])
    cost = decoder.cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)
    testVar = decoder.getTestVar(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)
   
    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    my_rng = numpy.random.RandomState(config['rng_value']) 
    if config['identity_init']:
      encoder.weights_init = decoder.weights_init = Identity()
    else:
      encoder.weights_init = decoder.weights_init = IsotropicGaussian(
          config['weight_scale'])
      encoder.rng = decoder.rng = my_rng
    
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    encoder.bidir.prototype.rng = my_rng
    decoder.transition.weights_init = Orthogonal()
    decoder.transition.rng = my_rng
    encoder.initialize()
    decoder.initialize()

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info('Applying dropout')
        dropout_inputs = [x for x in cg.intermediary_variables
                          if x.name == 'maxout_apply_output']
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Apply weight noise for regularization
    if config['weight_noise_ff'] > 0.0:
        logger.info('Applying weight noise to ff layers')
        enc_params = Selector(encoder.lookup).get_params().values()
        enc_params += Selector(encoder.fwd_fork).get_params().values()
        enc_params += Selector(encoder.back_fork).get_params().values()
        dec_params = Selector(
            decoder.sequence_generator.readout).get_params().values()
        dec_params += Selector(
            decoder.sequence_generator.fork).get_params().values()
        dec_params += Selector(decoder.state_init).get_params().values()
        cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'], seed=my_rng)

    cost = cg.outputs[0]

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))
    


    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))


    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)    

    # Set extensions
    logger.info("Initializing extensions")
    # this is ugly code and done, because I am not sure if the order of the extensions is important
    if 'track2' in config['saveto']: # less epochs for track 2, because of more data
      if config['early_stopping']:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']/2),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
      else:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']/2),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
    else:
      if config['early_stopping']:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
      else:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    
    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model, data_stream=tr_stream,
                    hook_samples=config['hook_samples'],
                    #every_n_batches=1,
                    every_n_batches=config['sampling_freq'],
                    src_vocab_size=8))
                    #src_vocab_size=config['src_vocab_size']))
    
    # Add early stopping based on bleu
    if config['val_set'] is not None:
        logger.info("Building accuracy validator")
        extensions.append(
            AccuracyValidator(sampling_input, samples=samples, config=config,
                          model=search_model, data_stream=dev_stream,
                          after_training=True,
                          #after_epoch=True))
                          every_n_epochs=5))
    else:
        logger.info("No validation set given for this language")
    
    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))
        
    # Load pretrained embeddings if necessary; after the other parameters; ORDER MATTERS
    if use_embeddings:
        extensions.append(LoadEmbeddings(config['embeddings'][0] + lang + config['embeddings'][1]))
       
    
    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                 eval(config['step_rule'])()])
    )

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=training_model,
        algorithm=algorithm,
        data_stream=tr_stream,
        extensions=extensions
    )
    
    # Train!
    main_loop.run()
Example #17
0
def main(argv):
    name = argv[1]
    files = map(lambda p: join(folder, p), listdir(folder))

    file = next(filter(lambda n: name in n, files))
    print(file)

    p = load_parameter_values(file)

    net = net_dvc((128,128))

    x = tensor.tensor4('image_features')
    y_hat = net.apply(x)

    g = Model(y_hat)

    for k,v in p.items():
        p[k] = v.astype('float32')

    g.set_parameter_values(p)

    a,t,v = get_dvc((128,128),trainning=False, shortcut=False)
    run = function([x], y_hat)

    def run_test(data):
        res = []
        for i in  data.get_epoch_iterator():
            res.extend(run(i[0]))
        return res

    def max_index(l):
        if l[0] > l[1]:
            return 0
        else:
            return 1

    def write_kaggle(f, l):
        f.write("id,label\n")
        for i,e in enumerate(l,start=1):
            f.write(str(i)+","+str(e)+"\n")

    def kaggle(file, data):
        write_kaggle(file,map(max_index, run_test(data)))

    def accuracy(data):
        res = []
        true = []
        for i in data.get_epoch_iterator():
            res.extend(run(i[0]))
            true.extend(i[1])
        res = map(max_index, res)

        total = 0
        equal = 0
        for r,t in zip(res,true):
            total += 1
            equal += 1 if r == t else 0

        return equal/total

    print("Training accuracy: ", accuracy(a))
    print("Test accuracy: ", accuracy(v))
    kaggle_file = join(result_folder, name+".kaggle")
    print(kaggle_file)
    with open(kaggle_file,'w') as f:
            kaggle(f, t)
Example #18
0
                            parameters=cg.parameters,
                            step_rule=Scale(learning_rate=0.1))

train_set = H5PYDataset('mushrooms.hdf5', which_sets=('train', ))
train_stream = DataStream.default_stream(train_set,
                                         iteration_scheme=SequentialScheme(
                                             train_set.num_examples,
                                             batch_size=128))

test_set = H5PYDataset('mushrooms.hdf5', which_sets=('test', ))
test_stream = DataStream.default_stream(test_set,
                                        iteration_scheme=SequentialScheme(
                                            test_set.num_examples,
                                            batch_size=128))

main = MainLoop(model=Model(cost),
                data_stream=train_stream,
                algorithm=algorithm,
                extensions=[
                    FinishAfter(after_n_epochs=10),
                    Printing(),
                    TrainingDataMonitoring([cost, error_rate],
                                           after_batch=True,
                                           prefix='train'),
                    DataStreamMonitoring([cost, error_rate],
                                         after_batch=True,
                                         data_stream=test_stream,
                                         prefix='test'),
                    Plot('Train',
                         channels=[['train_cost', 'test_cost'],
                                   ['train_error_rate', 'test_error_rate']])
Example #19
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'],name='word_encoder')
    decoder = Decoder(vocab_size=config['trg_vocab_size'],
                      embedding_dim=config['dec_embed'],
                      state_dim=config['dec_nhids'],
                      representation_dim=config['enc_nhids'] * 2,
                      match_function=config['match_function'],
                      use_doubly_stochastic=config['use_doubly_stochastic'],
                      lambda_ds=config['lambda_ds'],
                      use_local_attention=config['use_local_attention'],
                      window_size=config['window_size'],
                      use_step_decay_cost=config['use_step_decay_cost'],
                      use_concentration_cost=config['use_concentration_cost'],
                      lambda_ct=config['lambda_ct'],
                      use_stablilizer=config['use_stablilizer'],
                      lambda_st=config['lambda_st'])
    # here attended dim (representation_dim) of decoder is 2*enc_nhinds
    # because the context given by the encoder is a bidirectional context

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        context_sentences=[];
        context_sentence_masks=[];
        for i in range(config['ctx_num']):
            context_sentences.append(tensor.lmatrix('context_'+str(i)));
            context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask'));
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')
        dev_source = tensor.lmatrix('dev_source')
        dev_target=tensor.lmatrix('dev_target')

        # Get training and development set streams
        tr_stream = get_tr_stream_withContext(**config)
        dev_stream = get_dev_stream_with_grdTruth(**config)

        # Get cost of the model
        sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask);
        sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]);
        sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]);
        for i in range(config['ctx_num']):
            tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]);
            tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]);
            sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0);
            sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0);


        cost = decoder.cost(sentence_representations_list,
                            sentence_masks_list,
                            target_sentence,
                            target_sentence_mask)

        logger.info('Creating computational graph')
        perplexity = tensor.exp(cost)
        perplexity.name = 'perplexity'
        costs_computer = function(context_sentences+context_sentence_masks+[target_sentence,
                                   target_sentence_mask,
                                   source_sentence,
                                   source_sentence_mask], (perplexity))
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init =decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init =decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [x for x in cg.intermediary_variables
                              if x.name == 'maxout_apply_output']
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(
                cg, enc_params+dec_params, config['weight_noise_ff'])


        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                                   Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}"
                    .format(len(enc_dec_param_dict)))


        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([perplexity], after_batch=True),
            CheckpointNMT(config['saveto'],
                          config['model_name'],
                          every_n_batches=config['save_freq'])
        ]

        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(
                sampling_input, sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(
                bricks=[decoder.sequence_generator], name="outputs")(
                    ComputationGraph(generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model, data_stream=tr_stream,
                        model_name=config['model_name'],
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if False:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input, samples=samples, config=config,
                              model=search_model, data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq'],
                              n_best=3,
                              track_n_models=6))

        logger.info("Building perplexity validator")
        extensions.append(
                pplValidation(dev_source,dev_target, config=config,
                        model=costs_computer, data_stream=dev_stream,
                        model_name=config['model_name'],
                        every_n_batches=config['sampling_freq']))


        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En', channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        initial_learning_rate = config['initial_learning_rate']
        log_path = os.path.join(config['saveto'], 'log')
        if config['reload'] and os.path.exists(log_path):
            with open(log_path, 'rb') as source:
                log = cPickle.load(source)
                last = max(log.keys()) - 1
                if 'learning_rate' in log[last]:
                    initial_learning_rate = log[last]['learning_rate']

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([Scale(initial_learning_rate),
                                     StepClipping(config['step_clipping']),
                                     eval(config['step_rule'])()]))

        _learning_rate = algorithm.step_rule.components[0].learning_rate
        if config['learning_rate_decay']:
            extensions.append(
                LearningRateHalver(record_name='validation_cost',
                                   comparator=lambda x, y: x > y,
                                   learning_rate=_learning_rate,
                                   patience_default=3))
        else:
            extensions.append(OldModelRemover(saveto=config['saveto']))

        if config['learning_rate_grow']:
            extensions.append(
                LearningRateDoubler(record_name='validation_cost',
                                    comparator=lambda x, y: x < y,
                                    learning_rate=_learning_rate,
                                    patience_default=3))

        extensions.append(
            SimplePrinting(config['model_name'], after_batch=True))

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(
            model=training_model,
            algorithm=algorithm,
            data_stream=tr_stream,
            extensions=extensions
        )

        # Train!
        main_loop.run()

    elif mode == 'ppl':
        # Create Theano variables
        # Create Theano variables
        logger.info('Creating theano variables')
        context_sentences=[];
        context_sentence_masks=[];
        for i in range(config['ctx_num']):
            context_sentences.append(tensor.lmatrix('context_'+str(i)));
            context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask'));
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')

        # Get training and development set streams
        #tr_stream = get_tr_stream_withContext(**config)
        dev_stream = get_dev_stream_withContext_grdTruth(**config)

        # Get cost of the model
        sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask);
        sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]);
        sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]);
        for i in range(config['ctx_num']):
            tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]);
            tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]);
            sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0);
            sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0);


        cost = decoder.cost(sentence_representations_list,
                            sentence_masks_list,
                            target_sentence,
                            target_sentence_mask)

        logger.info('Creating computational graph')
        costs_computer = function(context_sentences+context_sentence_masks+[target_sentence,
                                   target_sentence_mask,
                                   source_sentence,
                                   source_sentence_mask], (cost))


        logger.info("Loading the model..")
        model = Model(cost)
        #loader = LoadNMT(config['saveto'])
        loader = LoadNMT(config['validation_load']);
        loader.set_model_parameters(model, loader.load_parameters_default())
        logger.info("Started Validation: ")

        ts = dev_stream.get_epoch_iterator()
        total_cost = 0.0
        total_tokens=0.0
        #pbar = ProgressBar(max_value=len(ts)).start()#modified
        pbar = ProgressBar(max_value=10000).start();
        for i, (ctx_0,ctx_0_mask,ctx_1,ctx_1_mask,ctx_2,ctx_2_mask,src, src_mask, trg, trg_mask) in enumerate(ts):
            costs  = costs_computer(*[ctx_0,ctx_1,ctx_2,ctx_0_mask,ctx_1_mask,ctx_2_mask,trg, trg_mask,src, src_mask])
            cost = costs.sum()
            total_cost+=cost
            total_tokens+=trg_mask.sum()
            pbar.update(i + 1)
        total_cost/=total_tokens;
        pbar.finish()
        #dev_stream.reset()

        # run afterprocess
        # self.ap.main()
        total_cost=2**total_cost;
        print("Average validation cost: " + str(total_cost));
    elif mode == 'translate':

        logger.info('Creating theano variables')
        context_sentences=[];
        context_sentence_masks=[];
        for i in range(config['ctx_num']):
            context_sentences.append(tensor.lmatrix('context_'+str(i)));
            context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask'));
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')

        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1
        trg_vocab = _ensure_special_tokens(
            cPickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0,
            eos_idx=trg_eos_idx, unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}
        config['batch_size'] = 1

        sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask);
        sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]);
        sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]);
        for i in range(config['ctx_num']):
            tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]);
            tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]);
            sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0);
            sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0);
        generated = decoder.generate(sentence_representations_list,sentence_masks_list)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        #loader = LoadNMT(config['saveto'])
        loader = LoadNMT(config['validation_load']);
        loader.set_model_parameters(model, loader.load_parameters_default())

        logger.info("Started translation: ")
        test_stream = get_dev_stream_withContext(**config)
        ts = test_stream.get_epoch_iterator()
        rts = open(config['val_set_source']).readlines()
        ftrans_original = open(config['val_output_orig'], 'w')
        saved_weights = []
        total_cost = 0.0

        pbar = ProgressBar(max_value=len(rts)).start()
        for i, (line, line_raw) in enumerate(zip(ts, rts)):
            trans_in = line_raw[3].split()
            seqs=[];
            input_=[];
            input_mask=[];
            for j in range(config['ctx_num']+1):
                seqs.append(sutils._oov_to_unk(
                    line[2*j][0], config['src_vocab_size'], unk_idx))
                input_mask.append(numpy.tile(line[2*j+1][0],(config['beam_size'], 1)))
                input_.append(numpy.tile(seqs[j], (config['beam_size'], 1)))
            #v=costs_computer(input_[0]);
            # draw sample, checking to ensure we don't get an empty string back
            trans, costs, attendeds, weights = \
                beam_search.search(
                    input_values={source_sentence: input_[3],source_sentence_mask:input_mask[3],
                                  context_sentences[0]: input_[0],context_sentence_masks[0]:input_mask[0],
                                  context_sentences[1]: input_[1],context_sentence_masks[1]:input_mask[1],
                                  context_sentences[2]: input_[2],context_sentence_masks[2]:input_mask[2]},
                    max_length=3*len(seqs[2]), eol_symbol=trg_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            b = numpy.argsort(costs)[0]
            #best=numpy.argsort(costs)[0:config['beam_size']];
            #for b in best:
            try:
                total_cost += costs[b]
                trans_out = trans[b]
                totalLen=4*len(line[0][0]);
                #weight = weights[b][:, :totalLen]
                weight=weights
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)
            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i+1))
                trans_out = '<UNK>'
            saved_weights.append(weight)
            print(' '.join(trans_out), file=ftrans_original)
            pbar.update(i + 1)

        pbar.finish()
        logger.info("Total cost of the test: {}".format(total_cost))
        cPickle.dump(saved_weights, open(config['attention_weights'], 'wb'))
        ftrans_original.close()
        ap = afterprocesser(config)
        ap.main()
Example #20
0
def get_confidence_function(exp_config):

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target_suffix')
    target_sentence_mask = tensor.matrix('target_suffix_mask')
    target_prefix = tensor.lmatrix('target_prefix')
    target_prefix_mask = tensor.matrix('target_prefix_mask')

    logger.info('Creating computational graph')

    # build the model
    encoder = BidirectionalEncoder(exp_config['src_vocab_size'],
                                   exp_config['enc_embed'],
                                   exp_config['enc_nhids'])

    # Note: the 'min_risk' kwarg tells the decoder which sequence_generator and cost_function to use
    decoder = NMTPrefixDecoder(exp_config['trg_vocab_size'],
                               exp_config['dec_embed'],
                               exp_config['dec_nhids'],
                               exp_config['enc_nhids'] * 2,
                               loss_function='cross_entropy')

    # rename to match baseline NMT systems
    decoder.name = 'decoder'

    predictions, merged_states = decoder.prediction_tags(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask,
        target_prefix, target_prefix_mask)

    # WORKING: also get the softmax prediction feature
    # WORKING: add features for source len, prefix len, position in suffix (position in suffix only makes sense if we're training on predictions)
    p_shape = predictions.shape
    predictions = predictions.reshape([p_shape[0] * p_shape[1], p_shape[2]])
    prediction_softmax = tensor.nnet.nnet.softmax(
        predictions.reshape([p_shape[0] * p_shape[1],
                             p_shape[2]])).reshape(p_shape)
    prediction_feature = prediction_softmax.max(axis=-1)[:, :, None]
    all_features = tensor.concatenate([merged_states, prediction_feature],
                                      axis=-1)

    confidence_output = decoder.sequence_generator.confidence_predictions(
        all_features)

    logger.info('Creating computational graph')
    confidence_model = Model(confidence_output)

    # Note that the parameters of this model must be pretrained, otherwise this doesn't make sense
    param_values = LoadNMT.load_parameter_values(
        exp_config['saved_parameters'], brick_delimiter=None)
    LoadNMT.set_model_parameters(confidence_model, param_values)

    confidence_param_values = LoadNMT.load_parameter_values(
        exp_config['confidence_saved_parameters'], brick_delimiter=None)
    LoadNMT.set_model_parameters(confidence_model, confidence_param_values)

    confidence_function = confidence_model.get_theano_function()

    return confidence_function
Example #21
0
def main(config,
         tr_stream,
         dev_stream,
         use_bokeh=False,
         src_vocab=None,
         trg_vocab=None):

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['dec_nhids'], config['enc_nhids'] * 2)
    cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask),
                        source_sentence_mask, target_sentence,
                        target_sentence_mask)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING

    # TODO: allow user to remove some params from the graph, for example if embeddings should be kept static
    if config.get('l2_regularization', False) is True:
        l2_reg_alpha = config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to name the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_parameters(),
        Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # allow user to externally initialize some params
    model_params = training_model.get_parameter_dict()
    if config.get('external_embeddings', None) is not None:
        for key in config['external_embeddings']:
            path_to_params = config['external_embeddings'][key]
            logger.info(
                'Replacing {} parameters with external params at: {}'.format(
                    key, path_to_params))
            external_params = numpy.load(path_to_params)
            len_external_idx = external_params.shape[0]
            print(external_params.shape)
            # Working: look in the dictionary and overwrite the correct rows
            existing_params = model_params[key].get_value()
            if key == '/bidirectionalencoder/embeddings.W':
                vocab = src_vocab
            elif key == '/decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W':
                vocab = trg_vocab
            else:
                raise KeyError(
                    'Unknown embedding parameter key: {}'.format(key))
            for k, i in vocab.items():
                if i < len_external_idx:
                    existing_params[i] = external_params[i]

            # model_params_shape = model_params[key].get_value().shape
            # assert model_params[key].get_value().shape == external_params.shape, ("Parameter dims must not change,"
            #                                                                       "shapes {} and {} do not match".
            #                                                                       format(model_params_shape,
            #                                                                              external_params.shape))
            model_params[key].set_value(existing_params)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions
    logger.info("Initializing extensions")
    extensions = []

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        # note that generated containes several different outputs
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling
    # Note: this is broken for unicode chars
    #if config['hook_samples'] >= 1:
    #    logger.info("Building sampler")
    #    extensions.append(
    #        Sampler(model=search_model, data_stream=tr_stream,
    #                hook_samples=config['hook_samples'],
    #                every_n_batches=config['sampling_freq'],
    #                src_vocab_size=config['src_vocab_size']))

    # WORKING: remove these validators in favor of Async
    # TODO: implement burn-in in the validation extension (don't fire until we're past the burn-in iteration)
    # Add early stopping based on bleu
    # if config.get('bleu_script', None) is not None:
    #     logger.info("Building bleu validator")
    #     extensions.append(
    #         BleuValidator(sampling_input, samples=samples, config=config,
    #                       model=search_model, data_stream=dev_stream,
    #                       normalize=config['normalized_bleu'],
    #                       every_n_batches=config['bleu_val_freq']))

    # Add early stopping based on Meteor
    # if config.get('meteor_directory', None) is not None:
    #     logger.info("Building meteor validator")
    #     extensions.append(
    #         MeteorValidator(sampling_input, samples=samples, config=config,
    #                       model=search_model, data_stream=dev_stream,
    #                       normalize=config['normalized_bleu'],
    #                       every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    # if there is dropout or random noise, we need to use the output of the modified graph
    if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0:
        algorithm = GradientDescent(cost=cg.outputs[0],
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))

    # enrich the logged information
    extensions.extend([
        Timing(every_n_batches=100),
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ])

    # External non-blocking validation
    extensions.append(
        RunExternalValidation(config=config,
                              every_n_batches=config['bleu_val_freq']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(config['model_save_directory'],
                 channels=[['decoder_cost_cost'],
                           ['validation_set_bleu_score'],
                           ['validation_set_meteor_score']],
                 every_n_batches=1))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Example #22
0
    cost_monitor.name = "cost_monitor"
elif conf.task == 'framewise':
    cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m)
    cost_train.name = 'cost'
    cost_monitor = cost_train
else:
    raise ValueError, conf.task

recognizer.initialize()
cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m])

weights = VariableFilter(roles=[WEIGHT])(cg.variables)
cg = apply_noise(cg, weights, conf.weight_noise)

#************* training algorithm *************
model = Model(cost_train)
if conf.step_rule == 'AdaDelta':
    step_rule = AdaDelta()
elif conf.step_rule == 'Momentum':
    step_rule = Momentum(learning_rate=conf.learning_rate,
                         momentum=conf.momentum)
else:
    raise ('step_rule not known: {}'.format(conf.step_rule))

step_rule = CompositeRule([step_rule, StepClipping(conf.step_clipping)])
algorithm = GradientDescent(cost=cost_train,
                            parameters=cg.parameters,
                            step_rule=step_rule)

#***************** main loop ****************
train_monitor = TrainingDataMonitoring([cost_monitor], prefix="train")
Example #23
0
def main(n_hiddens=1500,
         n_out=200,
         noise_std=0.5,
         learning_rate=2e-1,
         momentum=0.9,
         gamma=2.0):

    ######################
    # Model
    ######################
    import theano
    from theano.sandbox.rng_mrg import MRG_RandomStreams
    import numpy
    import theano.tensor as tensor
    from lvq.lvq import AaronLVQ, SupervisedNG, initialize_prototypes

    from blocks.bricks import Linear, LinearMaxout
    from blocks.bricks import Rectifier, Softmax, Logistic
    from blocks.bricks.cost import MisclassificationRate
    from blocks.config import config
    from blocks.initialization import Uniform, Constant
    from blocks.model import Model

    from lvq.batch_norm import MLP
    #from blocks.bricks import MLP

    seed = config.default_seed
    rng = MRG_RandomStreams(seed)

    x = tensor.tensor4('features')
    flat_x = tensor.flatten(x, outdim=2)
    flat_x_noise = flat_x + rng.normal(size=flat_x.shape, std=noise_std)
    y = tensor.imatrix('targets')
    flat_y = tensor.flatten(y, outdim=1)

    act = Rectifier()
    mlp = MLP(dims=[784, n_hiddens, n_hiddens, n_out],
              activations=[act, act, None])
    mlp.weights_init = Uniform(0.0, 0.001)
    mlp.biases_init = Constant(0.0)
    mlp.initialize()

    train_out = mlp.apply(flat_x_noise)
    test_out = mlp.inference(flat_x)

    lamb = theano.shared(numpy.float32(10.0))
    lvq = SupervisedNG(10,
                       n_out,
                       nonlin=True,
                       gamma=gamma,
                       lamb=lamb,
                       name='lvq')

    test_loss, test_misclass = lvq.apply(test_out, flat_y, 'test')
    loss, misclass = lvq.apply(train_out, flat_y, 'train')

    model = Model(loss)

    ######################
    # Data
    ######################

    #from mnist import MNIST
    from fuel.datasets import MNIST
    from fuel.streams import DataStream
    from fuel.schemes import ShuffledScheme, SequentialScheme
    from fuel.transformers import ScaleAndShift, ForceFloatX

    mnist_train = MNIST([
        'train'
    ])  #MNIST('train', drop_input=False, sources=('features', 'targets'))
    mnist_test = MNIST([
        'test'
    ])  # MNIST('test', drop_input=False, sources=('features', 'targets'))

    batch_size = 100  #Batch size for training
    batch_size_mon = 2000  # Batch size for monitoring and batch normalization
    n_batches = int(
        numpy.ceil(float(mnist_train.num_examples) / batch_size_mon))
    num_protos = 10

    ind = range(mnist_train.num_examples)
    train_ind = ind[:50000]
    val_ind = ind[50000:]

    def preprocessing(data_stream):
        return ForceFloatX(ScaleAndShift(data_stream,
                                         1 / 255.0,
                                         0.0,
                                         which_sources=('features', )),
                           which_sources=('features', ))

    train_stream_mon = preprocessing(
        DataStream(mnist_train,
                   iteration_scheme=ShuffledScheme(train_ind, batch_size_mon)))
    train_stream_bn = preprocessing(
        DataStream(mnist_train,
                   iteration_scheme=ShuffledScheme(train_ind, batch_size_mon)))
    train_stream = preprocessing(
        DataStream(mnist_train,
                   iteration_scheme=ShuffledScheme(train_ind, batch_size)))
    valid_stream = preprocessing(
        DataStream(mnist_train,
                   iteration_scheme=ShuffledScheme(val_ind, batch_size)))
    test_stream = preprocessing(
        DataStream(mnist_test,
                   iteration_scheme=ShuffledScheme(mnist_test.num_examples,
                                                   batch_size_mon)))

    initialize_prototypes(lvq, x, train_out, train_stream_mon)

    ######################
    # Training
    ######################

    from blocks.main_loop import MainLoop
    from blocks.extensions import FinishAfter, Timing, Printing
    from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring
    from blocks.algorithms import GradientDescent, Momentum, RMSProp, Adam
    from blocks.extensions.saveload import Checkpoint
    from lvq.extensions import EarlyStopping, LRDecay, MomentumSwitchOff, NCScheduler

    from lvq.batch_norm import BatchNormExtension
    lr = theano.shared(numpy.float32(1e-3))
    step_rule = Momentum(
        learning_rate, 0.9
    )  #Adam(learning_rate=lr) #RMSProp(learning_rate=1e-5, max_scaling=1e4)
    num_epochs = 100

    earlystop = EarlyStopping('valid_lvq_apply_misclass', 100,
                              './exp/alvq.pkl')

    main_loop = MainLoop(
        model=model,
        data_stream=train_stream,
        algorithm=GradientDescent(cost=model.outputs[0],
                                  parameters=model.parameters,
                                  step_rule=step_rule),
        extensions=[
            FinishAfter(after_n_epochs=num_epochs),
            BatchNormExtension(model, train_stream_bn, n_batches),
            LRDecay(step_rule.learning_rate, [20, 40, 60, 80]),
            MomentumSwitchOff(step_rule.momentum, num_epochs - 20),
            NCScheduler(lamb, 30., 0, num_epochs),
            DataStreamMonitoring(variables=[test_loss, test_misclass],
                                 data_stream=train_stream_mon,
                                 prefix='train'),
            DataStreamMonitoring(variables=[test_loss, test_misclass],
                                 data_stream=valid_stream,
                                 prefix='valid'),
            DataStreamMonitoring(variables=[test_loss, test_misclass],
                                 data_stream=test_stream,
                                 prefix='test'),
            Timing(), earlystop,
            Printing(after_epoch=True)
        ])
    main_loop.run()

    return main_loop.status.get('best_test_lvq_apply_misclass', None)
Example #24
0
def train_snli_model(new_training_job,
                     config,
                     save_path,
                     params,
                     fast_start,
                     fuel_server,
                     seed,
                     model='simple'):
    if config['exclude_top_k'] > config['num_input_words'] and config[
            'num_input_words'] > 0:
        raise Exception("Some words have neither word nor def embedding")
    c = config
    logger = configure_logger(name="snli_baseline_training",
                              log_file=os.path.join(save_path, "log.txt"))
    if not os.path.exists(save_path):
        logger.info("Start a new job")
        os.mkdir(save_path)
    else:
        logger.info("Continue an existing job")
    with open(os.path.join(save_path, "cmd.txt"), "w") as f:
        f.write(" ".join(sys.argv))

    # Make data paths nice
    for path in [
            'dict_path', 'embedding_def_path', 'embedding_path', 'vocab',
            'vocab_def', 'vocab_text'
    ]:
        if c.get(path, ''):
            if not os.path.isabs(c[path]):
                c[path] = os.path.join(fuel.config.data_path[0], c[path])

    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')

    # Save config to save_path
    json.dump(config, open(os.path.join(save_path, "config.json"), "w"))

    if model == 'simple':
        nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data(
            c)
    elif model == 'esim':
        nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data(
            c)
    else:
        raise NotImplementedError()

    # Compute cost
    s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2')

    if c['dict_path']:
        assert os.path.exists(c['dict_path'])
        s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix(
            'sentence2_def_map')
        def_mask = T.fmatrix("def_mask")
        defs = T.lmatrix("defs")
    else:
        s1_def_map, s2_def_map = None, None
        def_mask = None
        defs = None

    s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask')
    y = T.ivector('label')

    cg = {}
    for train_phase in [True, False]:
        # NOTE: Please don't change outputs of cg
        if train_phase:
            with batch_normalization(nli_model):
                pred = nli_model.apply(s1,
                                       s1_mask,
                                       s2,
                                       s2_mask,
                                       def_mask=def_mask,
                                       defs=defs,
                                       s1_def_map=s1_def_map,
                                       s2_def_map=s2_def_map,
                                       train_phase=train_phase)
        else:
            pred = nli_model.apply(s1,
                                   s1_mask,
                                   s2,
                                   s2_mask,
                                   def_mask=def_mask,
                                   defs=defs,
                                   s1_def_map=s1_def_map,
                                   s2_def_map=s2_def_map,
                                   train_phase=train_phase)

        cost = CategoricalCrossEntropy().apply(y.flatten(), pred)
        error_rate = MisclassificationRate().apply(y.flatten(), pred)
        cg[train_phase] = ComputationGraph([cost, error_rate])

    # Weight decay (TODO: Make it less bug prone)
    if model == 'simple':
        weights_to_decay = VariableFilter(
            bricks=[dense for dense, relu, bn in nli_model._mlp],
            roles=[WEIGHT])(cg[True].variables)
        weight_decay = np.float32(c['l2']) * sum(
            (w**2).sum() for w in weights_to_decay)
    elif model == 'esim':
        weight_decay = 0.0
    else:
        raise NotImplementedError()

    final_cost = cg[True].outputs[0] + weight_decay
    final_cost.name = 'final_cost'

    # Add updates for population parameters

    if c.get("bn", True):
        pop_updates = get_batch_normalization_updates(cg[True])
        extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates]
    else:
        pop_updates = []
        extra_updates = []

    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            loaded_params = load_parameters(src)
            cg[True].set_parameter_values(loaded_params)
            for param, m in pop_updates:
                param.set_value(loaded_params[get_brick(
                    param).get_hierarchical_name(param)])

    if os.path.exists(os.path.join(save_path, "main_loop.tar")):
        logger.warning("Manually loading BN stats :(")
        with open(os.path.join(save_path, "main_loop.tar")) as src:
            loaded_params = load_parameters(src)

        for param, m in pop_updates:
            param.set_value(
                loaded_params[get_brick(param).get_hierarchical_name(param)])

    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', batch_size=4).get_epoch_iterator())
        s1.tag.test_value = test_value_data[0]
        s1_mask.tag.test_value = test_value_data[1]
        s2.tag.test_value = test_value_data[2]
        s2_mask.tag.test_value = test_value_data[3]
        y.tag.test_value = test_value_data[4]

    # Freeze embeddings
    if not c['train_emb']:
        frozen_params = [
            p for E in nli_model.get_embeddings_lookups() for p in E.parameters
        ]
        train_params = [p for p in cg[True].parameters]
        assert len(set(frozen_params) & set(train_params)) > 0
    else:
        frozen_params = []
    if not c.get('train_def_emb', 1):
        frozen_params_def = [
            p for E in nli_model.get_def_embeddings_lookups()
            for p in E.parameters
        ]
        train_params = [p for p in cg[True].parameters]
        assert len(set(frozen_params_def) & set(train_params)) > 0
        frozen_params += frozen_params_def
    train_params = [p for p in cg[True].parameters if p not in frozen_params]
    train_params_keys = [
        get_brick(p).get_hierarchical_name(p) for p in train_params
    ]

    # Optimizer
    algorithm = GradientDescent(cost=final_cost,
                                on_unused_sources='ignore',
                                parameters=train_params,
                                step_rule=Adam(learning_rate=c['lr']))
    algorithm.add_updates(extra_updates)
    m = Model(final_cost)

    parameters = m.get_parameter_dict()  # Blocks version mismatch
    logger.info("Trainable parameters" + "\n" +
                pprint.pformat([(key, parameters[key].get_value().shape)
                                for key in sorted(train_params_keys)],
                               width=120))
    logger.info("# of parameters {}".format(
        sum([
            np.prod(parameters[key].get_value().shape)
            for key in sorted(train_params_keys)
        ])))

    ### Monitored args ###
    train_monitored_vars = [final_cost] + cg[True].outputs
    monitored_vars = cg[False].outputs
    val_acc = monitored_vars[1]
    to_monitor_names = [
        'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2',
        's1_gate_rootmean2', 's1_compose_gate_rootmean2'
    ]
    for k in to_monitor_names:
        train_v, valid_v = VariableFilter(name=k)(
            cg[True]), VariableFilter(name=k)(cg[False])
        if len(train_v):
            logger.info("Adding {} tracking".format(k))
            train_monitored_vars.append(train_v[0])
            monitored_vars.append(valid_v[0])
        else:
            logger.warning("Didnt find {} in cg".format(k))

    if c['monitor_parameters']:
        for name in train_params_keys:
            param = parameters[name]
            num_elements = numpy.product(param.get_value().shape)
            norm = param.norm(2) / num_elements
            grad_norm = algorithm.gradients[param].norm(2) / num_elements
            step_norm = algorithm.steps[param].norm(2) / num_elements
            stats = tensor.stack(norm, grad_norm, step_norm,
                                 step_norm / grad_norm)
            stats.name = name + '_stats'
            train_monitored_vars.append(stats)

    regular_training_stream = data.get_stream('train',
                                              batch_size=c['batch_size'],
                                              seed=seed)

    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=regular_training_stream.sources,
            hwm=100,
            produces_examples=regular_training_stream.produces_examples)
    else:
        training_stream = regular_training_stream

    ### Build extensions ###

    extensions = [
        # Load(main_loop_path, load_iteration_state=True, load_log=True)
        #     .set_conditions(before_training=not new_training_job),
        StartFuelServer(regular_training_stream,
                        stream_path,
                        hwm=100,
                        script_path=os.path.join(
                            os.path.dirname(__file__),
                            "../bin/start_fuel_server.py"),
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq']),
        ProgressBar(),
        RetrievalPrintStats(retrieval=used_retrieval,
                            every_n_batches=c['mon_freq_valid'],
                            before_training=not fast_start),
        Timestamp(),
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq']),
    ]

    if c['layout'] == 'snli':
        validation = DataStreamMonitoring(monitored_vars,
                                          data.get_stream('valid',
                                                          batch_size=14,
                                                          seed=seed),
                                          before_training=not fast_start,
                                          on_resumption=True,
                                          after_training=True,
                                          every_n_batches=c['mon_freq_valid'],
                                          prefix='valid')
        extensions.append(validation)
    elif c['layout'] == 'mnli':
        validation = DataStreamMonitoring(monitored_vars,
                                          data.get_stream('valid_matched',
                                                          batch_size=14,
                                                          seed=seed),
                                          every_n_batches=c['mon_freq_valid'],
                                          on_resumption=True,
                                          after_training=True,
                                          prefix='valid_matched')
        validation_mismatched = DataStreamMonitoring(
            monitored_vars,
            data.get_stream('valid_mismatched', batch_size=14, seed=seed),
            every_n_batches=c['mon_freq_valid'],
            before_training=not fast_start,
            on_resumption=True,
            after_training=True,
            prefix='valid_mismatched')
        extensions.extend([validation, validation_mismatched])
    else:
        raise NotImplementedError()

    # Similarity trackers for embeddings
    if len(c.get('vocab_def', '')):
        retrieval_vocab = Vocabulary(c['vocab_def'])
    else:
        retrieval_vocab = data.vocab

    retrieval_all = Retrieval(vocab_text=retrieval_vocab,
                              dictionary=used_dict,
                              max_def_length=c['max_def_length'],
                              exclude_top_k=0,
                              max_def_per_word=c['max_def_per_word'])

    for name in [
            's1_word_embeddings', 's1_dict_word_embeddings',
            's1_translated_word_embeddings'
    ]:
        variables = VariableFilter(name=name)(cg[False])
        if len(variables):
            s1_emb = variables[0]
            logger.info("Adding similarity tracking for " + name)
            # A bit sloppy about downcast

            if "dict" in name:
                embedder = construct_dict_embedder(theano.function(
                    [s1, defs, def_mask, s1_def_map],
                    s1_emb,
                    allow_input_downcast=True),
                                                   vocab=data.vocab,
                                                   retrieval=retrieval_all)
                extensions.append(
                    SimilarityWordEmbeddingEval(
                        embedder=embedder,
                        prefix=name,
                        every_n_batches=c['mon_freq_valid'],
                        before_training=not fast_start))
            else:
                embedder = construct_embedder(theano.function(
                    [s1], s1_emb, allow_input_downcast=True),
                                              vocab=data.vocab)
                extensions.append(
                    SimilarityWordEmbeddingEval(
                        embedder=embedder,
                        prefix=name,
                        every_n_batches=c['mon_freq_valid'],
                        before_training=not fast_start))

    track_the_best = TrackTheBest(validation.record_name(val_acc),
                                  before_training=not fast_start,
                                  every_n_epochs=c['save_freq_epochs'],
                                  after_training=not fast_start,
                                  every_n_batches=c['mon_freq_valid'],
                                  choose_best=min)
    extensions.append(track_the_best)

    # Special care for serializing embeddings
    if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path',
                                                     '')):
        extensions.insert(
            0,
            LoadNoUnpickling(main_loop_path,
                             load_iteration_state=True,
                             load_log=True).set_conditions(
                                 before_training=not new_training_job))
        extensions.append(
            Checkpoint(main_loop_path,
                       parameters=train_params + [p for p, m in pop_updates],
                       save_main_loop=False,
                       save_separately=['log', 'iteration_state'],
                       before_training=not fast_start,
                       every_n_epochs=c['save_freq_epochs'],
                       after_training=not fast_start).add_condition(
                           ['after_batch', 'after_epoch'],
                           OnLogRecord(track_the_best.notification_name),
                           (main_loop_best_val_path, )))
    else:
        extensions.insert(
            0,
            Load(main_loop_path, load_iteration_state=True,
                 load_log=True).set_conditions(
                     before_training=not new_training_job))
        extensions.append(
            Checkpoint(main_loop_path,
                       parameters=cg[True].parameters +
                       [p for p, m in pop_updates],
                       before_training=not fast_start,
                       every_n_epochs=c['save_freq_epochs'],
                       after_training=not fast_start).add_condition(
                           ['after_batch', 'after_epoch'],
                           OnLogRecord(track_the_best.notification_name),
                           (main_loop_best_val_path, )))

    extensions.extend([
        DumpCSVSummaries(save_path,
                         every_n_batches=c['mon_freq_valid'],
                         after_training=True),
        DumpTensorflowSummaries(save_path,
                                after_epoch=True,
                                every_n_batches=c['mon_freq_valid'],
                                after_training=True),
        Printing(every_n_batches=c['mon_freq_valid']),
        PrintMessage(msg="save_path={}".format(save_path),
                     every_n_batches=c['mon_freq']),
        FinishAfter(after_n_batches=c['n_batches']).add_condition(
            ['after_batch'],
            OnLogStatusExceed('iterations_done', c['n_batches']))
    ])

    logger.info(extensions)

    ### Run training ###

    if "VISDOM_SERVER" in os.environ:
        print("Running visdom server")
        ret = subprocess.Popen([
            os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"),
            "--visdom-server={}".format(os.environ['VISDOM_SERVER']),
            "--folder={}".format(save_path)
        ])
        time.sleep(0.1)
        if ret.returncode is not None:
            raise Exception()
        atexit.register(lambda: os.kill(ret.pid, signal.SIGINT))

    model = Model(cost)
    for p, m in pop_updates:
        model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=model,
                         extensions=extensions)

    assert os.path.exists(save_path)
    main_loop.run()
Example #25
0
def evaluate(c, tar_path, *args, **kwargs):
    """
    Performs rudimentary evaluation of SNLI/MNLI run

    * Runs on valid and test given network
    * Saves all predictions
    * Saves embedding matrix
    * Saves results.json and predictions.csv
    """

    # Load and configure
    model = kwargs['model']
    assert c.endswith("json")
    c = json.load(open(c))

    # Very ugly absolute path fix
    ABS_PATHS = [
        "data/", "/mnt/users/jastrzebski/local/dict_based_learning/data/",
        "/data/cf9ffb48-61bd-40dc-a011-b2e7e5acfd72/"
    ]
    from six import string_types
    for abs_path in ABS_PATHS:
        for k in c:
            if isinstance(c[k], string_types):
                if c[k].startswith(abs_path):
                    c[k] = c[k][len(abs_path):]

    # Make data paths nice
    for path in [
            'dict_path', 'embedding_def_path', 'embedding_path', 'vocab',
            'vocab_def', 'vocab_text'
    ]:
        if c.get(path, ''):
            if not os.path.isabs(c[path]):
                c[path] = os.path.join(fuel.config.data_path[0], c[path])

    logging.info("Updating config with " + str(kwargs))
    c.update(**kwargs)

    # NOTE: This assures we don't miss crucial definition for some def heavy words
    # usually it is a good idea
    c['max_def_per_word'] = c['max_def_per_word'] * 2

    assert tar_path.endswith("tar")
    dest_path = os.path.dirname(tar_path)
    prefix = os.path.splitext(os.path.basename(tar_path))[0]

    s1_decoded, s2_decoded = T.lmatrix('sentence1'), T.lmatrix('sentence2')

    if c['dict_path']:
        s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix(
            'sentence2_def_map')
        def_mask = T.fmatrix("def_mask")
        defs = T.lmatrix("defs")
    else:
        s1_def_map, s2_def_map = None, None
        def_mask = None
        defs = None

    s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask')

    if model == 'simple':
        model, data, used_dict, used_retrieval, used_vocab = _initialize_simple_model_and_data(
            c)
    elif model == 'esim':
        model, data, used_dict, used_retrieval, used_vocab = _initialize_esim_model_and_data(
            c)
    else:
        raise NotImplementedError()

    pred = model.apply(s1_decoded,
                       s1_mask,
                       s2_decoded,
                       s2_mask,
                       def_mask=def_mask,
                       defs=defs,
                       s1_def_map=s1_def_map,
                       s2_def_map=s2_def_map,
                       train_phase=False)
    cg = ComputationGraph([pred])
    if c.get("bn", True):
        bn_params = [
            p for p in VariableFilter(bricks=[BatchNormalization])(cg)
            if hasattr(p, "set_value")
        ]
    else:
        bn_params = []

    # Load model
    model = Model(cg.outputs)
    parameters = model.get_parameter_dict()  # Blocks version mismatch
    logging.info(
        "Trainable parameters" + "\n" +
        pprint.pformat([(key, parameters[key].get_value().shape)
                        for key in sorted([
                            get_brick(param).get_hierarchical_name(param)
                            for param in cg.parameters
                        ])],
                       width=120))
    logging.info("# of parameters {}".format(
        sum([
            np.prod(parameters[key].get_value().shape) for key in sorted([
                get_brick(param).get_hierarchical_name(param)
                for param in cg.parameters
            ])
        ])))
    with open(tar_path) as src:
        params = load_parameters(src)

        loaded_params_set = set(params.keys())
        model_params_set = set([
            get_brick(param).get_hierarchical_name(param)
            for param in cg.parameters
        ])

        logging.info("Loaded extra parameters")
        logging.info(loaded_params_set - model_params_set)
        logging.info("Missing parameters")
        logging.info(model_params_set - loaded_params_set)
    model.set_parameter_values(params)

    if c.get("bn", True):
        logging.info("Loading " + str([
            get_brick(param).get_hierarchical_name(param)
            for param in bn_params
        ]))
        for param in bn_params:
            param.set_value(
                params[get_brick(param).get_hierarchical_name(param)])
        for p in bn_params:
            model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p

    # Read logs
    logs = pd.read_csv(os.path.join(dest_path, "logs.csv"))
    best_val_acc = logs['valid_misclassificationrate_apply_error_rate'].min()
    logging.info("Best measured valid acc: " + str(best_val_acc))

    # NOTE(kudkudak): We need this to have comparable mean rank and embedding scores
    reference_vocab = Vocabulary(
        os.path.join(fuel.config.data_path[0], c['data_path'], 'vocab.txt'))
    vocab_all = Vocabulary(
        os.path.join(
            fuel.config.data_path[0], c['data_path'],
            'vocab_all.txt'))  # Can include OOV words, which is interesting
    retrieval_all = Retrieval(vocab_text=used_vocab,
                              dictionary=used_dict,
                              max_def_length=c['max_def_length'],
                              exclude_top_k=0,
                              max_def_per_word=c['max_def_per_word'])
    # logging.info("Calculating dict and word embeddings for vocab.txt and vocab_all.txt")
    # for name in ['s1_word_embeddings', 's1_dict_word_embeddings']:
    #     variables = VariableFilter(name=name)(cg)
    #     if len(variables):
    #         s1_emb = variables[0]
    #         # A bit sloppy about downcast
    #
    #         if "dict" in name:
    #             embedder = construct_dict_embedder(
    #                 theano.function([s1_decoded, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True),
    #                 vocab=data.vocab, retrieval=retrieval_all)
    #         else:
    #             embedder = construct_embedder(theano.function([s1_decoded], s1_emb, allow_input_downcast=True),
    #                 vocab=data.vocab)
    #
    #         for v_name, v in [("vocab_all", vocab_all), ("vocab", reference_vocab)]:
    #             logging.info("Calculating {} embeddings for {}".format(name, v_name))

    # Predict
    predict_fnc = theano.function(cg.inputs, pred)
    results = {}
    batch_size = 14
    for subset in ['valid', 'test']:
        logging.info("Predicting on " + subset)
        stream = data.get_stream(subset, batch_size=batch_size, seed=778)
        it = stream.get_epoch_iterator()
        rows = []
        for ex in tqdm.tqdm(it, total=10000 / batch_size):
            ex = dict(zip(stream.sources, ex))
            inp = [ex[v.name] for v in cg.inputs]
            prob = predict_fnc(*inp)
            label_pred = np.argmax(prob, axis=1)

            for id in range(len(prob)):
                s1_decoded = used_vocab.decode(ex['sentence1'][id]).split()
                s2_decoded = used_vocab.decode(ex['sentence2'][id]).split()

                assert used_vocab == data.vocab

                s1_decoded = [
                    '*' + w + '*'
                    if used_vocab.word_to_id(w) > c['num_input_words'] else w
                    for w in s1_decoded
                ]
                s2_decoded = [
                    '*' + w + '*'
                    if used_vocab.word_to_id(w) > c['num_input_words'] else w
                    for w in s2_decoded
                ]

                # Different difficulty metrics

                # text_unk_percentage
                s1_no_pad = [w for w in ex['sentence1'][id] if w != 0]
                s2_no_pad = [w for w in ex['sentence2'][id] if w != 0]

                s1_unk_percentage = sum([
                    1. for w in s1_no_pad if w == used_vocab.unk
                ]) / len(s1_no_pad)
                s2_unk_percentage = sum([
                    1. for w in s1_no_pad if w == used_vocab.unk
                ]) / len(s2_no_pad)

                # mean freq word
                s1_mean_freq = np.mean([
                    0 if w == data.vocab.unk else used_vocab._id_to_freq[w]
                    for w in s1_no_pad
                ])
                s2_mean_freq = np.mean([
                    0 if w == data.vocab.unk else used_vocab._id_to_freq[w]
                    for w in s2_no_pad
                ])

                # mean rank word (UNK is max rank)
                # NOTE(kudkudak): Will break if we reindex unk between vocabs :P
                s1_mean_rank = np.mean([
                    reference_vocab.size() if reference_vocab.word_to_id(
                        used_vocab.id_to_word(w)) == reference_vocab.unk else
                    reference_vocab.word_to_id(used_vocab.id_to_word(w))
                    for w in s1_no_pad
                ])

                s2_mean_rank = np.mean([
                    reference_vocab.size() if reference_vocab.word_to_id(
                        used_vocab.id_to_word(w)) == reference_vocab.unk else
                    reference_vocab.word_to_id(used_vocab.id_to_word(w))
                    for w in s2_no_pad
                ])

                rows.append({
                    "pred": label_pred[id],
                    "true_label": ex['label'][id],
                    "s1": ' '.join(s1_decoded),
                    "s2": ' '.join(s2_decoded),
                    "s1_unk_percentage": s1_unk_percentage,
                    "s2_unk_percentage": s2_unk_percentage,
                    "s1_mean_freq": s1_mean_freq,
                    "s2_mean_freq": s2_mean_freq,
                    "s1_mean_rank": s1_mean_rank,
                    "s2_mean_rank": s2_mean_rank,
                    "p_0": prob[id, 0],
                    "p_1": prob[id, 1],
                    "p_2": prob[id, 2]
                })

        preds = pd.DataFrame(rows, columns=rows[0].keys())
        preds.to_csv(
            os.path.join(dest_path,
                         prefix + '_predictions_{}.csv'.format(subset)))
        results[subset] = {}
        results[subset]['misclassification'] = 1 - np.mean(
            preds.pred == preds.true_label)

        if subset == "valid" and np.abs(
            (1 - np.mean(preds.pred == preds.true_label)) -
                best_val_acc) > 0.001:
            logging.error("!!!")
            logging.error(
                "Found different best_val_acc. Probably due to changed specification of the model class."
            )
            logging.error("Discrepancy {}".format(
                (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc))
            logging.error("!!!")

        logging.info(results)

    json.dump(results,
              open(os.path.join(dest_path, prefix + '_results.json'), "w"))
Example #26
0
def main(save_to, cost_name, learning_rate, momentum, num_epochs):
    mlp = MLP([None], [784, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    scores = mlp.apply(x)

    batch_size = y.shape[0]
    indices = tensor.arange(y.shape[0])
    target_scores = tensor.set_subtensor(
        tensor.zeros((batch_size, 10))[indices, y.flatten()], 1)
    score_diff = scores - target_scores

    # Logistic Regression
    if cost_name == 'lr':
        cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean()
    # MSE
    elif cost_name == 'mse':
        cost = (score_diff**2).mean()
    # Perceptron
    elif cost_name == 'perceptron':
        cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean()
    # TLE
    elif cost_name == 'minmin':
        cost = abs(score_diff[indices, y.flatten()]).mean()
        cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean()
    # TLEcut
    elif cost_name == 'minmin_cut':
        # Score of the groundtruth should be greater or equal than its target score
        cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean()
        # Score of the prediction should be less or equal than its actual score
        cost += tensor.maximum(0, score_diff[indices,
                                             scores.argmax(axis=1)]).mean()
    # TLE2
    elif cost_name == 'minmin2':
        cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()])**2).mean()
        cost += ((score_diff[tensor.arange(y.shape[0]),
                             scores.argmax(axis=1)])**2).mean()
    # Direct loss minimization
    elif cost_name == 'direct':
        epsilon = 0.1
        cost = (-scores[indices,
                        (scores + epsilon * target_scores).argmax(axis=1)] +
                scores[indices, scores.argmax(axis=1)]).mean()
        cost /= epsilon
    elif cost_name == 'svm':
        cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] -
                scores[indices, y.flatten()]).mean()
    else:
        raise ValueError("Unknown cost " + cost)

    error_rate = MisclassificationRate().apply(y.flatten(), scores)
    error_rate.name = 'error_rate'

    cg = ComputationGraph([cost])
    cost.name = 'cost'

    mnist_train = MNIST(("train", ))
    mnist_test = MNIST(("test", ))

    if learning_rate == None:
        learning_rate = 0.0001
    if momentum == None:
        momentum = 0.0
    rule = Momentum(learning_rate=learning_rate, momentum=momentum)
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=rule)
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs),
        DataStreamMonitoring([cost, error_rate],
                             Flatten(DataStream.default_stream(
                                 mnist_test,
                                 iteration_scheme=SequentialScheme(
                                     mnist_test.num_examples, 500)),
                                     which_sources=('features', )),
                             prefix="test"),
        # CallbackExtension(
        #    lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9),
        #    after_epoch=True),
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate
        ],
                               prefix="train",
                               after_epoch=True),
        Checkpoint(save_to),
        Printing()
    ]

    if BLOCKS_EXTRAS_AVAILABLE:
        extensions.append(
            Plot('MNIST example',
                 channels=[['test_cost', 'test_error_rate'],
                           ['train_total_gradient_norm']]))

    main_loop = MainLoop(algorithm,
                         Flatten(DataStream.default_stream(
                             mnist_train,
                             iteration_scheme=SequentialScheme(
                                 mnist_train.num_examples, 50)),
                                 which_sources=('features', )),
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()

    df = pandas.DataFrame.from_dict(main_loop.log, orient='index')
    res = {
        'cost': cost_name,
        'learning_rate': learning_rate,
        'momentum': momentum,
        'train_cost': df.train_cost.iloc[-1],
        'test_cost': df.test_cost.iloc[-1],
        'best_test_cost': df.test_cost.min(),
        'train_error': df.train_error_rate.iloc[-1],
        'test_error': df.test_error_rate.iloc[-1],
        'best_test_error': df.test_error_rate.min()
    }
    res = {
        k: float(v) if isinstance(v, numpy.ndarray) else v
        for k, v in res.items()
    }
    json.dump(res, sys.stdout)
    sys.stdout.flush()
Example #27
0
def main(config, tr_stream, dev_stream):
    # Create Theano variables
    logger.info('Creating theano variables')
    source_char_seq = tensor.lmatrix('source_char_seq')
    source_sample_matrix = tensor.btensor3('source_sample_matrix')
    source_char_aux = tensor.bmatrix('source_char_aux')
    source_word_mask = tensor.bmatrix('source_word_mask')
    target_char_seq = tensor.lmatrix('target_char_seq')
    target_char_aux = tensor.bmatrix('target_char_aux')
    target_char_mask = tensor.bmatrix('target_char_mask')
    target_sample_matrix = tensor.btensor3('target_sample_matrix')
    target_word_mask = tensor.bmatrix('target_word_mask')
    target_resample_matrix = tensor.btensor3('target_resample_matrix')
    target_prev_char_seq = tensor.lmatrix('target_prev_char_seq')
    target_prev_char_aux = tensor.bmatrix('target_prev_char_aux')
    target_bos_idx = tr_stream.trg_bos
    target_space_idx = tr_stream.space_idx['target']

    # Construct model
    logger.info('Building RNN encoder-decoder')

    encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'],
                                   config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth'])

    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'],
                      config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'],
                      config['trg_igru_depth'],
                      config['trg_dgru_depth'], target_space_idx, target_bos_idx)

    representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask)
    cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix,
                        target_resample_matrix, target_char_aux, target_char_mask,
                        target_word_mask, target_prev_char_seq, target_prev_char_aux)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.decimator.bidir_w.prototype.recurrent.weights_init = Orthogonal()
    for layer_n in range(config['src_dgru_depth']):
        encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal()
    for layer_n in range(config['bidir_encoder_depth']):
        encoder.children[1 + layer_n].prototype.recurrent.weights_init = Orthogonal()
    if config['trg_igru_depth'] == 1:
        decoder.interpolator.igru.weights_init = Orthogonal()
    else:
        for layer_n in range(config['trg_igru_depth']):
            decoder.interpolator.igru.transitions[layer_n].weights_init = Orthogonal()
    for layer_n in range(config['trg_dgru_depth']):
        decoder.interpolator.feedback_brick.dgru.transitions[layer_n].weights_init = Orthogonal()
    for layer_n in range(config['transition_depth']):
        decoder.transition.transitions[layer_n].weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(str(shape), count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(str(value.get_value().shape), name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)
    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # You could use 1e-4 in Adam, however manually decay will be faster.
    # We decay it to 5e-4 when trained for about 30K
    # then decay it to 2e-4 when trained for about 90K
    # finally set it to 1e-4 when trained for about 180K
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                 Adam(learning_rate=1e-3)]))

    # Set extensions
    logger.info("Initializing extensions")
    # Extensions
    gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
    step_norm = aggregation.mean(algorithm.total_step_norm)
    train_monitor = CostCurve([cost, gradient_norm, step_norm], config=config, after_batch=True,
                              before_first_epoch=True, prefix='tra')
    extensions = [
        train_monitor, Timing(),
        Printing(every_n_batches=config['print_freq']),
        FinishAfter(after_n_batches=config['finish_after']),
        CheckpointNMT(saveto=config['saveto'], dump_freq=config['dump_freq'], every_n_batches=config['save_freq'], )]

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1:
        logger.info("Building sampling model")
        generated = decoder.generate(representation, source_word_mask)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
            ComputationGraph(generated[config['transition_depth']]))  # generated[transition_depth] is next_outputs

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model, data_stream=tr_stream,
                    hook_samples=config['hook_samples'], transition_depth=config['transition_depth'],
                    every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size']))


    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=training_model,
        algorithm=algorithm,
        data_stream=tr_stream,
        extensions=extensions
    )

    # Train!
    main_loop.run()
Example #28
0
def main(config,
         tr_stream,
         dev_stream,
         source_vocab,
         target_vocab,
         use_bokeh=False):

    # add the tags from this function to the IMT datastream
    # prediction function signature
    # [target_suffix, source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask]
    prediction_function = get_prediction_function(exp_config=config)

    tr_stream = Mapping(
        tr_stream,
        CallPredictionFunctionOnStream(prediction_function,
                                       [1, 0, 5, 4, 7, 6]),
        #tr_stream = Mapping(tr_stream, CallFunctionOnStream(prediction_function, [6, 1, 0, 5, 4, 7]),
        add_sources=('predictions', 'orig_readouts', 'prediction_tags'))

    # now datastream has 11 things
    import ipdb
    ipdb.set_trace()

    # WORKING: call prediction function twice to get new readouts on predictions instead of reference suffs
    # the only difference is the index of the suffix
    tr_stream = Mapping(tr_stream,
                        CallPredictionFunctionOnStream(prediction_function,
                                                       [1, 0, 5, 4, 7, 8]),
                        add_sources=('dummy_predictions', 'readouts',
                                     'dummy_prediction_tags'))

    import ipdb
    ipdb.set_trace()

    # Create the prediction confidence model
    # the first draft of this model uses the readout output (before the post-merge step) as the per-timestep state vector

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')

    # Note that the _names_ are changed from normal NMT
    # for IMT training, we use only the suffix as the reference
    target_sentence = tensor.lmatrix('target_suffix')
    target_sentence_mask = tensor.matrix('target_suffix_mask')

    target_prefix = tensor.lmatrix('target_prefix')
    target_prefix_mask = tensor.matrix('target_prefix_mask')

    # symbolic variable which tags each timestep as GOOD/BAD
    # Note: later this might be tags for a hypothesis i.e. from TER(p), right now the timesteps are actually determined by the reference
    # By zipping the confidence model output with the reference, we get the model's confidence that this reference word
    # will be predicted correctly
    prediction_tags = tensor.matrix('prediction_tags')
    readouts = tensor.tensor3('readouts')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])

    decoder = NMTPrefixDecoder(config['trg_vocab_size'],
                               config['dec_embed'],
                               config['dec_nhids'],
                               config['enc_nhids'] * 2,
                               loss_function='cross_entropy')

    # rename to match baseline NMT systems
    decoder.name = 'decoder'

    cost = decoder.confidence_cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask,
        target_prefix, target_prefix_mask, readouts, prediction_tags)

    # WORKING: add l2 regularization

    logger.info('Creating computational graph')
    # working: implement cost for confidence model
    cg = ComputationGraph(cost)

    # INITIALIZATION
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    #cost_cg = ComputationGraph(cost)
    if config['l2_reg']:
        l2_reg_alpha = config['l2_reg_alpha']
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())
        # do we need to name the cost variable again?
        cost.name = 'cost'
        cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables if x.name in set([
                'confidence_model1_apply_output',
                'confidence_model2_apply_output',
                'confidence_model3_apply_output'
            ])
        ]
        # if x.name == 'maxout_apply_output']
        # if x.name == 'maxout_apply_output']
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # WORKING: implement confidence -- remove all params except output model
    cost_model = Model(cost)

    model_params = cost_model.get_parameter_dict()
    trainable_params = cg.parameters
    import ipdb
    ipdb.set_trace()
    print('trainable params')
    #params_to_remove = [model_params[k] for k in model_params.keys() if 'confidence' not in k]
    #for p in params_to_remove:
    #    trainable_params.remove(p)

    # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W']
    # trainable_params.remove(source_embeddings)
    # trainable_params.remove(target_embeddings)
    # END WORKING: implement confidence -- remove all params except output model

    # TODO: fixed dropout mask for recurrent params?
    # Print shapes
    # shapes = [param.get_value().shape for param in cg.parameters]
    # logger.info("Parameter shapes: ")
    # for shape, count in Counter(shapes).most_common():
    #     logger.info('    {:15}: {}'.format(shape, count))
    # logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    # enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
    #                            Selector(decoder).get_parameters())
    # logger.info("Parameter names: ")
    # for name, value in enc_dec_param_dict.items():
    #     logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    # logger.info("Total number of parameters: {}"
    #             .format(len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        # TrainingDataMonitoring(trainable_params, after_batch=True),
        # Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # WORKING: confidence prediction
    #monitor everything that could possibly be relevant

    # Set up the sampling graph for validation during training
    # Theano variables for the sampling graph
    # Note this also loads the model parameters
    sampling_vars = load_params_and_get_beam_search(config,
                                                    encoder=encoder,
                                                    decoder=decoder)
    beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars

    #if config['hook_samples'] >= 1:
    #    logger.info("Building sampler")
    #    extensions.append(
    #        Sampler(model=search_model, data_stream=tr_stream,
    #                hook_samples=config['hook_samples'],
    #                every_n_batches=config['sampling_freq'],
    #                src_vocab=source_vocab,
    #                trg_vocab=target_vocab,
    #                src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    #if config['bleu_script'] is not None:
    #    logger.info("Building bleu validator")
    #    extensions.append(
    #        BleuValidator(sampling_input, sampling_prefix, samples=samples, config=config,
    #                      model=search_model, data_stream=dev_stream,
    #                      src_vocab=source_vocab,
    #                      trg_vocab=target_vocab,
    #                      normalize=config['normalized_bleu'],
    #                      every_n_batches=config['bleu_val_freq']))

    # TODO: add first-word accuracy validation
    # TODO: add IMT meteor early stopping
    #if config.get('imt_f1_validation', None) is not None:
    #    logger.info("Building imt F1 validator")
    #    extensions.append(
    #        IMT_F1_Validator(sampling_input, sampling_prefix,
    #                         samples=samples,
    #                         config=config,
    #                         model=search_model, data_stream=dev_stream,
    #                         src_vocab=source_vocab,
    #                         trg_vocab=target_vocab,
    #                         normalize=config['normalized_bleu'],
    #                         every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # TODO: hacking here: get the predictions of the confidence model using the `readouts` source of the data_stream

    # Note that the parameters of this model must be pretrained, otherwise this doesn't make sense
    # confidence_predictions = decoder.get_confidence(readouts)
    # confidence_prediction_model = Model(confidence_predictions)
    #
    # confidence_param_values = LoadNMT.load_parameter_values(config['confidence_saved_parameters'], brick_delimiter=None)
    # LoadNMT.set_model_parameters(confidence_prediction_model, confidence_param_values)
    #
    # confidence_prediction_func = confidence_prediction_model.get_theano_function()

    # import ipdb; ipdb.set_trace()

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            # Plot(config['model_save_directory'], channels=[['decoder_confidence_cost_cost']],
            Plot(config['model_save_directory'],
                 channels=[['cost']],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # WORKING: implement confidence model
    # if there is dropout or random noise, we need to use the output of the modified graph
    algorithm = GradientDescent(
        cost=cg.outputs[0],
        parameters=trainable_params,
        step_rule=CompositeRule([
            StepClipping(config['step_clipping']),
            eval(config['step_rule'])()
        ]),
        # eval(config['step_rule'])(), RemoveNotFinite()]),
        # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]),
        on_unused_sources='warn')
    #if config['dropout'] < 1.0:
    #   algorithm = GradientDescent(
    #       cost=cg.outputs[0], parameters=trainable_params,
    #       step_rule=CompositeRule([StepClipping(config['step_clipping']),
    #                         eval(config['step_rule'])(), RemoveNotFinite()]),
    #       # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]),
    #       on_unused_sources='warn'
    #   )
    #else:
    #   algorithm = GradientDescent(
    #       cost=cost, parameters=cg.parameters,
    #       step_rule=CompositeRule([StepClipping(config['step_clipping']),
    #                                eval(config['step_rule'])()]),
    #       on_unused_sources='warn'
    #   )
    # END WORKING: implement confidence model

    import ipdb
    ipdb.set_trace()

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # WORKING: debugging confidence
    # get theano function from model
    # WORKING: implement word-level confidence cost
    #   @application(inputs=['representation', 'source_sentence_mask',
    #                                'target_sentence_mask', 'target_sentence', 'target_prefix_mask', 'target_prefix'],
    #                                                 outputs=['cost'])
    #       def confidence_cost(self, representation, source_sentence_mask,
    #                            target_sentence, target_sentence_mask, target_prefix, target_prefix_mask):

    logger.info('Creating theano variables')

    # WORKING: 26.9.16 -- get confidence outputs directly from (source, prefix, suffix) inputs
    # This is equivalent to forced alignment --> confidence scores
    # Note: but this section should probably be in "evaluate" mode, not here in "train"

    # source_sentence = tensor.lmatrix('source')
    # source_sentence_mask = tensor.matrix('source_mask')

    # Note that the _names_ are changed from normal NMT
    # for IMT training, we use only the suffix as the reference
    #target_sentence = tensor.lmatrix('target_suffix')
    #target_sentence_mask = tensor.matrix('target_suffix_mask')
    # TODO: change names back to *_suffix, there is currently a theano function name error
    # TODO: in the GradientDescent Algorithm

    #target_prefix = tensor.lmatrix('target_prefix')
    #target_prefix_mask = tensor.matrix('target_prefix_mask')

    # confidence_output = decoder.confidence_cost(
    #     encoder.apply(source_sentence, source_sentence_mask),
    #     source_sentence_mask, target_sentence, target_sentence_mask,
    #     target_prefix, target_prefix_mask)

    # confidence_model = Model(confidence_output)

    # t_cost_func = confidence_model.get_theano_function()
    # inputs
    # [source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask, target_suffix]

    #import ipdb;ipdb.set_trace()

    # get the right args from the datastream
    # TODO: just print source, prefix, suffix, prediction, correct to new files -- this makes sure everything is aligned
    # OUTPUT_DIR = '/media/1tb_drive/imt_models/word_prediction_accuracy_experiments/en-de/exp_1'
    # for the_file in os.listdir(OUTPUT_DIR):
    #     file_path = os.path.join(OUTPUT_DIR, the_file)
    #     try:
    #         if os.path.isfile(file_path):
    #             os.unlink(file_path)
    #     except Exception as e:
    #         print(e)
    #
    # def write_file_truncate_mask(filename, data, mask, mode='a'):
    #     ''' data is list of list '''
    #
    #     assert len(data) == len(mask)
    #     with codecs.open(filename, mode, encoding='utf8') as out:
    #         for l, m in zip(data, mask):
    #             output = u' '.join(l[:int(m.sum())]) + u'\n'
    #             out.write(output)
    #     logger.info('Wrote file: {}'.format(filename))
    #
    #
    # target_ivocab = {k:v.decode('utf8') for v,k in target_vocab.items()}
    # source_ivocab = {k:v.decode('utf8') for v,k in source_vocab.items()}
    # import ipdb; ipdb.set_trace()
    # tag_ivocab = {1: 'True', 0: 'False'}
    #
    # test_iter = tr_stream.get_epoch_iterator()
    # it = 0
    # for t_source, t_source_mask, t_target, t_target_mask, t_target_prefix, t_target_prefix_mask, t_target_suffix, t_target_suffix_mask in test_iter:
    #     if it <= 1000:
    #         it += 1
    #         t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix)
    #         readouts = t_cost[0]
    #         preds = readouts.argmax(axis=2)
    #         correct = preds.T == t_target_suffix
    #
    #
    #         source_output = os.path.join(OUTPUT_DIR,'sources.en')
    #         prefix_output = os.path.join(OUTPUT_DIR,'prefixes.de')
    #         suffix_output = os.path.join(OUTPUT_DIR,'suffixes.de')
    #         prediction_output = os.path.join(OUTPUT_DIR,'predictions.de')
    #         correct_output = os.path.join(OUTPUT_DIR,'prefix_word_prediction_acc.out')
    #
    #         source_text = [[source_ivocab[w] for w in s] for s in t_source]
    #         prefix_text = [[target_ivocab[w] for w in s] for s in t_target_prefix]
    #         suffix_text = [[target_ivocab[w] for w in s] for s in t_target_suffix]
    #         pred_text = [[target_ivocab[w] for w in s] for s in preds.T]
    #         correct_text = [[tag_ivocab[w] for w in s] for s in correct]
    #
    #
    #         for triple in zip([source_output, prefix_output, suffix_output, prediction_output, correct_output],
    #                           [source_text, prefix_text, suffix_text, pred_text, correct_text],
    #                           [t_source_mask, t_target_prefix_mask, t_target_suffix_mask, t_target_suffix_mask, t_target_suffix_mask]):
    #             write_file_truncate_mask(*triple)
    #     else:
    #         break
    #
    # import ipdb; ipdb.set_trace()

    #t_cost = t_cost_func(t_source, t_target_prefix)
    #t_cost = t_cost_func(t_target_suffix, t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask)
    #t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix)

    #    return confidence_cost, flat_y, confidence_logits, readouts

    #predictions = t_cost[0].argmax(axis=2)

    # TODO: next step -- print gradients and weights during training find out where nan is coming from
    # TODO: look at the gradient of this function with respect to parameters? -- see here: http://deeplearning.net/software/theano/tutorial/gradients.html

    # TODO: function which adds right/wrong tags for model predictions to the datastream. In this case we can learn a simple linear model as a baseline
    # TODO: print predictions for each batch for each timestep to file -- _dont shuffle_ so that we get the right order

    # import ipdb;ipdb.set_trace()

    # from blocks reverse_words example
    # observables = [
    #     cost, min_energy, max_energy, mean_activation,
    #     batch_size, max_length, cost_per_character,
    #     algorithm.total_step_norm, algorithm.total_gradient_norm]
    # for name, parameter in trainable_params.items():
    #     observables.append(parameter.norm(2).copy(name + "_norm"))
    #     observables.append(algorithm.gradients[parameter].norm(2).copy(
    #         name + "_grad_norm"))

    for i, (k, v) in enumerate(algorithm.updates):
        v.name = k.name + '_{}'.format(i)

    aux_vars = [v for v in cg.auxiliary_variables[-3:]]
    # import ipdb; ipdb.set_trace()

    extensions.extend([
        TrainingDataMonitoring([cost], after_batch=True),
        # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True),
        # TrainingDataMonitoring(aux_vars, after_batch=True),
        # TrainingDataMonitoring(trainable_params, after_batch=True),
        Printing(after_batch=True)
    ])

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)
    import ipdb
    ipdb.set_trace()

    # Train!
    main_loop.run()
def main():
    nclasses = 27

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=1)
    parser.add_argument("--length", type=int, default=180)
    parser.add_argument("--num-epochs", type=int, default=100)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=1e-3)
    parser.add_argument("--epsilon", type=float, default=1e-5)
    parser.add_argument("--num-hidden", type=int, default=1000)
    parser.add_argument("--baseline", action="store_true")
    parser.add_argument("--initialization", choices="identity glorot orthogonal uniform".split(), default="identity")
    parser.add_argument("--initial-gamma", type=float, default=1e-1)
    parser.add_argument("--initial-beta", type=float, default=0)
    parser.add_argument("--cluster", action="store_true")
    parser.add_argument("--activation", choices=list(activations.keys()), default="tanh")
    parser.add_argument("--optimizer", choices="sgdmomentum adam rmsprop", default="rmsprop")
    parser.add_argument("--continue-from")
    parser.add_argument("--evaluate")
    parser.add_argument("--dump-hiddens")
    args = parser.parse_args()

    np.random.seed(args.seed)
    blocks.config.config.default_seed = args.seed

    if args.continue_from:
        from blocks.serialization import load

        main_loop = load(args.continue_from)
        main_loop.run()
        sys.exit(0)

    graphs, extensions, updates = construct_graphs(args, nclasses)

    ### optimization algorithm definition
    if args.optimizer == "adam":
        optimizer = Adam(learning_rate=args.learning_rate)
    elif args.optimizer == "rmsprop":
        optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9)
    elif args.optimizer == "sgdmomentum":
        optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99)
    step_rule = CompositeRule([StepClipping(1.0), optimizer])
    algorithm = GradientDescent(
        cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule
    )
    algorithm.add_updates(updates["training"])
    model = Model(graphs["training"].outputs[0])
    extensions = extensions["training"] + extensions["inference"]

    # step monitor
    step_channels = []
    step_channels.extend(
        [
            algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
            for name, param in model.get_parameter_dict().items()
        ]
    )
    step_channels.append(algorithm.total_step_norm.copy(name="total_step_norm"))
    step_channels.append(algorithm.total_gradient_norm.copy(name="total_gradient_norm"))
    step_channels.extend(graphs["training"].outputs)
    logger.warning("constructing training data monitor")
    extensions.append(TrainingDataMonitoring(step_channels, prefix="iteration", after_batch=True))

    # parameter monitor
    extensions.append(
        DataStreamMonitoring(
            [param.norm(2).copy(name="parameter.norm:%s" % name) for name, param in model.get_parameter_dict().items()],
            data_stream=None,
            after_epoch=True,
        )
    )

    validation_interval = 500
    # performance monitor
    for situation in "training inference".split():
        if situation == "inference" and not args.evaluate:
            # save time when we don't need the inference graph
            continue

        for which_set in "train valid test".split():
            logger.warning("constructing %s %s monitor" % (which_set, situation))
            channels = list(graphs[situation].outputs)
            extensions.append(
                DataStreamMonitoring(
                    channels,
                    prefix="%s_%s" % (which_set, situation),
                    every_n_batches=validation_interval,
                    data_stream=get_stream(
                        which_set=which_set, batch_size=args.batch_size, num_examples=10000, length=args.length
                    ),
                )
            )

    extensions.extend(
        [
            TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"),
            DumpBest("best_valid_training_error_rate", "best.zip"),
            FinishAfter(after_n_epochs=args.num_epochs),
            # FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50),
            Checkpoint("checkpoint.zip", on_interrupt=False, every_n_epochs=1, use_cpickle=True),
            DumpLog("log.pkl", after_epoch=True),
        ]
    )

    if not args.cluster:
        extensions.append(ProgressBar())

    extensions.extend([Timing(), Printing(every_n_batches=validation_interval), PrintingTo("log")])
    main_loop = MainLoop(
        data_stream=get_stream(which_set="train", batch_size=args.batch_size, length=args.length, augment=True),
        algorithm=algorithm,
        extensions=extensions,
        model=model,
    )

    if args.dump_hiddens:
        dump_hiddens(args, main_loop)
        return

    if args.evaluate:
        evaluate(args, main_loop)
        return

    main_loop.run()
Example #30
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['dec_nhids'], config['enc_nhids'] * 2)

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')

        # Get training and development set streams
        tr_stream = get_tr_stream(**config)
        dev_stream = get_dev_stream(**config)

        # Get cost of the model
        cost = decoder.cost(
            encoder.apply(source_sentence, source_sentence_mask),
            source_sentence_mask, target_sentence, target_sentence_mask)

        logger.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [
                x for x in cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(cg, enc_params + dec_params,
                             config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([cost], after_batch=True),
            Printing(after_batch=True),
            CheckpointNMT(config['saveto'],
                          every_n_batches=config['save_freq'])
        ]

        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(sampling_input,
                                         sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(bricks=[decoder.sequence_generator],
                                        name="outputs")(ComputationGraph(
                                            generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model,
                        data_stream=tr_stream,
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if config['bleu_script'] is not None:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input,
                              samples=samples,
                              config=config,
                              model=search_model,
                              data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq']))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En',
                     channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(model=training_model,
                             algorithm=algorithm,
                             data_stream=tr_stream,
                             extensions=extensions)

        # Train!
        main_loop.run()

    elif mode == 'translate':

        # Create Theano variables
        logger.info('Creating theano variables')
        sampling_input = tensor.lmatrix('source')

        # Get test set stream
        test_stream = get_dev_stream(config['test_set'], config['src_vocab'],
                                     config['src_vocab_size'],
                                     config['unk_id'])
        ftrans = open(config['test_set'] + '.trans.out', 'w')

        # Helper utilities
        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1

        # Get beam search
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        loader = LoadNMT(config['saveto'])
        loader.set_model_parameters(model, loader.load_parameters())

        # Get target vocabulary
        trg_vocab = _ensure_special_tokens(pickle.load(
            open(config['trg_vocab'])),
                                           bos_idx=0,
                                           eos_idx=trg_eos_idx,
                                           unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Started translation: ")
        total_cost = 0.0

        for i, line in enumerate(test_stream.get_epoch_iterator()):

            seq = sutils._oov_to_unk(line[0], config['src_vocab_size'],
                                     unk_idx)
            input_ = numpy.tile(seq, (config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                beam_search.search(
                    input_values={sampling_input: input_},
                    max_length=3*len(seq), eol_symbol=src_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            best = numpy.argsort(costs)[0]
            try:
                total_cost += costs[best]
                trans_out = trans[best]

                # convert idx to words
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)

            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i + 1))
                trans_out = '<UNK>'

            print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info("Translated {} lines of test set...".format(i))

        logger.info("Total cost of the test: {}".format(total_cost))
        ftrans.close()
def train(step_rule, label_dim, state_dim, epochs, seed, dropout, test_cost,
          experiment_path, features, weight_noise, to_watch, patience,
          batch_size, batch_norm, **kwargs):

    print '.. TIMIT experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()

    # ------------------------------------------------------------------------
    # Streams

    rng = np.random.RandomState(seed)
    stream_args = dict(rng=rng, batch_size=batch_size)

    print '.. initializing iterators'
    train_dataset = Timit('train', features=features)
    train_stream = construct_stream(train_dataset, **stream_args)
    dev_dataset = Timit('dev', features=features)
    dev_stream = construct_stream(dev_dataset, **stream_args)
    test_dataset = Timit('test', features=features)
    test_stream = construct_stream(test_dataset, **stream_args)
    update_stream = construct_stream(train_dataset,
                                     n_batches=100,
                                     **stream_args)

    phone_dict = train_dataset.get_phoneme_dict()
    phoneme_dict = {
        k: phone_to_phoneme_dict[v] if v in phone_to_phoneme_dict else v
        for k, v in phone_dict.iteritems()
    }
    ind_to_phoneme = {v: k for k, v in phoneme_dict.iteritems()}
    eol_symbol = ind_to_phoneme['<STOP>']

    # ------------------------------------------------------------------------
    # Graph

    print '.. building model'
    x = T.tensor3('features')
    y = T.matrix('phonemes')
    input_mask = T.matrix('features_mask')
    output_mask = T.matrix('phonemes_mask')

    theano.config.compute_test_value = 'off'
    x.tag.test_value = np.random.randn(100, 24, 123).astype(floatX)
    y.tag.test_value = np.ones((30, 24), dtype=floatX)
    input_mask.tag.test_value = np.ones((100, 24), dtype=floatX)
    output_mask.tag.test_value = np.ones((30, 24), dtype=floatX)

    seq_len = 100
    input_dim = 123
    activation = Tanh()
    recurrent_init = IdentityInit(0.99)

    rec1 = TimLSTM(not batch_norm,
                   input_dim,
                   state_dim,
                   activation,
                   name='LSTM')
    rec1.initialize()
    l1 = Linear(state_dim,
                label_dim + 1,
                name='out_linear',
                weights_init=Orthogonal(),
                biases_init=Constant(0.0))
    l1.initialize()
    o1 = rec1.apply(x)
    y_hat_o = l1.apply(o1)

    shape = y_hat_o.shape
    y_hat = Softmax().apply(y_hat_o.reshape((-1, shape[-1]))).reshape(shape)

    y_mask = output_mask
    y_hat_mask = input_mask

    # ------------------------------------------------------------------------
    # Costs and Algorithm

    ctc_cost = T.sum(
        ctc.cpu_ctc_th(y_hat_o, T.sum(y_hat_mask, axis=0), y + T.ones_like(y),
                       T.sum(y_mask, axis=0)))
    batch_cost = ctc_cost.copy(name='batch_cost')

    bs = y.shape[1]
    cost_train = aggregation.mean(batch_cost, bs).copy("sequence_cost")
    cost_per_character = aggregation.mean(
        batch_cost, output_mask.sum()).copy("character_cost")
    cg_train = ComputationGraph(cost_train)

    model = Model(cost_train)
    train_cost_per_character = aggregation.mean(
        cost_train, output_mask.sum()).copy("train_character_cost")

    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost_train,
                                parameters=cg_train.parameters,
                                on_unused_sources='warn')

    # ------------------------------------------------------------------------
    # Monitoring and extensions

    parameters = model.get_parameter_dict()
    observed_vars = [
        cost_train, train_cost_per_character,
        aggregation.mean(algorithm.total_gradient_norm)
    ]
    for name, param in parameters.iteritems():
        observed_vars.append(param.norm(2).copy(name + "_norm"))
        observed_vars.append(
            algorithm.gradients[param].norm(2).copy(name + "_grad_norm"))
    train_monitor = TrainingDataMonitoring(variables=observed_vars,
                                           prefix="train",
                                           after_epoch=True)

    dev_monitor = DataStreamMonitoring(
        variables=[cost_train, cost_per_character],
        data_stream=dev_stream,
        prefix="dev")
    train_ctc_monitor = CTCMonitoring(x,
                                      input_mask,
                                      y_hat,
                                      eol_symbol,
                                      train_stream,
                                      prefix='train',
                                      every_n_epochs=1,
                                      before_training=True,
                                      phoneme_dict=phoneme_dict,
                                      black_list=black_list,
                                      train=True)
    dev_ctc_monitor = CTCMonitoring(x,
                                    input_mask,
                                    y_hat,
                                    eol_symbol,
                                    dev_stream,
                                    prefix='dev',
                                    every_n_epochs=1,
                                    phoneme_dict=phoneme_dict,
                                    black_list=black_list)

    extensions = []
    if 'load_path' in kwargs:
        extensions.append(Load(kwargs['load_path']))

    extensions.extend([
        FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor,
        train_ctc_monitor, dev_ctc_monitor
    ])

    if test_cost:
        test_monitor = DataStreamMonitoring(
            variables=[cost_train, cost_per_character],
            data_stream=test_stream,
            prefix="test")
        test_ctc_monitor = CTCMonitoring(x,
                                         input_mask,
                                         y_hat,
                                         eol_symbol,
                                         test_stream,
                                         prefix='test',
                                         every_n_epochs=1,
                                         phoneme_dict=phoneme_dict,
                                         black_list=black_list)
        extensions.append(test_monitor)
        extensions.append(test_ctc_monitor)

    #if not os.path.exists(experiment_path):
    #    os.makedirs(experiment_path)
    #best_path = os.path.join(experiment_path, 'best/')
    #if not os.path.exists(best_path):
    #    os.mkdir(best_path)
    #best_path = os.path.join(best_path, 'model.bin')
    extensions.append(EarlyStopping(to_watch, patience, '/dev/null'))
    extensions.extend([ProgressBar(), Printing()])

    # ------------------------------------------------------------------------
    # Main Loop

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    print "Building time: %f" % (time.time() - t0)
    # if write_predictions:
    #     with open('predicted.txt', 'w') as f_pred:
    #         with open('targets.txt', 'w') as f_targets:
    #             evaluator = CTCEvaluator(
    #                 eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list)
    #             evaluator.evaluate(dev_stream, file_pred=f_pred,
    #                                file_targets=f_targets)
    #     return
    main_loop.run()
Example #32
0
                            parameters=cg.parameters,
                            step_rule=CompositeRule(step_rules))

from blocks.extensions import Timing, FinishAfter, Printing, ProgressBar
from blocks.extensions.monitoring import TrainingDataMonitoring
from fuel.streams import DataStream
from fuel.schemes import SequentialScheme
from blocks.main_loop import MainLoop
from blocks.extensions.saveload import Checkpoint

from blocks.model import Model

main_loop = MainLoop(algorithm=algorithm,
                     data_stream=DataStream.default_stream(
                         dataset=train_dataset,
                         iteration_scheme=SequentialScheme(
                             train_dataset.num_examples, batch_size=10)),
                     model=Model(y_est),
                     extensions=[
                         Timing(),
                         FinishAfter(after_n_epochs=200),
                         TrainingDataMonitoring(variables=[cost],
                                                prefix="train",
                                                after_epoch=True),
                         Printing(),
                         ProgressBar(),
                         Checkpoint(path="./checkpoint.zip")
                     ])

main_loop.run()
Example #33
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])
    topical_transformer = topicalq_transformer(
        config['source_topic_vocab_size'], config['topical_embedding_dim'],
        config['enc_nhids'], config['topical_word_num'], config['batch_size'])
    decoder = Decoder(vocab_size=config['trg_vocab_size'],
                      topicWord_size=config['trg_topic_vocab_size'],
                      embedding_dim=config['dec_embed'],
                      topical_dim=config['topical_embedding_dim'],
                      state_dim=config['dec_nhids'],
                      representation_dim=config['enc_nhids'] * 2,
                      match_function=config['match_function'],
                      use_doubly_stochastic=config['use_doubly_stochastic'],
                      lambda_ds=config['lambda_ds'],
                      use_local_attention=config['use_local_attention'],
                      window_size=config['window_size'],
                      use_step_decay_cost=config['use_step_decay_cost'],
                      use_concentration_cost=config['use_concentration_cost'],
                      lambda_ct=config['lambda_ct'],
                      use_stablilizer=config['use_stablilizer'],
                      lambda_st=config['lambda_st'])
    # here attended dim (representation_dim) of decoder is 2*enc_nhinds
    # because the context given by the encoder is a bidirectional context

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        target_topic_sentence = tensor.lmatrix('target_topic')
        target_topic_binary_sentence = tensor.lmatrix('target_binary_topic')
        #target_topic_sentence_mask=tensor.lmatrix('target_topic_mask');
        sampling_input = tensor.lmatrix('input')
        source_topical_word = tensor.lmatrix('source_topical')
        source_topical_mask = tensor.matrix('source_topical_mask')

        topic_embedding = topical_transformer.apply(source_topical_word)

        # Get training and development set streams
        tr_stream = get_tr_stream_with_topic_target(**config)
        #dev_stream = get_dev_tr_stream_with_topic_target(**config)

        # Get cost of the model
        representations = encoder.apply(source_sentence, source_sentence_mask)
        tw_representation = topical_transformer.look_up.apply(
            source_topical_word.T)
        content_embedding = representations[0, :,
                                            (representations.shape[2] / 2):]
        cost = decoder.cost(representations, source_sentence_mask,
                            tw_representation, source_topical_mask,
                            target_sentence, target_sentence_mask,
                            target_topic_sentence,
                            target_topic_binary_sentence, topic_embedding,
                            content_embedding)

        logger.info('Creating computational graph')
        perplexity = tensor.exp(cost)
        perplexity.name = 'perplexity'

        cg = ComputationGraph(cost)
        costs_computer = function([
            target_sentence, target_sentence_mask, source_sentence,
            source_sentence_mask, source_topical_word, target_topic_sentence,
            target_topic_binary_sentence
        ], (perplexity),
                                  on_unused_input='ignore')

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        topical_transformer.weights_init = IsotropicGaussian(
            config['weight_scale'])
        topical_transformer.biases_init = Constant(0)
        topical_transformer.push_allocation_config()
        #don't know whether the initialize is for
        topical_transformer.look_up.weights_init = Orthogonal()
        topical_transformer.transformer.weights_init = Orthogonal()
        topical_transformer.initialize()
        word_topical_embedding = cPickle.load(
            open(config['topical_embeddings'], 'rb'))
        np_word_topical_embedding = numpy.array(word_topical_embedding,
                                                dtype='float32')
        topical_transformer.look_up.W.set_value(np_word_topical_embedding)
        topical_transformer.look_up.W.tag.role = []

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [
                x for x in cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(cg, enc_params + dec_params,
                             config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([perplexity], after_batch=True),
            CheckpointNMT(config['saveto'],
                          config['model_name'],
                          every_n_batches=config['save_freq'])
        ]

        # # Set up beam search and sampling computation graphs if necessary
        # if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        #     logger.info("Building sampling model")
        #     sampling_representation = encoder.apply(
        #         sampling_input, tensor.ones(sampling_input.shape))
        #     generated = decoder.generate(
        #         sampling_input, sampling_representation)
        #     search_model = Model(generated)
        #     _, samples = VariableFilter(
        #         bricks=[decoder.sequence_generator], name="outputs")(
        #             ComputationGraph(generated[1]))
        #
        # # Add sampling
        # if config['hook_samples'] >= 1:
        #     logger.info("Building sampler")
        #     extensions.append(
        #         Sampler(model=search_model, data_stream=tr_stream,
        #                 model_name=config['model_name'],
        #                 hook_samples=config['hook_samples'],
        #                 every_n_batches=config['sampling_freq'],
        #                 src_vocab_size=config['src_vocab_size']))
        #
        # # Add early stopping based on bleu
        # if False:
        #     logger.info("Building bleu validator")
        #     extensions.append(
        #         BleuValidator(sampling_input, samples=samples, config=config,
        #                       model=search_model, data_stream=dev_stream,
        #                       normalize=config['normalized_bleu'],
        #                       every_n_batches=config['bleu_val_freq'],
        #                       n_best=3,
        #                       track_n_models=6))
        #
        # logger.info("Building perplexity validator")
        # extensions.append(
        #         pplValidation( config=config,
        #                 model=costs_computer, data_stream=dev_stream,
        #                 model_name=config['model_name'],
        #                 every_n_batches=config['sampling_freq']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En',
                     channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        initial_learning_rate = config['initial_learning_rate']
        log_path = os.path.join(config['saveto'], 'log')
        if config['reload'] and os.path.exists(log_path):
            with open(log_path, 'rb') as source:
                log = cPickle.load(source)
                last = max(log.keys()) - 1
                if 'learning_rate' in log[last]:
                    initial_learning_rate = log[last]['learning_rate']

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        Scale(initial_learning_rate),
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]),
                                    on_unused_sources='ignore')

        _learning_rate = algorithm.step_rule.components[0].learning_rate
        if config['learning_rate_decay']:
            extensions.append(
                LearningRateHalver(record_name='validation_cost',
                                   comparator=lambda x, y: x > y,
                                   learning_rate=_learning_rate,
                                   patience_default=3))
        else:
            extensions.append(OldModelRemover(saveto=config['saveto']))

        if config['learning_rate_grow']:
            extensions.append(
                LearningRateDoubler(record_name='validation_cost',
                                    comparator=lambda x, y: x < y,
                                    learning_rate=_learning_rate,
                                    patience_default=3))

        extensions.append(
            SimplePrinting(config['model_name'], after_batch=True))

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(model=training_model,
                             algorithm=algorithm,
                             data_stream=tr_stream,
                             extensions=extensions)

        # Train!
        main_loop.run()

    elif mode == 'translate':

        logger.info('Creating theano variables')
        sampling_input = tensor.lmatrix('source')
        source_topical_word = tensor.lmatrix('source_topical')
        tw_vocab_overlap = tensor.lmatrix('tw_vocab_overlap')
        tw_vocab_overlap_matrix = cPickle.load(
            open(config['tw_vocab_overlap'], 'rb'))
        tw_vocab_overlap_matrix = numpy.array(tw_vocab_overlap_matrix,
                                              dtype='int32')
        #tw_vocab_overlap=shared(tw_vocab_overlap_matrix);

        topic_embedding = topical_transformer.apply(source_topical_word)

        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1
        trg_vocab = _ensure_special_tokens(cPickle.load(
            open(config['trg_vocab'], 'rb')),
                                           bos_idx=0,
                                           eos_idx=trg_eos_idx,
                                           unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        topic_embedding = topical_transformer.apply(source_topical_word)
        tw_representation = topical_transformer.look_up.apply(
            source_topical_word.T)
        content_embedding = sampling_representation[0, :, (
            sampling_representation.shape[2] / 2):]
        generated = decoder.generate(sampling_input,
                                     sampling_representation,
                                     tw_representation,
                                     topical_embedding=topic_embedding,
                                     content_embedding=content_embedding)

        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        #loader = LoadNMT(config['saveto'])
        loader = LoadNMT(config['validation_load'])
        loader.set_model_parameters(model, loader.load_parameters_default())

        logger.info("Started translation: ")
        test_stream = get_dev_stream_with_topicalq(**config)
        ts = test_stream.get_epoch_iterator()
        rts = open(config['val_set_source']).readlines()
        ftrans_original = open(config['val_output_orig'], 'w')
        saved_weights = []
        total_cost = 0.0

        pbar = ProgressBar(max_value=len(rts)).start()
        for i, (line, line_raw) in enumerate(zip(ts, rts)):
            trans_in = line_raw.split()
            seq = sutils._oov_to_unk(line[0], config['src_vocab_size'],
                                     unk_idx)
            seq1 = line[1]
            input_topical = numpy.tile(seq1, (config['beam_size'], 1))
            input_ = numpy.tile(seq, (config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs, attendeds, weights = \
                beam_search.search(
                    input_values={sampling_input: input_,source_topical_word:input_topical,tw_vocab_overlap:tw_vocab_overlap_matrix},
                    tw_vocab_overlap=tw_vocab_overlap_matrix,
                    max_length=3*len(seq), eol_symbol=trg_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            best = numpy.argsort(costs)[0]
            try:
                total_cost += costs[best]
                trans_out = trans[best]
                weight = weights[best][:, :len(trans_in)]
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)
            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i + 1))
                trans_out = '<UNK>'

            saved_weights.append(weight)
            print(' '.join(trans_out), file=ftrans_original)
            pbar.update(i + 1)

        pbar.finish()
        logger.info("Total cost of the test: {}".format(total_cost))
        cPickle.dump(saved_weights, open(config['attention_weights'], 'wb'))
        ftrans_original.close()
        # ap = afterprocesser(config)
        # ap.main()

    elif mode == 'score':
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        target_topic_sentence = tensor.lmatrix('target_topic')
        target_topic_binary_sentence = tensor.lmatrix('target_binary_topic')
        source_topical_word = tensor.lmatrix('source_topical')

        topic_embedding = topical_transformer.apply(source_topical_word)
        # Get cost of the model
        representations = encoder.apply(source_sentence, source_sentence_mask)
        costs = decoder.cost(representations, source_sentence_mask,
                             target_sentence, target_sentence_mask,
                             target_topic_sentence,
                             target_topic_binary_sentence, topic_embedding)

        config['batch_size'] = 1
        config['sort_k_batches'] = 1
        # Get test set stream
        test_stream = get_tr_stream_with_topic_target(**config)

        logger.info("Building sampling model")

        logger.info("Loading the model..")
        model = Model(costs)
        loader = LoadNMT(config['validation_load'])
        loader.set_model_parameters(model, loader.load_parameters_default())

        costs_computer = function([
            target_sentence, target_sentence_mask, source_sentence,
            source_sentence_mask, source_topical_word, target_topic_sentence,
            target_topic_binary_sentence
        ], (costs),
                                  on_unused_input='ignore')

        iterator = test_stream.get_epoch_iterator()

        scores = []
        att_weights = []
        for i, (src, src_mask, trg, trg_mask, te, te_mask, tt, tt_mask, tb,
                tb_mask) in enumerate(iterator):
            costs = costs_computer(*[trg, trg_mask, src, src_mask, te, tt, tb])
            cost = costs.sum()
            print(i, cost)
            scores.append(cost)

        print(sum(scores) / 10007)
Example #34
0
    cg = ComputationGraph(cost)
    monitored = set([cost] + VariableFilter(roles=[roles.COST])(cg.variables))

    valid_monitored = monitored
    if hasattr(model, 'valid_cost'):
        valid_cost = model.valid_cost(**inputs)
        valid_cg = ComputationGraph(valid_cost)
        valid_monitored = set([valid_cost] + VariableFilter(
            roles=[roles.COST])(valid_cg.variables))

    if hasattr(config, 'dropout') and config.dropout < 1.0:
        cg = apply_dropout(cg, config.dropout_inputs(cg), config.dropout)
    if hasattr(config, 'noise') and config.noise > 0.0:
        cg = apply_noise(cg, config.noise_inputs(cg), config.noise)
    cost = cg.outputs[0]
    cg = Model(cost)

    logger.info('# Parameter shapes:')
    parameters_size = 0
    for value in cg.parameters:
        logger.info('    %20s %s' % (value.get_value().shape, value.name))
        parameters_size += reduce(operator.mul, value.get_value().shape, 1)
    logger.info('Total number of parameters: %d in %d matrices' %
                (parameters_size, len(cg.parameters)))

    if hasattr(config, 'step_rule'):
        step_rule = config.step_rule
    else:
        step_rule = AdaDelta()

    logger.info("Fuel seed: %d" % fuel.config.default_seed)
Example #35
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code,
                               level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream,
                              reverse_words,
                              add_sources=("targets", ))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weights_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(chars, chars_mask, targets,
                                   targets_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        parameters = model.get_parameter_dict()
        logger.info("Parameters:\n" +
                    pprint.pformat([(key, value.get_value().shape)
                                    for key, value in parameters.items()],
                                   width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule(
                                        [StepClipping(10.0),
                                         Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies, ) = VariableFilter(applications=[generator.readout.readout],
                                      name_regex="output")(cg.variables)
        (activations, ) = VariableFilter(
            applications=[generator.transition.apply],
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        mean_activation = named_copy(
            abs(activations).mean(), "mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation, batch_size,
            max_length, cost_per_character, algorithm.total_step_norm,
            algorithm.total_gradient_norm
        ]
        for name, parameter in parameters.items():
            observables.append(named_copy(parameter.norm(2), name + "_norm"))
            observables.append(
                named_copy(algorithm.gradients[parameter].norm(2),
                           name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(observables,
                                                    prefix="average",
                                                    every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition(["after_batch"], _is_nan),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path,
                           every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)
            ])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_parameter_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(bricks=[reverser.generator],
                                          name="outputs")(ComputationGraph(
                                              generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(samples)
                outputs, costs = beam_search.search({chars: input_},
                                                    char2code['</S>'],
                                                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            line = input("Enter a sentence\n")
            message = ("Enter the number of samples\n"
                       if mode == "sample" else "Enter the beam size\n")
            batch_size = int(input(message))

            encoded_input = [
                char2code.get(char, char2code["<UNK>"])
                for char in line.lower().strip()
            ]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input, ))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size,
                             axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
Example #36
0
                                on_unused_sources='warn')

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()


training_cost = create_model(train_encoder, train_decoder)

# Set up training model
logger.info("Building model")
train_model = Model(training_cost)

dev_stream = get_dev_stream(**exp_config)

main(train_model,
     training_cost,
     exp_config,
     masked_stream,
     dev_stream=dev_stream,
     use_bokeh=True)
def train(step_rule, state_dim, epochs, seed, experiment_path, initialization,
          to_watch, patience, static_mask, batch_size, rnn_type, num_layers,
          augment, seq_len, drop_prob, drop_prob_states, drop_prob_cells,
          drop_prob_igates, ogates_zoneout, stoch_depth, share_mask,
          gaussian_drop, weight_noise, norm_cost_coeff, penalty, input_drop,
          **kwargs):

    print '.. cPTB experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()

    def numpy_rng(random_seed=None):
        if random_seed == None:
            random_seed = 1223
        return numpy.random.RandomState(random_seed)

    ###########################################
    #
    # MAKE DATA STREAMS
    #
    ###########################################
    rng = np.random.RandomState(seed)

    if share_mask:
        drop_prob_cells = drop_prob
        # we don't want to actually use these masks, so this is to debug
        drop_prob_states = None

    print '.. initializing iterators'

    if static_mask:
        train_stream = get_static_mask_ptb_stream('train',
                                                  batch_size,
                                                  seq_len,
                                                  drop_prob_states,
                                                  drop_prob_cells,
                                                  drop_prob_igates,
                                                  state_dim,
                                                  False,
                                                  augment=augment)
        train_stream_evaluation = get_static_mask_ptb_stream('train',
                                                             batch_size,
                                                             seq_len,
                                                             drop_prob_states,
                                                             drop_prob_cells,
                                                             drop_prob_igates,
                                                             state_dim,
                                                             True,
                                                             augment=augment)
        dev_stream = get_static_mask_ptb_stream('valid',
                                                batch_size,
                                                seq_len,
                                                drop_prob_states,
                                                drop_prob_cells,
                                                drop_prob_igates,
                                                state_dim,
                                                True,
                                                augment=augment)
    else:
        train_stream = get_ptb_stream('train',
                                      batch_size,
                                      seq_len,
                                      drop_prob_states,
                                      drop_prob_cells,
                                      drop_prob_igates,
                                      state_dim,
                                      False,
                                      augment=augment)
        train_stream_evaluation = get_ptb_stream('train',
                                                 batch_size,
                                                 seq_len,
                                                 drop_prob_states,
                                                 drop_prob_cells,
                                                 drop_prob_igates,
                                                 state_dim,
                                                 True,
                                                 augment=augment)
        dev_stream = get_ptb_stream('valid',
                                    batch_size,
                                    seq_len,
                                    drop_prob_states,
                                    drop_prob_cells,
                                    drop_prob_igates,
                                    state_dim,
                                    True,
                                    augment=augment)

    data = train_stream.get_epoch_iterator(as_dict=True).next()
    #import ipdb; ipdb.set_trace()

    ###########################################
    #
    # BUILD MODEL
    #
    ###########################################

    print '.. building model'

    x = T.tensor3('features', dtype=floatX)
    x, y = x[:-1], x[1:]
    drops_states = T.tensor3('drops_states')
    drops_cells = T.tensor3('drops_cells')
    drops_igates = T.tensor3('drops_igates')

    x.tag.test_value = data['features']
    #y.tag.test_value = data['outputs']
    drops_states.tag.test_value = data['drops_states']
    drops_cells.tag.test_value = data['drops_cells']
    drops_igates.tag.test_value = data['drops_igates']

    if initialization == 'glorot':
        weights_init = NormalizedInitialization()
    elif initialization == 'uniform':
        weights_init = Uniform(width=.2)
    elif initialization == 'ortho':
        weights_init = OrthogonalInitialization()
    else:
        raise ValueError('No such initialization')

    if rnn_type.lower() == 'lstm':
        in_to_hid = Linear(50,
                           state_dim * 4,
                           name='in_to_hid',
                           weights_init=weights_init,
                           biases_init=Constant(0.0))
        recurrent_layer = ZoneoutLSTM(dim=state_dim,
                                      weights_init=weights_init,
                                      activation=Tanh(),
                                      model_type=6,
                                      name='rnn',
                                      ogates_zoneout=ogates_zoneout)
    elif rnn_type.lower() == 'gru':
        in_to_hid = Linear(50,
                           state_dim * 3,
                           name='in_to_hid',
                           weights_init=weights_init,
                           biases_init=Constant(0.0))
        recurrent_layer = ZoneoutGRU(dim=state_dim,
                                     weights_init=weights_init,
                                     activation=Tanh(),
                                     name='rnn')
    elif rnn_type.lower() == 'srnn':
        in_to_hid = Linear(50,
                           state_dim,
                           name='in_to_hid',
                           weights_init=weights_init,
                           biases_init=Constant(0.0))
        recurrent_layer = ZoneoutSimpleRecurrent(dim=state_dim,
                                                 weights_init=weights_init,
                                                 activation=Rectifier(),
                                                 name='rnn')
    else:
        raise NotImplementedError

    hid_to_out = Linear(state_dim,
                        50,
                        name='hid_to_out',
                        weights_init=weights_init,
                        biases_init=Constant(0.0))

    in_to_hid.initialize()
    recurrent_layer.initialize()
    hid_to_out.initialize()

    h = in_to_hid.apply(x)

    if rnn_type.lower() == 'lstm':
        yh = recurrent_layer.apply(h, drops_states, drops_cells,
                                   drops_igates)[0]
    else:
        yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates)

    y_hat_pre_softmax = hid_to_out.apply(yh)
    shape_ = y_hat_pre_softmax.shape

    # y_hat = Softmax().apply(
    #     y_hat_pre_softmax.reshape((-1, shape_[-1])))# .reshape(shape_)

    ###########################################
    #
    # SET UP COSTS, MONITORS, and REGULARIZATION
    #
    ###########################################

    # cost = CategoricalCrossEntropy().apply(y.flatten().astype('int64'), y_hat)

    def crossentropy_lastaxes(yhat, y):
        # for sequence of distributions/targets
        return -(y * T.log(yhat)).sum(axis=yhat.ndim - 1)

    def softmax_lastaxis(x):
        # for sequence of distributions
        return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape)

    yhat = softmax_lastaxis(y_hat_pre_softmax)
    cross_entropies = crossentropy_lastaxes(yhat, y)
    cross_entropy = cross_entropies.mean().copy(name="cross_entropy")
    cost = cross_entropy.copy(name="cost")

    batch_cost = cost.copy(name='batch_cost')
    nll_cost = cost.copy(name='nll_cost')
    bpc = (nll_cost / np.log(2.0)).copy(name='bpr')

    #nll_cost = aggregation.mean(batch_cost, batch_size).copy(name='nll_cost')

    cost_monitor = aggregation.mean(
        batch_cost, batch_size).copy(name='sequence_cost_monitor')
    cost_per_character = aggregation.mean(
        batch_cost, (seq_len - 1) * batch_size).copy(name='character_cost')
    cost_train = cost.copy(name='train_batch_cost')
    cost_train_monitor = cost_monitor.copy('train_batch_cost_monitor')
    cg_train = ComputationGraph([cost_train, cost_train_monitor])

    ##################
    # NORM STABILIZER
    ##################

    norm_cost = 0.

    def _magnitude(x, axis=-1):
        return T.sqrt(
            T.maximum(T.sqr(x).sum(axis=axis),
                      numpy.finfo(x.dtype).tiny))

    if penalty == 'cells':
        assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables)
        for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables):
            norms = _magnitude(cell)
            norm_cost += T.mean(
                T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))
            ## debugging nans stuff
            #gr = T.grad(norm_cost, cg_train.parameters, disconnected_inputs='ignore')
            #grf = theano.function([x, input_mask], gr)
            #grz = grf(x.tag.test_value, input_mask.tag.test_value)
            #params = cg_train.parameters
            #mynanz = [(pp, np.sum(gg)) for pp,gg in zip(params, grz) if np.isnan(np.sum(gg))]
            #for mm in mynanz: print mm
            ##import ipdb; ipdb.set_trace()
    elif penalty == 'hids':
        assert 'rnn_apply_states' in [
            o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)
        ]
        for output in VariableFilter(roles=[OUTPUT])(cg_train.variables):
            if output.name == 'rnn_apply_states':
                norms = _magnitude(output)
                norm_cost += T.mean(
                    T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))

    norm_cost.name = 'norm_cost'

    cost_train += norm_cost_coeff * norm_cost
    cost_train = cost_train.copy(
        'cost_train')  #should this be cost_train.outputs[0]?

    cg_train = ComputationGraph([cost_train,
                                 cost_train_monitor])  #, norm_cost])

    ##################
    # WEIGHT NOISE
    ##################

    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg_train.variables)
        cg_train = apply_noise(cg_train, weights, weight_noise)
        cost_train = cg_train.outputs[0].copy(name='cost_train')
        cost_train_monitor = cg_train.outputs[1].copy(
            'train_batch_cost_monitor')

    # if 'l2regularization' in kwargs:
    #     weights = VariableFilter(roles=[WEIGHT])(cg_train.variables)
    #     cost_train += kwargs['l2regularization'] * sum([
    #         (weight ** 2).sum() for weight in weights])
    #     cost_train.name = 'cost_train'
    #     cg_train = ComputationGraph(cost_train)

    model = Model(cost_train)
    train_cost_per_character = aggregation.mean(
        cost_train_monitor,
        (seq_len - 1) * batch_size).copy(name='train_character_cost')

    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost_train,
                                parameters=cg_train.parameters)

    observed_vars = [
        cost_train, cost_train_monitor, train_cost_per_character,
        aggregation.mean(algorithm.total_gradient_norm)
    ]
    # parameters = model.get_parameter_dict()
    # for name, param in parameters.iteritems():
    #     observed_vars.append(param.norm(2).copy(name=name + "_norm"))
    #     observed_vars.append(
    #         algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm"))
    train_monitor = TrainingDataMonitoring(variables=observed_vars,
                                           prefix="train",
                                           after_epoch=True)

    dev_monitor = DataStreamMonitoring(variables=[nll_cost, bpc],
                                       data_stream=dev_stream,
                                       prefix="dev")

    extensions = []
    if 'load_path' in kwargs:
        with open(kwargs['load_path']) as f:
            loaded = np.load(f)
            model = Model(cost_train)
            params_dicts = model.get_parameter_dict()
            params_names = params_dicts.keys()
            for param_name in params_names:
                param = params_dicts[param_name]
                # '/f_6_.W' --> 'f_6_.W'
                slash_index = param_name.find('/')
                param_name = param_name[slash_index + 1:]
                if param.get_value().shape == loaded[param_name].shape:
                    print 'Found: ' + param_name
                    param.set_value(loaded[param_name])
                else:
                    print 'Not found: ' + param_name

    extensions.extend(
        [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor])

    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
    log_path = os.path.join(experiment_path, 'log.txt')
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    extensions.append(
        SaveParams('dev_nll_cost', model, experiment_path, every_n_epochs=1))
    extensions.append(SaveLog(every_n_epochs=1))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    ###########################################
    #
    # MAIN LOOOOOOOOOOOP
    #
    ###########################################

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    t1 = time.time()
    print "Building time: %f" % (t1 - t0)
    # if write_predictions:
    #     with open('predicted.txt', 'w') as f_pred:
    #         with open('targets.txt', 'w') as f_targets:
    #             evaluator = CTCEvaluator(
    #                 eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list)
    #             evaluator.evaluate(dev_stream, file_pred=f_pred,
    #                                file_targets=f_targets)
    #     return
    main_loop.run()
    print "Execution time: %f" % (time.time() - t1)
Example #38
0
def main(model, cost, config, tr_stream, dev_stream, use_bokeh=False):

    # Set the parameters from a trained models (.npz file)
    logger.info("Loading parameters from model: {}".format(
        exp_config['saved_parameters']))
    # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API
    param_values = LoadNMT.load_parameter_values(
        exp_config['saved_parameters'],
        brick_delimiter=exp_config.get('brick_delimiter', None))
    LoadNMT.set_model_parameters(model, param_values)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING
    if config.get('l2_regularization', False) is True:
        l2_reg_alpha = config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to rename the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    # Note dropout variables are hard-coded here
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        # TODO: mv the actual config file once we switch to .yaml for min-risk
        # shutil.copy(config['config_file'], config['saveto'])
        # shutil.copy(config['config_file'], config['saveto'])

        with codecs.open(os.path.join(config['saveto'], 'config.yaml'),
                         'w',
                         encoding='utf8') as yaml_out:
            yaml_out.write(yaml.dump(config))

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Set up beam search and sampling computation graphs if necessary
    # TODO: change the if here
    if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        logger.info("Building sampling model")
        sampling_representation = train_encoder.apply(
            theano_sampling_input, tensor.ones(theano_sampling_input.shape))
        # TODO: the generated output actually contains several more values, ipdb to see what they are
        generated = train_decoder.generate(theano_sampling_input,
                                           sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[train_decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling -- TODO: sampling is broken for min-risk
    #if config['hook_samples'] >= 1:
    #    logger.info("Building sampler")
    #    extensions.append(
    #        Sampler(model=search_model, data_stream=tr_stream,
    #                hook_samples=config['hook_samples'],
    #                every_n_batches=config['sampling_freq'],
    #                src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    if config.get('bleu_script', None) is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(theano_sampling_input,
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Add early stopping based on bleu
    if config.get('meteor_directory', None) is not None:
        logger.info("Building meteor validator")
        extensions.append(
            MeteorValidator(theano_sampling_input,
                            samples=samples,
                            config=config,
                            model=search_model,
                            data_stream=dev_stream,
                            normalize=config['normalized_bleu'],
                            every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(config['model_save_directory'],
                 channels=[['decoder_cost_cost'],
                           ['validation_set_meteor_score']],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # TODO: reenable dropout and add L2 regularization as in the main config

    # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph
    if config['dropout'] < 1.0:
        algorithm = GradientDescent(cost=cg.outputs[0],
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]),
                                    on_unused_sources='warn')
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]),
                                    on_unused_sources='warn')

    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=CompositeRule([
                                    StepClipping(config['step_clipping']),
                                    eval(config['step_rule'])()
                                ], ),
                                on_unused_sources='warn')

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Example #39
0
def mainPredict(config, data_to_predict_stream, use_ensemble, lang=None, et_version=False, use_bokeh=False, the_track=None):
    # Create Theano variables
    assert the_track != None
    
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2, cost_type=config['error_fct'])
    cost = decoder.cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))
    
    
    # Set extensions
    logger.info("Initializing (empty) extensions")
    extensions = [
    ]

    logger.info("Building sampling model")
    sampling_representation = encoder.apply(
        sampling_input, tensor.ones(sampling_input.shape))
    generated = decoder.generate(sampling_input, sampling_representation)
    search_model = Model(generated)

    _, samples = VariableFilter(
        bricks=[decoder.sequence_generator], name="outputs")(
            ComputationGraph(generated[1]))  # generated[1] is next_outputs
    
    # Reload the model (as this is prediction, it is 100% necessary):
    if config['reload']:
        extensions.append(LoadOnlyBestModel(config['saveto'])) # without early stopping use LoadOnlyModel here!
        #extensions.append(LoadOnlyModel(config['saveto'])) # without early stopping use LoadOnlyModel here!
    else:
        raise Exception('No model available for prediction! (Check config[\'reload\'] variable)')

    
    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                 eval(config['step_rule'])()])
    )

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=search_model,
        algorithm=algorithm,
        #algorithm=None,
        data_stream=data_to_predict_stream,
        extensions=extensions
    )

    predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, et_version, config, the_track=the_track)
Example #40
0
def main(save_to):
    batch_size = 365
    feature_maps = [6, 16]
    mlp_hiddens = [120, 84]
    conv_sizes = [5, 5]
    pool_sizes = [2, 2]
    image_size = (28, 28)
    output_size = 10

    # The above are from LeCun's paper. The blocks example had:
    #    feature_maps = [20, 50]
    #    mlp_hiddens = [500]

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations, 1, image_size,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='valid',
                    weights_init=Uniform(width=.2),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.push_initialization_config()
    convnet.layers[0].weights_init = Uniform(width=.2)
    convnet.layers[1].weights_init = Uniform(width=.09)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info("Input dim: {} {} {}".format(
        *convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(
                i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))

    x = tensor.tensor4('features')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    cg = ComputationGraph([probs])
    outs = VariableFilter(
            roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables)

    # Create an interior activation model
    model = Model([probs] + outs)

    # Load it with trained parameters
    params = load_parameters(open(save_to, 'rb'))
    model.set_parameter_values(params)

    algorithm = MaximumActivationSearch(outputs=outs)

    # Use the mnist test set, unshuffled
    mnist_test = MNIST(("test",), sources=['features'])
    mnist_test_stream = DataStream.default_stream(
        mnist_test,
        iteration_scheme=SequentialScheme(
            mnist_test.num_examples, batch_size))

    extensions = [Timing(),
                  FinishAfter(after_n_epochs=1),
                  DataStreamMonitoring(
                      [],
                      mnist_test_stream,
                      prefix="test"),
                  Checkpoint("maxact.tar"),
                  ProgressBar(),
                  Printing()]

    main_loop = MainLoop(
        algorithm,
        mnist_test_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    examples = mnist_test.get_example_stream()
    example = examples.get_data(0)[0]
    layers = convnet.layers
    for output, record in algorithm.maximum_activations.items():
        layer = get_brick(output)
        activations, indices, snapshots = (
                r.get_value() if r else None for r in record[1:])
        filmstrip = Filmstrip(
            example.shape[-2:], (indices.shape[1], indices.shape[0]),
            background='blue')
        if layer in layers:
            fieldmap = layerarray_fieldmap(layers[0:layers.index(layer) + 1])
            for unit in range(indices.shape[1]):
                for index in range(100):
                    mask = make_mask(example.shape[-2:], fieldmap, numpy.clip(
                        snapshots[index, unit, :, :], 0, numpy.inf))
                    imagenum = indices[index, unit, 0]
                    filmstrip.set_image((unit, index),
                            examples.get_data(imagenum)[0], mask)
        else:
            for unit in range(indices.shape[1]):
                for index in range(100):
                    imagenum = indices[index, unit]
                    filmstrip.set_image((unit, index),
                            examples.get_data(imagenum)[0])
        filmstrip.save(layer.name + '_maxact.jpg')
Example #41
0
    with open('dataset/vocab.pkl') as f:
        vocabs = pkl.load(f)
        word_vocab, rel_vocab = vocabs['word'], vocabs['rel']

    with open('dataset/trainXY.json') as f:
        train = json.load(f)
        train = wrap_stream(train)

    with open('dataset/testXY.json') as f:
        test = json.load(f)
        test = wrap_stream(test)

    model = LSTMModel(len(vocabs['word']), n_mem, len(vocabs['rel']))
    cg = ComputationGraph(model.cost)

    bricks_model = Model(model.cost)
    for brick in bricks_model.get_top_bricks():
        brick.initialize()
    model.lookup.W.set_value(vocabs['word'].get_embeddings().astype(theano.config.floatX))

    if dropout:
        pass
        # logger.info('Applying dropout of {}'.format(dropout))
        # lstm_dropout = [v for v in cg.intermediary_variables if v.name in {'W_cell_to_in', 'W_cell_to_out'}]
        # cg = apply_dropout(cg, lstm_dropout, drop_prob=dropout)

    # summary of what's going on
    parameters = bricks_model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, value.get_value().shape, value.get_value().mean()) for key, value
        sys.exit(0)

    graphs, extensions, updates = construct_graphs(args, nclasses, sequence_length)

    ### optimization algorithm definition
    step_rule = CompositeRule([
        StepClipping(1.),
        #Momentum(learning_rate=args.learning_rate, momentum=0.9),
        RMSProp(learning_rate=args.learning_rate, decay_rate=0.5),
    ])

    algorithm = GradientDescent(cost=graphs["training"].outputs[0],
                                parameters=graphs["training"].parameters,
                                step_rule=step_rule)
    algorithm.add_updates(updates["training"])
    model = Model(graphs["training"].outputs[0])
    extensions = extensions["training"] + extensions["inference"]


    # step monitor (after epoch to limit the log size)
    step_channels = []
    step_channels.extend([
        algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
        for name, param in model.get_parameter_dict().items()])
    step_channels.append(algorithm.total_step_norm.copy(name="total_step_norm"))
    step_channels.append(algorithm.total_gradient_norm.copy(name="total_gradient_norm"))
    step_channels.extend(graphs["training"].outputs)
    logger.warning("constructing training data monitor")
    extensions.append(TrainingDataMonitoring(
        step_channels, prefix="iteration", after_batch=False))
Example #43
0
def main(mode, save_to, num_epochs, load_params=None,
         feature_maps=None, mlp_hiddens=None,
         conv_sizes=None, pool_sizes=None, stride=None, repeat_times=None,
         batch_size=None, num_batches=None, algo=None,
         test_set=None, valid_examples=None,
         dropout=None, max_norm=None, weight_decay=None,
         batch_norm=None):
    if feature_maps is None:
        feature_maps = [20, 50, 50]
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    if conv_sizes is None:
        conv_sizes = [5, 5, 5]
    if pool_sizes is None:
        pool_sizes = [2, 2, 2]
    if repeat_times is None:
        repeat_times = [1, 1, 1]
    if batch_size is None:
        batch_size = 500
    if valid_examples is None:
        valid_examples = 2500
    if stride is None:
        stride = 1
    if test_set is None:
        test_set = 'test'
    if algo is None:
        algo = 'rmsprop'
    if batch_norm is None:
        batch_norm = False

    image_size = (128, 128)
    output_size = 2

    if (len(feature_maps) != len(conv_sizes) or
        len(feature_maps) != len(pool_sizes) or
        len(feature_maps) != len(repeat_times)):
        raise ValueError("OMG, inconsistent arguments")

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations, 3, image_size,
                    stride=stride,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    repeat_times=repeat_times,
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='full',
                    batch_norm=batch_norm,
                    weights_init=Glorot(),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.initialize()
    logging.info("Input dim: {} {} {}".format(
        *convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(
                i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))


    single_x = tensor.tensor3('image_features')
    x = tensor.tensor4('image_features')
    single_y = tensor.lvector('targets')
    y = tensor.lmatrix('targets')

    # Training
    with batch_normalization(convnet):
        probs = convnet.apply(x)
    cost = (CategoricalCrossEntropy().apply(y.flatten(), probs)
            .copy(name='cost'))
    error_rate = (MisclassificationRate().apply(y.flatten(), probs)
                  .copy(name='error_rate'))

    cg = ComputationGraph([cost, error_rate])
    extra_updates = []

    if batch_norm: # batch norm:
        logger.debug("Apply batch norm")
        pop_updates = get_batch_normalization_updates(cg)
        # p stands for population mean
        # m stands for minibatch
        alpha = 0.005
        extra_updates = [(p, m * alpha + p * (1 - alpha))
                         for p, m in pop_updates]
        population_statistics = [p for p, m in extra_updates]
    if dropout:
        relu_outputs = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg)
        cg = apply_dropout(cg, relu_outputs, dropout)
    cost, error_rate = cg.outputs
    if weight_decay:
        logger.debug("Apply weight decay {}".format(weight_decay))
        cost += weight_decay * l2_norm(cg.parameters)
        cost.name = 'cost'

    # Validation
    valid_probs = convnet.apply_5windows(single_x)
    valid_cost = (CategoricalCrossEntropy().apply(single_y, valid_probs)
            .copy(name='cost'))
    valid_error_rate = (MisclassificationRate().apply(
        single_y, valid_probs).copy(name='error_rate'))

    model = Model([cost, error_rate])
    if load_params:
        logger.info("Loaded params from {}".format(load_params))
        with open(load_params, 'r') as src:
            model.set_parameter_values(load_parameters(src))

    # Training stream with random cropping
    train = DogsVsCats(("train",), subset=slice(None, 25000 - valid_examples, None))
    train_str =  DataStream(
        train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size))
    train_str = add_transformers(train_str, random_crop=True)

    # Validation stream without cropping
    valid = DogsVsCats(("train",), subset=slice(25000 - valid_examples, None, None))
    valid_str = DataStream(
        valid, iteration_scheme=SequentialExampleScheme(valid.num_examples))
    valid_str = add_transformers(valid_str)

    if mode == 'train':
        directory, _ = os.path.split(sys.argv[0])
        env = dict(os.environ)
        env['THEANO_FLAGS'] = 'floatX=float32'
        port = numpy.random.randint(1025, 10000)
        server = subprocess.Popen(
            [directory + '/server.py',
             str(25000 - valid_examples), str(batch_size), str(port)],
            env=env, stderr=subprocess.STDOUT)
        train_str = ServerDataStream(
            ('image_features', 'targets'), produces_examples=False,
            port=port)

        save_to_base, save_to_extension = os.path.splitext(save_to)

        # Train with simple SGD
        if algo == 'rmsprop':
            step_rule = RMSProp(decay_rate=0.999, learning_rate=0.0003)
        elif algo == 'adam':
            step_rule = Adam()
        else:
            assert False
        if max_norm:
            conv_params = VariableFilter(bricks=[Convolutional], roles=[WEIGHT])(cg)
            linear_params = VariableFilter(bricks=[Linear], roles=[WEIGHT])(cg)
            step_rule = CompositeRule(
                [step_rule,
                 Restrict(VariableClipping(max_norm, axis=0), linear_params),
                 Restrict(VariableClipping(max_norm, axis=(1, 2, 3)), conv_params)])

        algorithm = GradientDescent(
            cost=cost, parameters=model.parameters,
            step_rule=step_rule)
        algorithm.add_updates(extra_updates)
        # `Timing` extension reports time for reading data, aggregating a batch
        # and monitoring;
        # `ProgressBar` displays a nice progress bar during training.
        extensions = [Timing(every_n_batches=100),
                    FinishAfter(after_n_epochs=num_epochs,
                                after_n_batches=num_batches),
                    DataStreamMonitoring(
                        [valid_cost, valid_error_rate],
                        valid_str,
                        prefix="valid"),
                    TrainingDataMonitoring(
                        [cost, error_rate,
                        aggregation.mean(algorithm.total_gradient_norm)],
                        prefix="train",
                        after_epoch=True),
                    TrackTheBest("valid_error_rate"),
                    Checkpoint(save_to, save_separately=['log'],
                               parameters=cg.parameters +
                               (population_statistics if batch_norm else []),
                               before_training=True, after_epoch=True)
                        .add_condition(
                            ['after_epoch'],
                            OnLogRecord("valid_error_rate_best_so_far"),
                            (save_to_base + '_best' + save_to_extension,)),
                    Printing(every_n_batches=100)]

        model = Model(cost)

        main_loop = MainLoop(
            algorithm,
            train_str,
            model=model,
            extensions=extensions)
        try:
            main_loop.run()
        finally:
            server.terminate()
    elif mode == 'test':
        classify = theano.function([single_x], valid_probs.argmax())
        test = DogsVsCats((test_set,))
        test_str = DataStream(
            test, iteration_scheme=SequentialExampleScheme(test.num_examples))
        test_str = add_transformers(test_str)
        correct = 0
        with open(save_to, 'w') as dst:
            print("id", "label", sep=',', file=dst)
            for index, example in enumerate(test_str.get_epoch_iterator()):
                image = example[0]
                prediction = classify(image)
                print(index + 1, classify(image), sep=',', file=dst)
                if len(example) > 1 and prediction == example[1]:
                    correct += 1
        print(correct / float(test.num_examples))
    else:
        assert False
Example #44
0
def evaluate(model, load_path, configs):
    print "FIX THIS : NOT BEST"
    with open(load_path + 'trained_params.npz') as f:
        loaded = np.load(f)
        blocks_model = Model(model.cost)
        params_dicts = blocks_model.get_parameter_dict()
        params_names = params_dicts.keys()
        for param_name in params_names:
            param = params_dicts[param_name]
            # '/f_6_.W' --> 'f_6_.W'
            slash_index = param_name.find('/')
            param_name = param_name[slash_index + 1:]
            # if param_name in ['initial_location', 'initial_scale', 'initial_alpha']:
            #     param_name = 'lstmattention.' + param_name
            if param.get_value().shape == loaded[param_name].shape:
                param.set_value(loaded[param_name])
            else:
                print param_name

        inps = ComputationGraph(model.error_rate).inputs
        eval_function = theano.function(
            inps, [model.error_rate, model.probabilities])
        # tds, vds = configs['get_streams'](100)

        # it = tds.get_epoch_iterator()
        # data = it.next()
        # print eval_function(data[0], data[1])
        return eval_function

        train_probs = []
        valid_probs = []
        train_unites = []
        valid_unites = []
        train_labels = []
        valid_labels = []

        it = tds.get_epoch_iterator()
        for batch in range(6):
            print batch
            data = it.next()
            train_probs.append(eval_function(data[0], data[1])[1])
            train_unites.append(data[2])
            train_labels.append(data[1])

        it = vds.get_epoch_iterator()
        for batch in range(2):
            print batch
            data = it.next()
            valid_probs.append(eval_function(data[0], data[1])[1])
            valid_unites.append(data[2])
            valid_labels.append(data[1])

        train_probs = np.vstack(train_probs)
        valid_probs = np.vstack(valid_probs)
        train_labels = np.hstack(train_labels)
        valid_labels = np.hstack(valid_labels)
        train_unites = np.hstack(train_unites)
        valid_unites = np.hstack(valid_unites)

        # For training
        map_vid_to_onehot = {}
        for j in list(set(train_unites)):
            map_vid_to_onehot[j] = []

        for i in train_unites:
            for j in list(set(train_unites)):
                if i == j:
                    map_vid_to_onehot[j].append(1)
                else:
                    map_vid_to_onehot[j].append(0)

        map_vid_to_class = {}
        for j in list(set(train_unites)):
            onehot = np.array(map_vid_to_onehot[j])[:, np.newaxis]
            masked = onehot * train_probs
            map_vid_to_class[j] = np.argmax(np.sum(masked, axis=0))

        predicted_labels = []
        for i in train_unites:
            predicted_labels.append(map_vid_to_class[i])

        incorrect = 0
        for label, predicted_label in zip(train_labels, predicted_labels):
            if label != predicted_label:
                incorrect = incorrect + 1

        print float(incorrect) / train_unites.shape[0]

        map_vid_to_onehot = {}
        for j in list(set(train_unites)):
            map_vid_to_onehot[j] = []

        for i in train_unites:
            for j in list(set(train_unites)):
                if i == j:
                    map_vid_to_onehot[j].append(1)
                else:
                    map_vid_to_onehot[j].append(0)

        # For validation
        map_vid_to_onehot = {}
        for j in list(set(valid_unites)):
            map_vid_to_onehot[j] = []

        for i in valid_unites:
            for j in list(set(valid_unites)):
                if i == j:
                    map_vid_to_onehot[j].append(1)
                else:
                    map_vid_to_onehot[j].append(0)

        map_vid_to_class = {}
        for j in list(set(valid_unites)):
            onehot = np.array(map_vid_to_onehot[j])[:, np.newaxis]
            masked = onehot * valid_probs
            map_vid_to_class[j] = np.argmax(np.sum(masked, axis=0))

        predicted_labels = []
        for i in valid_unites:
            predicted_labels.append(map_vid_to_class[i])

        incorrect = 0
        for label, predicted_label in zip(valid_labels, predicted_labels):
            if label != predicted_label:
                incorrect = incorrect + 1

        print float(incorrect) / valid_unites.shape[0]

        return eval_function
Example #45
0
def run_visualizations(cost, updates,
                       train_stream, valid_stream,
                       args,
                       hidden_states=None, gate_values=None):

    # Load the parameters from a dumped model
    assert args.load_path is not None
    model = Model(cost)
    model.set_parameter_values(load_parameter_values(args.load_path))

    # Run a visualization
    if args.visualize == "generate":
        visualize_generate(cost,
                           hidden_states, updates,
                           train_stream, valid_stream,
                           args)

    elif args.visualize == "gates" and (gate_values is not None):
        if args.rnn_type == "lstm":
            visualize_gates_lstm(gate_values, hidden_states, updates,
                                 train_stream, valid_stream,
                                 args)
        elif args.rnn_type == "soft":
            visualize_gates_soft(gate_values, hidden_states, updates,
                                 train_stream, valid_stream,
                                 args)
        else:
            assert False

    elif args.visualize == "states":
        visualize_states(hidden_states, updates,
                         train_stream, valid_stream,
                         args)

    elif args.visualize == "gradients":
        visualize_gradients(hidden_states, updates,
                            train_stream, valid_stream,
                            args)

    elif args.visualize == "jacobian":
        visualize_jacobian(hidden_states, updates,
                           train_stream, valid_stream,
                           args)

    elif args.visualize == "presoft":
        visualize_presoft(cost,
                          hidden_states, updates,
                          train_stream, valid_stream,
                          args)

    elif args.visualize == "matrices":
        visualize_matrices(args)

    elif args.visualize == "trained_singular_values":
        visualize_singular_values(args)

    elif args.visualize == "gradients_flow_pie":
        visualize_gradients_flow_pie(hidden_states, updates,
                                     args)

    else:
        assert False
Example #46
0
    def __init__(self, save_to):
        batch_size = 500
        image_size = (28, 28)
        output_size = 10
        convnet = create_lenet_5()
        layers = convnet.layers

        logging.info("Input dim: {} {} {}".format(
            *convnet.children[0].get_dim('input_')))
        for i, layer in enumerate(convnet.layers):
            if isinstance(layer, Activation):
                logging.info("Layer {} ({})".format(
                    i, layer.__class__.__name__))
            else:
                logging.info("Layer {} ({}) dim: {} {} {}".format(
                    i, layer.__class__.__name__, *layer.get_dim('output')))

        mnist_test = MNIST(("test",), sources=['features', 'targets'])
        basis = create_fair_basis(mnist_test, 10, 10)

        x = tensor.tensor4('features')
        y = tensor.lmatrix('targets')

        # Normalize input and apply the convnet
        probs = convnet.apply(x)
        cg = ComputationGraph([probs])

        def full_brick_name(brick):
            return '/'.join([''] + [b.name for b in brick.get_unique_path()])

        # Find layer outputs to probe
        outs = OrderedDict((full_brick_name(get_brick(out)), out)
                for out in VariableFilter(
                    roles=[OUTPUT], bricks=[Convolutional, Linear])(
                        cg.variables))

        # Normalize input and apply the convnet
        error_rate = (MisclassificationRate().apply(y.flatten(), probs)
                      .copy(name='error_rate'))
        confusion = (ConfusionMatrix().apply(y.flatten(), probs)
                      .copy(name='confusion'))
        confusion.tag.aggregation_scheme = Sum(confusion)
        confusion_image = (ConfusionImage().apply(y.flatten(), probs, x)
                      .copy(name='confusion_image'))
        confusion_image.tag.aggregation_scheme = Sum(confusion_image)

        model = Model(
                [error_rate, confusion, confusion_image] + list(outs.values()))

        # Load it with trained parameters
        params = load_parameters(open(save_to, 'rb'))
        model.set_parameter_values(params)

        mnist_test = MNIST(("test",))
        mnist_test_stream = DataStream.default_stream(
            mnist_test,
            iteration_scheme=SequentialScheme(
                mnist_test.num_examples, batch_size))

        self.model = model
        self.mnist_test_stream = mnist_test_stream
        self.evaluator = DatasetEvaluator(
                [error_rate, confusion, confusion_image])
        self.base_results = self.evaluator.evaluate(mnist_test_stream)

        # TODO: allow target layer to be parameterized
        self.target_layer = '/lenet/mlp/linear_0'
        self.next_layer_param = '/lenet/mlp/linear_1.W'
        self.base_sample = extract_sample(
                outs[self.target_layer], mnist_test_stream)
        self.base_param_value = (
            model.get_parameter_dict()[
                self.next_layer_param].get_value().copy())
Example #47
0
import sys
from fuel.datasets.mnist import MNIST
from fuel.streams import DataStream

from fuel.schemes import ShuffledScheme
import theano
import logging
import numpy as np


logging.basicConfig()

m = VAModel()

# load parameters
model = Model(m.variational_cost)
print "loading params"
params = load_parameter_values(sys.argv[1])
model.set_param_values(params)

test_dataset = MNIST("test", sources=["features"])
test_scheme = ShuffledScheme(test_dataset.num_examples, 128)
test_stream = DataStream(test_dataset, iteration_scheme=test_scheme)

_func_sample = theano.function([m.Z], m.sampled)
# _func_noisy = theano.function([m.X], m.noisy)
# _func_produced = theano.function([m.X], m.produced)

# batch = test_stream.get_epoch_iterator().next()[0]
# out_noise = _func_noisy(batch)
# out_produced = _func_produced(batch)
Example #48
0
def evaluate(model, load_path, plot):
    with open(load_path + 'trained_params_best.npz') as f:
        loaded = np.load(f)
        blocks_model = Model(model.cost)
        params_dicts = blocks_model.get_parameter_dict()
        params_names = params_dicts.keys()
        for param_name in params_names:
            param = params_dicts[param_name]
            # '/f_6_.W' --> 'f_6_.W'
            slash_index = param_name.find('/')
            param_name = param_name[slash_index + 1:]
            assert param.get_value().shape == loaded[param_name].shape
            param.set_value(loaded[param_name])

    if plot:
        train_data_stream, valid_data_stream = get_streams(20)
        # T x B x F
        data = train_data_stream.get_epoch_iterator().next()
        cg = ComputationGraph(model.cost)
        f = theano.function(cg.inputs, [model.location, model.scale],
                            on_unused_input='ignore',
                            allow_input_downcast=True)
        res = f(data[1], data[0])
        for i in range(10):
            visualize_attention(data[0][:, i, :],
                                res[0][:, i, :], res[1][:, i, :],
                                image_shape=(512, 512), prefix=str(i))

        plot_curves(path=load_path,
                    to_be_plotted=['train_categoricalcrossentropy_apply_cost',
                                   'valid_categoricalcrossentropy_apply_cost'],
                    yaxis='Cross Entropy',
                    titles=['train', 'valid'],
                    main_title='CE')

        plot_curves(path=load_path,
                    to_be_plotted=['train_learning_rate',
                                   'train_learning_rate'],
                    yaxis='lr',
                    titles=['train', 'train'],
                    main_title='lr')

        plot_curves(path=load_path,
                    to_be_plotted=['train_total_gradient_norm',
                                   'valid_total_gradient_norm'],
                    yaxis='GradientNorm',
                    titles=['train', 'valid'],
                    main_title='GradientNorm')

        for grad in ['_total_gradient_norm',
                     '_total_gradient_norm',
                     '_/lstmattention.W_patch_grad_norm',
                     '_/lstmattention.W_state_grad_norm',
                     '_/lstmattention.initial_cells_grad_norm',
                     '_/lstmattention.initial_location_grad_norm',
                     '_/lstmattention/lstmattention_mlp/linear_0.W_grad_norm',
                     '_/lstmattention/lstmattention_mlp/linear_1.W_grad_norm',
                     '_/mlp/linear_0.W_grad_norm',
                     '_/mlp/linear_1.W_grad_norm']:
            plot_curves(path=load_path,
                        to_be_plotted=['train' + grad,
                                       'valid' + grad],
                        yaxis='GradientNorm',
                        titles=['train',
                                'valid'],
                        main_title=grad.replace(
                            "_", "").replace("/", "").replace(".", ""))

        plot_curves(path=load_path,
                    to_be_plotted=[
                        'train_misclassificationrate_apply_error_rate',
                        'valid_misclassificationrate_apply_error_rate'],
                    yaxis='Error rate',
                    titles=['train', 'valid'],
                    main_title='Error')
        print 'plot printed'
Example #49
0
def generate_embeddings(config,
                        tar_path,
                        part,
                        dest_path,
                        format_,
                        average=False,
                        encoder_embeddings=None,
                        **kwargs):
    """
    generate embeddings for all the defintions, average them and serialize OR
    if encoder_embeddings, serialize the models' encoder embeddings

    config: name of the config of the model
    tar_path: tar path of the model parameters
    part: part of the dataset (should be either 'train', 'valid', 'test' or 'all')
    dest_path: directory where the serialized embeddings will be written
    format: either 'dict' or 'glove'
    encoder_embeddings: None, 'only', 'mixed', 'if_missing'
      - None: don't include encoder embeddings
      - 'only': don't read any data, just serialize the encoder embeddings
      - 'mixed': add the encoder embeddings to the list of definition embeddings
      - 'if_missing': add the encoder embeddings when there is no corresponding def
    average: if true, multi-prototype embeddings will be averaged
    """
    if not os.path.exists(dest_path):
        os.makedirs(dest_path)

    c = config
    data, model = initialize_data_and_model(c, train_phase=False)
    words = T.ltensor3('words')
    words_mask = T.matrix('words_mask')
    keys = T.lmatrix('keys')
    n_identical_keys = T.lvector('n_identical_keys')
    sym_args = [words, words_mask]

    if format_ not in ['dict', 'glove']:
        raise ValueError("format should be either: dict, glove")

    if not c['encoder'] and encoder_embeddings != 'only':
        raise ValueError('Error: this model does not have an encoder.')

    if use_keys(c):
        sym_args.append(keys)
    if use_n_identical_keys(c):
        sym_args.append(n_identical_keys)

    costs = model.apply(*sym_args, train_phase=False)

    cg = Model(costs)

    with open(tar_path) as src:
        cg.set_parameter_values(load_parameters(src))

    if encoder_embeddings:
        if encoder_embeddings == 'only' and not c['encoder']:
            embeddings_array = model.get_def_embeddings_params('key').eval()
        else:
            embeddings_array = model.get_def_embeddings_params('main').eval()
        entries = model.get_embeddings_entries()
        enc_embeddings = {
            e: np.asarray(a)
            for e, a in zip(entries, embeddings_array)
        }
        if encoder_embeddings == 'only':
            serialize_embeddings(enc_embeddings, format_, dest_path,
                                 "encoder_embeddings")
            return 0

    embeddings_var, = VariableFilter(name='embeddings')(cg)
    compute = dict({"embeddings": embeddings_var})
    if c['proximity_coef'] != 0:
        prox_var, = VariableFilter(name='proximity_term')(cg)
        compute["proximity_term"] = prox_var
    print "sym args", sym_args
    predict_f = theano.function(sym_args, compute)
    batch_size = 256  # size of test_unseen
    stream = data.get_stream(part,
                             batch_size=batch_size,
                             max_length=c['max_length'],
                             remove_keys=False,
                             remove_n_identical_keys=False)
    raw_data = []  # list of dicts containing the inputs and computed outputs
    i = 0
    vocab = model._vocab
    print "start computing"
    embeddings = defaultdict(list)
    for input_data in stream.get_epoch_iterator(as_dict=True):
        if i % 10 == 0:
            print "iteration:", i
        words = input_data['words']
        words_mask = input_data['words_mask']
        keys = input_data['keys']
        n_identical_keys = input_data['n_identical_keys']
        args = [words, words_mask]
        if use_keys(c):
            args.append(keys)
        if use_n_identical_keys(c):
            args.append(n_identical_keys)

        to_save = predict_f(*args)
        for k, h in zip(keys, to_save['embeddings']):
            key = vec2str(k)
            if encoder_embeddings == 'if_missing':
                try:
                    del enc_embeddings[key]
                except KeyError:
                    pass
            embeddings[key].append(h)
        i += 1

    if encoder_embeddings in ['mixed', 'if_missing']:
        for k, e in enc_embeddings.iteritems():
            embeddings[k].append(e)

    if encoder_embeddings == 'mixed':
        prefix_fname = 'mix_e_'
    elif encoder_embeddings == 'if_missing':
        prefix_fname = 'if_mis_e_'
    else:
        prefix_fname = ''

    # combine:
    if average:
        mean_embeddings = {}
        for k in embeddings.keys():
            mean_embeddings[k] = np.mean(np.asarray(embeddings[k]), axis=0)
        serialize_embeddings(mean_embeddings, format_, dest_path,
                             prefix_fname + "mean_embeddings")
    else:
        serialize_embeddings(embeddings, format_, dest_path,
                             prefix_fname + "embeddings")
Example #50
0
def main(config):
    # Create Theano variables
    logger.info('Creating theano variables')
    source_char_seq = tensor.lmatrix('source_char_seq')
    source_sample_matrix = tensor.btensor3('source_sample_matrix')
    source_char_aux = tensor.bmatrix('source_char_aux')
    source_word_mask = tensor.bmatrix('source_word_mask')
    target_char_seq = tensor.lmatrix('target_char_seq')
    target_char_aux = tensor.bmatrix('target_char_aux')
    target_char_mask = tensor.bmatrix('target_char_mask')
    target_sample_matrix = tensor.btensor3('target_sample_matrix')
    target_word_mask = tensor.bmatrix('target_word_mask')
    target_resample_matrix = tensor.btensor3('target_resample_matrix')
    target_prev_char_seq = tensor.lmatrix('target_prev_char_seq')
    target_prev_char_aux = tensor.bmatrix('target_prev_char_aux')

    src_vocab = _ensure_special_tokens(pickle.load(
        open(config['src_vocab'], 'rb')),
                                       bos_idx=0,
                                       eos_idx=config['src_vocab_size'] - 1,
                                       unk_idx=config['unk_id'])

    trg_vocab = _ensure_special_tokens(pickle.load(
        open(config['trg_vocab'], 'rb')),
                                       bos_idx=0,
                                       eos_idx=config['trg_vocab_size'] - 1,
                                       unk_idx=config['unk_id'])

    target_bos_idx = trg_vocab[config['bos_token']]
    target_space_idx = trg_vocab[' ']

    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'],
                                   config['src_dgru_nhids'],
                                   config['enc_nhids'],
                                   config['src_dgru_depth'],
                                   config['bidir_encoder_depth'])

    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['trg_dgru_nhids'], config['trg_igru_nhids'],
                      config['dec_nhids'], config['enc_nhids'] * 2,
                      config['transition_depth'], config['trg_igru_depth'],
                      config['trg_dgru_depth'], target_space_idx,
                      target_bos_idx)

    representation = encoder.apply(source_char_seq, source_sample_matrix,
                                   source_char_aux, source_word_mask)
    cost = decoder.cost(representation, source_word_mask, target_char_seq,
                        target_sample_matrix, target_resample_matrix,
                        target_char_aux, target_char_mask, target_word_mask,
                        target_prev_char_seq, target_prev_char_aux)

    # Set up model
    logger.info("Building model")
    training_model = Model(cost)

    # Set extensions
    logger.info("Initializing extensions")
    # Reload model if necessary
    extensions = [LoadNMT(config['saveto'])]

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=None,
                         data_stream=None,
                         extensions=extensions)

    for extension in main_loop.extensions:
        extension.main_loop = main_loop
    main_loop._run_extensions('before_training')

    char_embedding = encoder.decimator.apply(source_char_seq.T,
                                             source_sample_matrix,
                                             source_char_aux.T)
    embedding(Model(char_embedding), src_vocab)
def train(step_rule, label_dim, state_dim, epochs,
          seed, dropout, test_cost, experiment_path, features, weight_noise,
          to_watch, patience, batch_size, batch_norm, **kwargs):

    print '.. TIMIT experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()


    # ------------------------------------------------------------------------
    # Streams

    rng = np.random.RandomState(seed)
    stream_args = dict(rng=rng, batch_size=batch_size)

    print '.. initializing iterators'
    train_dataset = Timit('train', features=features)
    train_stream = construct_stream(train_dataset, **stream_args)
    dev_dataset = Timit('dev', features=features)
    dev_stream = construct_stream(dev_dataset, **stream_args)
    test_dataset = Timit('test', features=features)
    test_stream = construct_stream(test_dataset, **stream_args)
    update_stream = construct_stream(train_dataset, n_batches=100,
                                     **stream_args)

    phone_dict = train_dataset.get_phoneme_dict()
    phoneme_dict = {k: phone_to_phoneme_dict[v]
                    if v in phone_to_phoneme_dict else v
                    for k, v in phone_dict.iteritems()}
    ind_to_phoneme = {v: k for k, v in phoneme_dict.iteritems()}
    eol_symbol = ind_to_phoneme['<STOP>']
 
   
    # ------------------------------------------------------------------------
    # Graph

    print '.. building model'
    x = T.tensor3('features')
    y = T.matrix('phonemes')
    input_mask = T.matrix('features_mask')
    output_mask = T.matrix('phonemes_mask')

    theano.config.compute_test_value = 'off'
    x.tag.test_value = np.random.randn(100, 24, 123).astype(floatX)
    y.tag.test_value = np.ones((30, 24), dtype=floatX)
    input_mask.tag.test_value = np.ones((100, 24), dtype=floatX)
    output_mask.tag.test_value = np.ones((30, 24), dtype=floatX)

    seq_len = 100 
    input_dim = 123 
    activation = Tanh()
    recurrent_init = IdentityInit(0.99) 

    if batch_norm:
        rec1 = LSTMBatchNorm(name='rec1',
                             dim=state_dim,
                             activation=activation,
                             weights_init=NormalizedInitialization())
        #rec1 = SimpleRecurrentBatchNorm(name='rec1',
        #                                dim=state_dim,
        #                                activation=activation,
        #                                seq_len=seq_len,
        #                                weights_init=recurrent_init)
        #rec2 = SimpleRecurrentBatchNorm(name='rec2',
        #                                dim=state_dim,
        #                                activation=activation,
        #                                seq_len=seq_len,
        #                                weights_init=recurrent_init)
        #rec3 = SimpleRecurrentBatchNorm(name='rec3',
        #                                dim=state_dim,
        #                                activation=activation,
        #                                seq_len=seq_len,
        #                                weights_init=recurrent_init)
    else:
        rec1 = LSTM(name='rec1', dim=state_dim, activation=activation,
                    weights_init=NormalizedInitialization())
        #rec1 = SimpleRecurrent(name='rec1', dim=state_dim, activation=activation,
        #                       weights_init=recurrent_init)
        #rec2 = SimpleRecurrent(name='rec2', dim=state_dim, activation=activation,
        #                       weights_init=recurrent_init)
        #rec3 = SimpleRecurrent(name='rec3', dim=state_dim, activation=activation,
        #                       weights_init=recurrent_init)
    
    rec1.initialize()
    #rec2.initialize()
    #rec3.initialize()
    
    s1 = MyRecurrent(rec1, [input_dim, state_dim, label_dim + 1],
                     activations=[Identity(), Identity()], name='s1')
    #s2 = MyRecurrent(rec2, [state_dim, state_dim, state_dim],
    #                 activations=[Identity(), Identity()], name='s2')
    #s3 = MyRecurrent(rec3, [state_dim, state_dim, label_dim + 1],
    #                 activations=[Identity(), Identity()], name='s3')

    s1.initialize()
    #s2.initialize()
    #s3.initialize()

    o1 = s1.apply(x, input_mask)
    #o2 = s2.apply(o1)
    #y_hat_o = s3.apply(o2)
    y_hat_o = o1
    
    shape = y_hat_o.shape
    y_hat = Softmax().apply(y_hat_o.reshape((-1, shape[-1]))).reshape(shape)

    y_mask = output_mask
    y_hat_mask = input_mask


    # ------------------------------------------------------------------------
    # Costs and Algorithm

    ctc_cost = T.sum(ctc.cpu_ctc_th(
         y_hat_o, T.sum(y_hat_mask, axis=0),
         y + T.ones_like(y), T.sum(y_mask, axis=0)))
    batch_cost = ctc_cost.copy(name='batch_cost')

    bs = y.shape[1]
    cost_train = aggregation.mean(batch_cost, bs).copy("sequence_cost")
    cost_per_character = aggregation.mean(batch_cost,
                                          output_mask.sum()).copy(
                                                  "character_cost")
    cg_train = ComputationGraph(cost_train)

    model = Model(cost_train)
    train_cost_per_character = aggregation.mean(cost_train,
                                                output_mask.sum()).copy(
                                                        "train_character_cost")

    algorithm = GradientDescent(step_rule=step_rule, cost=cost_train,
                                parameters=cg_train.parameters,
                                on_unused_sources='warn')



    # ------------------------------------------------------------------------
    # Monitoring and extensions

    parameters = model.get_parameter_dict()
    observed_vars = [cost_train, train_cost_per_character,
                     aggregation.mean(algorithm.total_gradient_norm)]
    for name, param in parameters.iteritems():
        observed_vars.append(param.norm(2).copy(name + "_norm"))
        observed_vars.append(algorithm.gradients[param].norm(2).copy(name + "_grad_norm"))
    train_monitor = TrainingDataMonitoring(
        variables=observed_vars,
        prefix="train", after_epoch=True)

    dev_monitor = DataStreamMonitoring(
        variables=[cost_train, cost_per_character],
        data_stream=dev_stream, prefix="dev"
    )
    train_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, train_stream,
                                      prefix='train', every_n_epochs=1,
                                      before_training=True,
                                      phoneme_dict=phoneme_dict,
                                      black_list=black_list, train=True)
    dev_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, dev_stream,
                                    prefix='dev', every_n_epochs=1,
                                    phoneme_dict=phoneme_dict,
                                    black_list=black_list)

    extensions = []
    if 'load_path' in kwargs:
        extensions.append(Load(kwargs['load_path']))

    extensions.extend([FinishAfter(after_n_epochs=epochs),
                       train_monitor,
                       dev_monitor,
                       train_ctc_monitor,
                       dev_ctc_monitor])

    if test_cost:
        test_monitor = DataStreamMonitoring(
            variables=[cost_train, cost_per_character],
            data_stream=test_stream,
            prefix="test"
        )
        test_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, test_stream,
                                         prefix='test', every_n_epochs=1,
                                         phoneme_dict=phoneme_dict,
                                         black_list=black_list)
        extensions.append(test_monitor)
        extensions.append(test_ctc_monitor)

    #if not os.path.exists(experiment_path):
    #    os.makedirs(experiment_path)
    #best_path = os.path.join(experiment_path, 'best/')
    #if not os.path.exists(best_path):
    #    os.mkdir(best_path)
    #best_path = os.path.join(best_path, 'model.bin')
    extensions.append(EarlyStopping(to_watch, patience, '/dev/null'))
    extensions.extend([ProgressBar(), Printing()])


    # ------------------------------------------------------------------------
    # Main Loop

    main_loop = MainLoop(model=model, data_stream=train_stream,
                         algorithm=algorithm, extensions=extensions)

    print "Building time: %f" % (time.time() - t0)
   # if write_predictions:
   #     with open('predicted.txt', 'w') as f_pred:
   #         with open('targets.txt', 'w') as f_targets:
   #             evaluator = CTCEvaluator(
   #                 eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list)
   #             evaluator.evaluate(dev_stream, file_pred=f_pred,
   #                                file_targets=f_targets)
   #     return
    main_loop.run()
Example #52
0
                                             iteration_scheme=SequentialScheme(
                                                 data_test.num_examples,
                                                 batch_size=bs))

learning_rate = 0.0002
n_epochs = 100
algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=CompositeRule([
                                StepClipping(10.),
                                Adam(learning_rate),
                            ]))

print('..loading...')
load = Load('/home/xuehongyang/checkpoints_open/snapshot_18')
predictor = PredictDataStream(data_stream=data_stream_test,
                              output_tensor=result,
                              path='/home/xuehongyang/RESULT_MAIN',
                              before_training=True,
                              after_epoch=False,
                              after_training=False)
main_loop = MainLoop(
    model=Model(cost),
    data_stream=data_stream_train,
    algorithm=algorithm,
    extensions=[Timing(),
                FinishAfter(after_n_epochs=1), load, predictor])

print('start prediction ...')
main_loop.run()
Example #53
0
def main(name, epochs, batch_size, learning_rate,
         dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout,
         depth, max_grad, step_method, epsilon, sample, skip, uniform, top):

    #----------------------------------------------------------------------
    datasource = name

    def shnum(x):
        """ Convert a positive float into a short tag-usable string
             E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2
        """
        return '0' if x <= 0 else '%s%d' % (("%e"%x)[0], -np.floor(np.log10(x)))

    jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (datasource, depth, dim, mix_dim,
                                           int(dropout*10),
                                           shnum(learning_rate), batch_size,
                                           shnum(epsilon))
    if max_length != 600:
        jobname += '-L%d'%max_length

    if GRU:
        jobname += 'g'
    if max_grad != 5.:
        jobname += 'G%g'%max_grad
    if step_method != 'adam':
        jobname += step_method
    if skip:
        jobname += 'D'
        assert depth > 1
    if top:
        jobname += 'T'
        assert depth > 1
    if uniform > 0.:
        jobname += 'u%d'%int(uniform*100)

    if debug:
        jobname += ".debug"

    if sample:
        print("Sampling")
    else:
        print("\nRunning experiment %s" % jobname)
    if old_model_name:
        print("starting from model %s"%old_model_name)

    #----------------------------------------------------------------------
    transitions = [GatedRecurrent(dim=dim) if GRU else LSTM(dim=dim)
                   for _ in range(depth)]
    if depth > 1:
        transition = RecurrentStack(transitions, name="transition",
                                    fast=True, skip_connections=skip or top)
        if skip:
            source_names=['states'] + ['states_%d'%d for d in range(1,depth)]
        else:
            source_names=['states_%d'%(depth-1)]
    else:
        transition = transitions[0]
        transition.name = "transition"
        source_names=['states']

    emitter = SketchEmitter(mix_dim=mix_dim,
                            epsilon=epsilon,
                            name="emitter")
    readout = Readout(
        readout_dim=emitter.get_dim('inputs'),
        source_names=source_names,
        emitter=emitter,
        name="readout")
    normal_inputs = [name for name in transition.apply.sequences
                     if 'mask' not in name]
    fork = Fork(normal_inputs, prototype=Linear(use_bias=True))
    generator = SequenceGenerator(readout=readout, transition=transition,
                                  fork=fork)

    # Initialization settings
    if uniform > 0.:
        generator.weights_init = Uniform(width=uniform*2.)
    else:
        generator.weights_init = OrthogonalGlorot()
    generator.biases_init = Constant(0)

    # Build the cost computation graph [steps, batch_size, 3]
    x = T.tensor3('features', dtype=floatX)
    if debug:
        x.tag.test_value = np.ones((max_length,batch_size,3)).astype(floatX)
    x = x[:max_length,:,:]  # has to be after setting test_value
    cost = generator.cost(x)
    cost.name = "sequence_log_likelihood"

    # Give an idea of what's going on
    model = Model(cost)
    params = model.get_params()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, value.get_value().shape) for key, value
                     in params.items()],
                    width=120))
    model_size = 0
    for v in params.itervalues():
        s = v.get_value().shape
        model_size += s[0] * (s[1] if len(s) > 1 else 1)
    logger.info("Total number of parameters %d"%model_size)

    #------------------------------------------------------------
    extensions = []
    if old_model_name == 'continue':
        extensions.append(LoadFromDump(jobname))
    elif old_model_name:
        # or you can just load the weights without state using:
        old_params = LoadFromDump(old_model_name).manager.load_parameters()
        model.set_param_values(old_params)
    else:
        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

    if sample:
        assert old_model_name and old_model_name != 'continue'
        Sample(generator, steps=max_length, path=old_model_name).do(None)
        exit(0)

    #------------------------------------------------------------
    # Define the training algorithm.
    cg = ComputationGraph(cost)
    if dropout > 0.:
        from blocks.roles import INPUT, OUTPUT
        dropout_target = VariableFilter(roles=[OUTPUT],
                                        bricks=transitions,
                                        name_regex='states')(cg.variables)
        print('# dropout %d' % len(dropout_target))
        cg = apply_dropout(cg, dropout_target, dropout)
        opt_cost = cg.outputs[0]
    else:
        opt_cost = cost

    if step_method == 'adam':
        step_rule = Adam(learning_rate)
    elif step_method == 'rmsprop':
        step_rule = RMSProp(learning_rate, decay_rate=0.95)
    elif step_method == 'adagrad':
        step_rule = AdaGrad(learning_rate)
    elif step_method == 'adadelta':
        step_rule = AdaDelta()
    elif step_method == 'scale':
        step_rule = Scale(learning_rate)
    else:
        raise Exception('Unknown sttep method %s'%step_method)

    step_rule = CompositeRule([StepClipping(max_grad), step_rule])

    algorithm = GradientDescent(
        cost=opt_cost, params=cg.parameters,
        step_rule=step_rule)

    #------------------------------------------------------------
    observables = [cost]

    # Fetch variables useful for debugging
    (energies,) = VariableFilter(
        applications=[generator.readout.readout],
        name_regex="output")(cg.variables)
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    observables += [min_energy, max_energy]

    # (activations,) = VariableFilter(
    #     applications=[generator.transition.apply],
    #     name=generator.transition.apply.states[0])(cg.variables)
    # mean_activation = named_copy(abs(activations).mean(),
    #                              "mean_activation")
    # observables.append(mean_activation)

    observables += [algorithm.total_step_norm, algorithm.total_gradient_norm]
    for name, param in params.items():
        observables.append(named_copy(
            param.norm(2), name + "_norm"))
        observables.append(named_copy(
            algorithm.gradients[param].norm(2), name + "_grad_norm"))

    #------------------------------------------------------------
    datasource_fname = os.path.join(fuel.config.data_path, datasource,
                                    datasource+'.hdf5')

    train_ds = H5PYDataset(datasource_fname, #max_length=max_length,
                             which_set='train', sources=('features',),
                             load_in_memory=True)
    train_stream = DataStream(train_ds,
                              iteration_scheme=ShuffledScheme(
                                  train_ds.num_examples, batch_size))

    test_ds = H5PYDataset(datasource_fname, #max_length=max_length,
                            which_set='test', sources=('features',),
                            load_in_memory=True)
    test_stream  = DataStream(test_ds,
                              iteration_scheme=SequentialScheme(
                                  test_ds.num_examples, batch_size))

    train_stream = Mapping(train_stream, _transpose)
    test_stream = Mapping(test_stream, _transpose)

    def stream_stats(ds, label):
        itr = ds.get_epoch_iterator(as_dict=True)
        batch_count = 0
        examples_count = 0
        for batch in itr:
            batch_count += 1
            examples_count += batch['features'].shape[1]
        print('%s #batch %d #examples %d' %
              (label, batch_count, examples_count))

    stream_stats(train_stream, 'train')
    stream_stats(test_stream, 'test')

    extensions += [Timing(every_n_batches=10),
                   TrainingDataMonitoring(
                       observables, prefix="train",
                       every_n_batches=10),
                   DataStreamMonitoring(
                       [cost],  # without dropout
                       test_stream,
                       prefix="test",
                       on_resumption=True,
                       after_epoch=False,  # by default this is True
                       every_n_batches=100),
                   # all monitored data is ready so print it...
                   # (next steps may take more time and we want to see the
                   # results as soon as possible so print as soon as you can)
                   Printing(every_n_batches=10),
                   # perform multiple dumps at different intervals
                   # so if one of them breaks (has nan) we can hopefully
                   # find a model from few batches ago in the other
                   Dump(jobname, every_n_batches=11),
                   Dump(jobname+'.test', every_n_batches=100),
                   Sample(generator, steps=max_length,
                          path=jobname+'.test',
                          every_n_batches=100),
                   ProgressBar(),
                   FinishAfter(after_n_epochs=epochs)
                    # This shows a way to handle NaN emerging during
                    # training: simply finish it.
                    .add_condition("after_batch", _is_nan),
                   ]

    if bokeh:
        from blocks.extensions.plot import Plot
        extensions.append(Plot(
            'sketch',
            channels=[
                ['cost'],]))

    # Construct the main loop and start training!
    main_loop = MainLoop(
        model=model,
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=extensions
        )

    main_loop.run()
Example #54
0
def setup_model(configs):
    tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5)
    # shape: T x B x C x X x Y
    input_ = tensor5('features')
    tensor3 = theano.tensor.TensorType(config.floatX, (False,) * 3)
    locs = tensor3('locs')
    # shape: B x Classes
    target = T.ivector('targets')

    model = LSTMAttention(
        configs,
        weights_init=Glorot(),
        biases_init=Constant(0))
    model.initialize()

    (h, c, location, scale, alpha, patch, downn_sampled_input,
        conved_part_1, conved_part_2, pre_lstm) = model.apply(input_, locs)

    model.location = location
    model.scale = scale
    model.alpha = location
    model.patch = patch

    classifier = MLP(
        [Rectifier(), Softmax()],
        configs['classifier_dims'],
        weights_init=Glorot(),
        biases_init=Constant(0))
    classifier.initialize()

    probabilities = classifier.apply(h[-1])
    cost = CategoricalCrossEntropy().apply(target, probabilities)
    cost.name = 'CE'
    error_rate = MisclassificationRate().apply(target, probabilities)
    error_rate.name = 'ER'
    model.cost = cost
    model.error_rate = error_rate
    model.probabilities = probabilities

    if configs['load_pretrained']:
        blocks_model = Model(model.cost)
        all_params = blocks_model.parameters
        with open('VGG_CNN_params.npz') as f:
            loaded = np.load(f)
            all_conv_params = loaded.keys()
            for param in all_params:
                if param.name in loaded.keys():
                    assert param.get_value().shape == loaded[param.name].shape
                    param.set_value(loaded[param.name])
                    all_conv_params.pop(all_conv_params.index(param.name))
        print "the following parameters did not match: " + str(all_conv_params)

    if configs['test_model']:
        print "TESTING THE MODEL: CHECK THE INPUT SIZE!"
        cg = ComputationGraph(model.cost)
        f = theano.function(cg.inputs, [model.cost],
                            on_unused_input='ignore',
                            allow_input_downcast=True)
        data = configs['get_streams'](configs[
            'batch_size'])[0].get_epoch_iterator().next()
        f(data[1], data[0], data[2])

        print "Test passed! ;)"

    model.monitorings = [cost, error_rate]

    return model
Example #55
0
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None,
         conv_sizes=None, pool_sizes=None, batch_size=500,
         num_batches=None):
    if feature_maps is None:
        feature_maps = [20, 50]
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    if conv_sizes is None:
        conv_sizes = [5, 5]
    if pool_sizes is None:
        pool_sizes = [2, 2]
    image_size = (28, 28)
    output_size = 10

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations, 1, image_size,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='full',
                    weights_init=Uniform(width=.2),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.push_initialization_config()
    convnet.layers[0].weights_init = Uniform(width=.2)
    convnet.layers[1].weights_init = Uniform(width=.09)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info("Input dim: {} {} {}".format(
        *convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(
                i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))
    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    cost = (CategoricalCrossEntropy().apply(y.flatten(), probs)
            .copy(name='cost'))
    error_rate = (MisclassificationRate().apply(y.flatten(), probs)
                  .copy(name='error_rate'))

    cg = ComputationGraph([cost, error_rate])

    mnist_train = MNIST(("train",))
    mnist_train_stream = DataStream.default_stream(
        mnist_train, iteration_scheme=ShuffledScheme(
            mnist_train.num_examples, batch_size))

    mnist_test = MNIST(("test",))
    mnist_test_stream = DataStream.default_stream(
        mnist_test,
        iteration_scheme=ShuffledScheme(
            mnist_test.num_examples, batch_size))

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=Scale(learning_rate=0.1))
    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  DataStreamMonitoring(
                      [cost, error_rate],
                      mnist_test_stream,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [cost, error_rate,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      after_epoch=True),
                  Checkpoint(save_to),
                  ProgressBar(),
                  Printing()]

    model = Model(cost)

    main_loop = MainLoop(
        algorithm,
        mnist_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()
Example #56
0
def train(model, configs):
    get_streams = configs['get_streams']
    save_path = configs['save_path']
    num_epochs = configs['num_epochs']
    batch_size = configs['batch_size']
    lrs = configs['lrs']
    until_which_epoch = configs['until_which_epoch']
    grad_clipping = configs['grad_clipping']
    monitorings = model.monitorings

    # Training
    if configs['weight_noise'] > 0:
        cg = ComputationGraph(model.cost)
        weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        cg = apply_noise(cg, weights, configs['weight_noise'])
        model.cost = cg.outputs[0].copy(name='CE')

    if configs['l2_reg'] > 0:
        cg = ComputationGraph(model.cost)
        weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        new_cost = model.cost + configs['l2_reg'] * sum([
            (weight ** 2).sum() for weight in weights])
        model.cost = new_cost.copy(name='CE')

    blocks_model = Model(model.cost)
    all_params = blocks_model.parameters
    print "Number of found parameters:" + str(len(all_params))
    print all_params

    default_lr = np.float32(configs['lrs'][0])
    lr_var = theano.shared(default_lr, name="learning_rate")

    clipping = StepClipping(threshold=np.cast[floatX](grad_clipping))
    # sgd_momentum = Momentum(
    #     learning_rate=0.0001,
    #     momentum=0.95)
    # step_rule = CompositeRule([clipping, sgd_momentum])
    adam = Adam(learning_rate=lr_var)
    step_rule = CompositeRule([clipping, adam])
    training_algorithm = GradientDescent(
        cost=model.cost, parameters=all_params,
        step_rule=step_rule)

    monitored_variables = [
        lr_var,
        aggregation.mean(training_algorithm.total_gradient_norm)] + monitorings

    for param in all_params:
        name = param.tag.annotations[0].name + "." + param.name
        to_monitor = training_algorithm.gradients[param].norm(2)
        to_monitor.name = name + "_grad_norm"
        monitored_variables.append(to_monitor)
        to_monitor = param.norm(2)
        to_monitor.name = name + "_norm"
        monitored_variables.append(to_monitor)

    train_data_stream, valid_data_stream = get_streams(batch_size)

    train_monitoring = TrainingDataMonitoring(
        variables=monitored_variables,
        prefix="train",
        after_epoch=True)

    valid_monitoring = DataStreamMonitoring(
        variables=monitored_variables,
        data_stream=valid_data_stream,
        prefix="valid",
        after_epoch=True)

    main_loop = MainLoop(
        algorithm=training_algorithm,
        data_stream=train_data_stream,
        model=blocks_model,
        extensions=[
            train_monitoring,
            valid_monitoring,
            FinishAfter(after_n_epochs=num_epochs),
            SaveParams('valid_CE',
                       blocks_model, save_path,
                       after_epoch=True),
            SaveLog(after_epoch=True),
            ProgressBar(),
            LRDecay(lr_var, lrs, until_which_epoch,
                    after_epoch=True),
            Printing(after_epoch=True)])
    main_loop.run()
Example #57
0
# states = {}
states = [state for state in generator.transition.apply.outputs if state != "step"]

# ipdb.set_trace()

states = {name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states}

cost_matrix = generator.cost_matrix(x, attended=context, **states)

cost = cost_matrix.mean() + 0.0 * start_flag
cost.name = "nll"

cg = ComputationGraph(cost)

model = Model(cost)

transition_matrix = VariableFilter(theano_name_regex="state_to_state")(cg.parameters)
for matr in transition_matrix:
    matr.set_value(0.98 * numpy.eye(hidden_size_recurrent, dtype=floatX))

from play.utils import regex_final_value

extra_updates = []
for name, var in states.items():
    update = tensor.switch(
        start_flag, 0.0 * var, VariableFilter(theano_name_regex=regex_final_value(name))(cg.auxiliary_variables)[0]
    )
    extra_updates.append((var, update))

Example #58
0
def main(argv):
    name = argv[1]
    files = map(lambda p: join(folder, p), listdir(folder))

    file = next(filter(lambda n: name in n, files))
    print(file)

    p = load_parameter_values(file)

    net = net_dvc((128, 128))

    x = tensor.tensor4('image_features')
    y_hat = net.apply(x)

    g = Model(y_hat)

    for k, v in p.items():
        p[k] = v.astype('float32')

    g.set_parameter_values(p)

    a, t, v = get_dvc((128, 128), trainning=False, shortcut=False)
    run = function([x], y_hat)

    def run_test(data):
        res = []
        for i in data.get_epoch_iterator():
            res.extend(run(i[0]))
        return res

    def max_index(l):
        if l[0] > l[1]:
            return 0
        else:
            return 1

    def write_kaggle(f, l):
        f.write("id,label\n")
        for i, e in enumerate(l, start=1):
            f.write(str(i) + "," + str(e) + "\n")

    def kaggle(file, data):
        write_kaggle(file, map(max_index, run_test(data)))

    def accuracy(data):
        res = []
        true = []
        for i in data.get_epoch_iterator():
            res.extend(run(i[0]))
            true.extend(i[1])
        res = map(max_index, res)

        total = 0
        equal = 0
        for r, t in zip(res, true):
            total += 1
            equal += 1 if r == t else 0

        return equal / total

    print("Training accuracy: ", accuracy(a))
    print("Test accuracy: ", accuracy(v))
    kaggle_file = join(result_folder, name + ".kaggle")
    print(kaggle_file)
    with open(kaggle_file, 'w') as f:
        kaggle(f, t)
Example #59
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code, level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream, reverse_words,
                              add_sources=("targets",))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weights_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(
            chars, chars_mask, targets, targets_mask).sum()
        batch_size = chars.shape[1].copy(name="batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        parameters = model.get_parameter_dict()
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in parameters.items()],
                        width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies,) = VariableFilter(
            applications=[generator.readout.readout],
            name_regex="output")(cg.variables)
        (activations,) = VariableFilter(
            applications=[generator.transition.apply],
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = chars.shape[0].copy(name="max_length")
        cost_per_character = aggregation.mean(
            batch_cost, batch_size * max_length).copy(
                name="character_log_likelihood")
        min_energy = energies.min().copy(name="min_energy")
        max_energy = energies.max().copy(name="max_energy")
        mean_activation = abs(activations).mean().copy(
                name="mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation,
            batch_size, max_length, cost_per_character,
            algorithm.total_step_norm, algorithm.total_gradient_norm]
        for name, parameter in parameters.items():
            observables.append(parameter.norm(2).copy(name + "_norm"))
            observables.append(algorithm.gradients[parameter].norm(2).copy(
                name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(
            observables, prefix="average", every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition(["after_batch"], _is_nan),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path, every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_parameter_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    applications=[reverser.generator.generate], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(samples)
                outputs, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            try:
                line = input("Enter a sentence\n")
                message = ("Enter the number of samples\n" if mode == "sample"
                        else "Enter the beam size\n")
                batch_size = int(input(message))
            except EOFError:
                break
            except Exception:
                traceback.print_exc()
                continue

            encoded_input = [char2code.get(char, char2code["<UNK>"])
                             for char in line.lower().strip()]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input,))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size, axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
Example #60
0
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name,
         max_length, bokeh, GRU, dropout, depth, max_grad, step_method,
         epsilon, sample):

    #----------------------------------------------------------------------
    datasource = name

    def shnum(x):
        """ Convert a positive float into a short tag-usable string
             E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2
        """
        return '0' if x <= 0 else '%s%d' % (
            ("%e" % x)[0], -np.floor(np.log10(x)))

    jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (
        datasource, depth, dim, mix_dim, int(
            dropout * 10), shnum(learning_rate), batch_size, shnum(epsilon))
    if max_length != 600:
        jobname += '-L%d' % max_length

    if GRU:
        jobname += 'g'
    if max_grad != 5.:
        jobname += 'G%g' % max_grad
    if step_method != 'adam':
        jobname += step_method

    if sample:
        print("Sampling")
    else:
        print("\nRunning experiment %s" % jobname)

    #----------------------------------------------------------------------
    if depth > 1:
        transition = LSTMstack(dim=dim,
                               depth=depth,
                               name="transition",
                               lstm_name="transition")
        assert not GRU
    elif GRU:
        transition = GatedRecurrent(dim=dim, name="transition")
    else:
        transition = LSTM(dim=dim, name="transition")

    emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter")
    readout = Readout(readout_dim=emitter.get_dim('inputs'),
                      source_names=['states'],
                      emitter=emitter,
                      name="readout")
    normal_inputs = [
        name for name in transition.apply.sequences if 'mask' not in name
    ]
    fork = Fork(normal_inputs, prototype=Linear(use_bias=True))
    generator = SequenceGenerator(readout=readout,
                                  transition=transition,
                                  fork=fork)

    # Initialization settings
    generator.weights_init = OrthogonalGlorot()
    generator.biases_init = Constant(0)

    # Build the cost computation graph [steps,batch_size, 3]
    x = T.tensor3('features', dtype=floatX)[:max_length, :, :]
    x.tag.test_value = np.ones((max_length, batch_size, 3)).astype(np.float32)
    cost = generator.cost(x)
    cost.name = "sequence_log_likelihood"

    # Give an idea of what's going on
    model = Model(cost)
    params = model.get_params()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, value.get_value().shape)
                                for key, value in params.items()],
                               width=120))
    model_size = 0
    for v in params.itervalues():
        s = v.get_value().shape
        model_size += s[0] * (s[1] if len(s) > 1 else 1)
    logger.info("Total number of parameters %d" % model_size)

    #------------------------------------------------------------
    extensions = []
    if old_model_name == 'continue':
        extensions.append(LoadFromDump(jobname))
    elif old_model_name:
        # or you can just load the weights without state using:
        old_params = LoadFromDump(old_model_name).manager.load_parameters()
        model.set_param_values(old_params)
    else:
        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

    if sample:
        assert old_model_name and old_model_name != 'continue'
        Sample(generator, steps=max_length, path='.').do(None)
        exit(0)

    #------------------------------------------------------------
    # Define the training algorithm.
    cg = ComputationGraph(cost)
    if dropout > 0.:
        from blocks.roles import INPUT, OUTPUT
        dropout_target = VariableFilter(roles=[OUTPUT],
                                        bricks=[transition],
                                        name_regex='states')(cg.variables)
        cg = apply_dropout(cg, dropout_target, dropout)
        cost = cg.outputs[0]

    if step_method == 'adam':
        step_rule = Adam(learning_rate)
    elif step_method == 'rmsprop':
        step_rule = RMSProp(learning_rate, decay_rate=0.95)
    elif step_method == 'adagrad':
        step_rule = AdaGrad(learning_rate)
    elif step_method == 'adadelta':
        step_rule = AdaDelta()
    elif step_method == 'scale':
        step_rule = Scale(learning_rate=0.1)
    else:
        raise Exception('Unknown sttep method %s' % step_method)

    step_rule = CompositeRule([StepClipping(max_grad), step_rule])

    algorithm = GradientDescent(cost=cost,
                                params=cg.parameters,
                                step_rule=step_rule)

    #------------------------------------------------------------
    observables = [cost]

    # Fetch variables useful for debugging
    (energies, ) = VariableFilter(applications=[generator.readout.readout],
                                  name_regex="output")(cg.variables)
    (activations, ) = VariableFilter(
        applications=[generator.transition.apply],
        name=generator.transition.apply.states[0])(cg.variables)
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_activation = named_copy(abs(activations).mean(), "mean_activation")
    observables += [min_energy, max_energy, mean_activation]

    observables += [algorithm.total_step_norm, algorithm.total_gradient_norm]
    for name, param in params.items():
        observables.append(named_copy(param.norm(2), name + "_norm"))
        observables.append(
            named_copy(algorithm.gradients[param].norm(2),
                       name + "_grad_norm"))

    #------------------------------------------------------------
    datasource_fname = os.path.join(fuel.config.data_path, datasource,
                                    datasource + '.hdf5')

    train_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='train',
        sources=('features', ),
        load_in_memory=True)
    train_stream = DataStream(train_ds,
                              iteration_scheme=ShuffledScheme(
                                  train_ds.num_examples, batch_size))

    test_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='test',
        sources=('features', ),
        load_in_memory=True)
    test_stream = DataStream(test_ds,
                             iteration_scheme=SequentialScheme(
                                 test_ds.num_examples, batch_size))

    train_stream = Mapping(train_stream, _transpose)
    test_stream = Mapping(test_stream, _transpose)

    def stream_stats(ds, label):
        itr = ds.get_epoch_iterator(as_dict=True)
        batch_count = 0
        examples_count = 0
        for batch in itr:
            batch_count += 1
            examples_count += batch['features'].shape[1]
        print('%s #batch %d #examples %d' %
              (label, batch_count, examples_count))

    stream_stats(train_stream, 'train')
    stream_stats(test_stream, 'test')

    extensions += [
        Timing(every_n_batches=10),
        TrainingDataMonitoring(observables, prefix="train",
                               every_n_batches=10),
        DataStreamMonitoring(
            [cost],
            test_stream,
            prefix="test",
            on_resumption=True,
            after_epoch=False,  # by default this is True
            every_n_batches=100),
        # all monitored data is ready so print it...
        # (next steps may take more time and we want to see the
        # results as soon as possible so print as soon as you can)
        Printing(every_n_batches=10),
        # perform multiple dumps at different intervals
        # so if one of them breaks (has nan) we can hopefully
        # find a model from few batches ago in the other
        Dump(jobname, every_n_batches=11),
        Dump(jobname + '.test', every_n_batches=100),
        Sample(generator,
               steps=max_length,
               path=jobname + '.test',
               every_n_batches=100),
        ProgressBar(),
        FinishAfter(after_n_epochs=epochs)
        # This shows a way to handle NaN emerging during
        # training: simply finish it.
        .add_condition("after_batch", _is_nan),
    ]

    if bokeh:
        extensions.append(Plot('sketch', channels=[
            ['cost'],
        ]))

    # Construct the main loop and start training!
    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()