Beispiel #1
0
 def _create_model(with_dropout):
     cg = ComputationGraph(gan.compute_losses(x, z))
     if with_dropout:
         inputs = VariableFilter(bricks=gan.discriminator.children[1:],
                                 roles=[INPUT])(cg.variables)
         cg = apply_dropout(cg, inputs, 0.5)
         inputs = VariableFilter(bricks=[gan.discriminator],
                                 roles=[INPUT])(cg.variables)
         cg = apply_dropout(cg, inputs, 0.2)
     return Model(cg.outputs)
Beispiel #2
0
 def _create_model(with_dropout):
     cg = ComputationGraph(gan.compute_losses(x, z))
     if with_dropout:
         inputs = VariableFilter(
             bricks=gan.discriminator.children[1:],
             roles=[INPUT])(cg.variables)
         cg = apply_dropout(cg, inputs, 0.5)
         inputs = VariableFilter(
             bricks=[gan.discriminator],
             roles=[INPUT])(cg.variables)
         cg = apply_dropout(cg, inputs, 0.2)
     return Model(cg.outputs)
Beispiel #3
0
 def _create_model(with_dropout):
     cg = ComputationGraph(ali.compute_losses(x, z))
     if with_dropout:
         inputs = VariableFilter(
             bricks=([ali.discriminator.x_discriminator.layers[0],
                      ali.discriminator.z_discriminator.layers[0]]),
             roles=[INPUT])(cg.variables)
         cg = apply_dropout(cg, inputs, 0.2)
         inputs = VariableFilter(
             bricks=(ali.discriminator.x_discriminator.layers[2::3] +
                     ali.discriminator.z_discriminator.layers[2::2] +
                     ali.discriminator.joint_discriminator.layers[::2]),
             roles=[INPUT])(cg.variables)
         cg = apply_dropout(cg, inputs, 0.5)
     return Model(cg.outputs)
Beispiel #4
0
 def train_base_model(self, train_data, test_data, input_dim):
     x = T.matrix('features')
     y = T.matrix('targets')
     mlp, cost, mis_cost = self.create_base_model(x, y, input_dim)
     cg = ComputationGraph([cost])
     inputs = VariableFilter(roles=[INPUT])(cg.variables)
     cg = apply_dropout(cg, inputs, 0.2)
     algorithm = GradientDescent(cost=cost,
                                 parameters=cg.parameters,
                                 step_rule=Adam(learning_rate=0.001))
     data_stream = train_data
     data_stream_test = test_data
     monitor = DataStreamMonitoring(variables=[mis_cost],
                                    data_stream=data_stream_test,
                                    prefix="test")
     plot_ext = Plot('F1-measure',
                     channels=[['test_MisclassificationRate']],
                     after_batch=True)
     main_loop = MainLoop(data_stream=data_stream,
                          algorithm=algorithm,
                          extensions=[
                              monitor,
                              FinishAfter(after_n_epochs=50),
                              Printing(), plot_ext
                          ])
     main_loop.run()
     return mlp
def build_model(images, labels):
    
    vgg = VGG(layer='conv4_4')
    vgg.push_initialization_config()
    vgg.initialize()

    tdb = top_direction_block()
    tdb.push_initialization_config()
    tdb.initialize()

    # Construct feedforward sequence
    ss_seq = FeedforwardSequence([vgg.apply, tdb.apply])
    ss_seq.push_initialization_config()
    ss_seq.initialize()
    
    prediction = ss_seq.apply(images)
    cost       = StructuredCost().apply(labels, theano.tensor.clip(prediction, 1e-5, 1 - 1e-5))

    cg           = ComputationGraph(cost)
    cg_dropout   = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[0]], .5)
    cost_dropout = cg_dropout.outputs[0]

    # define learned parameters
    selector = Selector([ss_seq])
    W         = selector.get_parameters()
    parameters = []
    parameters += [v for k, v in W.items()]

    return cost_dropout, parameters 
Beispiel #6
0
def build_mlp(features_cat, features_int, labels):

    mlp_int = MLP(activations=[Rectifier(), Rectifier()],
                  dims=[19, 50, 50],
                  weights_init=IsotropicGaussian(),
                  biases_init=Constant(0),
                  name='mlp_interval')
    mlp_int.initialize()
    mlp_cat = MLP(activations=[Logistic()],
                  dims=[320, 50],
                  weights_init=IsotropicGaussian(),
                  biases_init=Constant(0),
                  name='mlp_categorical')
    mlp_cat.initialize()

    mlp = MLP(activations=[Rectifier(), None],
              dims=[50, 50, 1],
              weights_init=IsotropicGaussian(),
              biases_init=Constant(0))
    mlp.initialize()

    gated = mlp_cat.apply(features_cat) * mlp_int.apply(features_int)
    prediction = mlp.apply(gated)
    cost = MAPECost().apply(prediction, labels)

    cg = ComputationGraph(cost)
    print cg.variables

    cg_dropout1   = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[1], VariableFilter(roles=[OUTPUT])(cg.variables)[3]], .2)
    cost_dropout1 = cg_dropout1.outputs[0]

    return cost_dropout1, cg_dropout1.parameters, cost
Beispiel #7
0
 def apply_dropout(self, dropout, variables=None):
     if dropout and dropout > 0:
         if variables == None:
             var_filter = VariableFilter(theano_name_regex='linear.*input_')
             variables = var_filter(self.cg.variables)
         self.cg = apply_dropout(self.cg, variables, dropout)
         self._cost = self.cg.outputs[0]
Beispiel #8
0
def build_mlp(features_int, features_cat, labels, labels_mean):

    inputs = tensor.concatenate([features_int, features_cat], axis=1)

    mlp = MLP(activations=[Rectifier(),
                           Rectifier(),
                           Rectifier(), None],
              dims=[337, 800, 1200, 1],
              weights_init=IsotropicGaussian(),
              biases_init=Constant(1))
    mlp.initialize()

    prediction = mlp.apply(inputs)
    cost = MAPECost().apply(prediction, labels, labels_mean)

    cg = ComputationGraph(cost)
    #cg_dropout0   = apply_dropout(cg, [VariableFilter(roles=[INPUT])(cg.variables)[1]], .2)
    cg_dropout1 = apply_dropout(cg, [
        VariableFilter(roles=[OUTPUT])(cg.variables)[1],
        VariableFilter(roles=[OUTPUT])(cg.variables)[3],
        VariableFilter(roles=[OUTPUT])(cg.variables)[5]
    ], .2)
    cost_dropout1 = cg_dropout1.outputs[0]

    return cost_dropout1, cg_dropout1.parameters, cost  #cost, cg.parameters, cost #
Beispiel #9
0
def build_mlp(features_car_cat, features_car_int, features_nocar_cat,
              features_nocar_int, features_cp, features_hascar, means, labels):

    features = tensor.concatenate([
        features_hascar, means['cp'][features_cp[:, 0]],
        means['dep'][features_cp[:, 1]]
    ],
                                  axis=1)

    mlp = MLP(activations=[Rectifier(), Rectifier(), None],
              dims=[5, 50, 50, 1],
              weights_init=IsotropicGaussian(.1),
              biases_init=Constant(0),
              name='mlp')
    mlp.initialize()

    prediction = mlp.apply(features)

    cost = MAPECost().apply(labels, prediction)

    cg = ComputationGraph(cost)
    input_var = VariableFilter(roles=[INPUT])(cg.variables)
    print input_var

    cg_dropout1 = apply_dropout(cg, [input_var[3], input_var[5]], .4)
    cost_dropout1 = cg_dropout1.outputs[0]

    return prediction, cost_dropout1, cg_dropout1.parameters, cost
Beispiel #10
0
def build_mlp(features_car_cat, features_car_int, features_nocar_cat,
              features_nocar_int, features_cp, features_hascar, means, labels):

    prediction, _, _, _, = \
            build_mlp_onlyloc(features_car_cat, features_car_int,
                              features_nocar_cat, features_nocar_int, features_cp, features_hascar,
                              means, labels)

    mlp_crm = MLP(activations=[None],
                  dims=[1, 1],
                  weights_init=IsotropicGaussian(.1),
                  biases_init=Constant(0),
                  name='mlp_crm')
    mlp_crm.initialize()
    crm = features_nocar_int[:, 0][:, None]

    prediction = prediction * mlp_crm.apply(crm)

    cost = MAPECost().apply(labels, prediction)

    cg = ComputationGraph(cost)
    input_var = VariableFilter(roles=[INPUT])(cg.variables)
    print input_var

    cg_dropout = apply_dropout(cg, [input_var[7], input_var[5]], .4)
    cost_dropout = cg_dropout.outputs[0]

    return prediction, cost_dropout, cg_dropout.parameters, cost
Beispiel #11
0
 def _apply_dropout(self, outputs, *args, **kwargs):
     variables = [self.word_embed.W, self.hashtag_embed.W]
     cgs = ComputationGraph(outputs)
     cg_dropouts = apply_dropout(cgs,
                                 variables,
                                 drop_prob=self.config.dropout_prob,
                                 seed=123).outputs
     return cg_dropouts
Beispiel #12
0
def build_mlp(features_car_cat, features_car_int, features_nocar_cat,
              features_nocar_int, features_cp, features_hascar, means, labels):

    mlp_car = MLP(activations=[Rectifier(), Rectifier(), None],
                  dims=[8 + 185, 200, 200, 1],
                  weights_init=IsotropicGaussian(.1),
                  biases_init=Constant(0),
                  name='mlp_interval_car')
    mlp_car.initialize()
    mlp_nocar = MLP(activations=[Rectifier(), Rectifier(), None],
                    dims=[5 + 135, 200, 200, 1],
                    weights_init=IsotropicGaussian(.1),
                    biases_init=Constant(0),
                    name='mlp_interval_nocar')
    mlp_nocar.initialize()

    feature_car = tensor.concatenate((features_car_cat, features_car_int),
                                     axis=1)
    feature_nocar = tensor.concatenate(
        (features_nocar_cat, features_nocar_int), axis=1)
    prediction = mlp_nocar.apply(feature_nocar)
    # gating with the last feature : does the dude own a car
    prediction += tensor.addbroadcast(features_hascar,
                                      1) * mlp_car.apply(feature_car)

    prediction_loc, _, _, _, = \
            build_mlp_onlyloc(features_car_cat, features_car_int,
                              features_nocar_cat, features_nocar_int,
                              features_cp, features_hascar,
                              means, labels)
    prediction += prediction_loc

    # add crm
    mlp_crm = MLP(activations=[None],
                  dims=[1, 1],
                  weights_init=IsotropicGaussian(.1),
                  biases_init=Constant(0),
                  name='mlp_crm')
    mlp_crm.initialize()
    crm = features_nocar_int[:, 0][:, None]
    prediction = prediction * mlp_crm.apply(crm)

    cost = MAPECost().apply(labels, prediction)

    cg = ComputationGraph(cost)
    input_var = VariableFilter(roles=[INPUT])(cg.variables)
    print input_var

    cg_dropout1 = apply_dropout(cg, [input_var[6], input_var[7]], .4)
    cost_dropout1 = cg_dropout1.outputs[0]

    return prediction, cost_dropout1, cg_dropout1.parameters, cost
Beispiel #13
0
def test_apply_dropout_custom_divisor():
    x = tensor.vector()
    y = tensor.vector()
    z = x - y
    cg = ComputationGraph([z])
    scaled_dropped_cg = apply_dropout(cg, [y], 0.8, seed=2, custom_divisor=2.5)

    x_ = numpy.array([9., 8., 9.], dtype=theano.config.floatX)
    y_ = numpy.array([4., 5., 6.], dtype=theano.config.floatX)

    assert_allclose(
        scaled_dropped_cg.outputs[0].eval({x: x_, y: y_}),
        x_ - (y_ * MRG_RandomStreams(2).binomial((3,), p=0.2).eval() / 2.5))
Beispiel #14
0
def test_apply_dropout():
    x = tensor.vector()
    y = tensor.vector()
    z = x * y
    cg = ComputationGraph([z])
    dropped_cg = apply_dropout(cg, [x], 0.4, seed=1)

    x_ = numpy.array([5., 6., 7.], dtype=theano.config.floatX)
    y_ = numpy.array([1., 2., 3.], dtype=theano.config.floatX)

    assert_allclose(
        dropped_cg.outputs[0].eval({x: x_, y: y_}),
        x_ * y_ * MRG_RandomStreams(1).binomial((3,), p=0.6).eval() / 0.6)
Beispiel #15
0
def test_apply_dropout():
    x = tensor.vector()
    y = tensor.vector()
    z = x * y
    cg = ComputationGraph([z])
    dropped_cg = apply_dropout(cg, [x], 0.4, seed=1)

    x_ = numpy.array([5., 6., 7.], dtype=theano.config.floatX)
    y_ = numpy.array([1., 2., 3.], dtype=theano.config.floatX)

    assert_allclose(
        dropped_cg.outputs[0].eval({x: x_, y: y_}),
        x_ * y_ * MRG_RandomStreams(1).binomial((3,), p=0.6).eval() / 0.6)
Beispiel #16
0
def test_apply_dropout_custom_divisor():
    x = tensor.vector()
    y = tensor.vector()
    z = x - y
    cg = ComputationGraph([z])
    scaled_dropped_cg = apply_dropout(cg, [y], 0.8, seed=2, custom_divisor=2.5)

    x_ = numpy.array([9., 8., 9.], dtype=theano.config.floatX)
    y_ = numpy.array([4., 5., 6.], dtype=theano.config.floatX)

    assert_allclose(
        scaled_dropped_cg.outputs[0].eval({x: x_, y: y_}),
        x_ - (y_ * MRG_RandomStreams(2).binomial((3,), p=0.2).eval() / 2.5))
Beispiel #17
0
def dropout(cg):
    """Create dropout computation graph.

    Parameters
    ----------
    cg : ComputationGraph
        origin computation graph

    Returns
    -------
    dropout_cg : ComputationGraph
        dropped out computation graph
    """
    inputs = VariableFilter(roles=[INPUT])(cg.variables)
    dropout_cg = apply_dropout(cg, inputs, 0.5)

    return dropout_cg
Beispiel #18
0
def create_training_computation_graphs():
    x = tensor.tensor4('features')
    y = tensor.imatrix('targets')

    convnet, mlp = create_model_bricks()
    y_hat = mlp.apply(convnet.apply(x).flatten(ndim=2))
    cost = BinaryCrossEntropy().apply(y, y_hat)
    accuracy = 1 - tensor.neq(y > 0.5, y_hat > 0.5).mean()
    cg = ComputationGraph([cost, accuracy])

    # Create a graph which uses batch statistics for batch normalization
    # as well as dropout on selected variables
    bn_cg = apply_batch_normalization(cg)
    bricks_to_drop = ([convnet.layers[i] for i in (5, 11, 17)] +
                      [mlp.application_methods[1].brick])
    variables_to_drop = VariableFilter(
        roles=[OUTPUT], bricks=bricks_to_drop)(bn_cg.variables)
    bn_dropout_cg = apply_dropout(bn_cg, variables_to_drop, 0.5)

    return cg, bn_dropout_cg
Beispiel #19
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] +
                            [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim,
                                   output_dim=config.attention_mlp_hidden[0],
                                   name='attq')
        attention_clinear = Linear(input_dim=cenc_dim,
                                   output_dim=config.attention_mlp_hidden[0],
                                   use_bias=False,
                                   name='attc')
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(
            attention_clinear.apply(
                cenc.reshape((cenc.shape[0] * cenc.shape[1], cenc.shape[2]
                              ))).reshape((cenc.shape[0], cenc.shape[1],
                                           config.attention_mlp_hidden[0])) +
            attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(
            layer1.reshape(
                (layer1.shape[0] * layer1.shape[1], layer1.shape[2])))
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights = tensor.nnet.sigmoid(att_weights.T).T
        att_weights.name = 'att_weights'

        att_target = tensor.eq(
            tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)),
            tensor.tile(context[:, None, :],
                        (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1)
        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) *
                context_mask).sum() / context_mask.sum()
        self.predictions = tensor.gt(att_weights, 0.1) * context

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        ans_indices = tensor.imatrix('ans_indices')  # n_steps * n_samples
        ans_indices_mask = tensor.imatrix('ans_indices_mask')

        context_bag = tensor.eq(context[:, :, None],
                                tensor.arange(vocab_size)).sum(axis=1).clip(
                                    0, 1)

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)
        ans_indices = ans_indices.dimshuffle(1, 0)
        ans_indices_mask = ans_indices_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)
        # embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt')
        # embed.weights_init = Constant(embeddings_initial_value)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)

        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'
        #embed size: 200, lstm_size = 256
        #qenc: length * batch_size * (2*lstm_size)

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        cqembed = tensor.concatenate(
            [
                cembed,
                tensor.extra_ops.repeat(
                    qenc[None, :, :], cembed.shape[0], axis=0)
            ],
            axis=2
        )  #length * batch_size * (embed+2*lstm_size) this is what goes into encoder
        clstms, chidden_list = make_bidir_lstm_stack(
            cqembed, config.embed_size + qenc_dim,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'
        #cenc: length * batch_size * (2*lstm_size)

        #pointer networks decoder LSTM and Attention parameters
        params = init_params(data_dim=config.decoder_data_dim,
                             lstm_dim=config.decoder_lstm_output_dim)
        tparams = init_tparams(params)

        self.theano_params = []
        add_role(tparams['lstm_de_W'], WEIGHT)
        add_role(tparams['lstm_de_U'], WEIGHT)
        add_role(tparams['lstm_de_b'], BIAS)
        add_role(tparams['ptr_v'], WEIGHT)
        add_role(tparams['ptr_W1'], WEIGHT)
        add_role(tparams['ptr_W2'], WEIGHT)
        self.theano_params = tparams.values()
        # for p in tparams.values():
        #     add_role(p, WEIGHT)
        #     self.theano_params.append(p)

        #n_steps = length , n_samples = batch_size
        n_steps = ans_indices.shape[0]
        n_samples = ans_indices.shape[1]
        preds, generations = ptr_network(
            tparams, cqembed, context_mask.astype(theano.config.floatX),
            ans_indices, ans_indices_mask.astype(theano.config.floatX),
            config.decoder_lstm_output_dim, cenc)

        self.generations = generations

        idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'),
                                 tensor.ones((n_samples, ), dtype='int64'))
        idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'),
                                   tensor.arange(n_samples, dtype='int64'))
        probs = preds[idx_steps, ans_indices, idx_samples]
        # probs *= y_mask
        off = 1e-8
        if probs.dtype == 'float16':
            off = 1e-6
        # probs += (1 - y_mask)  # change unmasked position to 1, since log(1) = 0
        probs += off
        # probs_printed = theano.printing.Print('this is probs')(probs)
        cost = -tensor.log(probs)
        cost *= ans_indices_mask
        cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0)
        cost = cost.mean()
        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Beispiel #21
0
def main(name, epochs, batch_size, learning_rate,
         dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout,
         depth, max_grad, step_method, epsilon, sample, skip, uniform, top):

    #----------------------------------------------------------------------
    datasource = name

    def shnum(x):
        """ Convert a positive float into a short tag-usable string
             E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2
        """
        return '0' if x <= 0 else '%s%d' % (("%e"%x)[0], -np.floor(np.log10(x)))

    jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (datasource, depth, dim, mix_dim,
                                           int(dropout*10),
                                           shnum(learning_rate), batch_size,
                                           shnum(epsilon))
    if max_length != 600:
        jobname += '-L%d'%max_length

    if GRU:
        jobname += 'g'
    if max_grad != 5.:
        jobname += 'G%g'%max_grad
    if step_method != 'adam':
        jobname += step_method
    if skip:
        jobname += 'D'
        assert depth > 1
    if top:
        jobname += 'T'
        assert depth > 1
    if uniform > 0.:
        jobname += 'u%d'%int(uniform*100)

    if debug:
        jobname += ".debug"

    if sample:
        print("Sampling")
    else:
        print("\nRunning experiment %s" % jobname)
    if old_model_name:
        print("starting from model %s"%old_model_name)

    #----------------------------------------------------------------------
    transitions = [GatedRecurrent(dim=dim) if GRU else LSTM(dim=dim)
                   for _ in range(depth)]
    if depth > 1:
        transition = RecurrentStack(transitions, name="transition",
                                    fast=True, skip_connections=skip or top)
        if skip:
            source_names=['states'] + ['states_%d'%d for d in range(1,depth)]
        else:
            source_names=['states_%d'%(depth-1)]
    else:
        transition = transitions[0]
        transition.name = "transition"
        source_names=['states']

    emitter = SketchEmitter(mix_dim=mix_dim,
                            epsilon=epsilon,
                            name="emitter")
    readout = Readout(
        readout_dim=emitter.get_dim('inputs'),
        source_names=source_names,
        emitter=emitter,
        name="readout")
    normal_inputs = [name for name in transition.apply.sequences
                     if 'mask' not in name]
    fork = Fork(normal_inputs, prototype=Linear(use_bias=True))
    generator = SequenceGenerator(readout=readout, transition=transition,
                                  fork=fork)

    # Initialization settings
    if uniform > 0.:
        generator.weights_init = Uniform(width=uniform*2.)
    else:
        generator.weights_init = OrthogonalGlorot()
    generator.biases_init = Constant(0)

    # Build the cost computation graph [steps, batch_size, 3]
    x = T.tensor3('features', dtype=floatX)
    if debug:
        x.tag.test_value = np.ones((max_length,batch_size,3)).astype(floatX)
    x = x[:max_length,:,:]  # has to be after setting test_value
    cost = generator.cost(x)
    cost.name = "sequence_log_likelihood"

    # Give an idea of what's going on
    model = Model(cost)
    params = model.get_params()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, value.get_value().shape) for key, value
                     in params.items()],
                    width=120))
    model_size = 0
    for v in params.itervalues():
        s = v.get_value().shape
        model_size += s[0] * (s[1] if len(s) > 1 else 1)
    logger.info("Total number of parameters %d"%model_size)

    #------------------------------------------------------------
    extensions = []
    if old_model_name == 'continue':
        extensions.append(LoadFromDump(jobname))
    elif old_model_name:
        # or you can just load the weights without state using:
        old_params = LoadFromDump(old_model_name).manager.load_parameters()
        model.set_param_values(old_params)
    else:
        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

    if sample:
        assert old_model_name and old_model_name != 'continue'
        Sample(generator, steps=max_length, path=old_model_name).do(None)
        exit(0)

    #------------------------------------------------------------
    # Define the training algorithm.
    cg = ComputationGraph(cost)
    if dropout > 0.:
        from blocks.roles import INPUT, OUTPUT
        dropout_target = VariableFilter(roles=[OUTPUT],
                                        bricks=transitions,
                                        name_regex='states')(cg.variables)
        print('# dropout %d' % len(dropout_target))
        cg = apply_dropout(cg, dropout_target, dropout)
        opt_cost = cg.outputs[0]
    else:
        opt_cost = cost

    if step_method == 'adam':
        step_rule = Adam(learning_rate)
    elif step_method == 'rmsprop':
        step_rule = RMSProp(learning_rate, decay_rate=0.95)
    elif step_method == 'adagrad':
        step_rule = AdaGrad(learning_rate)
    elif step_method == 'adadelta':
        step_rule = AdaDelta()
    elif step_method == 'scale':
        step_rule = Scale(learning_rate)
    else:
        raise Exception('Unknown sttep method %s'%step_method)

    step_rule = CompositeRule([StepClipping(max_grad), step_rule])

    algorithm = GradientDescent(
        cost=opt_cost, params=cg.parameters,
        step_rule=step_rule)

    #------------------------------------------------------------
    observables = [cost]

    # Fetch variables useful for debugging
    (energies,) = VariableFilter(
        applications=[generator.readout.readout],
        name_regex="output")(cg.variables)
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    observables += [min_energy, max_energy]

    # (activations,) = VariableFilter(
    #     applications=[generator.transition.apply],
    #     name=generator.transition.apply.states[0])(cg.variables)
    # mean_activation = named_copy(abs(activations).mean(),
    #                              "mean_activation")
    # observables.append(mean_activation)

    observables += [algorithm.total_step_norm, algorithm.total_gradient_norm]
    for name, param in params.items():
        observables.append(named_copy(
            param.norm(2), name + "_norm"))
        observables.append(named_copy(
            algorithm.gradients[param].norm(2), name + "_grad_norm"))

    #------------------------------------------------------------
    datasource_fname = os.path.join(fuel.config.data_path, datasource,
                                    datasource+'.hdf5')

    train_ds = H5PYDataset(datasource_fname, #max_length=max_length,
                             which_set='train', sources=('features',),
                             load_in_memory=True)
    train_stream = DataStream(train_ds,
                              iteration_scheme=ShuffledScheme(
                                  train_ds.num_examples, batch_size))

    test_ds = H5PYDataset(datasource_fname, #max_length=max_length,
                            which_set='test', sources=('features',),
                            load_in_memory=True)
    test_stream  = DataStream(test_ds,
                              iteration_scheme=SequentialScheme(
                                  test_ds.num_examples, batch_size))

    train_stream = Mapping(train_stream, _transpose)
    test_stream = Mapping(test_stream, _transpose)

    def stream_stats(ds, label):
        itr = ds.get_epoch_iterator(as_dict=True)
        batch_count = 0
        examples_count = 0
        for batch in itr:
            batch_count += 1
            examples_count += batch['features'].shape[1]
        print('%s #batch %d #examples %d' %
              (label, batch_count, examples_count))

    stream_stats(train_stream, 'train')
    stream_stats(test_stream, 'test')

    extensions += [Timing(every_n_batches=10),
                   TrainingDataMonitoring(
                       observables, prefix="train",
                       every_n_batches=10),
                   DataStreamMonitoring(
                       [cost],  # without dropout
                       test_stream,
                       prefix="test",
                       on_resumption=True,
                       after_epoch=False,  # by default this is True
                       every_n_batches=100),
                   # all monitored data is ready so print it...
                   # (next steps may take more time and we want to see the
                   # results as soon as possible so print as soon as you can)
                   Printing(every_n_batches=10),
                   # perform multiple dumps at different intervals
                   # so if one of them breaks (has nan) we can hopefully
                   # find a model from few batches ago in the other
                   Dump(jobname, every_n_batches=11),
                   Dump(jobname+'.test', every_n_batches=100),
                   Sample(generator, steps=max_length,
                          path=jobname+'.test',
                          every_n_batches=100),
                   ProgressBar(),
                   FinishAfter(after_n_epochs=epochs)
                    # This shows a way to handle NaN emerging during
                    # training: simply finish it.
                    .add_condition("after_batch", _is_nan),
                   ]

    if bokeh:
        from blocks.extensions.plot import Plot
        extensions.append(Plot(
            'sketch',
            channels=[
                ['cost'],]))

    # Construct the main loop and start training!
    main_loop = MainLoop(
        model=model,
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=extensions
        )

    main_loop.run()
    ## initialize the model
    dpm = model.DiffusionModel(spatial_width, n_colors, uniform_noise=uniform_noise, **model_args)
    dpm.initialize()

    ## set up optimization
    features = T.matrix('features', dtype=theano.config.floatX)
    cost = dpm.cost(features)
    blocks_model = blocks.model.Model(cost)
    cg_nodropout = ComputationGraph(cost)
    if args.dropout_rate > 0:
        # DEBUG this triggers an error on my machine
        # apply dropout to all the input variables
        inputs = VariableFilter(roles=[INPUT])(cg_nodropout.variables)
        # dropconnect
        # inputs = VariableFilter(roles=[PARAMETER])(cg_nodropout.variables)
        cg = apply_dropout(cg_nodropout, inputs, args.dropout_rate)
    else:
        cg = cg_nodropout
    step_compute = RMSProp(learning_rate=args.lr, max_scaling=1e10)
    algorithm = GradientDescent(step_rule=CompositeRule([RemoveNotFinite(),
        step_compute]),
        parameters=cg.parameters, cost=cost)
    extension_list = []
    extension_list.append(
        SharedVariableModifier(step_compute.learning_rate,
            extensions.decay_learning_rate,
            after_batch=False,
            every_n_batches=batches_per_epoch, ))
    extension_list.append(FinishAfter(after_n_epochs=100001))

    ## logging of test set performance
predict = top_mlp.apply(conv_out)

# ---------------------------------------------------------------
# Building computational graph
# ---------------------------------------------------------------

cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost')
error = MisclassificationRate().apply(y.flatten(), predict)
error_rate = error.copy(name='error_rate')
error_rate2 = error.copy(name='error_rate2')
cg = ComputationGraph([cost, error_rate])
inputs = VariableFilter(roles=[INPUT])(cg.variables)
linear_inputs_index = [-10,-8,6]
linear_inputs = list(itemgetter(*linear_inputs_index)(inputs))
cg_dropout = apply_dropout(cg,linear_inputs, 0.5)

# ---------------------------------------------------------------
# Set ports listeners for Fuel data servers
# ---------------------------------------------------------------
data_valid_stream = ServerDataStream(('image_features','targets'), False, port=3040)
data_train_stream = ServerDataStream(('image_features','targets'), False, port=3041)


# ---------------------------------------------------------------
# Training settings
# ---------------------------------------------------------------
#algorithm = GradientDescent(cost=cost, parameters=cg_dropout.parameters, step_rule=Adam())
algorithm = GradientDescent(cost=cost, parameters=cg_dropout.parameters, step_rule=Scale(learning_rate=learning_rate))

save_to = 'Glorot__overfeat_4conv_1full_bn.pkl'
def run_training(config, tr_stream, dev_stream=None, use_bokeh=True):

    # Monitoring extensions
    try:
        from blocks_extras.extensions.plot import Plot
        BOKEH_AVAILABLE = True
    except ImportError:
        BOKEH_AVAILABLE = False
    print('Bokeh avalablity: ' + str(BOKEH_AVAILABLE))

    logger = logging.getLogger(__name__)

    # Create Theano variables
    logger.info('Creating theano variables')
    x = T.tensor3('features', dtype=config.params['data_dtype'])
    x_mask = T.tensor3('features_mask', dtype=config.params['mask_dtype'])
    y = T.matrix('targets', dtype=config.params['data_dtype'])
    y_mask = T.matrix('targets_mask', dtype=config.params['mask_dtype'])


    # Construct model
    logger.info('Building baseline model')
    baseline_model = BaselineModel(config.params)
    baseline_model.initialize()

    cost = baseline_model.cost(subword_id_input_=x, subword_id_input_mask_=x_mask,
                               subword_id_target_=y, subword_id_target_mask_=y_mask)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)


    # apply dropout for regularization
    if config.params['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info('Applying dropout')
        dropout_inputs = [x for x in cg.intermediary_variables
                          if x.name == 'maxout_apply_output']
        print(cg.intermediary_variables)
        print(cg.variables)
        print(cg.inputs)
        print(cg.parameters)

        print(dropout_inputs)
        cg = apply_dropout(cg, dropout_inputs, config.params['dropout'])

    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config.params['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True)
        #CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])]
    ]

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot('Baseline model', channels=[['baselinemodel_cost_cost']],
                 after_batch=True))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config.params['step_clipping']), eval(config.params['step_rule'])()])
    )

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=baseline_model,
        algorithm=algorithm,
        data_stream=tr_stream,
        extensions=extensions
    )

    # Train
    main_loop.run()

    print('DONE TRAINING')
Beispiel #25
0
def main(job_id, params, config_file='params.ec'):
    config = ConfigParser.ConfigParser()
    config.readfp(open('./configs/{}'.format(config_file)))

    pr = pprint.PrettyPrinter(indent=4)
    pr.pprint(config)

    net_name  =  config.get('hyperparams', 'net_name', 'adni')
    struct_name = net_name.split('_')[0]

    max_epoch = int(config.get('hyperparams', 'max_iter', 100))
    base_lr = float(config.get('hyperparams', 'base_lr', 0.01))
    train_batch = int(config.get('hyperparams', 'train_batch', 256))
    valid_batch = int(config.get('hyperparams', 'valid_batch', 512))
    test_batch = int(config.get('hyperparams', 'valid_batch', 512))

    W_sd = float(config.get('hyperparams', 'W_sd', 0.01))
    W_mu = float(config.get('hyperparams', 'W_mu', 0.0))
    b_sd = float(config.get('hyperparams', 'b_sd', 0.01))
    b_mu = float(config.get('hyperparams', 'b_mu', 0.0))

    hidden_units = int(config.get('hyperparams', 'hidden_units', 32))
    input_dropout_ratio = float(config.get('hyperparams', 'input_dropout_ratio', 0.2))
    dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2))
    weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001))
    max_norm = float(config.get('hyperparams', 'max_norm', 100.0))
    solver = config.get('hyperparams', 'solver_type', 'rmsprop')
    data_file = config.get('hyperparams', 'data_file')
    side = config.get('hyperparams', 'side', 'b')

    input_dim = input_dims[struct_name]

    # Spearmint optimization parameters:
    if params:
        base_lr = float(params['base_lr'][0])
        dropout_ratio = float(params['dropout_ratio'][0])
        hidden_units = params['hidden_units'][0]
        weight_decay = params['weight_decay'][0]

    if 'adagrad' in solver:
        solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)])
    else:
        solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)])


    data_file = config.get('hyperparams', 'data_file')

    if 'b' in side:
        train = H5PYDataset(data_file, which_set='train')
        valid = H5PYDataset(data_file, which_set='valid')
        test = H5PYDataset(data_file, which_set='test')
        x_l = tensor.matrix('l_features')
        x_r = tensor.matrix('r_features')
        x = tensor.concatenate([x_l, x_r], axis=1)

    else:
        train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets'])
        valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets'])
        test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets'])
        x = tensor.matrix('{}_features'.format(side))

    y = tensor.lmatrix('targets')


    # Define a feed-forward net with an input, two hidden layers, and a softmax output:
    model = MLP(activations=[
        Rectifier(name='h1'),
        Rectifier(name='h2'),
        Softmax(name='output'),
    ],
                dims=[
                    input_dim[side],
                    hidden_units,
                    hidden_units,
                    2],
                weights_init=IsotropicGaussian(std=W_sd, mean=W_mu),
                biases_init=IsotropicGaussian(b_sd, b_mu))

    # Don't forget to initialize params:
    model.initialize()

    # y_hat is the output of the neural net with x as its inputs
    y_hat = model.apply(x)

    # Define a cost function to optimize, and a classification error rate.
    # Also apply the outputs from the net and corresponding targets:
    cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
    error = MisclassificationRate().apply(y.flatten(), y_hat)
    error.name = 'error'

    # This is the model: before applying dropout
    model = Model(cost)

    # Need to define the computation graph for the cost func:
    cost_graph = ComputationGraph([cost])

    # This returns a list of weight vectors for each layer
    W = VariableFilter(roles=[WEIGHT])(cost_graph.variables)

    # Add some regularization to this model:
    cost += weight_decay * l2_norm(W)
    cost.name = 'entropy'

    # computational graph with l2 reg
    cost_graph = ComputationGraph([cost])

    # Apply dropout to inputs:
    inputs = VariableFilter([INPUT])(cost_graph.variables)
    dropout_inputs = [input for input in inputs if input.name.startswith('linear_')]
    dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio)
    dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio)
    dropout_cost = dropout_graph.outputs[0]
    dropout_cost.name = 'dropout_entropy'

    # Learning Algorithm (notice: we use the dropout cost for learning):
    algo = GradientDescent(
        step_rule=solver_type,
        params=dropout_graph.parameters,
        cost=dropout_cost)

    # algo.step_rule.learning_rate.name = 'learning_rate'

    # Data stream used for training model:
    training_stream = Flatten(
        DataStream.default_stream(
            dataset=train,
            iteration_scheme=ShuffledScheme(
                train.num_examples,
                batch_size=train_batch)))

    training_monitor = TrainingDataMonitoring([dropout_cost,
                                               aggregation.mean(error),
                                               aggregation.mean(algo.total_gradient_norm)],
                                              after_batch=True)

    # Use the 'valid' set for validation during training:
    validation_stream = Flatten(
        DataStream.default_stream(
            dataset=valid,
            iteration_scheme=ShuffledScheme(
                valid.num_examples,
                batch_size=valid_batch)))

    validation_monitor = DataStreamMonitoring(
        variables=[cost, error],
        data_stream=validation_stream,
        prefix='validation',
        after_epoch=True)

    test_stream = Flatten(
        DataStream.default_stream(
            dataset=test,
            iteration_scheme=ShuffledScheme(
                test.num_examples,
                batch_size=test_batch)))

    test_monitor = DataStreamMonitoring(
        variables=[error],
        data_stream=test_stream,
        prefix='test',
        after_training=True)


    plotting = Plot('{}_{}'.format(net_name, side),
                    channels=[
                        ['dropout_entropy'],
                        ['error', 'validation_error'],
                    ],
                    after_batch=False)

    # Checkpoint class used to save model and log:
    stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M')
    checkpoint = Checkpoint('./models/{}/{}/{}'.format(struct_name, side, stamp),
                            save_separately=['model', 'log'],
                            every_n_epochs=1)

    # Home-brewed class for early stopping when we detect we have started to overfit:
    # And by that I mean if the means of the val error and training error over the
    # previous 'epochs' is greater than the 'threshold', we are overfitting.
    early_stopper = FinishIfOverfitting(error_name='error',
                                        validation_name='validation_error',
                                        threshold=0.05,
                                        epochs=5,
                                        burn_in=100)

    # The main loop will train the network and output reports, etc
    main_loop = MainLoop(
        data_stream=training_stream,
        model=model,
        algorithm=algo,
        extensions=[
            validation_monitor,
            training_monitor,
            plotting,
            FinishAfter(after_n_epochs=max_epoch),
            early_stopper,
            Printing(),
            ProgressBar(),
            checkpoint,
            test_monitor,
        ])
    main_loop.run()

    ve = float(main_loop.log.last_epoch_row['validation_error'])
    te = float(main_loop.log.last_epoch_row['error'])
    spearmint_loss = ve + abs(te - ve)
    print 'Spearmint Loss: {}'.format(spearmint_loss)
    return spearmint_loss
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2)

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')

        # Get training and development set streams
        tr_stream = get_tr_stream(**config)
        dev_stream = get_dev_stream(**config)

        # Get cost of the model
        cost = decoder.cost(
            encoder.apply(source_sentence, source_sentence_mask),
            source_sentence_mask, target_sentence, target_sentence_mask)

        logger.info('Creating computational graph')
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [x for x in cg.intermediary_variables
                              if x.name == 'maxout_apply_output']
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(
                cg, enc_params+dec_params, config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                                   Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}"
                    .format(len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([cost], after_batch=True),
            Printing(after_batch=True),
            CheckpointNMT(config['saveto'],
                          every_n_batches=config['save_freq'])
        ]

        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(
                sampling_input, sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(
                bricks=[decoder.sequence_generator], name="outputs")(
                    ComputationGraph(generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model, data_stream=tr_stream,
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if config['bleu_script'] is not None:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input, samples=samples, config=config,
                              model=search_model, data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq']))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En', channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                     eval(config['step_rule'])()])
        )

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(
            model=training_model,
            algorithm=algorithm,
            data_stream=tr_stream,
            extensions=extensions
        )

        # Train!
        main_loop.run()

    elif mode == 'translate':

        # Create Theano variables
        logger.info('Creating theano variables')
        sampling_input = tensor.lmatrix('source')

        # Get test set stream
        test_stream = get_dev_stream(
            config['test_set'], config['src_vocab'],
            config['src_vocab_size'], config['unk_id'])
        ftrans = open(config['test_set'] + '.trans.out', 'w')

        # Helper utilities
        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1

        # Get beam search
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        loader = LoadNMT(config['saveto'])
        loader.set_model_parameters(model, loader.load_parameters())

        # Get target vocabulary
        trg_vocab = _ensure_special_tokens(
            pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0,
            eos_idx=trg_eos_idx, unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Started translation: ")
        total_cost = 0.0

        for i, line in enumerate(test_stream.get_epoch_iterator()):

            seq = sutils._oov_to_unk(
                line[0], config['src_vocab_size'], unk_idx)
            input_ = numpy.tile(seq, (config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs = \
                beam_search.search(
                    input_values={sampling_input: input_},
                    max_length=3*len(seq), eol_symbol=src_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            best = numpy.argsort(costs)[0]
            try:
                total_cost += costs[best]
                trans_out = trans[best]

                # convert idx to words
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)

            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i+1))
                trans_out = '<UNK>'

            print(trans_out, file=ftrans)

            if i != 0 and i % 100 == 0:
                logger.info(
                    "Translated {} lines of test set...".format(i))

        logger.info("Total cost of the test: {}".format(total_cost))
        ftrans.close()
Beispiel #27
0
    train_stream = stream.train(req_vars)
    valid_stream = stream.valid(req_vars)

    cost = model.cost(**inputs)
    cg = ComputationGraph(cost)
    monitored = set([cost] + VariableFilter(roles=[roles.COST])(cg.variables))

    valid_monitored = monitored
    if hasattr(model, 'valid_cost'):
        valid_cost = model.valid_cost(**inputs)
        valid_cg = ComputationGraph(valid_cost)
        valid_monitored = set([valid_cost] + VariableFilter(roles=[roles.COST])(valid_cg.variables))

    if hasattr(config, 'dropout') and config.dropout < 1.0:
        cg = apply_dropout(cg, config.dropout_inputs(cg), config.dropout)
    if hasattr(config, 'noise') and config.noise > 0.0:
        cg = apply_noise(cg, config.noise_inputs(cg), config.noise)
    cost = cg.outputs[0]
    cg = Model(cost)

    logger.info('# Parameter shapes:')
    parameters_size = 0
    for value in cg.parameters:
        logger.info('    %20s %s' % (value.get_value().shape, value.name))
        parameters_size += reduce(operator.mul, value.get_value().shape, 1)
    logger.info('Total number of parameters: %d in %d matrices' % (parameters_size, len(cg.parameters)))

    if hasattr(config, 'step_rule'):
        step_rule = config.step_rule
    else:
Beispiel #28
0
cost_graph = ComputationGraph([cost])

# This returns a list of weight vectors for each layer
W = VariableFilter(roles=[WEIGHT])(cost_graph.variables)

# Add some regularization to this model:
cost += weight_decay * l2_norm(W)
cost.name = 'entropy'

# computational graph with l2 reg
cost_graph = ComputationGraph([cost])

# Apply dropout to inputs:
inputs = VariableFilter([INPUT])(cost_graph.variables)
dropout_inputs = [input for input in inputs if input.name.startswith('linear_')]
dropout_graph = apply_dropout(cost_graph, dropout_inputs, dropout_ratio)
dropout_cost = dropout_graph.outputs[0]
dropout_cost.name = 'dropout_entropy'

# Learning Algorithm:
algo = GradientDescent(
    step_rule=solver_type,
    params=dropout_graph.parameters,
    cost=dropout_cost)

# Data stream used for training model:
training_stream = Flatten(
    DataStream.default_stream(
        dataset=train,
        iteration_scheme=ShuffledScheme(
            train.num_examples,
def main(save_to, num_epochs):
    mlp = MLP([Tanh(), Tanh(), Softmax()], [784, 100, 100, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    probs = mlp.apply(tensor.flatten(x, outdim=2))
    cost = CategoricalCrossEntropy().apply(y.flatten(), probs)
    error_rate = MisclassificationRate().apply(y.flatten(), probs)

    cg = ComputationGraph([cost, error_rate])
    cost.name = 'final_cost'
    test_cost = cost

    for_dropout = VariableFilter(roles=[INPUT], 
        bricks=mlp.linear_transformations[1:])(cg.variables)
    dropout_graph = apply_dropout(cg, for_dropout, 0.5)
    dropout_graph = apply_dropout(dropout_graph, [x], 0.1)
    dropout_cost, dropout_error_rate = dropout_graph.outputs

    mnist_train = MNIST(("train",))
    mnist_test = MNIST(("test",))

    algorithm = GradientDescent(
        cost=dropout_cost, parameters=cg.parameters,
        step_rule=Scale(learning_rate=0.1))
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs),
                  DataStreamMonitoring(
                      [cost, error_rate],
                      Flatten(
                          DataStream.default_stream(
                              mnist_test,
                              iteration_scheme=SequentialScheme(
                                  mnist_test.num_examples, 500)),
                          which_sources=('features',)),
                      prefix="test"),
                  TrainingDataMonitoring(
                      [dropout_cost, dropout_error_rate,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      after_epoch=True),
                  Checkpoint(save_to),
                  Printing()]

    if BLOCKS_EXTRAS_AVAILABLE:
        extensions.append(Plot(
            'MNIST example',
            channels=[
                ['test_final_cost',
                 'test_misclassificationrate_apply_error_rate'],
                ['train_total_gradient_norm']]))

    main_loop = MainLoop(
        algorithm,
        Flatten(
            DataStream.default_stream(
                mnist_train,
                iteration_scheme=SequentialScheme(
                    mnist_train.num_examples, 50)),
            which_sources=('features',)),
        model=Model(dropout_cost),
        extensions=extensions)

    main_loop.run()
Beispiel #30
0
def main_run(_config, _log):
    from collections import namedtuple

    c = namedtuple("Config", _config.keys())(*_config.values())

    _log.info("Running with" + str(_config))

    import theano
    from theano import tensor as T
    import numpy as np

    from dataset import IMDBText, GloveTransformer

    from blocks.initialization import Uniform, Constant, IsotropicGaussian, NdarrayInitialization, Identity, Orthogonal
    from blocks.bricks.recurrent import LSTM, SimpleRecurrent, GatedRecurrent
    from blocks.bricks.parallel import Fork

    from blocks.bricks import Linear, Sigmoid, Tanh, Rectifier
    from blocks import bricks

    from blocks.extensions import Printing, Timing
    from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring

    from blocks.extensions.plot import Plot
    from plot import PlotHistogram

    from blocks.algorithms import GradientDescent, Adam, Scale, StepClipping, CompositeRule, AdaDelta
    from blocks.graph import ComputationGraph, apply_dropout
    from blocks.main_loop import MainLoop
    from blocks.model import Model

    from cuboid.algorithms import AdaM, NAG
    from cuboid.extensions import EpochProgress

    from fuel.streams import DataStream, ServerDataStream
    from fuel.transformers import Padding

    from fuel.schemes import ShuffledScheme
    from Conv1D import Conv1D, MaxPooling1D
    from schemes import BatchwiseShuffledScheme
    from bricks import WeightedSigmoid, GatedRecurrentFull

    from multiprocessing import Process
    import fuel
    import logging
    from initialization import SumInitialization

    from transformers import DropSources

    global train_p
    global test_p

    x = T.tensor3("features")
    # m = T.matrix('features_mask')
    y = T.imatrix("targets")

    # x = x+m.mean()*0

    dropout_variables = []
    embedding_size = 300
    glove_version = "glove.6B.300d.txt"
    # embedding_size = 50
    # glove_version = "vectors.6B.50d.txt"

    gloveMapping = Linear(
        input_dim=embedding_size,
        output_dim=c.rnn_input_dim,
        weights_init=Orthogonal(),
        # weights_init = IsotropicGaussian(c.wstd),
        biases_init=Constant(0.0),
        name="gloveMapping",
    )
    gloveMapping.initialize()
    o = gloveMapping.apply(x)
    o = Rectifier(name="gloveRec").apply(o)
    dropout_variables.append(o)

    summed_mapped_glove = o.sum(axis=1)  # take out the sequence
    glove_out = Linear(
        input_dim=c.rnn_input_dim,
        output_dim=1.0,
        weights_init=IsotropicGaussian(c.wstd),
        biases_init=Constant(0.0),
        name="mapping_to_output",
    )
    glove_out.initialize()
    deeply_sup_0 = glove_out.apply(summed_mapped_glove)
    deeply_sup_probs = Sigmoid(name="deeply_sup_softmax").apply(deeply_sup_0)

    input_dim = c.rnn_input_dim
    hidden_dim = c.rnn_dim

    gru = GatedRecurrentFull(
        hidden_dim=hidden_dim,
        activation=Tanh(),
        # activation=bricks.Identity(),
        gate_activation=Sigmoid(),
        state_to_state_init=SumInitialization([Identity(1.0), IsotropicGaussian(c.wstd)]),
        state_to_reset_init=IsotropicGaussian(c.wstd),
        state_to_update_init=IsotropicGaussian(c.wstd),
        input_to_state_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            biases_init=Constant(0.0),
        ),
        input_to_update_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            # biases_init=Constant(-2.0)),
            biases_init=Constant(-1.0),
        ),
        input_to_reset_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            # biases_init=Constant(-3.0))
            biases_init=Constant(-2.0),
        ),
    )
    gru.initialize()
    rnn_in = o.dimshuffle(1, 0, 2)
    # rnn_in = o
    # rnn_out = gru.apply(rnn_in, mask=m.T)
    rnn_out = gru.apply(rnn_in)
    state_to_state = gru.rnn.state_to_state
    state_to_state.name = "state_to_state"
    # o = rnn_out[-1, :, :]
    o = rnn_out[-1]

    # o = rnn_out[:, -1, :]
    # o = rnn_out.mean(axis=1)

    # print rnn_last_out.eval({
    # x: np.ones((3, 101, 300), dtype=theano.config.floatX),
    # m: np.ones((3, 101), dtype=theano.config.floatX)})
    # raw_input()
    # o = rnn_out.mean(axis=1)
    dropout_variables.append(o)

    score_layer = Linear(
        input_dim=hidden_dim,
        output_dim=1,
        weights_init=IsotropicGaussian(std=c.wstd),
        biases_init=Constant(0.0),
        name="linear2",
    )
    score_layer.initialize()
    o = score_layer.apply(o)
    probs = Sigmoid().apply(o)

    # probs = deeply_sup_probs
    cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean()
    # cost_deeply_sup0 = - (y * T.log(deeply_sup_probs) + (1-y) * T.log(1 - deeply_sup_probs)).mean()
    # cost += cost_deeply_sup0 * c.deeply_factor

    cost.name = "cost"
    misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean()
    misclassification.name = "misclassification"

    # print rnn_in.shape.eval(
    # {x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    # })
    # print rnn_out.shape.eval(
    # {x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    # m : np.ones((45, 111), dtype=theano.config.floatX)})
    # print (m).sum(axis=1).shape.eval({
    # m : np.ones((45, 111), dtype=theano.config.floatX)})
    # print (m).shape.eval({
    # m : np.ones((45, 111), dtype=theano.config.floatX)})
    # raw_input()

    # =================

    cg = ComputationGraph([cost])
    cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5)
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cg.outputs[0],
        params=params,
        step_rule=CompositeRule(
            [
                StepClipping(threshold=4),
                Adam(learning_rate=0.002, beta1=0.1, beta2=0.001),
                # NAG(lr=0.1, momentum=0.9),
                # AdaDelta(),
            ]
        ),
    )

    # ========
    print "setting up data"
    ports = {
        "gpu0_train": 5557,
        "gpu0_test": 5558,
        "cuda0_train": 5557,
        "cuda0_test": 5558,
        "opencl0:0_train": 5557,
        "opencl0:0_test": 5558,
        "gpu1_train": 5559,
        "gpu1_test": 5560,
    }

    # batch_size = 16
    # batch_size = 32
    batch_size = 40

    def start_server(port, which_set):
        fuel.server.logger.setLevel("WARN")
        dataset = IMDBText(which_set, sorted=True)

        n_train = dataset.num_examples
        # scheme = ShuffledScheme(examples=n_train, batch_size=batch_size)
        scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size)

        stream = DataStream(dataset=dataset, iteration_scheme=scheme)
        print "loading glove"
        glove = GloveTransformer(glove_version, data_stream=stream)
        padded = Padding(
            data_stream=glove,
            # mask_sources=('features',)
            mask_sources=("features",),
        )

        padded = DropSources(padded, ["features_mask"])

        fuel.server.start_server(padded, port=port, hwm=20)

    train_port = ports[theano.config.device + "_train"]
    train_p = Process(target=start_server, args=(train_port, "train"))
    train_p.start()

    test_port = ports[theano.config.device + "_test"]
    test_p = Process(target=start_server, args=(test_port, "test"))
    test_p.start()

    # train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port)
    # test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port)

    train_stream = ServerDataStream(("features", "targets"), port=train_port)
    test_stream = ServerDataStream(("features", "targets"), port=test_port)

    print "setting up model"
    # ipdb.set_trace()

    n_examples = 25000
    print "Batches per epoch", n_examples // (batch_size + 1)
    batches_extensions = 100
    monitor_rate = 50
    # ======
    model = Model(cg.outputs[0])
    extensions = []
    extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1))
    extensions.append(TrainingDataMonitoring([cost, misclassification], prefix="train", every_n_batches=monitor_rate))

    extensions.append(
        DataStreamMonitoring(
            [cost, misclassification],
            data_stream=test_stream,
            prefix="test",
            after_epoch=True,
            before_first_epoch=False,
        )
    )

    extensions.append(Timing())
    extensions.append(Printing())

    # extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True))
    # extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True))

    # extensions.append(PlotHistogram(
    # channels=['train_state_to_state'],
    # bins=50,
    # every_n_batches=30))

    extensions.append(
        Plot(
            theano.config.device + "_result",
            channels=[["train_cost"], ["train_misclassification"]],
            every_n_batches=monitor_rate,
        )
    )

    main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions)
    main_loop.run()
Beispiel #31
0
def build_submodel(input_shape,
                   output_dim,
                   L_dim_conv_layers,
                   L_filter_size,
                   L_pool_size,
                   L_activation_conv,
                   L_dim_full_layers,
                   L_activation_full,
                   L_exo_dropout_conv_layers,
                   L_exo_dropout_full_layers,
                   L_endo_dropout_conv_layers,
                   L_endo_dropout_full_layers,
                   L_border_mode=None,
                   L_filter_step=None,
                   L_pool_step=None):


    # TO DO : target size and name of the features

    x = T.tensor4('features')
    y = T.imatrix('targets')

    assert len(input_shape) == 3, "input_shape must be a 3d tensor"

    num_channels = input_shape[0]
    image_size = tuple(input_shape[1:])
    print image_size
    print num_channels
    prediction = output_dim

    # CONVOLUTION
    output_conv = x
    output_dim = num_channels*np.prod(image_size)
    conv_layers = []
    assert len(L_dim_conv_layers) == len(L_filter_size)
    if L_filter_step is None:
        L_filter_step = [None] * len(L_dim_conv_layers)
    assert len(L_dim_conv_layers) == len(L_pool_size)
    if L_pool_step is None:
        L_pool_step = [None] * len(L_dim_conv_layers)
    assert len(L_dim_conv_layers) == len(L_pool_step)
    assert len(L_dim_conv_layers) == len(L_activation_conv)
    if L_border_mode is None:
        L_border_mode = ["valid"] * len(L_dim_conv_layers)
    assert len(L_dim_conv_layers) == len(L_border_mode)
    assert len(L_dim_conv_layers) == len(L_endo_dropout_conv_layers)
    assert len(L_dim_conv_layers) == len(L_exo_dropout_conv_layers)

    # regarding the batch dropout : the dropout is applied on the filter
    # which is equivalent to the output dimension
    # you have to look at the dropout_rate of the next layer
    # that is why we need to have the first dropout value of L_exo_dropout_full_layers
    
    # the first value has to be 0.0 in this context, and we'll
    # assume that it is, but let's have an assert
    assert L_exo_dropout_conv_layers[0] == 0.0, "L_exo_dropout_conv_layers[0] has to be 0.0 in this context. There are ways to make it work, of course, but we don't support this with this scripts."

    # here modifitication of L_exo_dropout_conv_layers
    L_exo_dropout_conv_layers = L_exo_dropout_conv_layers[1:] + [L_exo_dropout_full_layers[0]]

    if len(L_dim_conv_layers):
        for (num_filters, filter_size, filter_step,
            pool_size, pool_step, activation_str, border_mode,
            dropout, index) in zip(L_dim_conv_layers,
                                  L_filter_size,
                                  L_filter_step,
                                  L_pool_size,
                                  L_pool_step,
                                  L_activation_conv,
                                  L_border_mode,
                                  L_exo_dropout_conv_layers,
                                  xrange(len(L_dim_conv_layers))
                                  ):

            # convert filter_size and pool_size in tuple
            filter_size = tuple(filter_size)

            if filter_step is None:
                filter_step = (1, 1)
            else:
                filter_step = tuple(filter_step)

            if pool_size is None:
                pool_size = (0,0)
            else:
                pool_size = tuple(pool_size)

            # TO DO : leaky relu
            if activation_str.lower() == 'rectifier':
                activation = Rectifier().apply
            elif activation_str.lower() == 'tanh':
                activation = Tanh().apply
            elif activation_str.lower() in ['sigmoid', 'logistic']:
                activation = Logistic().apply
            elif activation_str.lower() in ['id', 'identity']:
                activation = Identity().apply
            else:
                raise Exception("unknown activation function : %s", activation_str)

            assert 0.0 <= dropout and dropout < 1.0
            num_filters = num_filters - int(num_filters*dropout)

            print "border_mode : %s" % border_mode

            # filter_step
            # http://blocks.readthedocs.org/en/latest/api/bricks.html#module-blocks.bricks.conv

            kwargs = {}
            if filter_step is None or filter_step == (1,1):
                pass
            else:
                # there's a bit of a mix of names because `Convolutional` takes
                # a "step" argument, but `ConvolutionActivation` takes "conv_step" argument
                kwargs['conv_step'] = filter_step

            if (pool_size[0] == 0 and pool_size[1] == 0):
                layer_conv = ConvolutionalActivation(activation=activation,
                                                filter_size=filter_size,
                                                num_filters=num_filters,
                                                border_mode=border_mode,
                                                name="layer_%d" % index,
                                                **kwargs)
            else:
                if pool_step is None:
                    pass
                else:
                    kwargs['pooling_step'] = tuple(pool_step)

                layer_conv = ConvolutionalLayer(activation=activation,
                                                filter_size=filter_size,
                                                num_filters=num_filters,
                                                border_mode=border_mode,
                                                pooling_size=pool_size,
                                                name="layer_%d" % index,
                                                **kwargs)

            conv_layers.append(layer_conv)

        convnet = ConvolutionalSequence(conv_layers, num_channels=num_channels,
                                    image_size=image_size,
                                    weights_init=Uniform(width=0.1),
                                    biases_init=Constant(0.0),
                                    name="conv_section")
        convnet.push_allocation_config()
        convnet.initialize()
        output_dim = np.prod(convnet.get_dim('output'))
        output_conv = convnet.apply(output_conv)
        


    output_conv = Flattener().apply(output_conv)

    # FULLY CONNECTED
    output_mlp = output_conv
    full_layers = []
    assert len(L_dim_full_layers) == len(L_activation_full)
    assert len(L_dim_full_layers) + 1 == len(L_endo_dropout_full_layers)
    assert len(L_dim_full_layers) + 1 == len(L_exo_dropout_full_layers)

    # reguarding the batch dropout : the dropout is applied on the filter
    # which is equivalent to the output dimension
    # you have to look at the dropout_rate of the next layer
    # that is why we throw away the first value of L_exo_dropout_full_layers
    L_exo_dropout_full_layers = L_exo_dropout_full_layers[1:]
    pre_dim = output_dim
    print "When constructing the model, the output_dim of the conv section is %d." % output_dim
    if len(L_dim_full_layers):
        for (dim, activation_str,
            dropout, index) in zip(L_dim_full_layers,
                                  L_activation_full,
                                  L_exo_dropout_full_layers,
                                  range(len(L_dim_conv_layers),
                                        len(L_dim_conv_layers)+ 
                                        len(L_dim_full_layers))
                                   ):
                                          
                # TO DO : leaky relu
                if activation_str.lower() == 'rectifier':
                    activation = Rectifier().apply
                elif activation_str.lower() == 'tanh':
                    activation = Tanh().apply
                elif activation_str.lower() in ['sigmoid', 'logistic']:
                    activation = Logistic().apply
                elif activation_str.lower() in ['id', 'identity']:
                    activation = Identity().apply
                else:
                    raise Exception("unknown activation function : %s", activation_str)

                assert 0.0 <= dropout and dropout < 1.0
                dim = dim - int(dim*dropout)
                print "When constructing the fully-connected section, we apply dropout %f to add an MLP going from pre_dim %d to dim %d." % (dropout, pre_dim, dim)

                layer_full = MLP(activations=[activation], dims=[pre_dim, dim],
                                 weights_init=Uniform(width=0.1),
                                 biases_init=Constant(0.0),
                                name="layer_%d" % index)
                layer_full.initialize()
                full_layers.append(layer_full)
                pre_dim = dim

        for layer in full_layers:
            output_mlp = layer.apply(output_mlp)

        output_dim = L_dim_full_layers[-1] - int(L_dim_full_layers[-1]*L_exo_dropout_full_layers[-1])

    # COST FUNCTION
    output_layer = Linear(output_dim, prediction,
                          weights_init=Uniform(width=0.1),
                          biases_init=Constant(0.0),
                          name="layer_"+str(len(L_dim_conv_layers)+ 
                                            len(L_dim_full_layers))
                          )
    output_layer.initialize()
    full_layers.append(output_layer)
    y_pred = output_layer.apply(output_mlp)
    y_hat = Softmax().apply(y_pred)
    # SOFTMAX and log likelihood
    y_pred = Softmax().apply(y_pred)
    # be careful. one version expects the output of a softmax; the other expects just the
    # output of the network
    cost = CategoricalCrossEntropy().apply(y.flatten(), y_pred)
    #cost = Softmax().categorical_cross_entropy(y.flatten(), y_pred)
    cost.name = "cost"

    # Misclassification
    error_rate_brick = MisclassificationRate()
    error_rate = error_rate_brick.apply(y.flatten(), y_hat)
    error_rate.name = "error_rate"

    # put names

    D_params, D_kind = build_params(x, T.matrix(), conv_layers, full_layers)
    # test computation graph
    

    cg = ComputationGraph(cost)

    # DROPOUT
    L_endo_dropout = L_endo_dropout_conv_layers + L_endo_dropout_full_layers

    cg_dropout = cg
    inputs = VariableFilter(roles=[INPUT])(cg.variables)

    for (index, drop_rate) in enumerate(L_endo_dropout):
        for input_ in inputs:
            m = re.match(r"layer_(\d+)_apply.*", input_.name)
            if m and index == int(m.group(1)):
                if drop_rate < 0.0001:
                    print "Skipped applying dropout on %s because the dropout rate was under 0.0001." % input_.name
                    break
                else:
                    cg_dropout = apply_dropout(cg, [input_], drop_rate)
                    print "Applied dropout %f on %s." % (drop_rate, input_.name)
                    break


    cg = cg_dropout

    return (cg, error_rate, cost, D_params, D_kind)
    def __init__(self, ref_data, output_dim):
        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data')

        # Construct the model
        j = tensor.lvector('j')
        x = tensor.fmatrix('x')
        y = tensor.ivector('y')

        last_outputs = []
        s_dropout_vars = []
        r_dropout_vars = []
        i_dropout_vars = []
        penalties = []

        for i in range(nparts):
            fs = numpy.random.binomial(1, part_r_proba, size=(ref_data.shape[1],))
            input_dim = int(fs.sum())

            fs_sh = theano.shared(fs)
            r = ref_data_sh[j, :][:, fs_sh.nonzero()[0]]

            mlp0 = MLP(activations=activation_functions_0,
                      dims=[input_dim] + hidden_dims_0, name='enc%d'%i)
            mlp0r = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name='dec%d'%i)
            mlp1 = MLP(activations=activation_functions_1,
                      dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name='inter_gen_%d'%i)
            mlp2 = MLP(activations=activation_functions_2 + [None],
                       dims=[n_inter] + hidden_dims_2 + [output_dim],
                       name='end_mlp_%d'%i)

            encod = mlp0.apply(r)
            rprime = mlp0r.apply(encod)
            inter_weights = mlp1.apply(encod)

            ibias = Bias(n_inter, name='inter_bias_%d'%i)
            inter = ibias.apply(tensor.dot(x, inter_weights))
            inter = inter_act_fun.apply(inter)

            out = mlp2.apply(inter)

            penalties.append(tensor.sqrt(((rprime - r)**2).sum(axis=1)).mean()[None])

            last_outputs.append(out)

            r_dropout_vars.append(r)
            s_dropout_vars = s_dropout_vars + (
                                    VariableFilter(bricks=[Tanh], name='output')
                                                  (ComputationGraph([inter_weights]))
                            )
            i_dropout_vars.append(inter)

            # Initialize parameters
            for brick in [mlp0, mlp0r, mlp1, mlp2, ibias]:
                brick.weights_init = IsotropicGaussian(0.01)
                brick.biases_init = Constant(0.001)
                brick.initialize()

        final = tensor.concatenate([x[:, :, None] for x in last_outputs], axis=2).mean(axis=2)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        error_rate = tensor.neq(y, pred).mean()

        # apply regularization
        cg = ComputationGraph([cost, error_rate])

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        if s_dropout != 0:
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)
        if r_dropout != 0:
            cg = apply_dropout(cg, r_dropout_vars, r_dropout)
        if i_dropout != 0:
            cg = apply_dropout(cg, i_dropout_vars, i_dropout)

        [cost_reg, error_rate_reg] = cg.outputs

        cost_reg = cost_reg + reconstruction_penalty * tensor.concatenate(penalties, axis=0).sum()

        self.cost = cost
        self.cost_reg = cost_reg
        self.error_rate = error_rate
        self.error_rate_reg = error_rate_reg
        self.pred = pred
        self.confidence = confidence
    def __init__(self, ref_data, output_dim):
        input_dim = ref_data.shape[1]

        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name="ref_data")

        # Construct the model
        j = tensor.lvector("j")
        r = ref_data_sh[j, :]
        x = tensor.fmatrix("x")
        y = tensor.ivector("y")

        # input_dim must be nr
        mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name="e0")
        mlp0vs = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name="de0")
        mlp1 = MLP(
            activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name="inter_gen"
        )
        mlp2 = MLP(
            activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name="end_mlp"
        )

        encod = mlp0.apply(r)
        rprime = mlp0vs.apply(encod)
        inter_weights = mlp1.apply(encod)

        ibias = Bias(n_inter)
        ibias.biases_init = Constant(0)
        ibias.initialize()
        inter = inter_act_fun.apply(ibias.apply(tensor.dot(x, inter_weights)))

        final = mlp2.apply(inter)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        error_rate = tensor.neq(y, pred).mean()

        # Initialize parameters
        for brick in [mlp0, mlp0vs, mlp1, mlp2]:
            brick.weights_init = IsotropicGaussian(0.01)
            brick.biases_init = Constant(0.001)
            brick.initialize()

        # apply regularization
        cg = ComputationGraph([cost, error_rate])

        if r_dropout != 0:
            # - dropout on input vector r : r_dropout
            cg = apply_dropout(cg, [r], r_dropout)

        if s_dropout != 0:
            # - dropout on intermediate layers of first mlp : s_dropout
            s_dropout_vars = list(
                set(VariableFilter(bricks=[Tanh], name="output")(ComputationGraph([inter_weights])))
                - set([inter_weights])
            )
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)

        if i_dropout != 0:
            # - dropout on input to second mlp : i_dropout
            cg = apply_dropout(cg, [inter], i_dropout)

        if a_dropout != 0:
            # - dropout on hidden layers of second mlp : a_dropout
            a_dropout_vars = list(
                set(VariableFilter(bricks=[Tanh], name="output")(ComputationGraph([final])))
                - set([inter_weights])
                - set(s_dropout_vars)
            )
            cg = apply_dropout(cg, a_dropout_vars, a_dropout)

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        [cost_reg, error_rate_reg] = cg.outputs

        # add reconstruction penalty for AE part
        penalty_val = tensor.sqrt(((r - rprime) ** 2).sum(axis=1)).mean()
        cost_reg = cost_reg + reconstruction_penalty * penalty_val

        self.cost = cost
        self.cost_reg = cost_reg
        self.error_rate = error_rate
        self.error_rate_reg = error_rate_reg
        self.pred = pred
        self.confidence = confidence
Beispiel #34
0
def construct_model(input_dim, out_dim):
    # Construct the model
    r = tensor.fmatrix('r')
    x = tensor.fmatrix('x')
    y = tensor.ivector('y')

    nx = x.shape[0]
    nj = x.shape[1]  # also is r.shape[0]
    nr = r.shape[1]

    # r is nj x nr
    # x is nx x nj
    # y is nx

    # r_rep is nx x nj x nr
    r_rep = r[None, :, :].repeat(axis=0, repeats=nx)
    # x3 is nx x nj x 1
    x3 = x[:, :, None]

    # concat is nx x nj x (nr + 1)
    concat = tensor.concatenate([r_rep, x3], axis=2)

    # Change concat from Batch x Time x Features to T X B x F
    mlp_input = concat.dimshuffle(1, 0, 2)

    if use_ensembling:
        # Split time dimension into batches of size num_feats
        # Join that dimension with the B dimension
        ens_shape = (num_feats,
                     mlp_input.shape[0]/num_feats,
                     mlp_input.shape[1])
        mlp_input = mlp_input.reshape(ens_shape + (input_dim+1,))
        mlp_input = mlp_input.reshape((ens_shape[0], ens_shape[1] * ens_shape[2], input_dim+1))

    mlp = MLP(dims=[input_dim+1] + mlp_hidden_dims,
              activations=[activation_function for _ in mlp_hidden_dims],
              name='mlp')

    lstm_bot_linear = Linear(input_dim=mlp_hidden_dims[-1], output_dim=4 * lstm_hidden_dim,
                    name="lstm_input_linear")
    lstm = LSTM(dim=lstm_hidden_dim, activation=activation_function,
                name="hidden_recurrent")
    lstm_top_linear = Linear(input_dim=lstm_hidden_dim, output_dim=out_dim,
                        name="out_linear")

    rnn_input = mlp.apply(mlp_input)

    pre_rnn = lstm_bot_linear.apply(rnn_input)
    states = lstm.apply(pre_rnn)[0]
    activations = lstm_top_linear.apply(states)

    if use_ensembling:
        activations = activations.reshape(ens_shape + (out_dim,))
        # Unsplit batches (ensembling)
        activations = tensor.mean(activations, axis=1)

    # Mean over time
    activations = tensor.mean(activations, axis=0)

    cost = Softmax().categorical_cross_entropy(y, activations)

    pred = activations.argmax(axis=1)
    error_rate = tensor.neq(y, pred).mean()

    # Initialize parameters
    for brick in (mlp, lstm_bot_linear, lstm, lstm_top_linear):
        brick.weights_init = IsotropicGaussian(0.01)
        brick.biases_init = Constant(0.)
        brick.initialize()

    # apply noise
    cg = ComputationGraph([cost, error_rate])
    noise_vars = VariableFilter(roles=[WEIGHT])(cg)
    apply_noise(cg, noise_vars, noise_std)
    apply_dropout(cg, [rnn_input], dropout)
    [cost_reg, error_rate_reg] = cg.outputs

    return cost_reg, error_rate_reg, cost, error_rate
Beispiel #35
0
    def start(self):
        x = T.matrix('features', config.floatX)
        y = T.imatrix('targets')

        self.x = x

        DIMS = [108*5, 1000, 1000, 1000, 1000, 1943]
        NUMS = [1, 1, 1, 1, 1, 1]
        FUNCS = [
            Rectifier, 
            Rectifier, 
            Rectifier, 
            Rectifier, 
            # Rectifier, 
            # Maxout(num_pieces=5),
            # Maxout(num_pieces=5),
            # Maxout(num_pieces=5),
            # SimpleRecurrent,
            # SimpleRecurrent,
            # SimpleRecurrent,
            Softmax,
        ]

        def lllistool(i, inp, func):
            l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], 
                       weights_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), 
                       biases_init=IsotropicGaussian(std=DIMS[i]**(-0.5)),
                       name='Lin{}'.format(i))
            l.initialize()
            func.name='Fun{}'.format(i)
            if func == SimpleRecurrent:
                gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=(DIMS[i]+DIMS[i+1])**(-0.5)))
            else:
                gong = func()
            ret = gong.apply(l.apply(inp))
            return ret

        oup = x
        for i in range(len(DIMS)-1):
            oup = lllistool(i, oup, FUNCS[i])
        y_hat = oup

        self.y_hat_prob = y_hat

        cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat).astype(config.floatX)

        cg = ComputationGraph(cost)
        orig_cg = cg
        ips = VariableFilter(roles=[INPUT])(cg.variables)
        ops = VariableFilter(roles=[OUTPUT])(cg.variables)
        cg = apply_dropout(cg, ips[0:2:1], 0.2)
        cg = apply_dropout(cg, ips[2:-2:1], 0.5)
        cost = cg.outputs[0]

        cost.name = 'cost'

        # mps = theano.shared(np.array([ph2id(ph48239(id2ph(t))) for t in range(48)]))
        mps = theano.shared(np.array([ph2id(state239(t)) for t in range(1943)]))
        z_hat = T.argmax(y_hat, axis=1)

        y39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[y.flatten()])
        y_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat])

        self.y_hat39 = y_hat39

        lost01 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX)
        lost01.name = '0/1 loss'
        lost23 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX)
        #lost23 = MisclassificationRate().apply(y39, y_hat39).astype(config.floatX)
        lost23.name = '2/3 loss'


        Ws = VariableFilter(roles=[WEIGHT])(cg.variables)
        norms = sum(w.norm(2) for w in Ws)
        norms.name = 'norms'
        path = pjoin(PATH['fuel'], pfx+'_train.hdf5')
        data = H5PYDataset(path, which_set='train', load_in_memory=True, subset=slice(0, 100000))
        # data = H5PYDataset(path, which_set='train', load_in_memory=True)
        data_v = H5PYDataset(pjoin(PATH['fuel'], pfx+'_validate.hdf5'), which_set='validate', load_in_memory=True)
        num = data.num_examples
        data_stream = DataStream(data, iteration_scheme=ShuffledScheme(
                        num, batch_size=128))
        data_stream_v = DataStream(data_v, iteration_scheme=SequentialScheme(
                        data_v.num_examples, batch_size=128))
        algo = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([Momentum(0.002, 0.9)]))
        monitor = DataStreamMonitoring( variables=[cost, lost01, norms],
                data_stream=data_stream)
        monitor_v = DataStreamMonitoring( variables=[lost23],
                data_stream=data_stream_v)
        plt = Plot('AlpAlpAlp', channels=[['0/1 loss', '2/3 loss']], after_epoch=True)
        main_loop = MainLoop(data_stream = data_stream, 
                algorithm=algo, 
                extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=2000), Printing(), plt])
        
        main_loop.run()
Beispiel #36
0
def train(args, trial=11, no_valid=False):
    # Creating unique strings to save for experiments.
    data_valid = "data/"+args.data_name+"_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\
    "_transitions_"+str(args.transitions)
    data_test = data_valid.replace("_valid_size", "_test_size")
    # If we want validation set to match modData of test set
    if modDataValid == 1:
        data_valid = data_valid.replace("_trial_", "_" + modData + "_trial_")
        data_test = data_test.replace("_trial_", "_" + modData + "_trial_")

    # By default, it is m0
    data_train = "data/"+args.data_name+"_trial_"+str(trial)+"_train_size_"+str(args.train_size)+\
    "_transitions_"+str(args.transitions)

    subStr = "rnn_type_"+args.rnn_type + "_trial_"+str(trial) + "_hiddenSize_"+str(args.hidden_size)+\
    "_numLayers_"+str(args.num_layers)+ \
    "_dropout_"+str(args.dropout)+"_train_size_"+str(args.train_size) + "_transitions_"+str(args.transitions)+\
    "_novalid_"+str(args.no_valid)

    if modData == "m1":
        data_train = data_train.replace("_trial_", "_m1_trial_")
        subStr = subStr.replace("_trial_", "_m1_trial_")
    elif modData == "m3":
        data_train = data_train.replace("_trial_", "_m3_trial_")
        subStr = subStr.replace("_trial_", "_m3_trial_")

        data_valid = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\
        "_transitions_"+str(args.transitions)
        data_test = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_test_size_"+str(args.train_size)+\
        "_transitions_"+str(args.transitions)

    print("on test: " + subStr)
    # Perform folder prefixing
    prefix_path = models_folder + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\
    "_boost_"+bStr(args.boosting)

    load_path2 = prefix + load_path
    save_path2 = prefix + save_path
    last_path2 = prefix + last_path

    plots_output2 = plots_output + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\
    "_boost_"+bStr(args.boosting)

    # obtain vocabulary size
    ix_to_char, char_to_ix, vocab_size = get_metadata(
        data_test.replace("_test", ""))
    print("vocab_size: " + str(vocab_size))

    # Get train, valid, test streams
    sharedDataTrain, train_stream = get_stream_inGPU(data_train,
                                                     sharedName='sharedData')
    train_streamCopy = copy.deepcopy(train_stream)
    sharedDataValid, dev_stream = get_stream_inGPU(data_valid,
                                                   sharedName='sharedData')
    valid_streamCopy = copy.deepcopy(dev_stream)
    sharedDataTest, test_stream = get_stream_inGPU(data_test,
                                                   sharedName='sharedData')
    test_streamCopy = copy.deepcopy(test_stream)

    # Create dummy sums
    sharedMRRSUM = shared(np.array(0.0, dtype=theano.config.floatX))
    sharedTOTSUM = shared(np.array(0.0, dtype=theano.config.floatX))
    sharedSUMVARs = {
        'sharedMRRSUM': sharedMRRSUM,
        'sharedTOTSUM': sharedTOTSUM
    }

    # Initialize batches
    batch_index_From = T.scalar('int_stream_From', dtype='int32')
    batch_index_To = T.scalar('int_stream_To', dtype='int32')

    # Index theano variables
    x = sharedDataTrain['x'][:, batch_index_From:batch_index_To]
    x.name = 'x'

    x_mask = sharedDataTrain['x_mask'][:, batch_index_From:batch_index_To]
    x_mask.name = 'x_mask'

    x_mask_o = sharedDataTrain['x_mask_o'][:, batch_index_From:batch_index_To]
    x_mask_o.name = 'x_mask_o'

    x_mask_o_mask = sharedDataTrain[
        'x_mask_o_mask'][:, batch_index_From:batch_index_To]
    x_mask_o_mask.name = 'x_mask_o_mask'

    y = sharedDataTrain['y'][:, batch_index_From:batch_index_To]
    y.name = 'y'

    y_mask = sharedDataTrain['y_mask'][:, batch_index_From:batch_index_To]
    y_mask.name = 'y_mask'

    y_mask_o = sharedDataTrain['y_mask_o'][:, batch_index_From:batch_index_To]
    y_mask_o.name = 'y_mask_o'

    y_mask_o_mask = sharedDataTrain[
        'y_mask_o_mask'][:, batch_index_From:batch_index_To]
    y_mask_o_mask.name = 'y_mask_o_mask'

    lens = sharedDataTrain['lens'][:, batch_index_From:batch_index_To]
    lens.name = 'lens'

    # Generate temp shared vars
    tempSharedData = {}
    tempSharedData[theano.config.floatX] = [
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX)),
        shared(np.array([[0], [0]], dtype=theano.config.floatX))
    ]

    tempSharedData['uint8'] = [
        shared(np.array([[0], [0]], dtype='uint8')),
        shared(np.array([[0], [0]], dtype='uint8')),
        shared(np.array([[0], [0]], dtype='uint8'))
    ]

    # Final mask is due to the generated mask and the input mask
    x_mask_final = x_mask * x_mask_o * x_mask_o_mask
    y_mask_final = y_mask * y_mask_o * y_mask_o_mask

    # Build neural network
    linear_output, cost = nn_fprop(
        x,
        x_mask_final,
        y,
        y_mask_final,
        lens,
        vocab_size,
        hidden_size,
        num_layers,
        rnn_type,
        boosting=boosting,
        scan_kwargs={'truncate_gradient': truncate_gradient})

    # Keep a constant in gpu memory
    constant1 = shared(np.float32(1.0))
    cost_int, ymasksum = RR_cost(y, linear_output, y_mask_final, constant1)

    # Validation calculations
    fRR = function(inputs=[
        theano.In(batch_index_From, borrow=True),
        theano.In(batch_index_To, borrow=True)
    ],
                   updates=[(sharedMRRSUM, sharedMRRSUM + cost_int),
                            (sharedTOTSUM, sharedTOTSUM + ymasksum)])

    # COST
    cg = ComputationGraph(cost)

    if dropout > 0:
        # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015)
        inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(
            cg.variables)
        cg = apply_dropout(cg, inputs, dropout)
        cost = cg.outputs[0]

    # Learning algorithm
    step_rules = [
        RMSProp(learning_rate=rmsPropLearnRate, decay_rate=decay_rate),
        StepClipping(step_clipping)
    ]
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=CompositeRule(step_rules))

    # Extensions

    # This is for tracking our best result
    trackbest = track_best('valid_MRR', save_path2, last_path2, num_epochs,
                           nepochs, maxIterations, epsilon, tempSharedData)

    if onlyPlots:
        prefixes = ["train_cross", "valid_cross", "test_cross"]
        gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
        step_norm = aggregation.mean(algorithm.total_step_norm)
        monitored_vars = [cost, gradient_norm, step_norm]
        #this is faster
        train_monitor = myTrainingDataMonitoring(
            variables=monitored_vars,
            prefix=prefixes[0],
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        #train_monitor = DataStreamMonitoringPlot(variables=[cost],
        #                    data_stream=train_streamCopy, prefix=prefixes[0], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTrain, after_batch=True, saveEveryXIteration = saveEveryXIteration)
        valid_monitor = DataStreamMonitoringPlot(
            variables=[cost],
            data_stream=valid_streamCopy,
            prefix=prefixes[1],
            sharedDataTrain=sharedDataTrain,
            sharedDataActualTest=sharedDataValid,
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        test_monitor = DataStreamMonitoringPlot(
            variables=[cost],
            data_stream=test_streamCopy,
            prefix=prefixes[2],
            sharedDataTrain=sharedDataTrain,
            sharedDataActualTest=sharedDataTest,
            after_batch=True,
            saveEveryXIteration=saveEveryXIteration)
        trackbest = [trackbest[0], trackbest[2], trackbest[3], trackbest[4]]
        plot = Plot('Live Plotting',
                    saveFolder=plots_output2,
                    channels=[
                        'train_cross_cost', 'valid_cross_cost',
                        'test_cross_cost'
                    ],
                    numProcesses=numProcesses,
                    saveEveryXIteration=saveEveryXIteration,
                    after_batch=True)
        extensions = [
            train_monitor,
            valid_monitor,
            test_monitor,
            plot,
            Printing(),
            ProgressBar(),
        ] + trackbest
    else:
        dev_monitor = myDataStreamMonitoring(after_epoch=True,
                                             before_epoch=False,
                                             data_stream=dev_stream,
                                             prefix="valid",
                                             fRR=fRR,
                                             sharedVars=sharedSUMVARs,
                                             sharedDataTrain=sharedDataTrain,
                                             sharedDataValid=sharedDataValid)
        extensions = [
            dev_monitor,
            Printing(),
            ProgressBar(),
        ] + trackbest

    if learning_rate_decay not in (0, 1):
        extensions.append(
            SharedVariableModifier(step_rules[0].learning_rate,
                                   lambda n, lr: np.cast[theano.config.floatX]
                                   (learning_rate_decay * lr),
                                   after_epoch=True,
                                   after_batch=False))

    print 'number of parameters in the model: ' + str(
        T.sum([p.size for p in cg.parameters]).eval())
    # Finally build the main loop and train the model
    main_loop = MainLoop(data_stream=train_stream,
                         algorithm=algorithm,
                         model=Model(cost),
                         extensions=extensions)
    main_loop.run()
Beispiel #37
0
train_stream = get_stream(hdf5_file, 'train', batch_size)
dev_stream = get_stream(hdf5_file, 'dev', batch_size)


# MODEL
x = tensor.matrix('features', dtype='uint8')
y = tensor.matrix('targets', dtype='uint8')
y_hat, cost = nn_fprop(x, y, vocab_size, hidden_size, num_layers, model)

# COST
cg = ComputationGraph(cost)

if dropout > 0:
    # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015)
    inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(cg.variables)
    cg = apply_dropout(cg, inputs, dropout)
    cost = cg.outputs[0]

# Learning algorithm
step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate),
              StepClipping(step_clipping)]
algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
                            step_rule=CompositeRule(step_rules))

# Extensions
gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
step_norm = aggregation.mean(algorithm.total_step_norm)
monitored_vars = [cost, gradient_norm, step_norm]

dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True,
                                   before_first_epoch=True, data_stream=dev_stream, prefix="dev")
Beispiel #38
0
def main():

    # set para
    config = getattr(configurations, "get_config_cs2en")()
    logger.info("Model options:\n{}".format(pprint.pformat(config)))
    tr_stream = get_tr_stream(**config)

    # Create Theano variables
    logger.info("Creating theano variables")

    source_sentence0 = tensor.lmatrix("source0")
    source_sentence_mask0 = tensor.matrix("source0_mask")
    target_sentence0 = tensor.lmatrix("target0")
    target_sentence_mask0 = tensor.matrix("target0_mask")

    source_sentence1 = tensor.lmatrix("source1")
    source_sentence_mask1 = tensor.matrix("source1_mask")
    target_sentence1 = tensor.lmatrix("target1")
    target_sentence_mask1 = tensor.matrix("target1_mask")

    source_sentence2 = tensor.lmatrix("source2")
    source_sentence_mask2 = tensor.matrix("source2_mask")
    target_sentence2 = tensor.lmatrix("target2")
    target_sentence_mask2 = tensor.matrix("target2_mask")

    sampling_input0 = tensor.lmatrix("input0")
    sampling_input1 = tensor.lmatrix("input1")
    sampling_input2 = tensor.lmatrix("input2")

    sampling_hstates0 = tensor.fmatrix("hstates0")
    sampling_hstates1 = tensor.fmatrix("hstates1")
    sampling_hstates2 = tensor.fmatrix("hstates2")

    sampling_lastrep0 = tensor.tensor3("lastrep0")
    sampling_lastrep1 = tensor.tensor3("lastrep1")

    hstates = theano.shared(value=numpy.zeros((config["enc_nhids"]), dtype=theano.config.floatX), name="hstates")

    # Get vocab
    sources = get_attr_rec(tr_stream, "data_stream")
    src_vocab = sources.data_streams[0].dataset.dictionary
    trg_vocab = sources.data_streams[1].dataset.dictionary

    # Construct model
    logger.info("Building PoemModel")

    block0 = PoemBlock(config=config, blockid="block0", name="poemblock0")
    block1 = PoemBlock(config=config, blockid="block1", name="poemblock1")
    block2 = PoemBlock(config=config, blockid="block2", name="poemblock2")

    cost0, hsta0, rep0 = block0.cost(
        source_sentence0,
        source_sentence_mask0,
        source_sentence_mask1,
        source_sentence_mask0,
        target_sentence0,
        target_sentence_mask0,
        hstates,
        lastrep0=None,
        lastrep1=None,
    )

    cost1, hsta1, rep1 = block1.cost(
        source_sentence1,
        source_sentence_mask0,
        source_sentence_mask1,
        source_sentence_mask1,
        target_sentence1,
        target_sentence_mask1,
        hsta0,
        lastrep0=rep0,
        lastrep1=None,
    )

    cost2, hsta2, rep2 = block2.cost(
        source_sentence2,
        source_sentence_mask0,
        source_sentence_mask1,
        source_sentence_mask2,
        target_sentence2,
        target_sentence_mask2,
        hsta1,
        lastrep0=rep0,
        lastrep1=rep1,
    )

    cost = cost0 + cost1 + cost2
    cost.name = "total_cost"

    logger.info("Creating computational graph")

    cg = ComputationGraph(cost)

    # Initialize model
    logger.info("Initializing model")
    block0.set_initw(IsotropicGaussian(config["weight_scale"]))
    block0.set_initb(Constant(0))
    block0.push_initialization_config()
    block0.set_specialinit(Orthogonal(), Orthogonal())
    block0.initialize()

    block1.set_initw(IsotropicGaussian(config["weight_scale"]))
    block1.set_initb(Constant(0))
    block1.push_initialization_config()
    block1.set_specialinit(Orthogonal(), Orthogonal())
    block1.initialize()

    block2.set_initw(IsotropicGaussian(config["weight_scale"]))
    block2.set_initb(Constant(0))
    block2.push_initialization_config()
    block2.set_specialinit(Orthogonal(), Orthogonal())
    block2.initialize()

    # apply dropout for regularization
    if config["dropout"] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info("Applying dropout")
        dropout_inputs = [x for x in cg.intermediary_variables if x.name == "maxout_apply_output"]
        cg = apply_dropout(cg, dropout_inputs, config["dropout"])

    # Print shapes

    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info("    {:15}: {}".format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names

    param_dict = Selector(block0).get_parameters()
    logger.info("Parameter names: ")
    for name, value in param_dict.items():
        logger.info("    {:15}: {}".format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(len(param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # logger.info(cg.auxiliary_variables)
    # logger.info("______________________________")

    """
    weights = ""
    for va in cg.auxiliary_variables:
        if va.name == "sequence_generator_block0_cost_matrix_weighted_averages":
            weights = va

    weightsize = weights.shape
    weightsize.name = "weightsize"

    states = ""
    for va in cg.auxiliary_variables:
        if va.name == "sequence_generator_block0_cost_matrix_states":
            states = va

    statesize = states.shape
    statesize.name = "statesize"

    rep = ""
    for va in cg.auxiliary_variables:
        if va.name == "poemblock0_cost_block0hstatesRepeat":
            rep = va

    repsize = rep.shape
    repsize.name = "repsize"

    """

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config["finish_after"]),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config["saveto"], every_n_batches=config["save_freq"]),
    ]

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost,
        parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config["step_clipping"]), eval(config["step_rule"])()]),
    )

    # Reload model if necessary
    if config["reload"]:
        extensions.append(LoadNMT(config["saveto"]))

    # Add sampling

    if config["hook_samples"] >= 1:
        logger.info("Building sampler")

        generated0 = block0.mygenerate(sampling_input0, sampling_hstates0)
        search_model0 = Model(generated0)

        generated1 = block1.mygenerate(sampling_input1, sampling_hstates1, sampling_lastrep0)
        search_model1 = Model(generated1)

        generated2 = block2.mygenerate(sampling_input2, sampling_hstates2, sampling_lastrep0, sampling_lastrep1)
        search_model2 = Model(generated2)

        extensions.append(
            Sampler(
                config=config,
                model0=search_model0,
                model1=search_model1,
                model2=search_model2,
                data_stream=tr_stream,
                hook_samples=config["hook_samples"],
                every_n_batches=config["sampling_freq"],
                src_vocab_size=config["src_vocab_size"],
            )
        )

        logger.info("End of building sampler")

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions)

    # Train!
    main_loop.run()
Beispiel #39
0
    train_stream = stream.train(req_vars)
    valid_stream = stream.valid(req_vars)

    cost = model.cost(**inputs)
    cg = ComputationGraph(cost)
    monitored = set([cost] + VariableFilter(roles=[roles.COST])(cg.variables))

    valid_monitored = monitored
    if hasattr(model, 'valid_cost'):
        valid_cost = model.valid_cost(**inputs)
        valid_cg = ComputationGraph(valid_cost)
        valid_monitored = set([valid_cost] + VariableFilter(
            roles=[roles.COST])(valid_cg.variables))

    if hasattr(config, 'dropout') and config.dropout < 1.0:
        cg = apply_dropout(cg, config.dropout_inputs(cg), config.dropout)
    if hasattr(config, 'noise') and config.noise > 0.0:
        cg = apply_noise(cg, config.noise_inputs(cg), config.noise)
    cost = cg.outputs[0]
    cg = Model(cost)

    logger.info('# Parameter shapes:')
    parameters_size = 0
    for value in cg.parameters:
        logger.info('    %20s %s' % (value.get_value().shape, value.name))
        parameters_size += reduce(operator.mul, value.get_value().shape, 1)
    logger.info('Total number of parameters: %d in %d matrices' %
                (parameters_size, len(cg.parameters)))

    if hasattr(config, 'step_rule'):
        step_rule = config.step_rule
Beispiel #40
0
def main(exp_config, source_vocab, target_vocab, dev_stream, use_bokeh=True):

    # def setup_model_and_stream(exp_config, source_vocab, target_vocab):
    # def setup_model_and_stream(exp_config, source_vocab, target_vocab):
    train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream = setup_model_and_stream(
        exp_config, source_vocab, target_vocab)
    cost = create_model(train_encoder, train_decoder,
                        exp_config.get('imt_smoothing_constant', 0.005))

    # Set up training model
    logger.info("Building model")
    train_model = Model(cost)

    # Set the parameters from a trained models (.npz file)
    logger.info("Loading parameters from model: {}".format(
        exp_config['saved_parameters']))
    # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API
    param_values = LoadNMT.load_parameter_values(
        exp_config['saved_parameters'],
        brick_delimiter=exp_config.get('brick_delimiter', None))
    LoadNMT.set_model_parameters(train_model, param_values)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING
    if exp_config.get('l2_regularization', False) is True:
        l2_reg_alpha = exp_config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to rename the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    # Note dropout variables are hard-coded here
    if exp_config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, exp_config['dropout'])

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(exp_config['saveto']):
        os.makedirs(exp_config['saveto'])
        # TODO: mv the actual config file once we switch to .yaml for min-risk
        shutil.copy(exp_config['config_file'], exp_config['saveto'])

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=exp_config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(exp_config['saveto'],
                      every_n_batches=exp_config['save_freq'])
    ]

    # Set up beam search and sampling computation graphs if necessary
    # TODO: change the if statement here
    if exp_config['hook_samples'] >= 1 or exp_config['bleu_script'] is not None:
        logger.info("Building sampling model")
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[train_decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling -- TODO: sampling is broken for min-risk
    #if config['hook_samples'] >= 1:
    #    logger.info("Building sampler")
    #    extensions.append(
    #        Sampler(model=search_model, data_stream=tr_stream,
    #                hook_samples=config['hook_samples'],
    #                every_n_batches=config['sampling_freq'],
    #                src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    # TODO: use multimodal meteor and BLEU validator
    # TODO: add 'validator' key to IMT config
    # Add early stopping based on bleu
    if exp_config.get('bleu_script', None) is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(theano_sampling_source_input,
                          theano_sampling_context_input,
                          samples=samples,
                          config=exp_config,
                          model=search_model,
                          data_stream=dev_stream,
                          src_vocab=source_vocab,
                          trg_vocab=target_vocab,
                          normalize=exp_config['normalized_bleu'],
                          every_n_batches=exp_config['bleu_val_freq']))

    if exp_config.get('imt_f1_validation', False) is not False:
        logger.info("Building imt F1 validator")
        extensions.append(
            IMT_F1_Validator(theano_sampling_source_input,
                             theano_sampling_context_input,
                             samples=samples,
                             config=exp_config,
                             model=search_model,
                             data_stream=dev_stream,
                             src_vocab=source_vocab,
                             trg_vocab=target_vocab,
                             normalize=exp_config['normalized_bleu'],
                             every_n_batches=exp_config['bleu_val_freq']))

    # Add early stopping based on Meteor
    # if exp_config.get('meteor_directory', None) is not None:
    #     logger.info("Building meteor validator")
    #     extensions.append(
    #         MeteorValidator(theano_sampling_source_input, theano_sampling_context_input,
    #                         samples=samples,
    #                         config=config,
    #                         model=search_model, data_stream=dev_stream,
    #                         src_vocab=src_vocab,
    #                         trg_vocab=trg_vocab,
    #                         normalize=config['normalized_bleu'],
    #                         every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if exp_config['reload']:
        extensions.append(LoadNMT(exp_config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(exp_config['model_save_directory'],
                 channels=[[
                     'decoder_cost_cost', 'validation_set_imt_f1_score',
                     'validation_set_bleu_score', 'validation_set_meteor_score'
                 ]],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph
    # WORKING: try to catch and fix nan
    if exp_config['dropout'] < 1.0:
        if exp_config.get('nan_guard', False):
            from theano.compile.nanguardmode import NanGuardMode
            algorithm = GradientDescent(cost=cg.outputs[0],
                                        parameters=cg.parameters,
                                        step_rule=CompositeRule([
                                            StepClipping(
                                                exp_config['step_clipping']),
                                            eval(exp_config['step_rule'])()
                                        ]),
                                        on_unused_sources='warn',
                                        theano_func_kwargs={
                                            'mode':
                                            NanGuardMode(nan_is_error=True,
                                                         inf_is_error=True)
                                        })
        else:
            algorithm = GradientDescent(cost=cg.outputs[0],
                                        parameters=cg.parameters,
                                        step_rule=CompositeRule([
                                            StepClipping(
                                                exp_config['step_clipping']),
                                            eval(exp_config['step_rule'])()
                                        ]),
                                        on_unused_sources='warn')
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(
                                            exp_config['step_clipping']),
                                        eval(exp_config['step_rule'])()
                                    ]),
                                    on_unused_sources='warn')

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=train_model,
                         algorithm=algorithm,
                         data_stream=masked_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Beispiel #41
0
def train(config, save_path, bokeh_name,
          params, bokeh_server, test_tag, use_load_ext,
          load_log, fast_start, validation_epochs, validation_batches,
          per_epochs, per_batches):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])

    # Build the main brick and initialize all parameters.
    recognizer = SpeechRecognizer(
        data.recordings_source, data.labels_source,
        data.eos_label,
        data.num_features, data.num_labels,
        name="recognizer",
        data_prepend_eos=data.prepend_eos,
        character_map=data.character_map,
        **config["net"])
    for brick_path, attribute_dict in sorted(
            config['initialization'].items(),
            key=lambda (k, v): -k.count('/')):
        for attribute, value in attribute_dict.items():
            brick, = Selector(recognizer).select(brick_path).bricks
            setattr(brick, attribute, value)
            brick.push_initialization_config()
    recognizer.initialize()

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result
    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    if params:
        logger.info("Load parameters from " + params)
        recognizer.load_params(params)

    if test_tag:
        tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__
        __stream = data.get_stream("train")
        __data = next(__stream.get_epoch_iterator(as_dict=True))
        recognizer.recordings.tag.test_value = __data[data.recordings_source]
        recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask']
        recognizer.labels.tag.test_value = __data[data.labels_source]
        recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask']
        theano.config.compute_test_value = 'warn'

    batch_cost = recognizer.get_cost_graph().sum()
    batch_size = named_copy(recognizer.recordings.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_log_likelihood"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(
        applications=[r.generator.readout.readout], name="output_0")(
                cost_cg)
    bottom_output, = VariableFilter(
        applications=[r.bottom.apply], name="output")(
                cost_cg)
    attended, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended")(
                cost_cg)
    attended_mask, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended_mask")(
                cost_cg)
    weights, = VariableFilter(
        applications=[r.generator.evaluate], name="weights")(
                cost_cg)
    max_recording_length = named_copy(r.recordings.shape[0],
                                      "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = named_copy(attended_mask.shape[0],
                                          "max_attended_mask_length")
    max_attended_length = named_copy(attended.shape[0],
                                     "max_attended_length")
    max_num_phonemes = named_copy(r.labels.shape[0],
                                  "max_num_phonemes")
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_attended = named_copy(abs(attended).mean(),
                               "mean_attended")
    mean_bottom_output = named_copy(abs(bottom_output).mean(),
                                    "mean_bottom_output")
    weights_penalty = named_copy(monotonicity_penalty(weights, r.labels_mask),
                                 "weights_penalty")
    weights_entropy = named_copy(entropy(weights, r.labels_mask),
                                 "weights_entropy")
    mask_density = named_copy(r.labels_mask.mean(),
                              "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy,
        min_energy, max_energy,
        mean_attended, mean_bottom_output,
        batch_size, max_num_phonemes,
        mask_density])

    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config['regularization']
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [p for p in cg.parameters if p not in attention_params]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])
    regularized_cost = regularized_cg.outputs[0]
    regularized_weights_penalty = regularized_cg.outputs[1]

    # Model is weird class, we spend lots of time arguing with Bart
    # what it should be. However it can already nice things, e.g.
    # one extract all the parameters from the computation graphs
    # and give them hierahical names. This help to notice when a
    # because of some bug a parameter is not in the computation
    # graph.
    model = SpeechModel(regularized_cost)
    params = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, params[key].get_value().shape) for key
                        in sorted(params.keys())],
                    width=120))

    # Define the training algorithm.
    train_conf = config['training']
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'], train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False):
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [v for v in maxnorm_subjects
                                if not isinstance(get_brick(v), LookupTable)]
        logger.info("Parameters covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in params.items()
                                        if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in params.items()
                                        if not p in maxnorm_subjects]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                        maxnorm_subjects)]
    algorithm = GradientDescent(
        cost=regularized_cost +
            reg_config.get("penalty_coof", .0) * regularized_weights_penalty / batch_size +
            reg_config.get("decay", .0) *
            l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2,
        parameters=params.values(),
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)]))

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    observables = regularized_cg.outputs
    observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold]
    for name, param in params.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements ** 0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        observables.append(stats)

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(named_copy(aggregation.mean(var, batch_size),
                                            'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(named_copy(aggregation.mean(
                    var, recognizer.labels_mask.sum()), 'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
        ]
    extensions.append(TrainingDataMonitoring(
        [observables[0], algorithm.total_gradient_norm,
            algorithm.total_step_norm, clipping.threshold,
            max_recording_length,
            max_attended_length, max_attended_mask_length], after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(observables),
        prefix="average", every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes([cost, weights_entropy, weights_penalty]),
        data.get_stream("valid"), prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=validation_epochs,
            every_n_batches=validation_batches,
            after_training=False)
    extensions.append(validation)
    recognizer.init_beam_search(10)
    per = PhonemeErrorRate(recognizer, data.get_dataset("valid"))
    per_monitoring = DataStreamMonitoring(
        [per], data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=per_epochs,
            every_n_batches=per_batches,
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_likelihood = TrackTheBest(
        validation.record_name(cost)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_likelihood, track_the_best_per]
    extensions.append(AdaptiveClipping(
        algorithm.total_gradient_norm.name,
        clipping, train_conf['gradient_threshold'],
        decay_rate=0.998, burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf['num_batches'],
                    after_n_epochs=train_conf['num_epochs'])
        .add_condition(["after_batch"], _gradient_norm_is_none),
        # Live plotting: requires launching `bokeh-server`
        # and allows to see what happens online.
        Plot(bokeh_name
             if bokeh_name
             else os.path.basename(save_path),
             [# Plot 1: training and validation costs
             [average_monitoring.record_name(regularized_cost),
             validation.record_name(cost)],
             # Plot 2: gradient norm,
             [average_monitoring.record_name(algorithm.total_gradient_norm),
             average_monitoring.record_name(clipping.threshold)],
             # Plot 3: phoneme error rate
             [per_monitoring.record_name(per)],
             # Plot 4: training and validation mean weight entropy
             [average_monitoring._record_name('weights_entropy_per_label'),
             validation._record_name('weights_entropy_per_label')],
             # Plot 5: training and validation monotonicity penalty
             [average_monitoring._record_name('weights_penalty_per_recording'),
             validation._record_name('weights_penalty_per_recording')]],
             every_n_batches=10,
             server_url=bokeh_server),
        Checkpoint(save_path,
                   before_first_epoch=not fast_start, after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True)
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_per.notification_name),
            (root_path + "_best" + extension,))
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_likelihood.notification_name),
            (root_path + "_best_ll" + extension,)),
        ProgressBar(),
        Printing(every_n_batches=1,
                    attribute_filter=PrintingFilterList()
                    )]

    # Save the config into the status
    log = TrainingLog()
    log.status['_config'] = repr(config)
    main_loop = MainLoop(
        model=model, log=log, algorithm=algorithm,
        data_stream=data.get_stream("train"),
        extensions=extensions)
    main_loop.run()
Beispiel #42
0
#       COST AND ERROR MEASURE
cost = Softmax().categorical_cross_entropy(label, output).mean()
cost.name = 'cost'

error_rate = tensor.neq(tensor.argmax(output, axis=1), label).mean()
error_rate.name = 'error_rate'


#       REGULARIZATION
cg = ComputationGraph([cost, error_rate])
if weight_noise > 0:
    noise_vars = VariableFilter(roles=[WEIGHT])(cg)
    cg = apply_noise(cg, noise_vars, weight_noise)
if dropout > 0:
    cg = apply_dropout(cg, [eeg1, eeg2, data1, data2] + VariableFilter(name='output', bricks=fc.linear_transformations[:-1])(cg), dropout)
# for vfilter, p in dropout_locs:
#     cg = apply_dropout(cg, vfilter(cg), p)
[cost_reg, error_rate_reg] = cg.outputs


#       INITIALIZATION
for brick in [conv_eeg, maxpool_eeg, conv_eeg2, maxpool_eeg2, conv, maxpool, conv2, maxpool2, fc]:
    brick.weights_init = weights_init
    brick.biases_init = biases_init
    brick.initialize()


# ==========================================================================================
#                                     THE INFRASTRUCTURE
# ==========================================================================================
Beispiel #43
0
def main(mode, save_to, num_epochs, load_params=None,
         feature_maps=None, mlp_hiddens=None,
         conv_sizes=None, pool_sizes=None, stride=None, repeat_times=None,
         batch_size=None, num_batches=None, algo=None,
         test_set=None, valid_examples=None,
         dropout=None, max_norm=None, weight_decay=None,
         batch_norm=None):
    if feature_maps is None:
        feature_maps = [20, 50, 50]
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    if conv_sizes is None:
        conv_sizes = [5, 5, 5]
    if pool_sizes is None:
        pool_sizes = [2, 2, 2]
    if repeat_times is None:
        repeat_times = [1, 1, 1]
    if batch_size is None:
        batch_size = 500
    if valid_examples is None:
        valid_examples = 2500
    if stride is None:
        stride = 1
    if test_set is None:
        test_set = 'test'
    if algo is None:
        algo = 'rmsprop'
    if batch_norm is None:
        batch_norm = False

    image_size = (128, 128)
    output_size = 2

    if (len(feature_maps) != len(conv_sizes) or
        len(feature_maps) != len(pool_sizes) or
        len(feature_maps) != len(repeat_times)):
        raise ValueError("OMG, inconsistent arguments")

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations, 3, image_size,
                    stride=stride,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    repeat_times=repeat_times,
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='full',
                    batch_norm=batch_norm,
                    weights_init=Glorot(),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.initialize()
    logging.info("Input dim: {} {} {}".format(
        *convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(
                i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))


    single_x = tensor.tensor3('image_features')
    x = tensor.tensor4('image_features')
    single_y = tensor.lvector('targets')
    y = tensor.lmatrix('targets')

    # Training
    with batch_normalization(convnet):
        probs = convnet.apply(x)
    cost = (CategoricalCrossEntropy().apply(y.flatten(), probs)
            .copy(name='cost'))
    error_rate = (MisclassificationRate().apply(y.flatten(), probs)
                  .copy(name='error_rate'))

    cg = ComputationGraph([cost, error_rate])
    extra_updates = []

    if batch_norm: # batch norm:
        logger.debug("Apply batch norm")
        pop_updates = get_batch_normalization_updates(cg)
        # p stands for population mean
        # m stands for minibatch
        alpha = 0.005
        extra_updates = [(p, m * alpha + p * (1 - alpha))
                         for p, m in pop_updates]
        population_statistics = [p for p, m in extra_updates]
    if dropout:
        relu_outputs = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg)
        cg = apply_dropout(cg, relu_outputs, dropout)
    cost, error_rate = cg.outputs
    if weight_decay:
        logger.debug("Apply weight decay {}".format(weight_decay))
        cost += weight_decay * l2_norm(cg.parameters)
        cost.name = 'cost'

    # Validation
    valid_probs = convnet.apply_5windows(single_x)
    valid_cost = (CategoricalCrossEntropy().apply(single_y, valid_probs)
            .copy(name='cost'))
    valid_error_rate = (MisclassificationRate().apply(
        single_y, valid_probs).copy(name='error_rate'))

    model = Model([cost, error_rate])
    if load_params:
        logger.info("Loaded params from {}".format(load_params))
        with open(load_params, 'r') as src:
            model.set_parameter_values(load_parameters(src))

    # Training stream with random cropping
    train = DogsVsCats(("train",), subset=slice(None, 25000 - valid_examples, None))
    train_str =  DataStream(
        train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size))
    train_str = add_transformers(train_str, random_crop=True)

    # Validation stream without cropping
    valid = DogsVsCats(("train",), subset=slice(25000 - valid_examples, None, None))
    valid_str = DataStream(
        valid, iteration_scheme=SequentialExampleScheme(valid.num_examples))
    valid_str = add_transformers(valid_str)

    if mode == 'train':
        directory, _ = os.path.split(sys.argv[0])
        env = dict(os.environ)
        env['THEANO_FLAGS'] = 'floatX=float32'
        port = numpy.random.randint(1025, 10000)
        server = subprocess.Popen(
            [directory + '/server.py',
             str(25000 - valid_examples), str(batch_size), str(port)],
            env=env, stderr=subprocess.STDOUT)
        train_str = ServerDataStream(
            ('image_features', 'targets'), produces_examples=False,
            port=port)

        save_to_base, save_to_extension = os.path.splitext(save_to)

        # Train with simple SGD
        if algo == 'rmsprop':
            step_rule = RMSProp(decay_rate=0.999, learning_rate=0.0003)
        elif algo == 'adam':
            step_rule = Adam()
        else:
            assert False
        if max_norm:
            conv_params = VariableFilter(bricks=[Convolutional], roles=[WEIGHT])(cg)
            linear_params = VariableFilter(bricks=[Linear], roles=[WEIGHT])(cg)
            step_rule = CompositeRule(
                [step_rule,
                 Restrict(VariableClipping(max_norm, axis=0), linear_params),
                 Restrict(VariableClipping(max_norm, axis=(1, 2, 3)), conv_params)])

        algorithm = GradientDescent(
            cost=cost, parameters=model.parameters,
            step_rule=step_rule)
        algorithm.add_updates(extra_updates)
        # `Timing` extension reports time for reading data, aggregating a batch
        # and monitoring;
        # `ProgressBar` displays a nice progress bar during training.
        extensions = [Timing(every_n_batches=100),
                    FinishAfter(after_n_epochs=num_epochs,
                                after_n_batches=num_batches),
                    DataStreamMonitoring(
                        [valid_cost, valid_error_rate],
                        valid_str,
                        prefix="valid"),
                    TrainingDataMonitoring(
                        [cost, error_rate,
                        aggregation.mean(algorithm.total_gradient_norm)],
                        prefix="train",
                        after_epoch=True),
                    TrackTheBest("valid_error_rate"),
                    Checkpoint(save_to, save_separately=['log'],
                               parameters=cg.parameters +
                               (population_statistics if batch_norm else []),
                               before_training=True, after_epoch=True)
                        .add_condition(
                            ['after_epoch'],
                            OnLogRecord("valid_error_rate_best_so_far"),
                            (save_to_base + '_best' + save_to_extension,)),
                    Printing(every_n_batches=100)]

        model = Model(cost)

        main_loop = MainLoop(
            algorithm,
            train_str,
            model=model,
            extensions=extensions)
        try:
            main_loop.run()
        finally:
            server.terminate()
    elif mode == 'test':
        classify = theano.function([single_x], valid_probs.argmax())
        test = DogsVsCats((test_set,))
        test_str = DataStream(
            test, iteration_scheme=SequentialExampleScheme(test.num_examples))
        test_str = add_transformers(test_str)
        correct = 0
        with open(save_to, 'w') as dst:
            print("id", "label", sep=',', file=dst)
            for index, example in enumerate(test_str.get_epoch_iterator()):
                image = example[0]
                prediction = classify(image)
                print(index + 1, classify(image), sep=',', file=dst)
                if len(example) > 1 and prediction == example[1]:
                    correct += 1
        print(correct / float(test.num_examples))
    else:
        assert False
    def __init__(self, ref_data, output_dim):
        input_dim = ref_data.shape[1]
        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data')

        rng = RandomStreams()

        ae_bricks = []
        ae_input = ref_data_sh
        ae_costs = []
        for i, (idim, odim) in enumerate(zip([input_dim] + ae_dims[:-1], ae_dims)):
            ae_mlp = MLP(activations=[ae_activations[i]],
                         dims=[idim, odim],
                         name='enc%i'%i)
            enc = ae_mlp.apply(ae_input)
            enc_n = ae_mlp.apply(ae_input + rng.normal(size=ae_input.shape, std=ae_f_noise_std))
            ae_mlp_dec = MLP(activations=[ae_activations[i]],
                             dims=[odim, idim],
                             name='dec%i'%i)
            dec = ae_mlp_dec.apply(enc_n)

            cost = tensor.sqrt(((ae_input - dec) ** 2).sum(axis=1)).mean() + \
                        ae_l1_pen * abs(enc).sum(axis=1).mean()
            ae_costs.append(cost)

            ae_input = enc
            ae_bricks = ae_bricks + [ae_mlp, ae_mlp_dec]

        self.ae_costs = ae_costs

        ref_data_enc = ae_input

        # Construct the model
        j = tensor.lvector('j')
        r = ref_data_enc[j, :]
        x = tensor.fmatrix('x')
        y = tensor.ivector('y')

        # input_dim must be nr
        mlp = MLP(activations=activation_functions,
                  dims=[ae_dims[-1]] + hidden_dims + [n_inter], name='inter_gen')
        mlp2 = MLP(activations=activation_functions_2 + [None],
                   dims=[n_inter] + hidden_dims_2 + [output_dim],
                   name='end_mlp')

        inter_weights = mlp.apply(r)

        if inter_bias == None:
            ibias = Bias(n_inter)
            ibias.biases_init = Constant(0)
            ibias.initialize()
            inter = ibias.apply(tensor.dot(x, inter_weights))
        else:
            inter = tensor.dot(x, inter_weights) - inter_bias
        inter = inter_act_fun.apply(inter)

        final = mlp2.apply(inter)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        # error_rate = tensor.neq(y, pred).mean()
        ber = balanced_error_rate.ber(y, pred)

        # Initialize parameters
        for brick in ae_bricks + [mlp, mlp2]:
            brick.weights_init = IsotropicGaussian(0.01)
            brick.biases_init = Constant(0.001)
            brick.initialize()

        # apply regularization
        cg = ComputationGraph([cost, ber])

        if r_dropout != 0:
            # - dropout on input vector r : r_dropout
            cg = apply_dropout(cg, [r], r_dropout)

        if x_dropout != 0:
            cg = apply_dropout(cg, [x], x_dropout)

        if s_dropout != 0:
            # - dropout on intermediate layers of first mlp : s_dropout
            s_dropout_vars = list(set(VariableFilter(bricks=[Tanh], name='output')
                                                     (ComputationGraph([inter_weights])))
                                 - set([inter_weights]))
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)

        if i_dropout != 0:
            # - dropout on input to second mlp : i_dropout
            cg = apply_dropout(cg, [inter], i_dropout)

        if a_dropout != 0:
            # - dropout on hidden layers of second mlp : a_dropout
            a_dropout_vars = list(set(VariableFilter(bricks=[Tanh], name='output')
                                                     (ComputationGraph([final])))
                                 - set([inter_weights]) - set(s_dropout_vars))
            cg = apply_dropout(cg, a_dropout_vars, a_dropout)

        if r_noise_std != 0:
            cg = apply_noise(cg, [r], r_noise_std)

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        [cost_reg, ber_reg] = cg.outputs
        
        if s_l1pen != 0:
            s_weights = VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg)
            cost_reg = cost_reg + s_l1pen * sum(abs(w).sum() for w in s_weights)
        if i_l1pen != 0:
            cost_reg = cost_reg + i_l1pen * abs(inter).sum()
        if a_l1pen != 0:
            a_weights = VariableFilter(bricks=mlp2.linear_transformations, roles=[WEIGHT])(cg)
            cost_reg = cost_reg + a_l1pen * sum(abs(w).sum() for w in a_weights)


        self.cost = cost
        self.cost_reg = cost_reg
        self.ber = ber
        self.ber_reg = ber_reg
        self.pred = pred
        self.confidence = confidence
    def __init__(self):
        inp = tensor.tensor3('input')
        inp = inp.dimshuffle(1,0,2)
        target = tensor.matrix('target')
        target = target.reshape((target.shape[0],))
        product = tensor.lvector('product')
        missing = tensor.eq(inp, 0)
        train_input_mean = 1470614.1
        train_input_std = 3256577.0

        trans_1 = tensor.concatenate((inp[1:,:,:],tensor.zeros((1,inp.shape[1],inp.shape[2]))), axis=0) 
        trans_2 = tensor.concatenate((tensor.zeros((1,inp.shape[1],inp.shape[2])), inp[:-1,:,:]), axis=0) 
        inp = tensor.switch(missing,(trans_1+trans_2)/2, inp)

        lookup = LookupTable(length = 352, dim=4*hidden_dim)
        product_embed= lookup.apply(product)

        salut = tensor.concatenate((inp, missing),axis =2)
        linear = Linear(input_dim=input_dim+1, output_dim=4*hidden_dim,
                        name="lstm_in")
        inter = linear.apply(salut)
        inter = inter + product_embed[None,:,:] 


        lstm = LSTM(dim=hidden_dim, activation=activation_function,
                    name="lstm")

        hidden, cells = lstm.apply(inter)

        linear2= Linear(input_dim = hidden_dim, output_dim = out_dim,
                       name="ouput_linear")

        pred = linear2.apply(hidden[-1])*train_input_std + train_input_mean
        pred = pred.reshape((product.shape[0],))
        
        cost = tensor.mean(abs((pred-target)/target)) 
        # Initialize all bricks
        for brick in [linear, linear2, lstm, lookup]:
            brick.weights_init = IsotropicGaussian(0.1)
            brick.biases_init = Constant(0.)
            brick.initialize()

        # Apply noise and dropout
        cg = ComputationGraph([cost])
        if w_noise_std > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, w_noise_std)
        if i_dropout > 0:
            cg = apply_dropout(cg, [hidden], i_dropout)
        [cost_reg] = cg.outputs
        cost_reg += 1e-20

        if cost_reg is not cost:
            self.cost = cost
            self.cost_reg = cost_reg

            cost_reg.name = 'cost_reg'
            cost.name = 'cost'

            self.sgd_cost = cost_reg

            self.monitor_vars = [[cost, cost_reg]]
        else:
            self.cost = cost
            cost.name = 'cost'

            self.sgd_cost = cost

            self.monitor_vars = [[cost]]

        self.pred = pred
        pred.name = 'pred'
Beispiel #46
0
    def set_up_predictor(self, nmt_model_path):
        """Initializes the predictor with the given NMT model. Code 
        following ``blocks.machine_translation.main``. 
        """
        self.src_vocab_size = self.config['src_vocab_size']
        self.trgt_vocab_size = self.config['trg_vocab_size']
        
        # Create Theano variables
        logging.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')
    
        # Construct model
        logging.info('Building RNN encoder-decoder')
        encoder = BidirectionalEncoder(self.config['src_vocab_size'],
                                       self.config['enc_embed'],
                                       self.config['enc_nhids'])
        decoder = Decoder(self.config['trg_vocab_size'],
                          self.config['dec_embed'],
                          self.config['dec_nhids'],
                          self.config['enc_nhids'] * 2)
        cost = decoder.cost(
                encoder.apply(source_sentence, source_sentence_mask),
                source_sentence_mask, target_sentence, target_sentence_mask)
    
        logging.info('Creating computational graph')
        cg = ComputationGraph(cost)
    
        # Initialize model (TODO: really necessary?)
        logging.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            self.config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()
    
        # Apply dropout for regularization (TODO: remove?)
        if self.config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logging.info('Applying dropout')
            dropout_inputs = [x for x in cg.intermediary_variables
                              if x.name == 'maxout_apply_output']
            cg = apply_dropout(cg, dropout_inputs, self.config['dropout'])
    
        # Apply weight noise for regularization (TODO: remove?)
        if self.config['weight_noise_ff'] > 0.0:
            logging.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(cg,
                             enc_params+dec_params,
                             self.config['weight_noise_ff'])
    
        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logging.debug("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logging.debug('    {:15}: {}'.format(shape, count))
        logging.info("Total number of parameters: {}".format(len(shapes)))
    
        # Print parameter names
        enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                                   Selector(decoder).get_parameters())
        logging.debug("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logging.debug('    {:15}: {}'.format(value.get_value().shape,
                                                 name))
        logging.info("Total number of parameters: {}"
                    .format(len(enc_dec_param_dict)))
    
        # Set up training model
        logging.info("Building model")
    
        # Set extensions
        logging.info("Initializing extensions")
    
        # Set up beam search and sampling computation graphs if necessary
        logging.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
            
        # Follows blocks.machine_translation.BleuValidator.__init__
        self.source_sentence = sampling_input
        self.samples = samples
        self.model = search_model
        self.normalize = True
        self.verbose = self.config.get('val_set_out', None)

        # Reload model if necessary
        if self.config['reload']:
            loader = LoadNMT(nmt_model_path,
                             self.config['saveto'],
                             search_model)
            loader.load_weights()
            
        self.best_models = []
        self.val_bleu_curve = []
        self.search_algorithm = MyopticSearch(samples=samples)
        self.search_algorithm.compile()
Beispiel #47
0
def main(num_epochs,
         feature_maps=None,
         mlp_hiddens=None,
         conv_sizes=None,
         pool_sizes=None,
         batch_size=500,
         num_batches=None):

    ############# Architecture #############
    if feature_maps is None:
        feature_maps = [20, 50]
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    if conv_sizes is None:
        conv_sizes = [5, 5]
    if pool_sizes is None:
        pool_sizes = [2, 2]
    image_size = (32, 32)
    batch_size = 50
    output_size = 2
    learningRate = 0.1
    num_epochs = 10
    num_batches = None
    delta = 0.01
    drop_prob = 0.5
    weight_noise = 0.75

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations,
                    3,
                    image_size,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='full',
                    weights_init=Uniform(width=.2),
                    biases_init=Constant(0))

    # We push initialization config to set different initialization schemes
    # for convolutional layers.

    convnet.push_initialization_config()
    convnet.layers[0].weights_init = Uniform(width=.2)
    convnet.layers[1].weights_init = Uniform(width=.09)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info(
        "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))

    x = tensor.tensor4('image_features')
    y = tensor.lmatrix('targets')

    probs = (convnet.apply(x)).copy(name='probs')

    # Computational Graph just for cost for drop_out and noise application
    cg_probs = ComputationGraph([probs])
    inputs = VariableFilter(roles=[INPUT])(cg_probs.variables)
    weights = VariableFilter(roles=[FILTER, WEIGHT])(cg_probs.variables)

    ############# Regularization #############
    #regularization = 0
    logger.info('Applying regularization')
    regularization = delta * sum([(W**2).mean() for W in weights])
    probs.name = "reg_probs"

    ############# Guaussian Noise #############

    logger.info('Applying Gaussian noise')
    cg_train = apply_noise(cg_probs, weights, weight_noise)

    ############# Dropout #############

    logger.info('Applying dropout')
    cg_probs = apply_dropout(cg_probs, inputs, drop_prob)
    dropped_out = VariableFilter(roles=[DROPOUT])(cg_probs.variables)
    inputs_referenced = [var.tag.replacement_of for var in dropped_out]
    set(inputs) == set(inputs_referenced)

    ############# Batch normalization #############

    # recalculate probs after dropout and noise and regularization:
    probs = cg_probs.outputs[0] + regularization
    cost = (CategoricalCrossEntropy().apply(y.flatten(),
                                            probs).copy(name='cost'))
    error_rate = (MisclassificationRate().apply(y.flatten(),
                                                probs).copy(name='error_rate'))
    cg = ComputationGraph([probs, cost, error_rate])
    cg = apply_batch_normalization(cg)

    ########### Loading images #####################

    from fuel.datasets.dogs_vs_cats import DogsVsCats
    from fuel.streams import DataStream, ServerDataStream
    from fuel.schemes import ShuffledScheme
    from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation
    from fuel.transformers import Flatten, Cast, ScaleAndShift

    def create_data(data):
        stream = DataStream(data,
                            iteration_scheme=ShuffledScheme(
                                data.num_examples, batch_size))
        stream_downscale = MinimumImageDimensions(
            stream, image_size, which_sources=('image_features', ))
        stream_rotate = Random2DRotation(stream_downscale,
                                         which_sources=('image_features', ))
        stream_max = ScikitResize(stream_rotate,
                                  image_size,
                                  which_sources=('image_features', ))
        stream_scale = ScaleAndShift(stream_max,
                                     1. / 255,
                                     0,
                                     which_sources=('image_features', ))
        stream_cast = Cast(stream_scale,
                           dtype='float32',
                           which_sources=('image_features', ))
        #stream_flat = Flatten(stream_scale, which_sources=('image_features',))

        return stream_cast

    stream_data_train = create_data(
        DogsVsCats(('train', ), subset=slice(0, 20)))
    stream_data_test = create_data(
        DogsVsCats(('train', ), subset=slice(20, 30)))

    # Train with simple SGD
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Scale(learning_rate=learningRate))
    #algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Adam(0.001))
    #algorithm.add_updates(extra_updates)

    # `Timing` extension reports time for reading data, aggregating a batch and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = []
    extensions.append(Timing())
    extensions.append(
        FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches))
    extensions.append(
        DataStreamMonitoring([cost, error_rate],
                             stream_data_test,
                             prefix="valid"))
    extensions.append(
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               after_epoch=True))
    #extensions.append(Checkpoint(save_to))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    logger.info("Building the model")
    model = Model(cost)

    main_loop = MainLoop(algorithm,
                         stream_data_train,
                         model=model,
                         extensions=extensions)

    main_loop.run()
Beispiel #48
0
def main(config, tr_stream, dev_stream, use_bokeh=False, the_task=None, the_track=None):

    config['the_task'] = the_task
    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        # end_embed is dimension of word embedding matrix in encoder; enc_nhids number of hidden units in encoder GRU
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2, config['use_attention'], cost_type=config['error_fct'])
    cost = decoder.cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)
    testVar = decoder.getTestVar(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)
   
    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    my_rng = numpy.random.RandomState(config['rng_value']) 
    if config['identity_init']:
      encoder.weights_init = decoder.weights_init = Identity()
    else:
      encoder.weights_init = decoder.weights_init = IsotropicGaussian(
          config['weight_scale'])
      encoder.rng = decoder.rng = my_rng
    
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    encoder.bidir.prototype.rng = my_rng
    decoder.transition.weights_init = Orthogonal()
    decoder.transition.rng = my_rng
    encoder.initialize()
    decoder.initialize()

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info('Applying dropout')
        dropout_inputs = [x for x in cg.intermediary_variables
                          if x.name == 'maxout_apply_output']
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Apply weight noise for regularization
    if config['weight_noise_ff'] > 0.0:
        logger.info('Applying weight noise to ff layers')
        enc_params = Selector(encoder.lookup).get_params().values()
        enc_params += Selector(encoder.fwd_fork).get_params().values()
        enc_params += Selector(encoder.back_fork).get_params().values()
        dec_params = Selector(
            decoder.sequence_generator.readout).get_params().values()
        dec_params += Selector(
            decoder.sequence_generator.fork).get_params().values()
        dec_params += Selector(decoder.state_init).get_params().values()
        cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'], seed=my_rng)

    cost = cg.outputs[0]

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))
    


    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))


    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)    

    # Set extensions
    logger.info("Initializing extensions")
    # this is ugly code and done, because I am not sure if the order of the extensions is important
    if 'track2' in config['saveto']: # less epochs for track 2, because of more data
      if config['early_stopping']:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']/2),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
      else:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']/2),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
    else:
      if config['early_stopping']:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
      else:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    
    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model, data_stream=tr_stream,
                    hook_samples=config['hook_samples'],
                    #every_n_batches=1,
                    every_n_batches=config['sampling_freq'],
                    src_vocab_size=8))
                    #src_vocab_size=config['src_vocab_size']))
    
    # Add early stopping based on bleu
    if config['val_set'] is not None:
        logger.info("Building accuracy validator")
        extensions.append(
            AccuracyValidator(sampling_input, samples=samples, config=config,
                          model=search_model, data_stream=dev_stream,
                          after_training=True,
                          #after_epoch=True))
                          every_n_epochs=5))
    else:
        logger.info("No validation set given for this language")
    
    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))
       
    
    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                 eval(config['step_rule'])()])
    )

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=training_model,
        algorithm=algorithm,
        data_stream=tr_stream,
        extensions=extensions
    )
    
    # Train!
    main_loop.run()
    def __init__(self, ref_data, output_dim):
        input_dim = ref_data.shape[1]

        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32),
                                    name='ref_data')

        # Construct the model
        j = tensor.lvector('j')
        r = ref_data_sh[j, :]
        x = tensor.fmatrix('x')
        y = tensor.ivector('y')

        # input_dim must be nr
        mlp0 = MLP(activations=activation_functions_0,
                   dims=[input_dim] + hidden_dims_0,
                   name='e0')
        mlp0vs = MLP(activations=[None],
                     dims=[hidden_dims_0[-1], input_dim],
                     name='de0')
        mlp1 = MLP(activations=activation_functions_1,
                   dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter],
                   name='inter_gen')
        mlp2 = MLP(activations=activation_functions_2 + [None],
                   dims=[n_inter] + hidden_dims_2 + [output_dim],
                   name='end_mlp')

        encod = mlp0.apply(r)
        rprime = mlp0vs.apply(encod)
        inter_weights = mlp1.apply(encod)

        ibias = Bias(n_inter)
        ibias.biases_init = Constant(0)
        ibias.initialize()
        inter = inter_act_fun.apply(ibias.apply(tensor.dot(x, inter_weights)))

        final = mlp2.apply(inter)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        error_rate = tensor.neq(y, pred).mean()

        # Initialize parameters
        for brick in [mlp0, mlp0vs, mlp1, mlp2]:
            brick.weights_init = IsotropicGaussian(0.01)
            brick.biases_init = Constant(0.001)
            brick.initialize()

        # apply regularization
        cg = ComputationGraph([cost, error_rate])

        if r_dropout != 0:
            # - dropout on input vector r : r_dropout
            cg = apply_dropout(cg, [r], r_dropout)

        if s_dropout != 0:
            # - dropout on intermediate layers of first mlp : s_dropout
            s_dropout_vars = list(
                set(
                    VariableFilter(bricks=[Tanh], name='output')
                    (ComputationGraph([inter_weights]))) -
                set([inter_weights]))
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)

        if i_dropout != 0:
            # - dropout on input to second mlp : i_dropout
            cg = apply_dropout(cg, [inter], i_dropout)

        if a_dropout != 0:
            # - dropout on hidden layers of second mlp : a_dropout
            a_dropout_vars = list(
                set(
                    VariableFilter(bricks=[Tanh], name='output')
                    (ComputationGraph([final]))) - set([inter_weights]) -
                set(s_dropout_vars))
            cg = apply_dropout(cg, a_dropout_vars, a_dropout)

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        [cost_reg, error_rate_reg] = cg.outputs

        # add reconstruction penalty for AE part
        penalty_val = tensor.sqrt(((r - rprime)**2).sum(axis=1)).mean()
        cost_reg = cost_reg + reconstruction_penalty * penalty_val

        self.cost = cost
        self.cost_reg = cost_reg
        self.error_rate = error_rate
        self.error_rate_reg = error_rate_reg
        self.pred = pred
        self.confidence = confidence
Beispiel #50
0
train_stream = get_stream(hdf5_file, 'train', batch_size)
dev_stream = get_stream(hdf5_file, 'dev', batch_size)


# MODEL
x = tensor.matrix('features', dtype='uint8')
y = tensor.matrix('targets', dtype='uint8')
y_hat, cost, cells = nn_fprop(x, y, vocab_size, hidden_size, num_layers, model)

# COST
cg = ComputationGraph(cost)

if dropout > 0:
    # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015)
    inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(cg.variables)
    cg = apply_dropout(cg, inputs, dropout)
    cost = cg.outputs[0]

# Learning algorithm
step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate),
              StepClipping(step_clipping)]
algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
                            step_rule=CompositeRule(step_rules))

# Extensions
gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
step_norm = aggregation.mean(algorithm.total_step_norm)
monitored_vars = [cost, gradient_norm, step_norm]

dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True,
                                   before_first_epoch=True, data_stream=dev_stream, prefix="dev")
Beispiel #51
0
    def set_up(self, config=None, make_prunable=False):
        """Loads and initializes all the theano variables for the
        training model and the decoding model.
        
        Args:
            config (dict): NMT configuration
        """
        if config:
            self.config = config
        else:
            config = self.config
        # Create Theano variables
        logging.debug('Creating theano variables')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence_mask = tensor.matrix('target_mask')

        # Construct model (fs439: Add NoLookup options)
        if config['dec_layers'] != 1:
            logging.fatal("Only dec_layers=1 supported.")
        logging.debug('Building RNN encoder-decoder')
        if config['src_sparse_feat_map']:
            if config['enc_layers'] != 1:
                logging.fatal("Only enc_layers=1 supported for sparse "
                              "source features.")
            source_sentence = tensor.tensor3('source')
            self.sampling_input = tensor.tensor3('input')
            encoder = NoLookupEncoder(config['enc_embed'], config['enc_nhids'])
        else:
            source_sentence = tensor.lmatrix('source')
            self.sampling_input = tensor.lmatrix('input')
            if config['enc_layers'] > 1 and not config['enc_share_weights']:
                encoder = DeepBidirectionalEncoder(
                    config['src_vocab_size'], config['enc_embed'],
                    config['enc_layers'], config['enc_skip_connections'],
                    config['enc_nhids'])
            else:
                encoder = BidirectionalEncoder(config['src_vocab_size'],
                                               config['enc_embed'],
                                               config['enc_layers'],
                                               config['enc_skip_connections'],
                                               config['enc_nhids'])
        if config['trg_sparse_feat_map']:
            target_sentence = tensor.tensor3('target')
            decoder = NoLookupDecoder(
                config['trg_vocab_size'], config['dec_embed'],
                config['dec_nhids'], config['att_nhids'],
                config['maxout_nhids'], config['enc_nhids'] * 2,
                config['attention'], config['dec_attention_sources'],
                config['dec_readout_sources'], config['memory'],
                config['memory_size'], config['seq_len'], config['dec_init'])
        else:
            target_sentence = tensor.lmatrix('target')
            decoder = Decoder(config['trg_vocab_size'],
                              config['dec_embed'],
                              config['dec_nhids'],
                              config['att_nhids'],
                              config['maxout_nhids'],
                              config['enc_nhids'] * 2,
                              config['attention'],
                              config['dec_attention_sources'],
                              config['dec_readout_sources'],
                              config['memory'],
                              config['memory_size'],
                              config['seq_len'],
                              config['dec_init'],
                              make_prunable=make_prunable)
        if config['annotations'] != 'direct':
            annotators = []
            add_direct = False
            for name in config['annotations'].split(','):
                if name == 'direct':
                    add_direct = True
                elif name == 'hierarchical':
                    annotators.append(HierarchicalAnnotator(encoder))
                else:
                    logging.fatal("Annotation strategy %s unknown" % name)
            encoder = EncoderWithAnnotators(encoder, annotators, add_direct)
        annotations, annotations_mask = encoder.apply(source_sentence,
                                                      source_sentence_mask)
        self.cost = decoder.cost(annotations, annotations_mask,
                                 target_sentence, target_sentence_mask)

        logging.info('Creating computational graph')
        self.cg = ComputationGraph(self.cost)

        # Initialize model
        logging.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        try:
            encoder.bidir.prototype.weights_init = Orthogonal()
        except AttributeError:
            pass  # Its fine, no bidirectional encoder
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logging.info('Applying dropout')
            dropout_inputs = [
                x for x in self.cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            self.cg = apply_dropout(self.cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logging.info('Applying weight noise to ff layers')
            if encoder.lookup:
                enc_params = Selector(encoder.lookup).get_parameters().values()
            enc_params += Selector(encoder.fwd_fork).get_parameters().values()
            enc_params += Selector(encoder.back_fork).get_parameters().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_parameters().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_parameters().values()
            self.cg = apply_noise(self.cg, enc_params + dec_params,
                                  config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in self.cg.parameters]
        logging.debug("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logging.debug('    {:15}: {}'.format(shape, count))
        logging.debug("Total number of CG parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logging.debug("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logging.debug('    {:15}: {}'.format(value.get_value().shape,
                                                 name))
        logging.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logging.info("Building model")
        self.training_model = Model(self.cost)

        logging.info("Building sampling model")
        src_shape = (self.sampling_input.shape[-2],
                     self.sampling_input.shape[-1])  # batch_size x sen_length
        sampling_representation, _ = encoder.apply(self.sampling_input,
                                                   tensor.ones(src_shape))
        generated = decoder.generate(src_shape, sampling_representation)
        self.search_model = Model(generated)
        generated_outputs = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        self.samples = generated_outputs[1]
        self.encoder = encoder
        self.decoder = decoder
Beispiel #52
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        cqembed = tensor.concatenate([
            cembed,
            tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0)
        ],
                                     axis=2)
        clstms, chidden_list = make_bidir_lstm_stack(
            cqembed, config.embed_size + qenc_dim,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism Bilinear
        attention_clinear_1 = Linear(input_dim=cenc_dim,
                                     output_dim=qenc_dim,
                                     name='attc_1')
        bricks += [attention_clinear_1]
        att_start = qenc[None, :, :] * attention_clinear_1.apply(
            cenc.reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape(
                    (cenc.shape[0], cenc.shape[1], cenc.shape[2]))
        att_start = att_start.sum(axis=2)
        att_start = tensor.nnet.softmax(att_start.T).T

        attention_clinear_2 = Linear(input_dim=cenc_dim,
                                     output_dim=qenc_dim,
                                     name='attc_2')
        bricks += [attention_clinear_2]
        att_end = qenc[None, :, :] * attention_clinear_2.apply(
            cenc.reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape(
                    (cenc.shape[0], cenc.shape[1], cenc.shape[2]))
        att_end = att_end.sum(axis=2)
        att_end = tensor.nnet.softmax(att_end.T).T

        att_start = tensor.dot(
            tensor.le(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_start)
        att_end = tensor.dot(
            tensor.ge(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_end)

        # add attention from left and right
        att_weights = att_start * att_end

        att_target = tensor.eq(
            tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)),
            tensor.tile(context[:, None, :],
                        (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1)

        self.predictions = tensor.gt(att_weights, 0.25) * context

        att_target = att_target / (att_target.sum(axis=0) + 0.00001)
        att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001)

        #cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum()
        cost = (((att_weights - att_target)**2) *
                context_mask).sum() / context_mask.sum()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        att_start.name = 'att_start'
        att_end.name = 'att_end'
        att_weights.name = 'att_weights'
        att_target.name = 'att_target'
        self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        self.analyse_vars = [
            cost, self.predictions, att_start, att_end, att_weights, att_target
        ]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
def main(feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=None, num_batches=None):
    if feature_maps is None:
        feature_maps = [32, 48, 64, 80, 128, 128]
    if mlp_hiddens is None:
        mlp_hiddens = [1000]
    if conv_sizes is None:
        conv_sizes = [7, 5, 5, 5, 5, 4]
    if pool_sizes is None:
        pool_sizes = [3, 2, 2, 2, 2, 1]
    if batch_size is None:
        batch_size = 64
    conv_steps = [2, 1, 1, 1, 1, 1]  # same as stride
    image_size = (256, 256)
    output_size = 2
    learningRate = 0.001
    drop_prob = 0.5
    weight_noise = 0.75
    num_epochs = 250
    num_batches = None
    host_plot = "http://*****:*****@ %s" % (graph_name, datetime.datetime.now(), socket.gethostname()),
                    channels=[["train_error_rate", "valid_error_rate"], ["train_total_gradient_norm"]],
                    after_epoch=True,
                    server_url=host_plot,
                )
            )
            PLOT_AVAILABLE = True
        except ImportError:
            PLOT_AVAILABLE = False
        extensions.append(Checkpoint(save_to, after_epoch=True, after_training=True, save_separately=["log"]))

    logger.info("Building the model")

    model = Model(cost)

    ########### Loading images #####################
    main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions)

    main_loop.run()
Beispiel #54
0
def main(config,
         tr_stream,
         dev_stream,
         source_vocab,
         target_vocab,
         use_bokeh=False):

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    initial_context = tensor.matrix('initial_context')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])

    # let user specify the target transition class name in config,
    # eval it and pass to decoder
    target_transition_name = config.get(
        'target_transition', 'GRUInitialStateWithInitialStateSumContext')
    target_transition = eval(target_transition_name)

    logger.info('Using target transition: {}'.format(target_transition_name))
    decoder = InitialContextDecoder(config['trg_vocab_size'],
                                    config['dec_embed'], config['dec_nhids'],
                                    config['enc_nhids'] * 2,
                                    config['context_dim'], target_transition)

    cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask),
                        source_sentence_mask, target_sentence,
                        target_sentence_mask, initial_context)

    cost.name = 'decoder_cost'

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING
    # TODO: validate performance with/without regularization
    if config.get('l2_regularization', False) is True:
        l2_reg_alpha = config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to name the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_parameters(),
        Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions

    # TODO: add checking for existing model and loading
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Create the theano variables that we need for the sampling graph
    sampling_input = tensor.lmatrix('input')
    sampling_context = tensor.matrix('context_input')

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config.get('bleu_script',
                                                 None) is not None:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))

        generated = decoder.generate(sampling_input, sampling_representation,
                                     sampling_context)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(
                model=search_model,
                data_stream=tr_stream,
                hook_samples=config['hook_samples'],
                every_n_batches=config['sampling_freq'],
                src_vocab=source_vocab,
                trg_vocab=target_vocab,
                src_vocab_size=config['src_vocab_size'],
            ))

    # Add early stopping based on bleu
    if config.get('bleu_script', None) is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(sampling_input,
                          sampling_context,
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          src_vocab=source_vocab,
                          trg_vocab=target_vocab,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Add early stopping based on Meteor
    if config.get('meteor_directory', None) is not None:
        logger.info("Building meteor validator")
        extensions.append(
            MeteorValidator(sampling_input,
                            sampling_context,
                            samples=samples,
                            config=config,
                            model=search_model,
                            data_stream=dev_stream,
                            src_vocab=source_vocab,
                            trg_vocab=target_vocab,
                            normalize=config['normalized_bleu'],
                            every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(config['model_save_directory'],
                 channels=[[
                     'decoder_cost', 'validation_set_bleu_score',
                     'validation_set_meteor_score'
                 ]],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    # if there is dropout or random noise, we need to use the output of the modified graph
    if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0:
        algorithm = GradientDescent(cost=cg.outputs[0],
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Beispiel #55
0
def main(config, tr_stream, dev_stream):

    # Create Theano variables
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'],
                                   config['enc_nhids'])
    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['dec_nhids'], config['enc_nhids'] * 2)
    cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask),
                        source_sentence_mask, target_sentence, target_sentence_mask)

    # Initialize model
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        dropout_inputs = [x for x in cg.intermediary_variables
                          if x.name == 'maxout_apply_output']
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Apply weight noise for regularization
    if config['weight_noise_ff'] > 0.0:
        enc_params = Selector(encoder.lookup).get_params().values()
        enc_params += Selector(encoder.fwd_fork).get_params().values()
        enc_params += Selector(encoder.back_fork).get_params().values()
        dec_params = Selector(decoder.sequence_generator.readout).get_params().values()
        dec_params += Selector(decoder.sequence_generator.fork).get_params().values()
        dec_params += Selector(decoder.transition.initial_transformer).get_params().values()
        cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'])

    cost = cg.outputs[0]

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_params(),
                               Selector(decoder).get_params())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.iteritems():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict)))

    # Set up training algorithm
    if args.subtensor_fix:
        assert config['step_rule'] == 'AdaDelta'
        from subtensor_gradient import GradientDescent_SubtensorFix, AdaDelta_SubtensorFix, subtensor_params
        lookups = subtensor_params(cg, [encoder.lookup, decoder.sequence_generator.readout.feedback_brick.lookup])
        algorithm = GradientDescent_SubtensorFix(
            subtensor_params=lookups,
            cost=cost, params=cg.parameters,
            step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                     RemoveNotFinite(0.9),
                                     AdaDelta_SubtensorFix(subtensor_params=lookups)])
        )
    else:
        algorithm = GradientDescent(
            cost=cost, params=cg.parameters,
            step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                     RemoveNotFinite(0.9),
                                     eval(config['step_rule'])()])
        )

    # Set up beam search and sampling computation graphs
    sampling_representation = encoder.apply(
        sampling_input, tensor.ones(sampling_input.shape))
    generated = decoder.generate(sampling_input, sampling_representation)
    search_model = Model(generated)
    samples, = VariableFilter(
        bricks=[decoder.sequence_generator], name="outputs")(
            ComputationGraph(generated[1]))  # generated[1] is the next_outputs

    # Set up training model
    training_model = Model(cost)

    # Set extensions
    extensions = [
        Sampler(
            model=search_model, config=config, data_stream=tr_stream,
            src_eos_idx=config['src_eos_idx'],
            trg_eos_idx=config['trg_eos_idx'],
            every_n_batches=config['sampling_freq']),
        BleuValidator(
            sampling_input, samples=samples, config=config,
            model=search_model, data_stream=dev_stream,
            src_eos_idx=config['src_eos_idx'],
            trg_eos_idx=config['trg_eos_idx'],
            every_n_batches=config['bleu_val_freq']),
        TrainingDataMonitoring([cost], after_batch=True),
        #Plot('En-Fr', channels=[['decoder_cost_cost']],
        #     after_batch=True),
        Printing(after_batch=True),
        Dump(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Reload model if necessary
    if config['reload']:
        extensions += [LoadFromDumpWMT15(config['saveto'])]

    # Initialize main loop
    main_loop = MainLoop(
        model=training_model,
        algorithm=algorithm,
        data_stream=tr_stream,
        extensions=extensions
    )

    # Train!
    main_loop.run()
    def __init__(self, config, vocab_size, id_to_vocab, logger):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.ivector('answer')
        candidates = tensor.imatrix('candidates')
        candidates_mask = tensor.imatrix('candidates_mask')

        # question_actual = tensor.imatrix('question_actual')
        # context_actual = tensor.imatrix('context_actual')
        # answer_actual = tensor.imatrix('answer_actual')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)

        # Embed questions and cntext
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        bricks.append(embed)

        qembed = embed.apply(question)
        cembed = embed.apply(context)

        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')
        clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX),
                                                     config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
        bricks = bricks + qlstms + clstms

        # Calculate question encoding (concatenate layer1)
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            #u
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        if config.ctx_skip_connections:
            cenc_dim = 2*sum(config.ctx_lstm_size)
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2*config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq')
        attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc')
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2])))
        att_weights.name = 'att_weights_0'
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights.name = 'att_weights'

        #r
        attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0)
        attended.name = 'attended'

        # Now we can calculate our output
        out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities],
                      activations=config.out_mlp_activations + [Identity()],
                      name='out_mlp')
        bricks += [out_mlp]
        # g^AR
        probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1))
        probs.name = 'probs'

        is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :],
                                 tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1)
        probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs))

        # Calculate prediction, cost and error rate
        pred = probs.argmax(axis=1)
        cost = Softmax().categorical_cross_entropy(answer, probs).mean()
        error_rate = tensor.neq(answer, pred).mean()

        # Apply dropout
        cg = ComputationGraph([cost, error_rate])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg, error_rate_reg] = cg.outputs

        # Other stuff
        cost_reg.name = cost.name = 'cost'
        error_rate_reg.name = error_rate.name = 'error_rate'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg], [error_rate_reg]]
        self.monitor_vars_valid = [[cost], [error_rate]]

        # Initialize bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
def train(save_to, num_epochs, feature_maps=None, mlp_hiddens=None,
         conv_sizes=None, pool_sizes=None, batch_size=500):

    # Initialize the training set
    train = CIFAR10(("train",))
    train_stream = DataStream.default_stream(
        train, iteration_scheme=ShuffledScheme(
            train.num_examples, batch_size))

    test = CIFAR10(("test",))
    test_stream = DataStream.default_stream(
        test,
        iteration_scheme=ShuffledScheme(
            test.num_examples, batch_size))

    # ConvMLP Parameters
    image_size = (32, 32)
    num_channels = 3
    num_conv = 3 # Number of Convolutional Layers
    if feature_maps is None:
        feature_maps = [20, 30, 30]
        if not len(feature_maps) == num_conv:
            raise ValueError('Must specify more feature maps')
    if conv_sizes is None:
        conv_sizes = [5] * num_conv
    if pool_sizes is None:
        pool_sizes = [2] * num_conv
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    output_size = 10

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = ConvMLP(conv_activations, num_channels, image_size,
                      filter_sizes=zip(conv_sizes, conv_sizes),
                      feature_maps=feature_maps,
                      pooling_sizes=zip(pool_sizes, pool_sizes),
                      top_mlp_activations=mlp_activations,
                      top_mlp_dims=mlp_hiddens + [output_size],
                      border_mode='full',
                      weights_init=Uniform(width=.2),
                      biases_init=Constant(0))

    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.push_initialization_config()
    for i in range(num_conv):
        convnet.layers[i].weights_init = Uniform(width=.2)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info("Input dim: {} {} {}".format(
        *convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        logging.info("Layer {} dim: {} {} {}".format(
            i, *layer.get_dim('output')))

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    cost = named_copy(CategoricalCrossEntropy().apply(y.flatten(),
                      probs), 'cost')
    error_rate = named_copy(MisclassificationRate().apply(y.flatten(), probs),
                            'error_rate')

    cg = ComputationGraph([cost, error_rate])

    # Apply Dropout to outputs of rectifiers
    from blocks.roles import OUTPUT
    vs = VariableFilter(roles=[OUTPUT])(cg.variables)
    vs1 = [v for v in vs if v.name.startswith('rectifier')]
    vs1 = vs1[0: -2] # Only first two layers
    cg = apply_dropout(cg, vs1, 0.5)

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=AdaDelta())

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs),
                  DataStreamMonitoring(
                      [cost, error_rate],
                      test_stream,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [cost, error_rate,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      after_epoch=True),
                  Checkpoint(save_to),
                  ProgressBar(),
                  Printing()]

    model = Model(cost)

    main_loop = MainLoop(
        algorithm,
        train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()
    classifier_fn = 'convmlp_cifar10.zip'
    with open(classifier_fn, 'w') as f:
        dump(convnet, f)