def maxout_vae_mnist_test(path_vae_mnist):

    # load vae model on mnist
    vae_mnist = load(path_vae_mnist)
    maxout = Maxout()
    x = T.matrix('features')
    y = T.imatrix('targets')
    batch_size = 128
    z, _ = vae_mnist.sampler.sample(vae_mnist.encoder_mlp.apply(x))
    predict = maxout.apply(z)

    cost = Softmax().categorical_cross_entropy(y.flatten(), predict)
    y_hat = Softmax().apply(predict)
    cost.name = 'cost'
    cg = ComputationGraph(cost)

    temp = cg.parameters
    for t, i in zip(temp, range(len(temp))):
        t.name = t.name+str(i)+"maxout"

    error_brick = MisclassificationRate()
    error_rate = error_brick.apply(y, y_hat) 

    # training
    step_rule = RMSProp(0.01, 0.9)
    #step_rule = Momentum(0.2, 0.9)
    train_set = MNIST('train')
    test_set = MNIST("test")

    data_stream_train = Flatten(DataStream.default_stream(
            train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)))

    data_stream_test =Flatten(DataStream.default_stream(
            test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size)))

    algorithm = GradientDescent(cost=cost, params=cg.parameters,
                                step_rule=step_rule)

    monitor_train = TrainingDataMonitoring(
        variables=[cost], data_stream=data_stream_train, prefix="train")
    monitor_valid = DataStreamMonitoring(
        variables=[cost, error_rate], data_stream=data_stream_test, prefix="test")


    extensions = [  monitor_train,
                    monitor_valid,
                    FinishAfter(after_n_epochs=50),
                    Printing(every_n_epochs=1)
                  ]

    main_loop = MainLoop(data_stream=data_stream_train,
                        algorithm=algorithm, model = Model(cost),
                        extensions=extensions)
    main_loop.run()

    # save here
    from blocks.serialization import dump
    with closing(open('../data_mnist/maxout', 'w')) as f:
	    dump(maxout, f)
Exemple #2
0
 def _get_full_cost(self, input_vec, *args, **kwargs):
     preds = self._get_pred_dist(input_vec)
     cost = Softmax().categorical_cross_entropy(self.hashtag, preds).mean()
     max_index = preds.argmax(axis=1)
     cost.name = 'full_cost'
     ranks = tensor.argsort(preds, axis=1)[:, ::-1]
     top1_accuracy = tensor.eq(self.hashtag, ranks[:, 0]).mean()
     top10_accuracy = tensor.sum(tensor.eq(ranks[:, 0:self.rank],
                                           self.hashtag[:, None]),
                                 axis=1).mean()
     top1_accuracy.name = "top1_accuracy"
     top10_accuracy.name = "top10_accuracy"
     cost_drop, top1_accuracy_drop, top10_accuracy_drop = self._apply_dropout(
         [cost, top1_accuracy, top10_accuracy])
     cost_drop.name = cost.name
     top1_accuracy_drop.name = top1_accuracy.name
     top10_accuracy_drop.name = top10_accuracy.name
     self.full_monitor_train_vars = [[cost_drop], [top1_accuracy_drop],
                                     [top10_accuracy_drop]]
     self.full_cost = cost_drop
Exemple #3
0
 def _get_train_cost(self, input_vec, *args, **kwargs):
     preds = self._get_pred_dist(input_vec)
     cost = Softmax().categorical_cross_entropy(self.hashtag, preds).mean()
     #Apply regularization
     cost = self._apply_reg(cost)
     cost.name = 'cost'
     ranks = tensor.argsort(preds, axis=1)[:, ::-1]
     top1_accuracy = tensor.eq(self.hashtag, ranks[:, 0]).mean()
     top10_accuracy = tensor.sum(tensor.eq(ranks[:, 0:self.rank],
                                           self.hashtag[:, None]),
                                 axis=1).mean()
     top1_accuracy.name = "top1_accuracy"
     top10_accuracy.name = "top10_accuracy"
     #Apply dropout
     cost_drop, top1_accuracy_drop, top10_accuracy_drop = self._apply_dropout(
         [cost, top1_accuracy, top10_accuracy])
     cost_drop.name = cost.name
     top1_accuracy_drop.name = top1_accuracy.name
     top10_accuracy_drop.name = top10_accuracy.name
     self.monitor_train_vars = [[cost_drop], [top1_accuracy_drop],
                                [top10_accuracy_drop]]
     self.cg_generator = cost_drop
Exemple #4
0
data2 = data2[:, :, d1:d1+data1.shape[2], :]
# max pool
data2 = maxpool2.apply(data2)
# activation
data2 = Tanh(name='act_data2').apply(data2)


# fully connected layers
fc = MLP(dims=[25*50, 100, 100, num_output_classes],
         activations=[Rectifier(name='r1'), Rectifier(name='r2'), Identity()])
output = fc.apply(data2.reshape((data2.shape[0], 25*50)))


#       COST AND ERROR MEASURE
cost = Softmax().categorical_cross_entropy(label, output).mean()
cost.name = 'cost'

error_rate = tensor.neq(tensor.argmax(output, axis=1), label).mean()
error_rate.name = 'error_rate'


#       REGULARIZATION
cg = ComputationGraph([cost, error_rate])
if weight_noise > 0:
    noise_vars = VariableFilter(roles=[WEIGHT])(cg)
    cg = apply_noise(cg, noise_vars, weight_noise)
if dropout > 0:
    cg = apply_dropout(cg, [eeg1, eeg2, data1, data2] + VariableFilter(name='output', bricks=fc.linear_transformations[:-1])(cg), dropout)
# for vfilter, p in dropout_locs:
#     cg = apply_dropout(cg, vfilter(cg), p)
[cost_reg, error_rate_reg] = cg.outputs
def build_model_soft(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())]

    # Build the MLP
    dims = [2 * state_dim]
    activations = []
    for i in range(args.mlp_layers):
        activations.append(Rectifier())
        dims.append(state_dim)

    # Activation of the last layer of the MLP
    if args.mlp_activation == "logistic":
        activations.append(Logistic())
    elif args.mlp_activation == "rectifier":
        activations.append(Rectifier())
    elif args.mlp_activation == "hard_logistic":
        activations.append(HardLogistic())
    else:
        assert False

    # Output of MLP has dimension 1
    dims.append(1)

    for i in range(layers - 1):
        mlp = MLP(activations=activations, dims=dims,
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            SoftGatedRecurrent(dim=state_dim,
                               mlp=mlp,
                               activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # dim = layers * state_dim
    output_layer = Linear(
        input_dim=layers * state_dim,
        output_dim=vocab_size, name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs' + suffix] = pre_rnn
        init_states[d] = theano.shared(
            numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX),
            name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # Now we have:
    # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...]

    # Extract gate_values
    gate_values = h[2::2]
    new_h = [h[0]]
    new_h.extend(h[1::2])
    h = new_h

    # Now we have:
    # h = [state, state_1, state_2, ...]
    # gate_values = [gate_value_1, gate_value_2, gate_value_3]

    for i, gate_value in enumerate(gate_values):
        gate_value.name = "gate_value_" + str(i)

    # Save all the last states
    last_states = {}
    for d in range(layers):
        last_states[d] = h[d][-1, :, :]

    # Concatenate all the states
    if layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(),
        presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates, gate_values
Exemple #6
0
def main(save_to, cost_name, learning_rate, momentum, num_epochs):
    mlp = MLP([None], [784, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    scores = mlp.apply(x)

    batch_size = y.shape[0]
    indices = tensor.arange(y.shape[0])
    target_scores = tensor.set_subtensor(
        tensor.zeros((batch_size, 10))[indices, y.flatten()], 1)
    score_diff = scores - target_scores

    # Logistic Regression
    if cost_name == 'lr':
        cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean()
    # MSE
    elif cost_name == 'mse':
        cost = (score_diff**2).mean()
    # Perceptron
    elif cost_name == 'perceptron':
        cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean()
    # TLE
    elif cost_name == 'minmin':
        cost = abs(score_diff[indices, y.flatten()]).mean()
        cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean()
    # TLEcut
    elif cost_name == 'minmin_cut':
        # Score of the groundtruth should be greater or equal than its target score
        cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean()
        # Score of the prediction should be less or equal than its actual score
        cost += tensor.maximum(0, score_diff[indices,
                                             scores.argmax(axis=1)]).mean()
    # TLE2
    elif cost_name == 'minmin2':
        cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()])**2).mean()
        cost += ((score_diff[tensor.arange(y.shape[0]),
                             scores.argmax(axis=1)])**2).mean()
    # Direct loss minimization
    elif cost_name == 'direct':
        epsilon = 0.1
        cost = (-scores[indices,
                        (scores + epsilon * target_scores).argmax(axis=1)] +
                scores[indices, scores.argmax(axis=1)]).mean()
        cost /= epsilon
    elif cost_name == 'svm':
        cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] -
                scores[indices, y.flatten()]).mean()
    else:
        raise ValueError("Unknown cost " + cost)

    error_rate = MisclassificationRate().apply(y.flatten(), scores)
    error_rate.name = 'error_rate'

    cg = ComputationGraph([cost])
    cost.name = 'cost'

    mnist_train = MNIST(("train", ))
    mnist_test = MNIST(("test", ))

    if learning_rate == None:
        learning_rate = 0.0001
    if momentum == None:
        momentum = 0.0
    rule = Momentum(learning_rate=learning_rate, momentum=momentum)
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=rule)
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs),
        DataStreamMonitoring([cost, error_rate],
                             Flatten(DataStream.default_stream(
                                 mnist_test,
                                 iteration_scheme=SequentialScheme(
                                     mnist_test.num_examples, 500)),
                                     which_sources=('features', )),
                             prefix="test"),
        # CallbackExtension(
        #    lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9),
        #    after_epoch=True),
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate
        ],
                               prefix="train",
                               after_epoch=True),
        Checkpoint(save_to),
        Printing()
    ]

    if BLOCKS_EXTRAS_AVAILABLE:
        extensions.append(
            Plot('MNIST example',
                 channels=[['test_cost', 'test_error_rate'],
                           ['train_total_gradient_norm']]))

    main_loop = MainLoop(algorithm,
                         Flatten(DataStream.default_stream(
                             mnist_train,
                             iteration_scheme=SequentialScheme(
                                 mnist_train.num_examples, 50)),
                                 which_sources=('features', )),
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()

    df = pandas.DataFrame.from_dict(main_loop.log, orient='index')
    res = {
        'cost': cost_name,
        'learning_rate': learning_rate,
        'momentum': momentum,
        'train_cost': df.train_cost.iloc[-1],
        'test_cost': df.test_cost.iloc[-1],
        'best_test_cost': df.test_cost.min(),
        'train_error': df.train_error_rate.iloc[-1],
        'test_error': df.test_error_rate.iloc[-1],
        'best_test_error': df.test_error_rate.min()
    }
    res = {
        k: float(v) if isinstance(v, numpy.ndarray) else v
        for k, v in res.items()
    }
    json.dump(res, sys.stdout)
    sys.stdout.flush()
def maxout_mnist_test():
    # if it is working
    # do a class
    x = T.tensor4('features')
    y = T.imatrix('targets')
    batch_size = 128
    # maxout convolutional layers
    # layer0
    filter_size = (8, 8)
    activation = Maxout_(num_pieces=2).apply
    pooling_size = 4
    pooling_step = 2
    pad = 0
    image_size = (28, 28)
    num_channels = 1
    num_filters = 48
    layer0 = ConvolutionalLayer(activation, filter_size, num_filters,
                                pooling_size=(pooling_size, pooling_size),
                                pooling_step=(pooling_step, pooling_step),
                                pad=pad,
                                image_size=image_size,
                                num_channels=num_channels,
                                weights_init=Uniform(width=0.01),
                                biases_init=Uniform(width=0.01),
                                name="layer_0")
    layer0.initialize()

    num_filters = 48
    filter_size = (8,8)
    pooling_size = 4
    pooling_step = 2
    pad = 3  
    image_size = (layer0.get_dim('output')[1],
                  layer0.get_dim('output')[2])
    num_channels = layer0.get_dim('output')[0]
    layer1 = ConvolutionalLayer(activation, filter_size, num_filters,
                                pooling_size=(pooling_size, pooling_size),
                                pooling_step=(pooling_step, pooling_step),
                                pad=pad,
                                image_size=image_size,
                                num_channels=num_channels,
                                weights_init=Uniform(width=0.01),
                                biases_init=Uniform(width=0.01),
                                name="layer_1")
    layer1.initialize()

    num_filters = 24
    filter_size=(5,5)
    pooling_size = 2
    pooling_step = 2
    pad = 3
    activation = Maxout_(num_pieces=4).apply
    image_size = (layer1.get_dim('output')[1],
                  layer1.get_dim('output')[2])
    num_channels = layer1.get_dim('output')[0]
    layer2 = ConvolutionalLayer(activation, filter_size, num_filters,
                                pooling_size=(pooling_size, pooling_size),
                                pooling_step=(pooling_step, pooling_step),
                                pad = pad,
                                image_size=image_size,
                                num_channels=num_channels,
                                weights_init=Uniform(width=0.01),
                                biases_init=Uniform(width=0.01),
                                name="layer_2")
    layer2.initialize()

    conv_layers = [layer0, layer1, layer2]
    output_conv = x
    for layer in conv_layers :
        output_conv = layer.apply(output_conv)
    output_conv = Flattener().apply(output_conv)

    mlp_layer = Linear(54, 10, 
                        weights_init=Uniform(width=0.01),
                        biases_init=Uniform(width=0.01), name="layer_5")
    mlp_layer.initialize()

    output_mlp = mlp_layer.apply(output_conv)

    params, names = build_params(conv_layers, [mlp_layer])

    cost = Softmax().categorical_cross_entropy(y.flatten(), output_mlp)
    cost.name = 'cost'
    cg_ = ComputationGraph(cost)
    weights = VariableFilter(roles=[WEIGHT])(cg_.variables)
    cost = cost + 0.001*sum([sum(p**2) for p in weights])
    cg = ComputationGraph(cost)
    error_rate = errors(output_mlp, y)
    error_rate.name = 'error'

    # training
    step_rule = RMSProp(0.01, 0.9)
    #step_rule = Momentum(0.2, 0.9)
    train_set = MNIST('train')
    test_set = MNIST("test")

    data_stream = DataStream.default_stream(
            train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size))

    data_stream_monitoring = DataStream.default_stream(
            train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size))

    data_stream_test =DataStream.default_stream(
            test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size))

    algorithm = GradientDescent(cost=cost, params=cg.parameters,
                                step_rule=step_rule)

    monitor_train = DataStreamMonitoring(
        variables=[cost, error_rate], data_stream=data_stream_monitoring, prefix="train")
    monitor_valid = DataStreamMonitoring(
        variables=[cost, error_rate], data_stream=data_stream_test, prefix="test")


    extensions = [  monitor_train,
                    monitor_valid,
                    FinishAfter(after_n_epochs=50),
                    Printing(every_n_epochs=1)
                  ]

    main_loop = MainLoop(data_stream=data_stream,
                        algorithm=algorithm, model = Model(cost),
                        extensions=extensions)
    main_loop.run()

    from blocks.serialization import dump
    with closing(open('../data_mnist/maxout', 'w')) as f:
	    dump(vae, f)
Exemple #8
0
def build_model_hard(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names,
                input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence([lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())]
    for i in range(layers - 1):
        mlp = MLP(activations=[Logistic()],
                  dims=[2 * state_dim, 1],
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # dim = layers * state_dim
    output_layer = Linear(input_dim=layers * state_dim,
                          output_dim=vocab_size,
                          name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs' + suffix] = pre_rnn
        init_states[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                       name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # Now we have correctly:
    # h = [state_1, state_2, state_3 ...]

    # Save all the last states
    last_states = {}
    for d in range(layers):
        last_states[d] = h[d][-1, :, :]

    # Concatenate all the states
    if layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(), presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates
features = Flattener().apply(convnet.apply(x))

mlp = MLP(
        activations=[Rectifier(), None],
        dims=[output_dim, 100, 10],
        weights_init=IsotropicGaussian(0.01),
        biases_init=Constant(0)
        )
mlp.initialize()

y_hat = mlp.apply(features)


# numerically stable softmax
cost = Softmax().categorical_cross_entropy(y.flatten(), y_hat)
cost.name = 'nll'
error_rate = MisclassificationRate().apply(y.flatten(), y_hat)
#cost = MisclassificationRate().apply(y, y_hat)
#cost.name = 'error_rate'

cg = ComputationGraph(cost)

#pdb.set_trace()
weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables)
l2_regularization = 0.005 * sum((W**2).sum() for W in weights)

cost_l2 = cost + l2_regularization
cost.name = 'cost_with_regularization'

# Print sizes to check
print("Representation sizes:")
def test_communication(path_vae_mnist,
                       path_maxout_mnist):
                       
    # load models
    vae_mnist = load(path_vae_mnist)
    # get params : to be remove from the computation graph

    # write an object maxout
    classifier = Maxout()
    # get params : to be removed from the computation graph

    # vae whose prior is a zero mean unit variance normal distribution
    activation = Rectifier()
    full_weights_init = Orthogonal()
    weights_init = full_weights_init

    # SVHN en niveau de gris
    layers = [32*32, 200, 200, 200, 50]
    encoder_layers = layers[:-1]
    encoder_mlp = MLP([activation] * (len(encoder_layers)-1),
              encoder_layers,
              name="MLP_SVHN_encode", biases_init=Constant(0.), weights_init=weights_init)

    enc_dim = encoder_layers[-1]
    z_dim = layers[-1]
    sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, biases_init=Constant(0.), weights_init=full_weights_init)
    decoder_layers = layers[:]  ## includes z_dim as first layer
    decoder_layers.reverse()
    decoder_mlp = MLP([activation] * (len(decoder_layers)-2) + [Rectifier()],
              decoder_layers,
              name="MLP_SVHN_decode", biases_init=Constant(0.), weights_init=weights_init)

    
    vae_svhn = VAEModel(encoder_mlp, sampler, decoder_mlp)
    vae_svhn.initialize()

    # do the connection
    
    x = T.tensor4('x') # SVHN samples preprocessed with local contrast normalization
    x_ = (T.sum(x, axis=1)).flatten(ndim=2)
    y = T.imatrix('y')
    batch_size = 512

    svhn_z, _ = vae_svhn.sampler.sample(vae_svhn.encoder_mlp.apply(x_))
    mnist_decode = vae_mnist.decoder_mlp.apply(svhn_z)
    # reshape
    shape = mnist_decode.shape
    mnist_decode = mnist_decode.reshape((shape[0], 1, 28, 28))
    prediction = classifier.apply(mnist_decode)
    y_hat = Softmax().apply(prediction)

    x_recons, kl_terms = vae_svhn.reconstruct(x_)
    recons_term = BinaryCrossEntropy().apply(x_, T.clip(x_recons, 1e-4, 1 - 1e-4))
    recons_term.name = "recons_term"

    cost_A = recons_term + kl_terms.mean()
    cost_A.name = "cost_A"

    cost_B = Softmax().categorical_cross_entropy(y.flatten(), prediction)
    cost_B.name = 'cost_B'

    cost = cost_B
    cost.name = "cost"
    cg = ComputationGraph(cost) # probably discard some of the parameters
    parameters = cg.parameters
    params = []
    for t in parameters:
        if not re.match(".*mnist", t.name):
            params.append(t)

    """
    f = theano.function([x], cost_A)
    value_x = np.random.ranf((1, 3, 32, 32)).astype("float32")
    print f(value_x)
    
    return
    """
    error_brick = MisclassificationRate()
    error_rate = error_brick.apply(y.flatten(), y_hat)
    error_rate.name = "error_rate"
    
    # training here
    step_rule = RMSProp(0.001,0.99)

    dataset_hdf5_file="/Tmp/ducoffem/SVHN/"
    train_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='train')
    test_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='valid')
    
    data_stream = DataStream.default_stream(
        train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size))
        
    data_stream_test = DataStream.default_stream(
        test_set, iteration_scheme=SequentialScheme(2000, batch_size))


    algorithm = GradientDescent(cost=cost, params=params,
                                step_rule=step_rule)

    monitor_train = TrainingDataMonitoring(
        variables=[cost], prefix="train", every_n_batches=10)
    monitor_valid = DataStreamMonitoring(
        variables=[cost, error_rate], data_stream=data_stream_test, prefix="valid", every_n_batches=10)

    # drawing_samples = ImagesSamplesSave("../data_svhn", vae, (3, 32, 32), every_n_epochs=1)
    extensions = [  monitor_train,
                    monitor_valid,
                    FinishAfter(after_n_batches=10000),
                    Printing(every_n_batches=10)
                  ]

    main_loop = MainLoop(data_stream=data_stream,
                        algorithm=algorithm, model = Model(cost),
                        extensions=extensions)
    main_loop.run()
Exemple #11
0
    def __init__(self):
        inp = tensor.lmatrix('bytes')

        # Make state vars
        state_vars = {}
        for i, d in enumerate(hidden_dims):
            state_vars['states%d'%i] = theano.shared(numpy.zeros((num_seqs, d))
                                                        .astype(theano.config.floatX),
                                                     name='states%d'%i)
            state_vars['cells%d'%i] = theano.shared(numpy.zeros((num_seqs, d))
                                                        .astype(theano.config.floatX),
                                                    name='cells%d'%i)
        # Construct brick
        cchlstm = CCHLSTM(io_dim=io_dim,
                          hidden_dims=hidden_dims,
                          cond_cert=cond_cert,
                          activation=activation_function)

        # Random pass
        passdict = {}
        for i, p in enumerate(block_prob):
            passdict['pass%d'%i] = rng.binomial(size=(inp.shape[1], inp.shape[0]), p=1-p)

        # Apply it
        outs = cchlstm.apply(inputs=inp.dimshuffle(1, 0),
                             **dict(state_vars.items() + passdict.items()))
        states = []
        active_prop = []
        for i in range(len(hidden_dims)):
            states.append((state_vars['states%d'%i], outs[3*i+1][-1, :, :]))
            states.append((state_vars['cells%d'%i], outs[3*i+2][-1, :, :]))
            active_prop.append(outs[3*i+3].mean())
            active_prop[-1].name = 'active_prop_%d'%i

        out = outs[0].dimshuffle(1, 0, 2)

        # Do prediction and calculate cost
        pred = out.argmax(axis=2)

        cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(),
                                                   out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1),
                                                                           io_dim)))
        error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).mean()

        # Initialize all bricks
        for brick in [cchlstm]:
            brick.weights_init = IsotropicGaussian(0.1)
            brick.biases_init = Constant(0.)
            brick.initialize()

        # Apply noise and dropoutvars
        cg = ComputationGraph([cost, error_rate])
        if w_noise_std > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, w_noise_std)
        [cost_reg, error_rate_reg] = cg.outputs

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost, cost_reg],
                             [error_rate, error_rate_reg],
                             active_prop]

        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        error_rate.name = 'error_rate'
        error_rate_reg.name = 'error_rate_reg'

        self.out = out
        self.pred = pred

        self.states = states
Exemple #12
0
def main():
    # # # # # # # # # # # 
    # Modeling Building #
    # # # # # # # # # # # 
    
    # ConvOp requires input be a 4D tensor
    x = tensor.tensor4("features")

    y = tensor.ivector("targets")

    # Convolutional Layers
    # ====================
    
    # "Improving neural networks by preventing co-adaptation of feature detectors"
    # conv_layers = [
    #     # ConvolutionalLayer(activiation, filter_size, num_filters, pooling_size, name)
    #       ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l1')
    #     , ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l2')
    #     , ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l3')
    #     ]

    # "VGGNet"
    conv_layers = [
          ConvolutionalActivation(Rectifier().apply, (3,3), 64, border_mode='full', name='l1')
        , ConvolutionalLayer(Rectifier().apply, (3,3), 64, (2,2), border_mode='full', name='l2')
        , ConvolutionalActivation(Rectifier().apply, (3,3), 128, border_mode='full', name='l3')
        , ConvolutionalLayer(Rectifier().apply, (3,3), 128, (2,2), border_mode='full', name='l4')
        , ConvolutionalActivation(Rectifier().apply, (3,3), 256, border_mode='full', name='l5')
        , ConvolutionalLayer(Rectifier().apply, (3,3), 256, (2,2), border_mode='full', name='l6')
        ]

    # Bake my own
    # conv_layers = [
    #     # ConvolutionalLayer(activiation, filter_size, num_filters, pooling_size, name)
    #       ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l1')
    #     , ConvolutionalLayer(Rectifier().apply, (3,3), 128, (2,2), border_mode='full', name='l2')
    #     , ConvolutionalActivation(Rectifier().apply, (3,3), 256, border_mode='full', name='l3')
    #     , ConvolutionalLayer(Rectifier().apply, (3,3), 256, (2,2), border_mode='full', name='l4')
    #     ]

    
    convnet = ConvolutionalSequence(
        conv_layers, num_channels=3, image_size=(32,32),
        weights_init=IsotropicGaussian(0.1),
        biases_init=Constant(0)
        )
    convnet.initialize()

    output_dim = np.prod(convnet.get_dim('output'))

    # Fully Connected Layers
    # ======================
    conv_features = convnet.apply(x)
    features = Flattener().apply(conv_features)

    mlp = MLP(  activations=[Rectifier()]*2+[None]
              , dims=[output_dim, 256, 256, 10]
              , weights_init=IsotropicGaussian(0.01)
              , biases_init=Constant(0)
        )
    mlp.initialize()

    y_hat = mlp.apply(features)
    # print y_hat.shape.eval({x: np.zeros((1, 3, 32, 32), dtype=theano.config.floatX)})

    # Numerically Stable Softmax
    cost = Softmax().categorical_cross_entropy(y, y_hat)
    error_rate = MisclassificationRate().apply(y, y_hat)

    cg = ComputationGraph(cost)

    weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables)
    l2_regularization = 0.005 * sum((W**2).sum() for W in weights)

    cost = cost + l2_regularization
    cost.name = 'cost_with_regularization'

    # Print sizes to check
    print("Representation sizes:")
    for layer in convnet.layers:
        print(layer.get_dim('input_'))

    # # # # # # # # # # # 
    # Modeling Training #
    # # # # # # # # # # # 

    # Figure out data source
    train = CIFAR10("train")
    test = CIFAR10("test")

    # Load Data Using Fuel
    train_stream = DataStream.default_stream(
          dataset=train
        , iteration_scheme=SequentialScheme(train.num_examples, batch_size=128))
    test_stream = DataStream.default_stream(
          dataset=test
        , iteration_scheme=SequentialScheme(test.num_examples, batch_size=1024))

    # Train
    algorithm = GradientDescent(
          cost=cost
        , params=cg.parameters
        , step_rule=Adam(learning_rate=0.0005)
        )


    main_loop = MainLoop(
          model=Model(cost)
        , data_stream=train_stream
        , algorithm=algorithm
        , extensions=[
              TrainingDataMonitoring(
                  [cost, error_rate]
                , prefix='train'
                , after_epoch=True)
            , DataStreamMonitoring(
                  [cost, error_rate]
                , test_stream,
                  prefix='test')
            , ExperimentSaver(dest_directory='...', src_directory='.')
            , Printing()
            , ProgressBar()
            ]
        )
    main_loop.run()
Exemple #13
0
def main(save_to, cost_name, learning_rate, momentum, num_epochs):
    mlp = MLP([None], [784, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    scores = mlp.apply(x)

    batch_size = y.shape[0]
    indices = tensor.arange(y.shape[0])
    target_scores = tensor.set_subtensor(
        tensor.zeros((batch_size, 10))[indices, y.flatten()],
        1)
    score_diff = scores - target_scores

    # Logistic Regression
    if cost_name == 'lr':
        cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean()
    # MSE
    elif cost_name == 'mse':
        cost = (score_diff ** 2).mean()
    # Perceptron
    elif cost_name == 'perceptron':
        cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean()
    # TLE
    elif cost_name == 'minmin':
        cost = abs(score_diff[indices, y.flatten()]).mean()
        cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean()
    # TLEcut
    elif cost_name == 'minmin_cut':
        # Score of the groundtruth should be greater or equal than its target score
        cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean()
        # Score of the prediction should be less or equal than its actual score
        cost += tensor.maximum(0, score_diff[indices, scores.argmax(axis=1)]).mean()
    # TLE2
    elif cost_name == 'minmin2':
        cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()]) ** 2).mean()
        cost += ((score_diff[tensor.arange(y.shape[0]), scores.argmax(axis=1)]) ** 2).mean()
    # Direct loss minimization
    elif cost_name == 'direct':
        epsilon = 0.1
        cost = (- scores[indices, (scores + epsilon * target_scores).argmax(axis=1)]
                + scores[indices, scores.argmax(axis=1)]).mean()
        cost /= epsilon
    elif cost_name == 'svm':
        cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)]
                - scores[indices, y.flatten()]).mean()
    else:
        raise ValueError("Unknown cost " + cost)

    error_rate = MisclassificationRate().apply(y.flatten(), scores)
    error_rate.name = 'error_rate'

    cg = ComputationGraph([cost])
    cost.name = 'cost'

    mnist_train = MNIST(("train",))
    mnist_test = MNIST(("test",))

    if learning_rate == None:
        learning_rate = 0.0001
    if momentum == None:
        momentum = 0.0
    rule = Momentum(learning_rate=learning_rate,
                    momentum=momentum)
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=rule)
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs),
                  DataStreamMonitoring(
                      [cost, error_rate],
                      Flatten(
                          DataStream.default_stream(
                              mnist_test,
                              iteration_scheme=SequentialScheme(
                                  mnist_test.num_examples, 500)),
                          which_sources=('features',)),
                      prefix="test"),
                  # CallbackExtension(
                  #    lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9),
                  #    after_epoch=True),
                  TrainingDataMonitoring(
                      [cost, error_rate,
                       aggregation.mean(algorithm.total_gradient_norm),
                       rule.learning_rate],
                      prefix="train",
                      after_epoch=True),
                  Checkpoint(save_to),
                  Printing()]

    if BLOCKS_EXTRAS_AVAILABLE:
        extensions.append(Plot(
            'MNIST example',
            channels=[
                ['test_cost',
                 'test_error_rate'],
                ['train_total_gradient_norm']]))

    main_loop = MainLoop(
        algorithm,
        Flatten(
            DataStream.default_stream(
                mnist_train,
                iteration_scheme=SequentialScheme(
                    mnist_train.num_examples, 50)),
            which_sources=('features',)),
        model=Model(cost),
        extensions=extensions)

    main_loop.run()

    df = pandas.DataFrame.from_dict(main_loop.log, orient='index')
    res = {'cost' : cost_name,
           'learning_rate' : learning_rate,
           'momentum' : momentum,
           'train_cost' : df.train_cost.iloc[-1],
           'test_cost' : df.test_cost.iloc[-1],
           'best_test_cost' : df.test_cost.min(),
           'train_error' : df.train_error_rate.iloc[-1],
           'test_error' : df.test_error_rate.iloc[-1],
           'best_test_error' : df.test_error_rate.min()}
    res = {k: float(v) if isinstance(v, numpy.ndarray) else v for k, v in res.items()}
    json.dump(res, sys.stdout)
    sys.stdout.flush()
def build_model_vanilla(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())
                   for _ in range(layers)]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # If skip_connections: dim = layers * state_dim
    # else: dim = state_dim
    output_layer = Linear(
        input_dim=skip_connections * layers *
        state_dim + (1 - skip_connections) * state_dim,
        output_dim=vocab_size, name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs'] = pre_rnn
        init_states[d] = theano.shared(
            numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX),
            name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # We have
    # h = [state, state_1, state_2 ...] if layers > 1
    # h = state if layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    last_states = {}
    if layers > 1:
        # Save all the last states
        for d in range(layers):
            last_states[d] = h[d][-1, :, :]
        if skip_connections:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        last_states[0] = h[-1, :, :]
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(),
        presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates
print(output_dim)

# Fully connected layers
features = Flattener().apply(convnet.apply(x))

mlp = MLP(activations=[Rectifier(), None],
          dims=[output_dim, 100, 10],
          weights_init=IsotropicGaussian(0.01),
          biases_init=Constant(0))
mlp.initialize()

y_hat = mlp.apply(features)

# numerically stable softmax
cost = Softmax().categorical_cross_entropy(y.flatten(), y_hat)
cost.name = 'nll'
error_rate = MisclassificationRate().apply(y.flatten(), y_hat)
#cost = MisclassificationRate().apply(y, y_hat)
#cost.name = 'error_rate'

cg = ComputationGraph(cost)

#pdb.set_trace()
weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables)
l2_regularization = 0.005 * sum((W**2).sum() for W in weights)

cost_l2 = cost + l2_regularization
cost.name = 'cost_with_regularization'

# Print sizes to check
print("Representation sizes:")
def training_model_mnist(learning_rate, momentum, iteration, batch_size, epoch_end, iter_batch):

    x = T.tensor4('features')
    y = T.imatrix('targets')

    classifier = build_model_mnist()

    predict = classifier.apply(x)
    y_hat = Softmax().apply(predict)

    cost = Softmax().categorical_cross_entropy(y.flatten(), predict)
    cost.name = "cost"
    cg = ComputationGraph(cost)
    error_brick = MisclassificationRate()
    error_rate = error_brick.apply(y.flatten(), y_hat)
    error_rate.name = "error"


    train_set = MNIST(('train', ))
    test_set = MNIST(("test",))

    if iteration =="slice":
        data_stream = DataStream.default_stream(
                train_set, iteration_scheme=SequentialScheme_slice(train_set.num_examples,
                                                            batch_size))
        data_stream_test = DataStream.default_stream(
                test_set, iteration_scheme=SequentialScheme_slice(test_set.num_examples,
                                                            batch_size))
    else:
        data_stream = DataStream.default_stream(
                train_set, iteration_scheme=SequentialScheme(train_set.num_examples,
                                                            batch_size))

        data_stream_test = DataStream.default_stream(
                test_set, iteration_scheme=SequentialScheme(test_set.num_examples,
                                                            batch_size))

    step_rule = Momentum(learning_rate=learning_rate,
                         momentum=momentum)

    start = time.clock()
    time_spent = shared_floatx(np.float32(0.), name="time_spent")
    time_extension = Time_reference(start, time_spent, every_n_batches=1)

    algorithm = GradientDescent(cost=cost, params=cg.parameters,
                                step_rule=step_rule)

    monitor_train = TrainingDataMonitoring(
        variables=[cost], prefix="train", every_n_epochs=iter_batch)
    monitor_valid = DataStreamMonitoring(
        variables=[cost, error_rate, time_spent], data_stream=data_stream_test, prefix="valid",
        every_n_epochs=iter_batch)

    # add a monitor variable about the time
    extensions = [  monitor_train,
                    monitor_valid,
                    FinishAfter(after_n_epochs=epoch_end),
                    Printing(every_n_epochs=iter_batch),
                    time_extension
                  ]

    main_loop = MainLoop(data_stream=data_stream,
                        algorithm=algorithm, model = Model(cost),
                        extensions=extensions)
    main_loop.run()
Exemple #17
0
def build_model_lstm(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    virtual_dim = 4 * state_dim

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(virtual_dim)

    lookup = LookupTable(length=vocab_size, dim=virtual_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    # Make sure time_length is what we need
    fork = Fork(output_names=output_names,
                input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence([lookup.apply]))

    transitions = [
        LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers)
    ]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # If skip_connections: dim = layers * state_dim
    # else: dim = state_dim
    output_layer = Linear(input_dim=skip_connections * layers * state_dim +
                          (1 - skip_connections) * state_dim,
                          output_dim=vocab_size,
                          name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    init_cells = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs'] = pre_rnn
        init_states[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                       name='state0_%d' % d)
        init_cells[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                      name='cell0_%d' % d)
        kwargs['states' + suffix] = init_states[d]
        kwargs['cells' + suffix] = init_cells[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # h = [state, cell, in, forget, out, state_1,
    #        cell_1, in_1, forget_1, out_1 ...]

    last_states = {}
    last_cells = {}
    for d in range(layers):
        last_states[d] = h[5 * d][-1, :, :]
        last_cells[d] = h[5 * d + 1][-1, :, :]

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))
        updates.append((init_cells[d], last_states[d]))

    # h = [state, cell, in, forget, out, state_1,
    #        cell_1, in_1, forget_1, out_1 ...]

    # Extract the values
    in_gates = h[2::5]
    forget_gates = h[3::5]
    out_gates = h[4::5]

    gate_values = {
        "in_gates": in_gates,
        "forget_gates": forget_gates,
        "out_gates": out_gates
    }

    h = h[::5]

    # Now we have correctly:
    # h = [state, state_1, state_2 ...] if layers > 1
    # h = [state] if layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    if layers > 1:
        if skip_connections:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        h = h[0]
    h.name = "hidden_state"

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(), presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    # Dont initialize as Orthogonal if we are about to load new parameters
    if args.load_path is not None:
        rnn.weights_init = initialization.Constant(0)
    else:
        rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates, gate_values
Exemple #18
0
    def __init__(self, config):
        inp = tensor.imatrix('bytes')

        embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX),
                              name='embedding_matrix')
        in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim))
        in_repr.name = 'in_repr'

        bricks = []
        states = []

        # Construct predictive GRU hierarchy
        hidden = []
        costs = []
        next_target = in_repr.dimshuffle(1, 0, 2)
        for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims,
                                                   config.cost_factors,
                                                   config.hidden_q)):
            init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX),
                                       name='st0_%d'%i)

            linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim,
                            name="lstm_in_%d"%i)
            lstm = GatedRecurrent(dim=hdim, activation=config.activation_function,
                        name="lstm_rec_%d"%i)
            linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i)
            tanh = Tanh('lstm_out_tanh_%d'%i)
            bricks += [linear, lstm, linear2, tanh]
            if i > 0:
                linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim,
                                 name='lstm_in2_%d'%i)
                bricks += [linear1]

            next_target = tensor.cast(next_target, dtype=theano.config.floatX)
            inter = linear.apply(theano.gradient.disconnected_grad(next_target))
            if i > 0:
                inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:]))
            new_hidden = lstm.apply(inputs=inter[:,:,:hdim],
                                    gate_inputs=inter[:,:,hdim:],
                                    states=init_state)
            states.append((init_state, new_hidden[-1, :, :]))

            hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)]
            pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:]))
            costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()]
            costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()]
            diff = next_target - pred
            next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5)


        # Construct output from hidden states
        hidden = [s.dimshuffle(1, 0, 2) for s in hidden]

        out_parts = []
        out_dims = config.out_hidden + [config.io_dim]
        for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)):
            pred_linear = Linear(input_dim=dim, output_dim=out_dims[0],
                                name='pred_linear_%d'%i)
            bricks.append(pred_linear)
            lin = theano.gradient.disconnected_grad(state)
            out_parts.append(pred_linear.apply(lin))

        # Do prediction and calculate cost
        out = sum(out_parts)

        if len(out_dims) > 1:
            out = config.out_hidden_act[0](name='out_act0').apply(out)
            mlp = MLP(dims=out_dims,
                      activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])]
                                 +[Identity()],
                      name='out_mlp')
            bricks.append(mlp)
            out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1))
                           ).reshape((inp.shape[0],inp.shape[1]+1,-1))

        pred = out.argmax(axis=2)

        cost = Softmax().categorical_cross_entropy(inp.flatten(),
                                                   out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1],
                                                                config.io_dim))).mean()
        error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean()

        sgd_cost = cost + sum(costs)
            
        # Initialize all bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()

        # apply noise
        cg = ComputationGraph([sgd_cost, cost, error_rate]+costs)
        if config.weight_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.weight_noise)
        sgd_cost = cg.outputs[0]
        cost = cg.outputs[1]
        error_rate = cg.outputs[2]
        costs = cg.outputs[3:]


        # put stuff into self that is usefull for training or extensions
        self.sgd_cost = sgd_cost

        sgd_cost.name = 'sgd_cost'
        for i in range(len(costs)):
            costs[i].name = 'pred_cost_%d'%i
        cost.name = 'cost'
        error_rate.name = 'error_rate'
        self.monitor_vars = [costs, [cost],
                             [error_rate]]

        self.out = out[:,1:,:]
        self.pred = pred[:,1:]

        self.states = states
Exemple #19
0
    def __init__(self, config):
        inp = tensor.imatrix('bytes')

        in_onehot = tensor.eq(tensor.arange(config.io_dim, dtype='int32').reshape((1, 1, config.io_dim)),
                              inp[:, :, None]).astype(theano.config.floatX)
        in_onehot.name = 'in_onehot'

        hidden_dim = sum(p['dim'] for p in config.layers)
        recvalues = tensor.concatenate([in_onehot.dimshuffle(1, 0, 2),
                            tensor.zeros((inp.shape[1], inp.shape[0], hidden_dim))],
                        axis=2)
  
        # Construct hidden states
        indim = config.io_dim
        bricks = []
        states = []
        for i in xrange(1, len(config.layers)+1):
            p = config.layers[i-1]

            init_state = theano.shared(numpy.zeros((config.num_seqs, p['dim'])).astype(theano.config.floatX),
                                       name='st0_%d'%i)
            init_cell = theano.shared(numpy.zeros((config.num_seqs, p['dim'])).astype(theano.config.floatX),
                                       name='cell0_%d'%i)

            linear = Linear(input_dim=indim, output_dim=4*p['dim'],
                            name="lstm_in_%d"%i)
            bricks.append(linear)
            inter = linear.apply(recvalues[:, :, :indim])

            lstm = RstLSTM(dim=p['dim'], activation=config.activation_function,
                        name="lstm_rec_%d"%i)
            bricks.append(lstm)

            run_mask = None
            if 'run_on' in p:
                run_mask = compare_matrix(inp.T, p['run_on'])

            rst_in_mask = None
            if 'reset_before' in p:
                rst_in_mask = compare_matrix(inp.T, p['reset_before'])

            rst_out_mask = None
            if 'reset_after' in p:
                rst_out_mask = compare_matrix(inp.T, p['reset_after'])

            new_hidden, new_cells, rec_out = \
                        lstm.apply_cond(inputs=inter,
                                        states=init_state, cells=init_cell,
                                        run_mask=run_mask,
                                        rst_in_mask=rst_in_mask, rst_out_mask=rst_out_mask)
            states.append((init_state, new_hidden[-1, :, :]))
            states.append((init_cell, new_cells[-1, :, :]))

            indim2 = indim + p['dim']
            recvalues = tensor.set_subtensor(recvalues[:, :, indim:indim2],
                                             rec_out)
            indim = indim2


        print "**** recvalues", recvalues.dtype
        for i, (u, v) in enumerate(states):
            print "****     state", i, u.dtype, v.dtype

        recvalues = recvalues.dimshuffle(1, 0, 2)

        # Construct output from hidden states
        top_linear = Linear(input_dim=indim, output_dim=config.io_dim,
                            name="top_linear")
        bricks.append(top_linear)
        out = top_linear.apply(recvalues)
        out.name = 'out'

        # Do prediction and calculate cost
        pred = out.argmax(axis=2).astype('int32')

        print "****         inp", inp.dtype
        print "****         out", out.dtype
        print "****         pred", pred.dtype
        cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(),
                                                   out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1),
                                                                           config.io_dim))).mean()
        cost.name = 'cost'
        error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).astype(theano.config.floatX).mean()
        print "****         cost", cost.dtype
        print "****         error_rate", error_rate.dtype

        # Initialize all bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()

        # Apply noise and dropout
        cg = ComputationGraph([cost, error_rate])
        if config.w_noise_std > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise_std)
        if config.i_dropout > 0:
            cg = apply_dropout(cg, hidden[1:], config.i_dropout)
        [cost_reg, error_rate_reg] = cg.outputs
        print "****         cost_reg", cost_reg.dtype
        print "****         error_rate_reg", error_rate_reg.dtype

        # add l1 regularization
        if config.l1_reg > 0:
            l1pen = sum(abs(st).mean() for st in hidden[1:])
            cost_reg = cost_reg + config.l1_reg * l1pen
        if config.l1_reg_weight > 0:
            l1pen_w = sum(abs(w).mean() for w in VariableFilter(roles=[WEIGHT])(cg))
            cost_reg = cost_reg + config.l1_reg_weight * l1pen_w

        cost_reg += 1e-10           # so that it is not the same Theano variable as cost
        error_rate_reg += 1e-10

        # put stuff into self that is usefull for training or extensions
        self.sgd_cost = cost_reg

        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        error_rate.name = 'error_rate'
        error_rate_reg.name = 'error_rate_reg'
        self.monitor_vars = [[cost],
                             [cost_reg],
                             [error_rate_reg]]

        self.out = out
        self.pred = pred

        self.states = states