Example #1
0
    def _build(self):
        self.layers.append(
            SparseLayer(input_dim=self.input_dim,
                        output_dim=FLAGS.hidden1,
                        features_nonzero=self.features_nonzero,
                        act=tf.nn.relu,
                        dropout=self.dropout,
                        logging=self.logging))
        self.hidden1 = self.layers[-1](self.inputs)

        self.layers.append(
            DenseLayer(input_dim=FLAGS.hidden1,
                       output_dim=FLAGS.hidden2,
                       act=lambda x: x,
                       dropout=self.dropout,
                       logging=self.logging))
        self.embeddings = self.layers[-1](self.hidden1)

        self.layers.append(
            DenseLayer(input_dim=FLAGS.hidden2,
                       output_dim=self.output_dim,
                       act=lambda x: x,
                       dropout=self.dropout,
                       logging=self.logging))
        self.outputs = self.layers[-1](self.embeddings)
Example #2
0
    def define_network(self, image):
        with tf.name_scope("Block1"):
            conv1_1 = ConvLayer(image, 3, 64, name="conv1_1")
            conv1_2 = ConvLayer(conv1_1, 64, 64, name="conv1_2")
            pool1 = MaxPoolLayer(conv1_2, name='pool1')

        with tf.name_scope("Block2"):
            conv2_1 = ConvLayer(pool1, 64, 128, name="conv2_1")
            conv2_2 = ConvLayer(conv2_1, 128, 128, name="conv2_2")
            pool2 = MaxPoolLayer(conv2_2, name='pool2')

        with tf.name_scope("Block3"):
            conv3_1 = ConvLayer(pool2, 128, 256, name="conv3_1")
            conv3_2 = ConvLayer(conv3_1, 256, 256, name="conv3_2")
            conv3_3 = ConvLayer(conv3_2, 256, 256, name="conv3_3")
            conv3_4 = ConvLayer(conv3_3, 256, 256, name="conv3_4")
            pool3 = MaxPoolLayer(conv3_4, name='pool3')

        with tf.name_scope("Block4"):
            conv4_1 = ConvLayer(pool3, 256, 512, name="conv4_1")
            conv4_2 = ConvLayer(conv4_1, 512, 512, name="conv4_2")
            conv4_3 = ConvLayer(conv4_2, 512, 512, name="conv4_3")
            conv4_4 = ConvLayer(conv4_3, 512, 512, name="conv4_4")
            pool4 = MaxPoolLayer(conv4_4, name='pool4')

        with tf.name_scope("DenseBlock"):
            fc6 = DenseLayer(pool4, 1024, name='fc6')
            drop_6 = DropoutLayer(fc6, dropout_rate=self.p)
            fc7 = DenseLayer(drop_6, 1024, name='fc7')
            drop_7 = DropoutLayer(fc7, dropout_rate=self.p)
        return drop_7
 def _build_decoder(self):
     """ Builds the decoder's list"""
     decoder = [
         DenseLayer(num_units=128),
         LeakyReLU(),
         DenseLayer(num_units=self.input_dim)
     ]
     return decoder
 def _build_encoder(self):
     """ Buils the encoder's list """
     encoder = [
         DenseLayer(num_units=128, input_shape=self.input_dim),
         LeakyReLU(),
         DenseLayer(num_units=self.latent_factors)
     ]
     return encoder
Example #5
0
    def __init__(self):
        super().__init__()
        # First layer: a fully connected layer with shape = 784 x 20
        self.l1 = DenseLayer(28 * 28, 20, w_std=0.01)
        # Activation of the first layer: Sigmoid
        self.sig1 = SigmoidLayer()

        # Second layer: a fully connected layer with shape = 20 x 1
        self.l2 = DenseLayer(20, 1, w_std=0.01)
        # Activation of the second layer: Sigmoid
        self.sig2 = SigmoidLayer()
Example #6
0
    def adddiscriminator(self,num_1,num_2):

        input_layer = self.feature_layer
        name = "cate_1"
        new_layer1 = DenseLayer(input_layer, name= name, num_units=num_1 )
        #self.all_layers += (new_layer1,)
        self.trainable_layers += (new_layer1,)
        name = "cate_2"
        new_layer2 = DenseLayer(new_layer1, name = name, num_units=num_2)
        #self.all_layers += (new_layer2,)
        self.trainable_layers += (new_layer2,)

        category = Softmax(new_layer2)
        self.category_layer = category
    def load_pathnet(filename):
        log = None
        with open(filename, 'rb') as f:
            log = pickle.load(f)

        layers = []
        for layer_log in log['layer_logs']:
            if layer_log['layer_type'] == 'dense':
                layers.append(DenseLayer.build_from_log(layer_log))
            if layer_log['layer_type'] == 'conv':
                layers.append(ConvLayer.build_from_log(layer_log))

        Layer.initialize_whole_network(layers, log['in_shape'])
        for layer, layer_log in zip(layers, log['layer_logs']):
            layer.load_layer_log(layer_log)

        pathnet = PathNet(input_shape=log['in_shape'],
                          width=log['width'],
                          depth=log['depth'])
        pathnet._layers = layers
        pathnet.training_counter = log['training_counter']
        pathnet.max_modules_pr_layer = log['max_modules_pr_layer']
        pathnet.min_modules_pr_layer = log['min_modules_pr_layer']

        tasks = []
        for task_log in log['task_logs']:
            task = TaskContainer.build_from_log(task_log)
            pathnet.path2model(pathnet.random_path(), task)
            task.layer.set_weights(task_log['layer_weights'])
            tasks.append(task)

        pathnet._tasks = tasks

        return pathnet
    def _build(self):
        for i in range(1, len(self.num_hidden)):
            # set last layer activation as linear function otherwise use self.act
            if i == len(self.num_hidden) - 1:
                act = (lambda x: x)
            else:
                act = self.act

            #######################################################
            # TODO: Add a DenseLayer Object as a layer            #
            # Use DenseLayer class to define a new layer          #
            # Please set all its constructor arguments properly   #
            # These arguments include:                            #
            #   1. Input and output dimensions                    #
            #   2. Weight and Bias Initializer (stddev if needed) #
            #   3. Activation function                            #
            #######################################################

            layer = DenseLayer(self.num_hidden[i-1], self.num_hidden[i], act,
                               self.weight_initializer, self.bias_initializer, stddev=self.stddev)

            ########################################################
            #                   END OF YOUR CODE                   #
            ########################################################

            # add layer to layers list
            self.layers.append(layer)
Example #9
0
def get_layers_dict(json_list):
    layers_list = []
    for layer_info in json_list:
        #(layer_name,layer),(layer_activation_name,layer_activation) = Layer(layer_info)
        if 'dense' in layer_info['layer_type']:
            layer = DenseLayer(layer_info)
        elif 'conv2d' in layer_info['layer_type']:
            layer = Conv2dLayer(layer_info)
        elif 'flatten' in layer_info['layer_type']:
            layer = FlattenLayer(layer_info)
        layer_tup, activation_tup = layer.get_torch_layer()
        layers_list.append(layer_tup)
        if activation_tup[1] is not None:
            layers_list.append(activation_tup)
    #ret = dict([(item[0], item[1]) for item in layers_list])
    ret = collections.OrderedDict(layers_list)
    return ret
Example #10
0
    def __init__(self, config, weight_init):
        super(GCEncoder, self).__init__()
        self.num_relations = config.num_relations
        self.num_users = config.num_users
        self.accum = config.accum

        self.rgc_layer = RGCLayer(config, weight_init)
        self.dense_layer = DenseLayer(config, weight_init)
Example #11
0
    def build_model(self):
        layers = []
        input_shape = np.array(
            [self.batch_size, self.x_dim, self.x_dim, self.c_dim])
        # layer_1: input_layer ==> [n, 28, 28, 1]
        x = InputLayer(input_shape)
        layers.append(x)
        # layer_2: conv_layer [n, 28, 28, 1] ==> [n, 28, 28, 32]
        x = ConvLayer(x,
                      output_nums=20,
                      kernel=5,
                      strides=1,
                      padding='SAME',
                      name='conv1')
        layers.append(x)
        # layer_4: avgpool_layer [n, 28, 28, 32] ==> [n, 14, 14, 32]
        x = MaxPoolLayer(x, kernel=2, strides=2, paddind='SAME', name='pool1')
        layers.append(x)
        # layer_5: conv_layer [n, 14, 14, 32] ==> [n, 14, 14, 64]
        x = ConvLayer(x,
                      output_nums=50,
                      kernel=5,
                      strides=1,
                      padding='SAME',
                      name='conv2')
        layers.append(x)
        # layer_7: avgpool_layer [n, 14, 14, 64] ==> [n, 7, 7, 64]
        x = MaxPoolLayer(x, kernel=2, strides=2, padding='SAME', name='pool2')
        layers.append(x)
        # layer_8: flatten_layer [n, 7, 7, 64] ==> [n, 7*7*64]
        x = FlattenLayer(x, name='flatten')
        layers.append(x)
        # layer_9: fullconnected_layer [n, 3136] ==> [n, 500]
        x = DenseLayer(x, output_nums=500, name='dense1')
        layers.append(x)
        # layer_10: relu_layer [n, 500] ==> [n, 500]
        x = ReLULayer(x, name='relu1')
        layers.append(x)
        # layer_11: fullconnected_layer [n, 500] ==> [n, 10]
        x = DenseLayer(x, output_nums=10, name='dense2')
        layers.append(x)
        # layer_12: softmax_layer [n, 10] ==> [n, 10]
        x = SoftMaxLayer(x, name='softmax')
        layers.append(x)

        self.layers = layers
Example #12
0
    def get_network(self):
        self._read_config()

        input_layer = None
        layers = []

        prev_layer = None
        for data in self._layers:
            if data["type"] == "input":
                input_size = self._input_size * self._input_size
                output_size = int(data["output_size"])
                layer = InputLayer(input_size, output_size)
            elif data["type"] == "dense":
                if "output_size" in data:
                    output_size = int(data["output_size"])
                else:
                    output_size = self._output_size
                activation_function_str = data["af"]
                activation_function = self._lookup_activation_function(
                    activation_function_str)
                activation_function_d = self._lookup_activation_function_d(
                    activation_function_str)
                learning_rate = float(data["la"])
                layer = DenseLayer(prev_layer.get_output_shape(), output_size,
                                   activation_function, activation_function_d,
                                   learning_rate)
            elif data["type"] == "convolution":
                if prev_layer == None:
                    input_shape = (self._input_size, self._input_size, 1)
                else:
                    input_shape = prev_layer.get_output_shape()
                kernel_n = int(data["kernel_n"])
                kernel_m = int(data["kernel_m"])
                channels_out = int(data["channels"])
                output_shape = (kernel_n, kernel_m, channels_out)
                v_stride = int(data["stride_n"])
                h_stride = int(data["stride_m"])
                padding = int(data["padding"])
                la = float(data["la"])
                layer = ConvolutionLayer(input_shape, output_shape, h_stride,
                                         v_stride, padding, la)
            if input_layer == None:
                input_layer = layer
            else:
                layers.append(layer)
            prev_layer = layer

        network = Network(input_layer, layers)
        return network
Example #13
0
    def binary_mnist():
        config = [{'out': 20, 'activation': 'relu'}]
        input_shape = [28, 28, 1]
        output_size = 2
        depth = 3
        width = 10
        max_modules_pr_layer = 3
        learning_rate = 0.0001
        optimizer_type = SGD
        loss = 'binary_crossentropy'

        layers = []
        for l in range(depth):
            if len(layers) == 0:
                layers.append(DenseLayer(width, 'L0', config, flatten=True))
            else:
                layers.append(DenseLayer(width, 'L' + str(l), config))

        Layer.initialize_whole_network(layers, input_shape)

        task = TaskContainer(input_shape,
                             output_size,
                             name='unique_binary_mnist',
                             optimizer=optimizer_type,
                             loss=loss,
                             lr=learning_rate)

        pathnet = PathNet(input_shape=input_shape, width=width, depth=depth)
        pathnet._layers = layers
        pathnet._tasks = [task]
        pathnet.max_modules_pr_layer = max_modules_pr_layer

        for layer in pathnet._layers:
            layer.save_initialized_weights()

        return pathnet, task
Example #14
0
    def __init__(self, config, weight_init):
        super(OurGCEncoder, self).__init__()
        in_dim = 64
        self.num_relations = config.num_relations
        self.num_users = config.num_users
        self.num_items = config.num_items
        self.accum = config.accum
        self.drop_prob = config.drop_prob
        self.encoder_user = nn.Linear(3, in_dim)
        self.encoder_item = nn.Linear(3, in_dim)

        # self.rgc_layer = RGCLayer(config, weight_init)
        self.gnn = GNNLayer(config, weight_init, config.model, config.use_uv,
                            self.num_users, self.num_relations)
        self.dense_layer = DenseLayer(config, weight_init)
        self.edge_obj_cache = {}
    def _build(self):
        for i in range(1, len(self.num_hidden)):
            # set last layer activation as linear function otherwise use self.act
            if i == len(self.num_hidden) - 1:
                act = (lambda x: x)
            else:
                act = self.act

            layer = DenseLayer(input_dim=self.num_hidden[i - 1],
                               output_dim=self.num_hidden[i],
                               act=act,
                               weight_initializer=self.weight_initializer,
                               bias_initializer=self.bias_initializer,
                               stddev=self.stddev)

            # add layer to layers list
            self.layers.append(layer)
Example #16
0
    def _build(self):
        self.hidden1 = SparseLayer(input_dim=self.input_dim,
                                   output_dim=FLAGS.hidden1,
                                   features_nonzero=self.features_nonzero,
                                   act=tf.nn.relu,
                                   dropout=self.dropout,
                                   logging=self.logging)(self.inputs)

        self.embeddings = DenseLayer(input_dim=FLAGS.hidden1,
                                     output_dim=FLAGS.hidden2,
                                     act=lambda x: x,
                                     dropout=self.dropout,
                                     logging=self.logging)(self.hidden1)

        self.z_mean = self.embeddings

        self.reconstruction_adjacency = InnerProductDecoder(
            input_dim=FLAGS.hidden2, act=lambda x: x,
            logging=self.logging)(self.embeddings)

        self.reconstructions = tf.reshape(self.reconstruction_adjacency, [-1])
Example #17
0
 def __init__(self, layers):
     self._network = []
     for layer in layers:
         layer_type = layer.pop("type")
         if layer_type == "data":
             # this is a data layer
             new_layer = DataLayer(**layer)
         elif layer_type == "conv":
             new_layer = ConvLayer(**layer)
         elif layer_type == "pool":
             new_layer = PoolLayer(**layer)
         elif layer_type == "dense":
             new_layer = DenseLayer(**layer)
         elif layer_type == "relu":
             new_layer = ReLULayer()
         elif layer_type == "loss":
             new_layer = LossLayer(**layer)
         else:
             raise NotImplementedError(
                 "Layer type: {0} not found".format(layer_type))
         self._network.append(new_layer)
     self.initialize()
Example #18
0
    def addDenseLayer(self, use_batch_norm=False, **kwargs):
        """
        Add dense layer.
        If batch norm flag is True, the dense layer
        will be followed by a batch-normalization layer.
        """

        input_layer = self.input_layer if not self.all_layers \
            else self.all_layers[-1]

        self.n_dense_layers += 1
        name = "dense%i" % self.n_dense_layers

        new_layer = DenseLayer(input_layer, name=name, **kwargs)

        self.all_layers += (new_layer, )
        self.trainable_layers += (new_layer, )

        if use_batch_norm:
            self.n_bn_layers += 1
            name = "bn%i" % self.n_bn_layers
            self.all_layers += (BatchNorm(new_layer, name=name), )
  def __init__(self,
               n_points_sample,
               L, 
               max_num_neighs = 4,
               descripDim = [2, 4, 8, 16, 32],
               fittingDim = [16, 8, 4, 2, 1],
               av = [0.0, 0.0],
               std = [1.0, 1.0],
               name='deepMDsimpleEnergy',
               **kwargs):

    super(DeepMDClassification, self).__init__(name=name, **kwargs)

    self.L = L
    # this should be done on the fly, for now we will keep it here
    self.n_points_sample = n_points_sample
    # maximum number of neighbors
    self.max_num_neighs = max_num_neighs
    # we normalize the inputs (should help for the training)
    self.av = av
    self.std = std
    self.descripDim = descripDim
    self.fittingDim = fittingDim
    self.descriptorDim = descripDim[-1]
    # we may need to use the tanh here
    self.layerPyramid   = PyramidLayer(descripDim, 
                                       actfn = tf.nn.relu,
                                       initializer = tf.initializers.GlorotUniform())
    self.layerPyramidInv  = PyramidLayer(descripDim, 
                                       actfn = tf.nn.relu,
                                       initializer = tf.initializers.GlorotUniform())
    
    # we may need to use the relu especially here
    self.fittingNetwork = PyramidLayer(fittingDim, 
                                       actfn = tf.nn.relu)
    self.linfitNet      = DenseLayer(2)    
# tag::test_setup[]
import load_mnist
import network
from layers import DenseLayer, ActivationLayer

training_data, test_data = load_mnist.load_data()  # <1>

net = network.SequentialNetwork()  # <2>

net.add(DenseLayer(784, 392))  # <3>
net.add(ActivationLayer(392))
net.add(DenseLayer(392, 196))
net.add(ActivationLayer(196))
net.add(DenseLayer(196, 10))
net.add(ActivationLayer(10))  # <4>

# <1> First, load training and test data.
# <2> Next, initialize a sequential neural network.
# <3> You can then add dense and activation layers one by one.
# <4> The final layer has size 10, the number of classes to predict.
# end::test_setup[]

# tag::test_run[]
net.train(training_data,
          epochs=10,
          mini_batch_size=10,
          learning_rate=3.0,
          test_data=test_data)  # <1>

# <1> You can now easily train the model by specifying train and test data, the number of epochs, the mini-batch size and the learning rate.
# end::test_run[]
Example #21
0
print("prepocessed_images.shape:", prepocessed_images.shape)
prepocessed_images = np.transpose(prepocessed_images, (0,3,1,2))
print("prepocessed_images.shape after transpose:", prepocessed_images.shape)

# Train-test split 90%-10%
X_train, X_test, y_train, y_test = train_test_split(prepocessed_images, class_label, test_size=0.1)

cnn = MyCNN (
    ConvLayer(filter_size=3,num_filter=3,num_channel=3),
    DetectorLayer(),
    PoolLayer(filter_size=3,stride_size=4,mode="Max"),
    ConvLayer(filter_size=3,num_filter=3,num_channel=3),
    DetectorLayer(),
    PoolLayer(filter_size=3,stride_size=1,mode="Max"),
    FlattenLayer(),
    DenseLayer(n_units=100, activation='relu'),
    DenseLayer(n_units=10, activation='relu'),
    DenseLayer(n_units=1, activation='sigmoid'),
)

cnn.fit(
    features=X_train,
    target=y_train,
    batch_size=5,
    epochs=5,
    learning_rate=0.1
)

model_name = 'pretrained_model'
cnn.save_model(model_name)
Example #22
0
def get_dense_layer():
    layer = DenseLayer(2, 1)
    layer.w = np.asarray([[1.], [2.]])
    layer.b = np.asarray([2.])
    layer._input_data = np.asarray([[-1, 2]])
    return layer
Example #23
0
def build_tower(config, seqs, lengths, labels, initializers={}):
    embedding = DropEmbeddingLayer(config.vocab_size,
                                   config.embed_size,
                                   output_keep_prob=config.keep_prob,
                                   kernel_initializer=initializers.get("embedding_init"),
                                   trainable=False)
    if config.use_elmo:
        elmo = ELMoLayer(vocab_size=config.vocab_size,
                         embed_size=300,
                         hidden_size=1024,
                         cell_type='lstm',
                         num_layers=2,
                         l2_weight=0.1)

    rnn = CudnnRNNLayer(num_units=config.hidden_size,
                        num_layers=config.num_layers,
                        direction="bidirectional",
                        kernel_keep_prob=config.rnn_kernel_keep_prob,
                        output_keep_prob=config.keep_prob,
                        cell='lstm',
                        name='rnn')

    poolers = [MultiHeadAttentivePooling(atn_units=config.atn_units,
                                         num_heads=1,
                                         atn_kernel_keep_prob=config.keep_prob,
                                         atn_weight_keep_prob=config.keep_prob,
                                         name="pooler_%d" % i) for i in range(config.num_aspects)]

    highway = HighwayLayer(num_units=config.hidden_size * 2,
                           num_layers=2,
                           output_keep_prob=config.keep_prob,
                           name='highway')

    dense = DenseLayer(num_units=4, name="dense")

    def aspect_logits(seqs, lengths, training=False):
        embed_seqs = embedding(seqs, training=training)
        if config.use_elmo:
            elmo_seqs = elmo(seqs, lengths)
            embed_seqs = tf.concat([elmo_seqs, embed_seqs], axis=-1)
        rnn_feat_seqs, _ = rnn(embed_seqs, lengths, training=training)
        feat_list = []
        for pooler in poolers:
            feat = pooler(rnn_feat_seqs, lengths, training=training)
            feat = tf.nn.dropout(feat, keep_prob=config.keep_prob)
            feat_list.append(feat)
        feats = tf.concat(feat_list, axis=1)
        feats = highway(feats, training=training)
        logits = dense(feats)
        return logits

    smooth_labels = get_smooth_label(tf.one_hot(labels, depth=4, dtype=tf.float32))
    train_logits = aspect_logits(seqs, lengths, training=True)
    train_loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=train_logits, labels=smooth_labels))
    train_loss += get_f1_loss(smooth_labels, tf.nn.softmax(train_logits))

    if config.use_elmo:
        train_loss += elmo.reg()

    vs = tf.get_variable_scope()
    avger, avg_getter = avg_getter_factory()
    vs.set_custom_getter(avg_getter)
    vs.reuse_variables()
    embedding.build()
    rnn.set_avger(avger)
    for pooler in poolers:
        pooler.build([None, None, config.hidden_size * 2])
    dense.build([None, config.hidden_size * 2])

    eval_logits = aspect_logits(seqs, lengths, training=False)
    eval_oh_preds = tf.one_hot(tf.argmax(eval_logits, axis=-1),
                               depth=4,
                               on_value=True, off_value=False, dtype=tf.bool)
    if config.use_elmo:
        return train_loss, eval_oh_preds, elmo.saver
    else:
        return train_loss, eval_oh_preds, None
Example #24
0
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.reshape(x_train.shape[0], 1, 28 * 28)
x_train = x_train.astype('float32')
x_train /= 255

y_train = np_utils.to_categorical(y_train)

x_test = x_test.reshape(x_test.shape[0], 1, 28 * 28)
x_test = x_test.astype('float32')
x_test /= 255
y_test = np_utils.to_categorical(y_test)

# Network
model = NeuralNetwork()
model.add(DenseLayer(28 * 28, 100))
model.add(ActivationLayer(tanh, dtanh))
model.add(DenseLayer(100, 50))
model.add(ActivationLayer(tanh, dtanh))
model.add(DenseLayer(50, 10))
model.add(ActivationLayer(tanh, dtanh))

model.use(mse, dmse)
model.fit(x_train[0:1000], y_train[0:1000], epochs=35, learning_rate=0.1)

# test on 3 samples
out = model.predict(x_test[0:3])
print("\n")
print("predicted values : ")
print(out, end="\n")
print("true values : ")
Example #25
0
import numpy as np

from network import NeuralNetwork
from layers import DenseLayer, ActivationLayer
from activations import tanh, dtanh
from losses import mse, dmse

x_train = np.array([[[0, 0]], [[0, 1]], [[1, 0]], [[1, 1]]])
y_train = np.array([[[0]], [[1]], [[1]], [[0]]])

model = NeuralNetwork()
model.add(DenseLayer(2, 3))
model.add(ActivationLayer(tanh, dtanh))
model.add(DenseLayer(3, 1))
model.add(ActivationLayer(tanh, dtanh))

model.use(mse, dmse)

model.fit(x_train, y_train, epochs=1000, learning_rate=0.1)

out = model.predict(x_train)
print(out)
Example #26
0
fp = open(filepath, "w")

config = 0

for epoch in epochs:
    for lr in learning_rates:
        for reg in regularizations:
            for alpha in momentums:
                mean_loss = 0
                mean_validation = 0

                for i in range(k):
                    model = NeuralNetwork()
                    model.add(InputLayer(10))
                    model.add(DenseLayer(50, fanin=10))
                    model.add(DenseLayer(30, fanin=50))
                    model.add(OutputLayer(2, fanin=30))
                    model.compile(size, epoch, lr / size, None, reg, alpha,
                                  "mean_squared_error")
                    (train, val) = data.kfolds(index=i, k=k)
                    mean_loss = mean_loss + model.fit(train[0], train[1])[-1]
                    mean_validation = mean_validation + model.evaluate(
                        val[0], val[1])

                fp.write("{}, {}, {}, {}, {}, {}, {}\n".format(
                    config, epoch, lr, reg, alpha, mean_loss / k,
                    mean_validation / k))

                config = config + 1
    (test_images.shape[0], test_images.shape[1] * test_images.shape[2]))

training_images *= 1.0 / 255.0
test_images *= 1.0 / 255.0

np.random.seed(1345134)

# training_images = training_images[0:1]
# training_labels = training_labels[0:1]

# print (training_images.shape)
# print (training_labels.shape)
# print (test_images.shape)
# print (test_labels.shape)

layers = [DenseLayer(training_images.shape[1], sizes[0], ReLUActivation())]
i = 1
while i < len(sizes):
    layers.append(DenseLayer(sizes[i - 1], sizes[i], ReLUActivation()))
    i += 1
layers.append(SoftmaxCrossEntropyLayer(sizes[i - 1], training_labels.shape[1]))

classifier = Classifier(layers, softmax_cross_entropy)
classifier.train(training_images,
                 training_labels,
                 max_iter=max_iter,
                 learning_rate=learning_rate,
                 target_acc=0.999,
                 batch_size=batch_size)

predictions = classifier.predict(training_images)
    def __init__(self, x, y, args):
        self.params_theta = []
        self.params_lambda = []
        self.params_weight = []
        if args.dataset == 'mnist':
            input_size = (None, 1, 28, 28)
        elif args.dataset == 'cifar10':
            input_size = (None, 3, 32, 32)
        else:
            raise AssertionError

        if (args.depth - 1) % args.num_blocks != 0:
            raise ValueError("depth must be num_blocks * n + 1 for some n")

        # input and initial convolution
        layers = [InputLayer(input_size)]
        self.penalty = theano.shared(np.array(0.))

        layers.append(
            Conv2DLayer(args,
                        layers[-1],
                        args.first_output,
                        3,
                        pad='same',
                        W=lasagne.init.HeNormal(gain='relu'),
                        b=None,
                        nonlinearity=None,
                        name='pre_conv'))
        self.add_params_to_self(args, layers[-1])

        layers.append(
            BatchNormLayer(layers[-1], name='pre_bn', beta=None, gamma=None))
        #self.add_params_to_self(args, layers[-1])
        # note: The authors' implementation does *not* have a dropout after the
        #       initial convolution. This was missing in the paper, but important.
        # if dropout:
        #     layers.append(DropoutLayer(network, dropout))
        # dense blocks with transitions in between

        n = (args.depth - 1) // args.num_blocks
        for b in range(args.num_blocks):
            self.dense_block(args,
                             layers,
                             n - 1,
                             args.growth_rate,
                             args.dropout,
                             name_prefix='block%d' % (b + 1))
            if b < args.num_blocks - 1:
                self.transition(args,
                                layers,
                                args.dropout,
                                name_prefix='block%d_trs' % (b + 1))

        # post processing until prediction
        #TODO: treat initialization as hyperparameter, but don't regularize weights
        layers.append(ScaleLayer(args, layers[-1], name='post_scale'))
        self.add_params_to_self(args, layers[-1])
        layers.append(BiasLayer(args, layers[-1], name='post_shift'))
        self.add_params_to_self(args, layers[-1])
        layers.append(
            NonlinearityLayer(layers[-1],
                              nonlinearity=rectify,
                              name='post_relu'))
        layers.append(GlobalPoolLayer(layers[-1], name='post_pool'))
        #TODO: regularize
        layers.append(
            DenseLayer(args,
                       layers[-1],
                       args.classes,
                       nonlinearity=softmax,
                       W=lasagne.init.HeNormal(gain=1),
                       name='output'))
        self.add_params_to_self(args, layers[-1])
        self.layers = layers

        print(self.params_theta)
        print(self.params_weight)
        print(self.params_lambda)

        #training time: deterministic=False
        self.y = ll.get_output(layers[-1], x, deterministic=False)
        self.prediction = T.argmax(self.y, axis=1)
        # cost function
        self.loss = T.mean(categorical_crossentropy(self.y, y))
        self.lossWithPenalty = T.add(self.loss, self.penalty)

        #validation time: deterministic=True
        self.y_det = ll.get_output(layers[-1], x, deterministic=True)
        self.prediction_det = T.argmax(self.y, axis=1)
        # cost function
        self.loss_det = T.mean(categorical_crossentropy(self.y_det, y))
        self.lossWithPenalty_det = T.add(self.loss_det, self.penalty)
        print("loss and losswithpenalty", type(self.loss),
              type(self.lossWithPenalty))
Example #29
0
    def init_comp_graph(self, args, vecs, pretrained, mappings, invmappings, trans_length, feat_dim, log):
        keep_prob = args.keep_prob if self.train else 1.0

        feat_shape = [5] if args.transsys == 'Cov' else ([feat_dim, 5] if self.train else [self.sent_length, 5])

        # build computational graph
        log.info('Building computational graph, this might take a while...')
        # POS BiLSTM
        log.debug('Building computational graph for the POS-BiLSTM...')
        word_emb_dim = vecs.shape[1]
        # Uppercased words are initialized with lowercased vectors, but finetuned separately
        pretrained_base = tf.Variable(vecs[:pretrained], trainable=False)
        pretrained_delta = tf.Variable(np.zeros(vecs[:pretrained].shape, dtype=floatX))
        pretrained_emb = tf.add(pretrained_base, pretrained_delta)
        random_emb = tf.Variable(vecs[pretrained:])
        embeddings = tf.concat([pretrained_emb, random_emb], 0)
        self.words = tf.placeholder(tf.int32, [args.batch_size, self.sent_length])
        self.words2 = tf.placeholder(tf.int32, [args.batch_size, self.sent_length])
        self.sent_lengths = tf.placeholder(tf.int32, [args.batch_size])
        word_emb = tf.nn.embedding_lookup(embeddings, self.words)
        word_emb2 = tf.nn.embedding_lookup(embeddings, self.words2)

        with tf.variable_scope('bilstm1'):
            lstm_fw = rnn_cell.MultiRNNCell([rnn_cell.BasicLSTMCell(args.hidden_size, state_is_tuple=True) for _ in range(args.layers)], state_is_tuple=True)
            lstm_bw = rnn_cell.MultiRNNCell([rnn_cell.BasicLSTMCell(args.hidden_size, state_is_tuple=True) for _ in range(args.layers)], state_is_tuple=True)

            bilstm_outputs, _ = rnn.bidirectional_dynamic_rnn(lstm_fw, lstm_bw, word_emb, sequence_length=self.sent_lengths, dtype=tf.float32)


        # POS
        self.gold_pos = tf.placeholder(tf.int32, [args.batch_size, self.sent_length])
        if args.fpos:
            self.gold_fpos = tf.placeholder(tf.int32, [args.batch_size, self.sent_length])

        # POS system
        log.debug('Building computational graph for the POS-tagging system...')
        if not args.no_pos:
            log.debug('Building computational graph for dense layers following Parse-BiLSTM...')
            pos_densesizes = [args.hidden_size] + [int(x) for x in args.pos_dense_layers.split(',')] + [args.pos_emb_dim, len(mappings['pos'])]
            pos_denselayers = len(pos_densesizes) - 1

            pos_dense_inputs = [tf.reshape(x, [-1, args.hidden_size]) for x in bilstm_outputs]
            pos_dense = [MergeLayer(pos_densesizes[0], pos_densesizes[0], pos_densesizes[1], keepProb=keep_prob, combination='affine')]
            pos_dense += [DenseLayer(pos_densesizes[i], pos_densesizes[i+1], keepProb=keep_prob) for i in xrange(1, pos_denselayers - 1)]
            # split representations for head and dependent
            pos_dense += [DenseLayer(pos_densesizes[-2], pos_densesizes[-1], keepProb=keep_prob, nl=lambda x:x)]

            pos_dense_intermediate = pos_dense[0](pos_dense_inputs[0], pos_dense_inputs[1])

            for l in xrange(1, pos_denselayers-1):
                pos_dense_intermediate = pos_dense[l](pos_dense_intermediate)

            pos_dense_outputs = tf.reshape(pos_dense[-1](pos_dense_intermediate), [args.batch_size, -1, len(mappings['pos'])])
            if args.fpos:
                fpos_dense = DenseLayer(args.pos_emb_dim, len(mappings['fpos']), keepProb=keep_prob, nl=lambda x:x)
                fpos_dense_outputs = tf.reshape(fpos_dense(pos_dense_intermediate), [args.batch_size, -1, len(mappings['fpos'])])
        else:
            pos_dense_outputs = [None for _ in range(args.batch_size)]

        pos_trainables = tf.Variable(tf.truncated_normal((len(mappings['pos']), args.pos_emb_dim)),
                dtype=tf.float32, name='pos_trainables')
        pos_untrainable = tf.Variable(tf.zeros((1, args.pos_emb_dim), dtype=tf.float32), trainable=False)
        pos_embeddings = tf.concat([pos_trainables, pos_untrainable], 0)

        pos_loss_pred_ = lambda i: self.pos_loss_pred(i, pos_embeddings, pos_dense_outputs[i], len(mappings['pos']), self.gold_pos, pos_trainables)

        if self.train:
            pos_losses = tf.multiply(args.pos_mult, tf.map_fn(lambda i: pos_loss_pred_(i)[0], tf.range(args.batch_size), parallel_iterations=args.batch_size, dtype=tf.float32))
        else:
            self.pos_preds = tf.map_fn(lambda i: pos_loss_pred_(i)[0], tf.range(args.batch_size), parallel_iterations=args.batch_size)

        self.pos_embs = tf.map_fn(lambda i: pos_loss_pred_(i)[1], tf.range(args.batch_size), parallel_iterations=args.batch_size, dtype=tf.float32)

        if args.fpos:
            fpos_trainables = tf.Variable(tf.truncated_normal((len(mappings['fpos']), args.pos_emb_dim)),
                    dtype=tf.float32, name='fpos_trainables')
            fpos_untrainable = tf.Variable(tf.zeros((1, args.pos_emb_dim), dtype=tf.float32), trainable=False)
            fpos_embeddings = tf.concat([fpos_trainables, fpos_untrainable], 0)

            fpos_loss_pred_ = lambda i: self.pos_loss_pred(i, fpos_embeddings, fpos_dense_outputs[i], len(mappings['fpos']), self.gold_fpos, fpos_trainables)

            if self.train:
                fpos_losses = tf.multiply(args.pos_mult, tf.map_fn(lambda i: fpos_loss_pred_(i)[0], tf.range(args.batch_size), parallel_iterations=args.batch_size, dtype=tf.float32))
                pos_losses = pos_losses + fpos_losses
            else:
                self.fpos_preds = tf.map_fn(lambda i: fpos_loss_pred_(i)[0], tf.range(args.batch_size), parallel_iterations=args.batch_size)

            self.fpos_embs = tf.map_fn(lambda i: fpos_loss_pred_(i)[1], tf.range(args.batch_size), parallel_iterations=args.batch_size, dtype=tf.float32)

        bilstm_outputs = tf.concat([bilstm_outputs[0], bilstm_outputs[1]], 2)

        # Concatenate tagger BiLSTM outputs as Parser BiLSTM input
        concat_list = [bilstm_outputs]
        dim = args.hidden_size * 2

        concat_list += [self.pos_embs]
        dim += args.pos_emb_dim

        if args.fpos:
            concat_list += [self.fpos_embs]
            dim += args.pos_emb_dim

        bilstm2_inputs = tf.reshape(tf.concat(concat_list, 2), [args.batch_size, -1, dim])

        # Parse BiLSTM
        log.debug('Building computational graph for the Parse-BiLSTM...')

        with tf.variable_scope('bilstm2'):
            lstm2_fw = rnn_cell.MultiRNNCell([rnn_cell.BasicLSTMCell(args.hidden_size, state_is_tuple=True) for _ in range(args.layers2)], state_is_tuple=True)
            lstm2_bw = rnn_cell.MultiRNNCell([rnn_cell.BasicLSTMCell(args.hidden_size, state_is_tuple=True) for _ in range(args.layers2)], state_is_tuple=True)

            bilstm2_outputs, _ = rnn.bidirectional_dynamic_rnn(lstm2_fw, lstm2_bw, bilstm2_inputs, sequence_length=self.sent_lengths, dtype=tf.float32)

        # Dense layer(s)
        log.debug('Building computational graph for dense layers following Parse-BiLSTM...')
        densesizes = [args.hidden_size] + [int(x) for x in args.dense_layers.split(',')] + [args.rel_emb_dim]
        denselayers = len(densesizes) - 1

        dense_inputs = [tf.reshape(x, [-1, args.hidden_size]) for x in bilstm2_outputs]
        if denselayers == 1:
            dense = [[MergeLayer(densesizes[0], densesizes[0], densesizes[1], keepProb=keep_prob, combination='affine') for _ in xrange(2)]]
            dense_outputs = [dense[0][j](dense_inputs[0], dense_inputs[1]) for j in xrange(2)]
        else:
            dense = [MergeLayer(densesizes[0], densesizes[0], densesizes[1], keepProb=keep_prob, combination='affine')]
            dense += [DenseLayer(densesizes[i], densesizes[i+1], keepProb=keep_prob) for i in xrange(1, denselayers - 1)]
            # split representations for head and dependent
            dense += [[DenseLayer(densesizes[-2], densesizes[-1], keepProb=keep_prob) for _ in xrange(2)]]

            dense_outputs = dense[0](dense_inputs[0], dense_inputs[1])

            for l in xrange(1, denselayers-1):
                dense_outputs = dense[l](dense_outputs)

            dense_outputs = [dense[-1][j](dense_outputs) for j in xrange(2)]

        dense_outputs = [tf.reshape(x, [args.batch_size, -1, args.rel_emb_dim]) for x in dense_outputs]

        self.combined_head = dense_outputs[0]
        self.combined_dep  = dense_outputs[1]

        # transition system
        log.debug('Building computational graph for the transition system...')
        if self.train:
            self.trans_feat_ids = tf.placeholder(tf.int32, [args.batch_size, trans_length] + feat_shape)
            
            self.trans_feat_sizes = tf.placeholder(tf.int32, [args.batch_size, trans_length])
            self.trans_labels = tf.placeholder(tf.int32, [args.batch_size, trans_length])
            self.trans_lengths = tf.placeholder(tf.int32, [args.batch_size])
        else:
            self.trans_feat_ids = tf.placeholder(tf.int32, [None] + feat_shape)
            self.trans_feat_sizes = tf.placeholder(tf.int32, [None])

        self.rel_merge = MergeLayer(args.rel_emb_dim, args.rel_emb_dim, args.rel_emb_dim,
                keepProb=keep_prob, combination=args.combination)

        if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3':
            self.rel_dense = DenseLayer(args.rel_emb_dim, len(mappings['rel']), nl=lambda x:x)

            transition_dense = MergeLayer(args.rel_emb_dim, args.rel_emb_dim, 1, nl=lambda x:x, combination=args.combination)
            self.transition_logit = transition_dense(tf.reshape(self.combined_head, [-1, args.rel_emb_dim]),
                    tf.reshape(self.combined_dep, [-1, args.rel_emb_dim]))
            self.transition_logit = tf.reshape(self.transition_logit, (args.batch_size, -1))
        elif args.transsys in ['AER', 'AES', 'Cov']:
            self.rel_dense = DenseLayer(args.rel_emb_dim * 4, 2 + 2 * len(mappings['rel']), nl=lambda x:x)
        elif args.transsys in ['ASd', 'AH']:
            self.rel_dense = DenseLayer(args.rel_emb_dim * 4, 1 + 2 * len(mappings['rel']), nl=lambda x:x)

        SHIFT = mappings['action']['Shift']
        if self.train:
            if args.transsys == 'NCov' or args.transsys == 'Cov3' :
                trans_loss_f = lambda i, j: self.NCov_transition_loss_pred(i, j, self.combined_head[i], self.combined_dep[i], self.transition_logit[i], SHIFT)
            elif args.transsys == 'Cov2':
                
                trans_loss_f = lambda i, j: self.NCov_transition_loss_pred(i, j, self.combined_head[i], self.combined_dep[i], self.transition_logit[i], SHIFT)    
            else:
                trans_loss_f = lambda i, j: self.traditional_transition_loss_pred(i, j, self.combined_head[i], self.combined_dep[i])

            def _ex_loss(i):
                trans_loss = tf.reduce_sum(tf.map_fn(lambda j: trans_loss_f(i, j), tf.range(self.trans_lengths[i]), dtype=tf.float32, parallel_iterations=100))
                if not args.no_pos:
                    loss = tf.add(pos_losses[i], trans_loss)
                else:
                    loss = trans_loss

                return loss

            losses = tf.map_fn(_ex_loss, tf.range(args.batch_size), dtype=tf.float32, parallel_iterations=100)

            self._loss = tf.reduce_mean(losses)
        else:
            self.combined_head_placeholder = tf.placeholder(tf.float32, (None, self.sent_length, args.rel_emb_dim))
            self.combined_dep_placeholder  = tf.placeholder(tf.float32, (None, self.sent_length, args.rel_emb_dim))
            if args.transsys == 'NCov' or args.transsys == 'Cov3':
                self.trans_logit_placeholder = tf.placeholder(tf.float32, (None, self.sent_length))
                trans_pred = lambda i, k: self.NCov_transition_loss_pred(i, k, self.combined_head_placeholder[i], self.combined_dep_placeholder[i], self.trans_logit_placeholder[i], SHIFT)
                self.pred_output_size = self.sent_length * len(mappings['rel']) + 1
            
            elif args.transsys == 'Cov2':
                self.trans_logit_placeholder = tf.placeholder(tf.float32, (None, self.sent_length))
                trans_pred = lambda i, k: self.NCov_transition_loss_pred(i, k, self.combined_head_placeholder[i], self.combined_dep_placeholder[i], self.trans_logit_placeholder[i], SHIFT)
                self.pred_output_size = self.sent_length * len(mappings['rel']) + 1 #Son 2 (NA y SH), pero volvemos a poner 1 porque el NA pasa a ser una arc-tran
            
            else:
                trans_pred = lambda i, k: self.traditional_transition_loss_pred(i, k, self.combined_head_placeholder[i], self.combined_dep_placeholder[i])
                if args.transsys in ['AES', 'AER', 'Cov']:
                    self.pred_output_size = 2 * len(mappings['rel']) + 2
                elif args.transsys in ['ASd', 'AH']:
                    self.pred_output_size = 2 * len(mappings['rel']) + 1

            self._trans_predictors = [[trans_pred(i,k) for k in range(args.beam_size)] for i in xrange(args.batch_size)]
def main(num_epochs=10,
         k=100,
         batch_size=128,
         display_freq=100,
         save_freq=1000,
         load_previous=False,
         attention=True,
         word_by_word=True,
         p=0,
         mode='word_by_word'):
    print('num_epochs: {}'.format(num_epochs))
    print('k: {}'.format(k))
    print('batch_size: {}'.format(batch_size))
    print('display_frequency: {}'.format(display_freq))
    print('save_frequency: {}'.format(save_freq))
    print('load previous: {}'.format(load_previous))
    print('attention: {}'.format(attention))
    print('word_by_word: {}'.format(word_by_word))
    save_filename = './snli/{}_model.npz'.format(mode)
    print("Building network ...")
    premise_var = T.imatrix('premise_var')
    premise_mask = T.imatrix('premise_mask')
    hypo_var = T.imatrix('hypo_var')
    hypo_mask = T.imatrix('hypo_mask')
    unchanged_W = pickle.load(open('./snli/unchanged_W.pkl', 'rb'))
    unchanged_W = unchanged_W.astype('float32')
    unchanged_W_shape = unchanged_W.shape
    oov_in_train_W = pickle.load(open('./snli/oov_in_train_W.pkl', 'rb'))
    oov_in_train_W = oov_in_train_W.astype('float32')
    oov_in_train_W_shape = oov_in_train_W.shape
    print('unchanged_W.shape: {0}'.format(unchanged_W_shape))
    print('oov_in_train_W.shape: {0}'.format(oov_in_train_W_shape))
    # hyperparameters
    learning_rate = 0.001
    l2_weight = 0.
    #Input layers
    l_premise = lasagne.layers.InputLayer(shape=(None, premise_max),
                                          input_var=premise_var)
    l_premise_mask = lasagne.layers.InputLayer(shape=(None, premise_max),
                                               input_var=premise_mask)
    l_hypo = lasagne.layers.InputLayer(shape=(None, hypothesis_max),
                                       input_var=hypo_var)
    l_hypo_mask = lasagne.layers.InputLayer(shape=(None, hypothesis_max),
                                            input_var=hypo_mask)
    #Embedded layers
    premise_embedding = EmbeddedLayer(l_premise,
                                      unchanged_W,
                                      unchanged_W_shape,
                                      oov_in_train_W,
                                      oov_in_train_W_shape,
                                      p=p)
    #weights shared with premise_embedding
    hypo_embedding = EmbeddedLayer(
        l_hypo,
        unchanged_W=premise_embedding.unchanged_W,
        unchanged_W_shape=unchanged_W_shape,
        oov_in_train_W=premise_embedding.oov_in_train_W,
        oov_in_train_W_shape=oov_in_train_W_shape,
        p=p,
        dropout_mask=premise_embedding.dropout_mask)
    #Dense layers
    l_premise_linear = DenseLayer(premise_embedding,
                                  k,
                                  nonlinearity=lasagne.nonlinearities.linear)
    l_hypo_linear = DenseLayer(hypo_embedding,
                               k,
                               W=l_premise_linear.W,
                               b=l_premise_linear.b,
                               nonlinearity=lasagne.nonlinearities.linear)

    encoder = Encoder(l_premise_linear,
                      k,
                      peepholes=False,
                      mask_input=l_premise_mask)
    #initialized with encoder final hidden state
    decoder = Decoder(l_hypo_linear,
                      k,
                      cell_init=encoder,
                      peepholes=False,
                      mask_input=l_hypo_mask,
                      encoder_mask_input=l_premise_mask,
                      attention=attention,
                      word_by_word=word_by_word)
    if p > 0.:
        print('apply dropout rate {} to decoder'.format(p))
        decoder = lasagne.layers.DropoutLayer(decoder, p)
    l_softmax = lasagne.layers.DenseLayer(
        decoder, num_units=3, nonlinearity=lasagne.nonlinearities.softmax)
    target_var = T.ivector('target_var')

    #lasagne.layers.get_output produces a variable for the output of the net
    prediction = lasagne.layers.get_output(l_softmax, deterministic=False)
    #The network output will have shape (n_batch, 3);
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    cost = loss.mean()
    if l2_weight > 0.:
        #apply l2 regularization
        print('apply l2 penalty to all layers, weight: {}'.format(l2_weight))
        regularized_layers = {encoder: l2_weight, decoder: l2_weight}
        l2_penalty = lasagne.regularization.regularize_network_params(
            l_softmax, lasagne.regularization.l2) * l2_weight
        cost += l2_penalty


#Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_softmax, trainable=True)
    #Compute adam updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adam(cost,
                                   all_params,
                                   learning_rate=learning_rate)

    test_prediction = lasagne.layers.get_output(l_softmax, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    # lasagne.objectives.categorical_accuracy()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train_fn = theano.function(
        [premise_var, premise_mask, hypo_var, hypo_mask, target_var],
        cost,
        updates=updates)
    val_fn = theano.function(
        [premise_var, premise_mask, hypo_var, hypo_mask, target_var],
        [test_loss, test_acc])
    print("Training ...")

    print('train_data.shape: {0}'.format(train_data.shape))
    print('val_data.shape: {0}'.format(val_data.shape))
    print('test_data.shape: {0}'.format(test_data.shape))
    try:
        # Finally, launch the training loop.
        print("Training started...")
        # iterate over epochs:
        for epoch in range(num_epochs):
            # In each epoch, do a full pass over the training data:
            shuffled_train_data = train_data.reindex(
                np.random.permutation(train_data.index))
            train_err = 0
            train_acc = 0
            train_batches = 0
            start_time = time.time()
            display_at = time.time()
            save_at = time.time()
            for start_i in range(0, len(shuffled_train_data), batch_size):
                batched_data = shuffled_train_data[start_i:start_i +
                                                   batch_size]
                ps, p_masks, hs, h_masks, labels = prepare(batched_data)
                train_err += train_fn(ps, p_masks, hs, h_masks, labels)
                err, acc = val_fn(ps, p_masks, hs, h_masks, labels)
                train_acc += acc
                train_batches += 1
                # display
                if train_batches % display_freq == 0:
                    print("Seen {:d} samples, time used: {:.3f}s".format(
                        start_i + batch_size,
                        time.time() - display_at))
                    print("  current training loss:\t\t{:.6f}".format(
                        train_err / train_batches))
                    print("  current training accuracy:\t\t{:.6f}".format(
                        train_acc / train_batches))
                # do tmp save model
                if train_batches % save_freq == 0:
                    print(
                        'saving to ..., time used {:.3f}s'.format(time.time() -
                                                                  save_at))
                    np.savez(save_filename,
                             *lasagne.layers.get_all_param_values(l_softmax))
                    save_at = time.time()

            # And a full pass over the validation data:
            val_err = 0
            val_acc = 0
            val_batches = 0
            for start_i in range(0, len(val_data), batch_size):
                batched_data = val_data[start_i:start_i + batch_size]
                ps, p_masks, hs, h_masks, labels = prepare(batched_data)
                err, acc = val_fn(ps, p_masks, hs, h_masks, labels)
                val_err += err
                val_acc += acc
                val_batches += 1

            # Then we print the results for this epoch:
            print("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs,
                time.time() - start_time))
            print("  training loss:\t\t{:.6f}".format(train_err /
                                                      train_batches))
            print("  training accuracy:\t\t{:.2f} %".format(
                train_acc / train_batches * 100))
            print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
            print("  validation accuracy:\t\t{:.2f} %".format(
                val_acc / val_batches * 100))

            # After training, we compute and print the test error:
            test_err = 0
            test_acc = 0
            test_batches = 0
            for start_i in range(0, len(test_data), batch_size):
                batched_data = test_data[start_i:start_i + batch_size]
                ps, p_masks, hs, h_masks, labels = prepare(batched_data)
                err, acc = val_fn(ps, p_masks, hs, h_masks, labels)
                test_err += err
                test_acc += acc
                test_batches += 1
            # print("Final results:")
            print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
            print("  test accuracy:\t\t{:.2f} %".format(test_acc /
                                                        test_batches * 100))
            filename = './snli/{}_model_epoch{}.npz'.format(mode, epoch + 1)
            print('saving to {}'.format(filename))
            np.savez(filename, *lasagne.layers.get_all_param_values(l_softmax))

        # Optionally, you could now dump the network weights to a file like this:
        # np.savez('model.npz', *lasagne.layers.get_all_param_values(network))
        #
        # And load them again later on like this:
        # with np.load('model.npz') as f:
        #     param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        # lasagne.layers.set_all_param_values(network, param_values)
    except KeyboardInterrupt:
        print('exit ...')