def ewma(series, axis=None, span=VOL_WINDOW_SPAN, adjust=True, initial=None):
    """
    Exponentially-weighted moving average
    """
    if axis is None:
        if series.ndim == 1:
            axis=0
        else:
            raise ValueError("Please specify which axis to compute ewma over (usually time axis)") 
    assert span >= 1
    alpha = 2. / (span + 1) 
    series = T.swapaxes(series, axis, 0)
    if adjust:
        assert initial is None
        initial = T.zeros_like(series[0])
    else:
        if initial is None:
            initial = series[0]
        initial /= alpha
 
    def ewma_numerator_step(a_i, prev_ewma):
        return a_i + (1. - alpha) * prev_ewma
 
    ewma_numerators, _ = theano.scan(ewma_numerator_step, series,
                                     outputs_info=initial, strict=True)
 
    if adjust:
        ewma_denominators = T.cumsum((1 - alpha) ** T.arange(ewma_numerators.shape[0]))
        series_ewma = ewma_numerators / ewma_denominators.reshape((-1,)+(1,)*(ewma_numerators.ndim-1))
    else:
        series_ewma = ewma_numerators * alpha 
    series_ewma = T.swapaxes(series_ewma, 0, axis)
    return series_ewma
Exemple #2
0
        def columnwise_softmax(o):
            # comput softmax in numerically stable, column-wise way
            # move distribution axis to the end, collapse sequences/minibatches along first axis, calculate softmax
            # for each of the sequence_length*num_minibatches rows, then re-roll and swap axes back
            swapped_o = T.swapaxes(o,1,2)
            swapped_flat_o = swapped_o.reshape((-1,swapped_o.shape[-1]))
            clipped_swapped_flat_o1 = T.clip(swapped_flat_o, -5., 5.) # don't exponentiate numbers too big/small
            clipped_swapped_flat_o2 = T.exp(clipped_swapped_flat_o1 / softmax_temperature)
            clipped_swapped_flat_o3 = clipped_swapped_flat_o2 / clipped_swapped_flat_o2.sum(axis=1, keepdims=True)
            softmaxed_swapped_o = clipped_swapped_flat_o3.reshape(swapped_o.shape)
            softmaxed_o = T.swapaxes(softmaxed_swapped_o,1,2)

            return softmaxed_o
Exemple #3
0
 def forward(self, X, previous_state, previous_hidden):
     output, state = self.input_layer.step(X, previous_state[:, 0, :],
                                           previous_hidden[:, 0, :],
                                           self.dropout_probability)
     hiddens, states = [output], [state]
     for i, layer in enumerate(self.layers):
         output, state = layer.step(output, previous_state[:, i + 1, :],
                                    previous_hidden[:, i + 1, :],
                                    self.dropout_probability)
         hiddens.append(output)
         states.append(state)
     return T.swapaxes(T.stack(*hiddens), 0,
                       1), T.swapaxes(T.stack(*states), 0, 1)
def set_sampling_function(decoder_feature_function,
                          decoder_red_function,
                          decoder_green_function,
                          decoder_blue_function):

    hidden_data = T.matrix(name='hidden_data',
                           dtype=theano.config.floatX)

    # decoder
    decoder_outputs = decoder_feature_function(hidden_data)
    decoder_feature = decoder_outputs[1]
    decoder_red     = decoder_red_function(decoder_feature)
    decoder_green   = decoder_green_function(decoder_feature)
    decoder_blue    = decoder_blue_function(decoder_feature)

    num_samples = decoder_red.shape[0]
    num_rows    = decoder_red.shape[2]
    num_cols    = decoder_red.shape[3]
    num_pixels  = num_rows*num_cols

    # shape = (num_samples, num_intensity, num_pixels)
    decoder_red   = T.flatten(decoder_red, 3)
    decoder_green = T.flatten(decoder_green, 3)
    decoder_blue  = T.flatten(decoder_blue, 3)
    # shape = (num_samples, num_pixels, num_intensity)
    decoder_red   = T.swapaxes(decoder_red, axis1=1, axis2=2)
    decoder_green = T.swapaxes(decoder_green, axis1=1, axis2=2)
    decoder_blue  = T.swapaxes(decoder_blue, axis1=1, axis2=2)
    # shape = (num_samples*num_pixels, num_intensity)
    decoder_red   = decoder_red.reshape((num_samples*num_pixels, -1))
    decoder_green = decoder_green.reshape((num_samples*num_pixels, -1))
    decoder_blue  = decoder_blue.reshape((num_samples*num_pixels, -1))
    # softmax
    decoder_red   = T.argmax(T.nnet.softmax(decoder_red),axis=1)
    decoder_green = T.argmax(T.nnet.softmax(decoder_green),axis=1)
    decoder_blue  = T.argmax(T.nnet.softmax(decoder_blue),axis=1)

    decoder_red   = decoder_red.reshape((num_samples, 1, num_rows, num_cols))
    decoder_green = decoder_green.reshape((num_samples, 1, num_rows, num_cols))
    decoder_blue  = decoder_blue.reshape((num_samples, 1, num_rows, num_cols))

    decoder_image = T.concatenate([decoder_red, decoder_green, decoder_blue], axis=1)

    function_inputs = [hidden_data,]
    function_outputs = [decoder_image,]

    function = theano.function(inputs=function_inputs,
                               outputs=function_outputs,
                               on_unused_input='ignore')
    return function
Exemple #5
0
        def make_masks(x):
            nonsymbolic_masks = []
            for layer in self.layers:
                rng = T.shared_randomstreams.RandomStreams(np.random.randint(999999))
                mask = rng.binomial(p=self.dropout, size=(layer.hidden_dim,layer.minibatch_dim), dtype=theano.config.floatX)

                nonsymbolic_masks.append(mask)

            # T.stack gives (minibatch_dim,hidden_dim,layer_num) and we want (hidden_dim,minibatch_dim,layer_num) for point-wise multiplication
            masks = T.stack(nonsymbolic_masks)
            masks = T.swapaxes(masks, 0, 2)
            masks = T.swapaxes(masks, 0, 1)

            return masks
Exemple #6
0
 def apply(self, input_):
     output = tensor.swapaxes(input_, self.axis1, self.axis2)
     if self.debug:
         import theano
         output = theano.printing.Print('output:',
                                        attrs=('shape', ))(output)
     return output
Exemple #7
0
 def step(self, X, previous_hidden, previous_state):
     out, state = self.forward_with_weights(X, previous_hidden[:, 0, :], previous_state[:, 0, :],
                                            self.Wi, self.Ui[0], self.bi[0],
                                            self.Wf, self.Uf[0], self.bf[0],
                                            self.Wc, self.Uc[0], self.bc[0],
                                            self.Wo, self.Vo[0], self.Uo[0], self.bo[0])
     outs = [out]
     states = [state]
     for l in xrange(1, self.num_layers):
         out, state = self.forward_with_weights(out, previous_hidden[:, l, :], previous_state[:, l, :],
                                             self.Whi[l - 1], self.Ui[l], self.bi[l],
                                             self.Whf[l - 1], self.Uf[l], self.bf[l],
                                             self.Whc[l - 1], self.Uc[l], self.bc[l],
                                             self.Who[l - 1], self.Vo[l], self.Uo[l], self.bo[l])
         states.append(state)
         outs.append(out)
     return T.swapaxes(T.stack(*outs), 0, 1), T.swapaxes(T.stack(*states), 0, 1)
Exemple #8
0
 def compute_output(self, network, in_vw):
     axis1, axis2 = network.find_hyperparameter(["axes"])
     out_shape = list(in_vw.shape)
     out_shape[axis1], out_shape[axis2] = out_shape[axis2], out_shape[axis1]
     network.create_vw(
         "default",
         variable=T.swapaxes(in_vw.variable, axis1, axis2),
         shape=out_shape,
         tags={"output"},
     )
Exemple #9
0
 def compute_output(self, network, in_vw):
     axis1, axis2 = network.find_hyperparameter(["axes"])
     out_shape = list(in_vw.shape)
     out_shape[axis1], out_shape[axis2] = out_shape[axis2], out_shape[axis1]
     network.create_vw(
         "default",
         variable=T.swapaxes(in_vw.variable, axis1, axis2),
         shape=out_shape,
         tags={"output"},
     )
Exemple #10
0
  def fwd(self, x, disk=None, layer_begin=None, layer_end=None):
    """
    x : signal
    """
#    def cal_patch(theta_i, rho_i, x_dense, y_dense):
#      x_coord = Tsp.csr_from_dense(x_dense)
#      y_coord = Tsp.csr_from_dense(y_dense)
#      x0 = rho_i*T.cos(theta_i)*Tsp.basic.sp_ones_like(x_coord)
#      y0 = rho_i*T.sin(theta_i)*Tsp.basic.sp_ones_like(y_coord)
#      patch_i = Tsp.structured_exp((-1.0/self.sigma)*(Tsp.sqr(x_coord-x0)+Tsp.sqr(y_coord-y0)))
#      patch_i = Tsp.basic.row_scale(patch_i,1.0/Tsp.basic.sp_sum(patch_i,axis=1))
#      return patch_i.toarray()
#    
#    scan_results,scan_updates = theano.scan(fn=cal_patch, outputs_info=None,
#                                            sequences=[self.theta, self.rho],
#                                            non_sequences=[x_local.toarray(), y_local.toarray()])
#    disk = Tsp.csr_from_dense(T.swapaxes(scan_results,0,1).reshape([self.ntheta * self.nrho * x.shape[0], x.shape[0]]))
#    patch = Tsp.basic.structured_dot(disk,x)
#    patch = T.reshape(patch,(x.shape[0], self.ntheta*self.nrho, self.nin, 1))
#    patch = T.swapaxes(patch, 1, 2)


#    def cal_patch(theta_i, rho_i, x_coord, y_coord):
#      x0 = rho_i*T.cos(theta_i)*Tsp.basic.sp_ones_like(x_coord)
#      y0 = rho_i*T.sin(theta_i)*Tsp.basic.sp_ones_like(y_coord)
#      patch_i = Tsp.structured_exp((-1.0/self.sigma)*(Tsp.sqr(x_coord-x0)+Tsp.sqr(y_coord-y0)))
#      patch_i = Tsp.basic.row_scale(patch_i,1.0/(1e-30+Tsp.basic.sp_sum(patch_i,axis=1)))
#      return patch_i
#    disk = []
#    for i in xrange(self.ntheta * self.nrho):
#      disk.append(cal_patch(self.theta[i], self.rho[i], x_local, y_local))
#    disk = Tsp.basic.vstack(disk,format='csc')

    layer_disk = disk[layer_begin[self.layer_id]:layer_end[self.layer_id],:]
    patch = Tsp.basic.structured_dot(layer_disk,x)
    patch = T.reshape(patch,[self.ntheta*self.nrho, x.shape[0], x.shape[1]])
    patch = T.reshape(T.swapaxes(T.swapaxes(patch,0,1),1,2),[x.shape[0], self.nin, self.ntheta*self.nrho, 1])

    return self.activation(theano.tensor.nnet.conv.conv2d(patch, self.a).flatten(2))
Exemple #11
0
def build_transition_cost(logits, targets, num_transitions):
    """
    Build a parse action prediction cost function.
    """

    # swap seq_length dimension to front so that we can scan per timestep
    logits = T.swapaxes(logits, 0, 1)
    targets = targets.T

    def cost_t(logits, tgt, num_transitions):
        # TODO(jongauthier): Taper down xent cost as we proceed through
        # sequence?
        predicted_dist = T.nnet.softmax(logits)
        cost = T.nnet.categorical_crossentropy(predicted_dist, tgt)

        pred = T.argmax(logits, axis=1)
        error = T.neq(pred, tgt)
        return cost, error

    results, _ = theano.scan(cost_t, [logits, targets],
                             non_sequences=[num_transitions])
    costs, errors = results

    # Create a mask that selects only transitions that involve real data.
    unrolling_length = T.shape(costs)[0]
    padding = unrolling_length - num_transitions
    padding = T.reshape(padding, (1, -1))
    rng = T.arange(unrolling_length) + 1
    rng = T.reshape(rng, (-1, 1))
    mask = T.gt(rng, padding)

    # Compute acc using the mask
    acc = 1.0 - (T.sum(errors * mask, dtype=theano.config.floatX) /
                 T.sum(num_transitions, dtype=theano.config.floatX))

    # Compute cost directly, since we *do* want a cost incentive to get the padding
    # transitions right.
    cost = T.mean(costs)
    return cost, acc
Exemple #12
0
def apply_except_axis(x, axis, func):
    """
    Apply a contraction function on all but one axis.

    Parameters
    ----------
    x: T.Tensor
        Input tensor.
    axis: int
        Axis to exclude on application.
    func: function
        A function with signature ``func(x, axis=)`` eg T.mean, T.std ...

    Returns
    -------
    T.Tensor
        Contraction of ``x``, but of the same dimensionality.
    """
    x = T.swapaxes(x, 0, axis) # put axis on front
    x = T.flatten(x, 2) # flatten remainder
    y = func(x, axis=1)
    return y
Exemple #13
0
def build_transition_cost(logits, targets, num_transitions):
    """
    Build a parse action prediction cost function.
    """

    # swap seq_length dimension to front so that we can scan per timestep
    logits = T.swapaxes(logits, 0, 1)
    targets = targets.T

    def cost_t(logits, tgt, num_transitions):
        # TODO(jongauthier): Taper down xent cost as we proceed through
        # sequence?
        predicted_dist = T.nnet.softmax(logits)
        cost = T.nnet.categorical_crossentropy(predicted_dist, tgt)

        pred = T.argmax(logits, axis=1)
        error = T.neq(pred, tgt)
        return cost, error

    results, _ = theano.scan(cost_t, [logits, targets], non_sequences=[num_transitions])
    costs, errors = results

    # Create a mask that selects only transitions that involve real data.
    unrolling_length = T.shape(costs)[0]
    padding = unrolling_length - num_transitions
    padding = T.reshape(padding, (1, -1))
    rng = T.arange(unrolling_length) + 1
    rng = T.reshape(rng, (-1, 1))
    mask = T.gt(rng, padding)

    # Compute acc using the mask
    acc = 1.0 - (T.sum(errors * mask, dtype=theano.config.floatX)
                 / T.sum(num_transitions, dtype=theano.config.floatX))

    # Compute cost directly, since we *do* want a cost incentive to get the padding
    # transitions right.
    cost = T.mean(costs)
    return cost, acc
Exemple #14
0
def test_th_matmul():
    vlist = []
    flist = []
    ndlist = []
    for i in range(2, 30):
        dims = int(np.random.random() * 4 + 2)

        # Create a tuple of tensors with potentially different broadcastability.
        vs = tuple(
            tt.TensorVariable(
                tt.TensorType(
                    'float64',
                    tuple((p < .3) for p in np.random.ranf(dims - 2))
                    # Make full matrices
                    + (False, False))) for _ in range(2))
        vs = tuple(
            tt.swapaxes(v, -2, -1) if j % 2 == 0 else v
            for j, v in enumerate(vs))

        f = th.function([*vs], [matmul(*vs)])

        # Create the default shape for the test ndarrays
        defshape = tuple(int(np.random.random() * 5 + 1) for _ in range(dims))
        # Create a test array matching the broadcastability of each v, for each v.
        nds = tuple(
            np.random.ranf(
                tuple(s if not v.broadcastable[j] else 1
                      for j, s in enumerate(defshape))) for v in vs)
        nds = tuple(
            np.swapaxes(nd, -2, -1) if j % 2 == 0 else nd
            for j, nd in enumerate(nds))

        ndlist.append(nds)
        vlist.append(vs)
        flist.append(f)

    for i in range(len(ndlist)):
        assert np.allclose(flist[i](*ndlist[i]), np.matmul(*ndlist[i]))
Exemple #15
0
def train_conv_net(
    datasets,
    word_vecs,
    windows=[3, 2],
    pool_sizes=[2],
    dim=300,
    feature_maps=[100, 100],
    dropout_rate=[0.5],
    hidden_units=[],
    shuffle_batch=True,
    n_epochs=25,
    batch_size=50,
    lr_decay=0.95,
    conv_non_linear="relu",
    activations=['relu'],
    sqr_norm_lim=9,
):

    assert (len(windows) == len(feature_maps))
    assert (len(windows) == len(pool_sizes) + 1)
    print('\nbuilding model...')
    index = T.lscalar()
    x1 = T.matrix('x1')
    x2 = T.matrix('x2')
    y = T.ivector('y')
    Words = theano.shared(value=word_vecs, name="Words")
    rng = np.random.RandomState(9999)

    ### define model architecture ###
    img_h = len(datasets[0][0][0])
    filter_shapes = [(feature_maps[0], 1, windows[0], dim)]
    for i in range(1, len(windows)):
        filter_shapes.append(
            (feature_maps[i], 1, windows[i], feature_maps[i - 1]))
    next_layer_input_1 = Words[T.cast(x1.flatten(), dtype="int32")].reshape(
        (x1.shape[0], 1, x1.shape[1], dim))
    next_layer_input_2 = Words[T.cast(x2.flatten(), dtype="int32")].reshape(
        (x1.shape[0], 1, x1.shape[1], dim))
    conv_layers_1 = []
    conv_layers_2 = []
    for i in xrange(len(windows) - 1):
        filter_shape = filter_shapes[i]
        pool_size = (pool_sizes[i], 1)
        conv_layer_1 = LeNetConvPoolLayer(rng,
                                          input=next_layer_input_1,
                                          image_shape=(batch_size, 1, img_h,
                                                       filter_shape[3]),
                                          filter_shape=filter_shape,
                                          poolsize=pool_size,
                                          non_linear=conv_non_linear)
        conv_layer_2 = LeNetConvPoolLayer(rng,
                                          input=next_layer_input_2,
                                          image_shape=(batch_size, 1, img_h,
                                                       filter_shape[3]),
                                          filter_shape=filter_shape,
                                          poolsize=pool_size,
                                          non_linear=conv_non_linear)
        img_h -= windows[i] - 1
        img_h /= pool_sizes[i]
        next_layer_input_1 = T.swapaxes(conv_layer_1.output, 1, 3)
        next_layer_input_2 = T.swapaxes(conv_layer_2.output, 1, 3)
        conv_layers_1.append(conv_layer_1)
        conv_layers_2.append(conv_layer_2)
    ###the last convPoolLayer needs different configurations###
    filter_shape = filter_shapes[-1]
    pool_size = (img_h - windows[-1] + 1, 1)
    conv_layer_1 = LeNetConvPoolLayer(rng,
                                      input=next_layer_input_1,
                                      image_shape=(batch_size, 1, img_h,
                                                   filter_shape[3]),
                                      filter_shape=filter_shape,
                                      poolsize=pool_size)
    conv_layer_2 = LeNetConvPoolLayer(rng,
                                      input=next_layer_input_2,
                                      image_shape=(batch_size, 1, img_h,
                                                   filter_shape[3]),
                                      filter_shape=filter_shape,
                                      poolsize=pool_size)
    output_1 = conv_layer_1.output.flatten(2)
    output_2 = conv_layer_2.output.flatten(2)
    conv_layers_1.append(conv_layer_1)
    conv_layers_2.append(conv_layer_2)
    next_layer_input = T.concatenate([output_1, output_2], 1)
    ###MLP with dropout###
    layer_sizes = [feature_maps[-1] * 2]
    for i in hidden_units:
        layer_sizes.append(hidden_units[i])
    layer_sizes.append(2)
    classifier = MLPDropout(rng,
                            input=next_layer_input,
                            layer_sizes=layer_sizes,
                            activations='relu',
                            dropout_rates=dropout_rate)
    ###updates the params with adadelta###
    params = classifier.params
    for conv_layer in conv_layers_1:
        params += conv_layer.params
    for conv_layer in conv_layers_2:
        params += conv_layer.params
    dropout_cost = classifier.dropout_negative_log_likelihood(y)
    grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6,
                                        sqr_norm_lim)

    ###creat minibatches for training set###
    np.random.seed(9999)
    if datasets[0].shape[0] % batch_size > 0:
        data_zipped = zip(datasets[0], datasets[2])
        extra_data_num = batch_size - datasets[0].shape[0] % batch_size
        train_set = np.random.permutation(data_zipped)
        extra_data = train_set[:extra_data_num]
        train_set = np.append(train_set, extra_data, axis=0)
    else:
        train_set = datasets[0]
    train_set = np.random.permutation(train_set)
    n_batches = train_set.shape[0] / batch_size
    train_labels = train_set[:, 1]
    train_set = np.array(train_set[:, 0])
    train_set_x1 = [x[0] for x in train_set]
    train_set_x2 = [x[1] for x in train_set]
    test_set_x1 = [x[0] for x in datasets[1]]
    test_set_x2 = [x[1] for x in datasets[1]]
    test_labels = np.asarray(datasets[3], "int32")
    train_set_x1, train_set_x2, train_labels = shared_dataset(
        (train_set_x1, train_set_x2, train_labels))
    ###theano functions for training and testing###
    train_model = theano.function(
        [index],
        classifier.errors(y),
        updates=grad_updates,
        givens={
            x1: train_set_x1[index * batch_size:(index + 1) * batch_size],
            x2: train_set_x2[index * batch_size:(index + 1) * batch_size],
            y: train_labels[index * batch_size:(index + 1) * batch_size]
        },
        allow_input_downcast=True)
    test_layer_input_1 = Words[T.cast(x1.flatten(), dtype="int32")].reshape(
        (x1.shape[0], 1, x1.shape[1], dim))
    test_layer_input_2 = Words[T.cast(x2.flatten(), dtype="int32")].reshape(
        (x2.shape[0], 1, x2.shape[1], dim))
    for i in xrange(len(conv_layers_1) - 1):
        output_1 = conv_layers_1[i].predict(test_layer_input_1,
                                            len(test_labels))
        output_2 = conv_layers_1[i].predict(test_layer_input_2,
                                            len(test_labels))
        test_layer_input_1 = T.swapaxes(output_1, 1, 3)
        test_layer_input_2 = T.swapaxes(output_2, 1, 3)
    output_1 = conv_layers_1[-1].predict(test_layer_input_1, len(test_labels))
    output_2 = conv_layers_1[-1].predict(test_layer_input_2, len(test_labels))
    next_layer_input = T.concatenate(
        [output_1.flatten(2), output_2.flatten(2)], 1)
    test_y_pred = classifier.predict(next_layer_input)
    test_error = T.mean(T.neq(test_y_pred, y))
    test_model = theano.function([x1, x2, y],
                                 test_error,
                                 allow_input_downcast=True)
    ###training###
    print 'training...'
    epoch = 0
    test_accs = []
    train_losses = []
    while (epoch < n_epochs):
        start_time = time.time()
        epoch = epoch + 1
        if shuffle_batch:
            for minibatch_index in np.random.permutation(range(n_batches)):
                train_losses.append(train_model(minibatch_index))
        else:
            for minibatch_index in xrange(n_batches):
                train_losses.append(train_model(minibatch_index))
        test_error = test_model(test_set_x1, test_set_x2, test_labels)
        train_perf = 1 - np.mean(train_losses)
        test_perf = 1 - test_error
        test_accs.append(test_perf)
        print(
            'epoch: %i, training time: %.2f secs, train perf: %.2f %% , test perf: %.2f %%'
            % (epoch, time.time() - start_time, train_perf * 100.,
               test_perf * 100.))
    return max(test_accs)
Exemple #16
0
def train_conv_net(datasets,
                   word_vecs,
                   windows=[3,2],
                   pool_sizes=[2],
                   dim=300,
                   feature_maps=[100, 100],
                   dropout_rate=[0.5],
                   hidden_units=[],
                   shuffle_batch=True,
                   n_epochs=25,
                   batch_size=50,
                   lr_decay=0.95,
                   conv_non_linear="relu",
                   activations=['relu'],
                   sqr_norm_lim=9, ):

    assert(len(windows)==len(feature_maps))
    assert (len(windows)==len(pool_sizes)+1)
    print('\nbuilding model...')
    index = T.lscalar()
    x1 = T.matrix('x1')
    x2 = T.matrix('x2')
    y = T.ivector('y')
    Words = theano.shared(value=word_vecs, name="Words")
    rng = np.random.RandomState(9999)

    ### define model architecture ###
    img_h = len(datasets[0][0][0])
    filter_shapes = [(feature_maps[0], 1, windows[0], dim)]
    for i in range(1, len(windows)):
        filter_shapes.append((feature_maps[i], 1, windows[i], feature_maps[i - 1]))
    next_layer_input_1 = Words[T.cast(x1.flatten(), dtype="int32")].reshape((x1.shape[0], 1, x1.shape[1], dim))
    next_layer_input_2 = Words[T.cast(x2.flatten(), dtype="int32")].reshape((x1.shape[0], 1, x1.shape[1], dim))
    conv_layers_1 = []
    conv_layers_2 = []
    for i in xrange(len(windows) - 1):
        filter_shape = filter_shapes[i]
        pool_size = (pool_sizes[i], 1)
        conv_layer_1 = LeNetConvPoolLayer(rng, input=next_layer_input_1,
                                          image_shape=(batch_size, 1, img_h, filter_shape[3]),
                                          filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear)
        conv_layer_2 = LeNetConvPoolLayer(rng, input=next_layer_input_2,
                                          image_shape=(batch_size, 1, img_h, filter_shape[3]),
                                          filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear)
        img_h -= windows[i] - 1
        img_h /= pool_sizes[i]
        next_layer_input_1 = T.swapaxes(conv_layer_1.output,1,3)
        next_layer_input_2 = T.swapaxes(conv_layer_2.output,1,3)
        conv_layers_1.append(conv_layer_1)
        conv_layers_2.append(conv_layer_2)
    ###the last convPoolLayer needs different configurations###
    filter_shape = filter_shapes[-1]
    pool_size = (img_h-windows[-1]+1, 1)
    conv_layer_1 = LeNetConvPoolLayer(rng, input=next_layer_input_1,
                                      image_shape=(batch_size, 1, img_h, filter_shape[3]),
                                      filter_shape=filter_shape, poolsize=pool_size)
    conv_layer_2 = LeNetConvPoolLayer(rng, input=next_layer_input_2,
                                      image_shape=(batch_size, 1, img_h, filter_shape[3]),
                                      filter_shape=filter_shape, poolsize=pool_size)
    output_1 = conv_layer_1.output.flatten(2)
    output_2 = conv_layer_2.output.flatten(2)
    conv_layers_1.append(conv_layer_1)
    conv_layers_2.append(conv_layer_2)
    next_layer_input = T.concatenate([output_1, output_2], 1)
    ###MLP with dropout###
    layer_sizes=[feature_maps[-1] * 2]
    for i in hidden_units:
        layer_sizes.append(hidden_units[i])
    layer_sizes.append(2)
    classifier = MLPDropout(rng, input=next_layer_input, layer_sizes=layer_sizes, activations='relu',
                            dropout_rates=dropout_rate)
    ###updates the params with adadelta###
    params = classifier.params
    for conv_layer in conv_layers_1:
        params += conv_layer.params
    for conv_layer in conv_layers_2:
        params += conv_layer.params
    dropout_cost = classifier.dropout_negative_log_likelihood(y)
    grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim)

    ###creat minibatches for training set###
    np.random.seed(9999)
    if datasets[0].shape[0] % batch_size > 0:
        data_zipped=zip(datasets[0],datasets[2])
        extra_data_num = batch_size - datasets[0].shape[0] % batch_size
        train_set = np.random.permutation(data_zipped)
        extra_data = train_set[:extra_data_num]
        train_set=np.append(train_set ,extra_data,axis=0)
    else:
        train_set = datasets[0]
    train_set = np.random.permutation(train_set)
    n_batches = train_set.shape[0] / batch_size
    train_labels = train_set[:,1]
    train_set= np.array(train_set[:,0])
    train_set_x1 = [x[0] for x in train_set]
    train_set_x2 = [x[1] for x in train_set]
    test_set_x1 = [x[0] for x in datasets[1]]
    test_set_x2 = [x[1] for x in datasets[1]]
    test_labels = np.asarray(datasets[3], "int32")
    train_set_x1, train_set_x2, train_labels = shared_dataset((train_set_x1, train_set_x2, train_labels))
    ###theano functions for training and testing###
    train_model = theano.function([index], classifier.errors(y), updates=grad_updates,
                                  givens={
                                      x1: train_set_x1[index * batch_size:(index + 1) * batch_size],
                                      x2: train_set_x2[index * batch_size:(index + 1) * batch_size],
                                      y: train_labels[index * batch_size:(index + 1) * batch_size]},
                                  allow_input_downcast=True)
    test_layer_input_1= Words[T.cast(x1.flatten(),dtype="int32")].reshape((x1.shape[0],1,x1.shape[1],dim))
    test_layer_input_2= Words[T.cast(x2.flatten(),dtype="int32")].reshape((x2.shape[0],1,x2.shape[1],dim))
    for i in xrange(len(conv_layers_1)-1):
        output_1=conv_layers_1[i].predict(test_layer_input_1, len(test_labels))
        output_2=conv_layers_1[i].predict(test_layer_input_2, len(test_labels))
        test_layer_input_1 = T.swapaxes(output_1,1,3)
        test_layer_input_2 = T.swapaxes(output_2,1,3)
    output_1=conv_layers_1[-1].predict(test_layer_input_1, len(test_labels))
    output_2=conv_layers_1[-1].predict(test_layer_input_2, len(test_labels))
    next_layer_input=T.concatenate([output_1.flatten(2), output_2.flatten(2)], 1)
    test_y_pred = classifier.predict(next_layer_input)
    test_error = T.mean(T.neq(test_y_pred, y))
    test_model = theano.function([x1,x2,y], test_error, allow_input_downcast = True)
    ###training###
    print 'training...'
    epoch = 0
    test_accs = []
    train_losses=[]
    while (epoch < n_epochs):
        start_time = time.time()
        epoch = epoch + 1
        if shuffle_batch:
            for minibatch_index in np.random.permutation(range(n_batches)):
                train_losses.append(train_model(minibatch_index))
        else:
            for minibatch_index in xrange(n_batches):
                train_losses.append(train_model(minibatch_index))
        test_error = test_model(test_set_x1,test_set_x2,test_labels)
        train_perf = 1 - np.mean(train_losses)
        test_perf = 1 - test_error
        test_accs.append(test_perf)
        print('epoch: %i, training time: %.2f secs, train perf: %.2f %% , test perf: %.2f %%' % (
            epoch, time.time() - start_time, train_perf * 100., test_perf*100.))
    return max(test_accs)
def main(model='mlp', num_epochs=500):
    # Load the dataset
    print("Loading data...")
    # num_per_class = 100
    # print("Using %d per class" % num_per_class) 
    print("Using all the training data") 
    
    ## Load Data##
    X_train, y_train, X_test, y_test = load_data("/X_train.npy", "/Y_train.npy", "/X_test.npy", "/Y_test.npy")

    X_train = extend_images(X_train, 227)
    X_test = extend_images(X_test, 227)

    y_train = y_train
    y_test = y_test

    ## Define Batch Size ##
    batch_size = 50
 
    ## Define nRotation for exhaustive search ##
    nRotation = 16

    # The dimension would be (nRotation * n, w, h)
    input_var = T.tensor4('inputs')
    vanilla_target_var = T.ivector('vanilla_targets')

    # Create neural network model (depending on first command line parameter)
    network, weight_decay = build_cnn(input_var, batch_size)
    
    # saved_weights = np.load("../data/mnist_Chi_dec_100.npy")
    saved_weights = np.load("../data/curet_test_hinge_epoch_400_2pool.npy")
    lasagne.layers.set_all_param_values(network, saved_weights)

    predictions = lasagne.layers.get_output(network)

    one_hot_targets = T.extra_ops.to_one_hot(vanilla_target_var, 61)

    rests = T.reshape(predictions, (nRotation, -1, 61))

    final_rests = T.max(rests, 0)
    
    rests = T.swapaxes(rests, 0, 2)
    rests = T.swapaxes(rests, 0, 1)
    rests = rests[one_hot_targets.nonzero()]
    rests = T.max(rests, axis = 1)

    final_rests = T.set_subtensor(final_rests[one_hot_targets.nonzero()], rests)

    # loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    # loss = one_vs_all_hinge_loss(final_rests, vanilla_target_var)
    loss = lasagne.objectives.multiclass_hinge_loss(final_rests, vanilla_target_var, 5)
    loss = loss.mean() + weight_decay
    # We could add some weight decay as well here, see lasagne.regularization.


    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)
    # updates = lasagne.updates.nesterov_momentum(
    #         loss, params, learning_rate=0.01, momentum=0.9)
    updates = lasagne.updates.adagrad(loss, params, learning_rate = 0.001)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_prediction = T.reshape(test_prediction,(nRotation, -1, 61))
    test_prediction_res = test_prediction.max(axis = 0)

    final_test_prediction_res = test_prediction[0]
    test_prediction_process = T.swapaxes(test_prediction, 0, 2)
    test_prediction_process = T.swapaxes(test_prediction_process, 0, 1)
    test_prediction_process = test_prediction_process[one_hot_targets.nonzero()]
    test_prediction_process = T.max(test_prediction_process, axis = 1)

    final_test_prediction_res = T.set_subtensor(final_test_prediction_res[one_hot_targets.nonzero()], test_prediction_process)

    test_loss = lasagne.objectives.multiclass_hinge_loss(final_test_prediction_res, vanilla_target_var)
    test_loss = test_loss.mean() + weight_decay
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction_res, axis=1), vanilla_target_var),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, vanilla_target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, vanilla_target_var], [test_loss, test_acc])



    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True):
            inputs, targets = batch
            inputs = inputs.reshape(batch_size, 227, 227)
            inputs = rotateImage_batch(inputs, nRotation).reshape(batch_size * nRotation, 1, 227, 227)
            duplicated_targets = np.array([targets for i in range(nRotation)]).reshape(batch_size * nRotation,)
            train_err += train_fn(inputs, targets)
            train_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))




        if epoch % 10 == 0:
           # After training, we compute and print the test error:
            test_err = 0
            test_acc = 0
            test_batches = 0
            for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False):
                inputs, targets = batch
                inputs = inputs.reshape(batch_size, 227, 227)
                inputs = rotateImage_batch(inputs, nRotation).reshape(batch_size * nRotation, 1, 227, 227)
                err, acc = val_fn(inputs, targets)
                test_err += err
                test_acc += acc
                test_batches += 1
            print("Final results:")
            print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
            print("  test accuracy:\t\t{:.2f} %".format(
                test_acc / test_batches * 100))

            # After training, we compute and print the test error:
            test_err = 0
            test_acc = 0
            test_batches = 0
            for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=False):
                inputs, targets = batch
                inputs = inputs.reshape(batch_size, 227, 227)
                inputs = rotateImage_batch(inputs, nRotation).reshape(batch_size * nRotation, 1, 227, 227)
                err, acc = val_fn(inputs, targets)
                test_err += err
                test_acc += acc
                test_batches += 1
            print("Final results:")
            print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
            print("  test accuracy:\t\t{:.2f} %".format(
                test_acc / test_batches * 100))

    weightsOfParams = lasagne.layers.get_all_param_values(network)
    np.save("../data/curet_justRotation_try5.npy", weightsOfParams)
Exemple #18
0
def swapaxes(x, axis1, axis2):
    return T.swapaxes(x, axis1, axis2)
Exemple #19
0
    def _create_time_step(self, mask, x_preact, C_in, h_in, h_weights, mem_weights, mem_bias, v_weights, v_bias, q_weights):
        """The LSTM step function for theano.scan(). Creates the structure of
        one time step.

        The inputs do not contain the time step dimension. ``mask`` is a vector
        containing a boolean mask for each sequence. ``x_preact`` is a matrix
        containing the preactivations for each sequence. ``C_in`` and ``h_in``,
        as well as the outputs, are matrices containing the state vectors for
        each sequence.

        The required affine transformations have already been applied to the
        input prior to creating the loop. The transformed inputs and the mask
        that will be passed to the step function are vectors when processing a
        mini-batch - each value corresponds to the same time step in a different
        sequence.

        :type mask: Variable
        :param mask: a symbolic vector that masks out sequences that are past
                     the last word

        :type x_preact: Variable
        :param x_preact: concatenation of the input x_(t) pre-activations
                         computed using the gate and candidate state weights and
                         biases; shape is (the number of sequences, state size *
                         4)

        :type C_in: Variable
        :param C_in: C_(t-1...t-n), memory (cell output) of the previous time steps; shape
                     is (the number of sequences, state size* memory size)

        :type h_in: Variable
        :param h_in: h_(t-1), hidden state output of the previous time step;
                     shape is (the number of sequences, state size)

        :type h_weights: Variable
        :param h_weights: concatenation of the gate and candidate state weights
                          to be applied to h_(t-1); shape is (state size, state
                          size * 4)

        :rtype: a tuple of two Variables
        :returns: C_(t) and h_(t), the cell state and hidden state outputs
        """

        # pre-activation of the gates and candidate state
        preact = tensor.dot(h_in, h_weights)
        preact += x_preact
        num_sequences = x_preact.shape[0]
        # input, forget, and output gates
        i = tensor.nnet.sigmoid(get_submatrix(preact, 0, self.output_size))
        f = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size))
        o = tensor.nnet.sigmoid(get_submatrix(preact, 2, self.output_size))

        # hidden state outputs candidate
        h_candidate = tensor.tanh(get_submatrix(preact, 3, self.output_size))

        # calculate the attention weights
        # transforming the memory
        # First rehape C_in
        mem = C_in.reshape([num_sequences, self.memory_size, self.output_size])		
        hidden = tensor.dot(mem[:,:-1,:], mem_weights) + mem_bias
        hidden_q = (tensor.dot(h_in, q_weights)).reshape([num_sequences, 1, self.output_size])
        # use V to calculate the attention scores for all previous input vectors
        raw_attention = tensor.dot(tensor.tanh(hidden+hidden_q), v_weights) + v_bias

        #logging.debug("time: %s, seq: &s", t, num_sequences)
        raw_attention = tensor.swapaxes(raw_attention, 0, 1)
        raw_attention = raw_attention.reshape([num_sequences, self.memory_size-1])

        # with softmax we get the attention scores for each time t
        # shape is (num_sequences, t)
        attentions = tensor.nnet.softmax(raw_attention)
        # apply attention to the memory
        long_memory = tensor.batched_dot(attentions.reshape([attentions.shape[0],1,attentions.shape[1]]), mem[:,:-1,:]) #TODO test
        long_memory = long_memory.reshape([long_memory.shape[0], long_memory.shape[2]])
        h_out = o * self._activation(f * long_memory + i * h_candidate)
        #concat new vector
        logging.debug("C ndim: %s, h_out ndim: %s", C_in.ndim, h_out.ndim)
        mem = tensor.concatenate([C_in[:,self.output_size:], h_out], axis=1) # TODO chech dimensions!

        # Apply the mask. None creates a new axis with size 1, causing the mask
        # to be broadcast to all the outputs.
        #C_out = tensor.switch(mask[:, None], C_out, C_in)
        h_out = tensor.switch(mask[:, None], h_out, h_in)

        return mem, h_out
def set_updater_function(encoder_feature_function,
                         encoder_mean_function,
                         encoder_var_function,
                         decoder_feature_function,
                         decoder_red_function,
                         decoder_green_function,
                         decoder_blue_function,
                         encoder_params,
                         decoder_params,
                         optimizer):

    # positive visible data
    positive_visible_data = T.tensor4(name='positive_visible_data',
                                      dtype=theano.config.floatX)

    # positive hidden data
    positive_hidden_data = T.matrix(name='positive_hidden_data',
                                    dtype=theano.config.floatX)
    # negative hidden data
    negative_hidden_data = T.matrix(name='negative_hidden_data',
                                    dtype=theano.config.floatX)
    # moment weight
    moment_cost_weight = T.scalar(name='moment_cost_weight',
                                  dtype=theano.config.floatX)

    # num of samples
    num_samples = positive_visible_data.shape[0]
    num_rows    = positive_visible_data.shape[2]
    num_cols    = positive_visible_data.shape[3]
    num_pixels  = num_rows*num_cols

    ##################
    # positive phase #
    ##################
    # positive encoder
    positive_encoder_outputs = encoder_feature_function(positive_visible_data/127.5 - 1.)
    positive_encoder_feature = positive_encoder_outputs[1]
    positive_encoder_mean    = encoder_mean_function(positive_encoder_feature)
    positive_encoder_log_var = encoder_var_function(positive_encoder_feature)
    positive_encoder_std     = T.sqrt(T.exp(positive_encoder_log_var))
    positive_encoder_sample  = positive_encoder_mean + positive_encoder_std*positive_hidden_data
    # positive decoder
    positive_decoder_outputs = decoder_feature_function(positive_encoder_sample)
    positive_decoder_hiddens = positive_decoder_outputs[0]
    positive_decoder_feature = positive_decoder_outputs[1]
    # shape = (num_samples, num_intensity, num_rows, num_cols)
    positive_decoder_red     = decoder_red_function(positive_decoder_feature)
    positive_decoder_green   = decoder_green_function(positive_decoder_feature)
    positive_decoder_blue    = decoder_blue_function(positive_decoder_feature)
    # shape = (num_samples, num_intensity, num_pixels)
    positive_decoder_red     = T.flatten(positive_decoder_red, 3)
    positive_decoder_green   = T.flatten(positive_decoder_green, 3)
    positive_decoder_blue    = T.flatten(positive_decoder_blue, 3)
    # shape = (num_samples, num_pixels, num_intensity)
    positive_decoder_red     = T.swapaxes(positive_decoder_red, axis1=1, axis2=2)
    positive_decoder_green   = T.swapaxes(positive_decoder_green, axis1=1, axis2=2)
    positive_decoder_blue    = T.swapaxes(positive_decoder_blue, axis1=1, axis2=2)
    # shape = (num_samples*num_pixels, num_intensity)
    positive_decoder_red     = positive_decoder_red.reshape((num_samples*num_pixels, -1))
    positive_decoder_green   = positive_decoder_green.reshape((num_samples*num_pixels, -1))
    positive_decoder_blue    = positive_decoder_blue.reshape((num_samples*num_pixels, -1))
    # softmax
    positive_decoder_red     = T.nnet.softmax(positive_decoder_red)
    positive_decoder_green   = T.nnet.softmax(positive_decoder_green)
    positive_decoder_blue    = T.nnet.softmax(positive_decoder_blue)
    # positive target
    positive_target_red      = T.flatten(T.cast(positive_visible_data[:,0,:,:],'int64'), 1)
    positive_target_green    = T.flatten(T.cast(positive_visible_data[:,1,:,:],'int64'), 1)
    positive_target_blue     = T.flatten(T.cast(positive_visible_data[:,2,:,:],'int64'), 1)



    # positive lower bound cost
    positive_recon_red_cost   = T.nnet.categorical_crossentropy(  positive_decoder_red,   positive_target_red).reshape((num_samples,-1)).sum(axis=1)
    positive_recon_green_cost = T.nnet.categorical_crossentropy(positive_decoder_green, positive_target_green).reshape((num_samples,-1)).sum(axis=1)
    positive_recon_blue_cost  = T.nnet.categorical_crossentropy( positive_decoder_blue,  positive_target_blue).reshape((num_samples,-1)).sum(axis=1)
    positive_recon_cost = positive_recon_red_cost + positive_recon_green_cost + positive_recon_blue_cost
    positive_kl_cost    = -0.5*T.sum((1.0+positive_encoder_log_var-T.sqr(positive_encoder_mean)-T.exp(positive_encoder_log_var)), axis=1)
    positive_vae_cost   = positive_recon_cost + positive_kl_cost

    ##################
    # negative phase #
    ##################
    # negative decoder
    negative_decoder_outputs = decoder_feature_function(negative_hidden_data)
    negative_decoder_hiddens = negative_decoder_outputs[0]
    negative_decoder_feature = negative_decoder_outputs[1]

    # moment matching
    moment_match_cost = 0
    for i in xrange(len(positive_decoder_hiddens)):
        pos_feat = positive_decoder_hiddens[i]
        neg_feat = negative_decoder_hiddens[i]
        moment_match_cost += T.mean(T.sqr(T.mean(pos_feat, axis=0)-T.mean(neg_feat, axis=0)))
        moment_match_cost += T.mean(T.sqr(T.mean(T.sqr(pos_feat), axis=0)-T.mean(T.sqr(neg_feat), axis=0)))
    moment_match_cost += T.mean(T.sqr(T.mean(T.flatten(positive_decoder_feature, 2), axis=0)-T.mean(T.flatten(negative_decoder_feature, 2), axis=0)))
    moment_match_cost += T.mean(T.sqr(T.mean(T.sqr(T.flatten(positive_decoder_feature, 2)), axis=0)-T.mean(T.sqr(T.flatten(negative_decoder_feature, 2)), axis=0)))


    model_updater_cost = T.mean(positive_vae_cost) + moment_cost_weight*T.mean(moment_match_cost)
    model_updater_dict = optimizer(encoder_params+decoder_params,
                                   model_updater_cost)

    model_updater_inputs = [positive_visible_data,
                            positive_hidden_data,
                            negative_hidden_data,
                            moment_cost_weight]
    model_updater_outputs = [positive_vae_cost,
                             moment_match_cost,
                             model_updater_cost]

    model_updater_function = theano.function(inputs=model_updater_inputs,
                                             outputs=model_updater_outputs,
                                             updates=model_updater_dict,
                                             on_unused_input='ignore')
    return model_updater_function
def main(model='mlp', num_epochs=2000):
    # Load the dataset
    print("Loading data...")
    # num_per_class = 100
    # print("Using %d per class" % num_per_class) 
    print("Using all the training data") 
    
    #X_train, y_train, X_test, y_test = load_data("/X_train.npy", "/Y_train.npy", "/X_test_rotated.npy", "/Y_test_rotated.npy")
    X_train, y_train, X_test, y_test = load_data("/mnistROT.npy", "/mnistROTLabel.npy", "/mnistROTTEST.npy", "/mnistROTLABELTEST.npy", "ROT_MNIST")
   
    # Only for subclass trainning 
    # X_train_final = []
    # y_train_final = []
    # for i in range(10):
    #    X_train_class = X_train[y_train == i]
        # permutated_index = np.random.permutation(X_train_class.shape[0])
    #    permutated_index = np.arange(X_train_class.shape[0])
    #    X_train_final.append(X_train_class[permutated_index[:100]])
    #    y_train_final += [i] * num_per_class
    # X_train = np.vstack(X_train_final)
    # y_train = np.array(y_train_final, dtype = np.int32) 
    
    X_train = extend_image(X_train, 40)
    X_test = extend_image(X_test, 40)
    #X_train, y_train, X_test, y_test = load_data("/cluttered_train_x.npy", "/cluttered_train_y.npy", "/cluttered_test_x.npy", "/cluttered_test_y.npy", dataset = "MNIST_CLUTTER")

    # Prepare Theano variables for inputs and targets
    nRotation = 8
    
    # The dimension would be (nRotation * n, w, h)
    input_var = T.tensor4('inputs')

    # The dimension would be (n, )
    vanilla_target_var = T.ivector('vanilla_targets')
    # The dimension would be (nRotation * n , )
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    
    network, weight_decay = build_cnn(input_var)
    
    # saved_weights = np.load("../data/mnist_Chi_dec_100.npy")
    saved_weights = np.load("../data/mnist_CNN_params_drop_out_Chi_2017_hinge.npy")
    lasagne.layers.set_all_param_values(network, saved_weights)

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    # The dimension would be (nRotation * n, 10)
    predictions = lasagne.layers.get_output(network)

    # The diVmension would be (nRotation * n, 10)
    one_hot_targets = T.extra_ops.to_one_hot(vanilla_target_var, 10)

    rests = T.reshape(predictions, (nRotation, -1, 10))

    # final_rests = rests[0]
    final_rests = T.max(rests, 0)

    rests = T.swapaxes(rests, 0, 2)
    rests = T.swapaxes(rests, 0, 1)
    rests = rests[one_hot_targets.nonzero()]
    rests = T.max(rests, axis = 1)
    
    final_rests = T.set_subtensor(final_rests[one_hot_targets.nonzero()], rests)

    # loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    # loss = one_vs_all_hinge_loss(final_rests, vanilla_target_var)
    loss = lasagne.objectives.multiclass_hinge_loss(final_rests, vanilla_target_var)
    loss = loss.mean() + weight_decay
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)
    # updates = lasagne.updates.nesterov_momentum(
    #         loss, params, learning_rate=0.01, momentum=0.9)
    updates = lasagne.updates.adagrad(loss, params, learning_rate = 0.01)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_prediction = T.reshape(test_prediction,(nRotation, -1, 10))
    test_prediction_res = test_prediction.max(axis = 0)

    final_test_prediction_res = test_prediction[0]
    test_prediction_process = T.swapaxes(test_prediction, 0, 2)
    test_prediction_process = T.swapaxes(test_prediction_process, 0, 1)
    test_prediction_process = test_prediction_process[one_hot_targets.nonzero()]
    test_prediction_process = T.max(test_prediction_process, axis = 1)

    final_test_prediction_res = T.set_subtensor(final_test_prediction_res[one_hot_targets.nonzero()], test_prediction_process)
    
    test_loss = lasagne.objectives.multiclass_hinge_loss(final_test_prediction_res, vanilla_target_var)
    test_loss = test_loss.mean() + weight_decay
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction_res, axis=1), vanilla_target_var),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, vanilla_target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, vanilla_target_var], [test_loss, test_acc])

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, 100, shuffle=True):
            inputs, targets = batch
            inputs = inputs.reshape(100, 40, 40)
            inputs = rotateImage_batch(inputs, nRotation).reshape(100 * nRotation, 1, 40, 40)
            duplicated_targets = np.array([targets for i in range(nRotation)]).reshape(100 * nRotation,)
            train_err += train_fn(inputs, targets)
            train_batches += 1


        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))

        if epoch % 50 == 0: 
           # After training, we compute and print the test error:
            test_err = 0
            test_acc = 0
            test_batches = 0
            for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
                inputs, targets = batch
                inputs = inputs.reshape(500, 40, 40)
                inputs = rotateImage_batch(inputs, nRotation).reshape(500 * nRotation, 1, 40, 40)
                err, acc = val_fn(inputs, targets)
                test_err += err
                test_acc += acc
                test_batches += 1
            print("Final results:")
            print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
            print("  test accuracy:\t\t{:.2f} %".format(
                test_acc / test_batches * 100))
           
            # After training, we compute and print the test error:
            test_err = 0
            test_acc = 0
            test_batches = 0
            for batch in iterate_minibatches(X_train, y_train, 500, shuffle=False):
                inputs, targets = batch
                inputs = inputs.reshape(500, 40, 40)
                inputs = rotateImage_batch(inputs, nRotation).reshape(500 * nRotation, 1, 40, 40)
                err, acc = val_fn(inputs, targets)
                test_err += err
                test_acc += acc
                test_batches += 1
            print("Final results:")
            print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
            print("  test accuracy:\t\t{:.2f} %".format(
                test_acc / test_batches * 100))

            # Optionally, you could now dump the network weights to a file like this:
            # np.savez('model.npz', *lasagne.layers.get_all_param_values(network))
            #
            # And load them again later on like this:
            # with np.load('model.npz') as f:
            #     param_values = [f['arr_%d' % i] for i in range(len(f.files))]
            # lasagne.layers.set_all_param_values(network, param_values)
    weightsOfParams = lasagne.layers.get_all_param_values(network)
    #np.save("../data/mnist_clutter_CNN_params_sigmoid.npy", weightsOfParams)
    #np.save("../data/mnist_CNN_params_sigmoid.npy", weightsOfParams)
    #np.save("../data/mnist_CNN_params.npy", weightsOfParams)
    #np.save("../data/mnist_CNN_params_drop_out_semi_Chi_Dec7.npy", weightsOfParams)
    np.save("../data/mnist_CNN_params_drop_out_Chi_2017_ROT_hinge_2000.npy", weightsOfParams)
    def _create_time_step(self, t, mask, S, x, x_preact, h_weights, h_bias, v_weights, v_bias):
        """The Attention step function for theano.scan(). Creates the structure of
        one time step.

        The inputs do not contain the time step dimension. ``mask`` is a vector
        containing a boolean mask for each sequence. ``x_preact`` is a matrix
        containing the preactivations for each sequence. 

        The required affine transformations have already been applied to the
        input prior to creating the loop. The transformed inputs and the mask
        that will be passed to the step function are vectors when processing a
        mini-batch - each value corresponds to the same time step in a different
        sequence.

        :type t: Variable
        :param t: the current time index

        :type mask: Variable
        :param mask: a symbolic vector that masks out sequences that are past
                     the last word
        
        :type x: Variable
        :param x: concatenation of the input vectors x_(t)

        :type x_preact: Variable
        :param x_preact: concatenation of the input x_(t) pre-activations
                         computed using the W1 weights and
                         biases; shape is (the number of sequences, num_hiden)

        :type S: Variable
        :param S: C_(t-1), layer output of the previous time step; shape
                     is (the number of sequences, state size)

        :type h_weights: Variable
        :param h_weights: weights to be applied to S_(t-1); 
                     shape is (output_size, hidden_size)

        :type h_bias: Variable
        :param h_bias: bias to be applied with h_weights

        :type v_weights: Variable
        :param v_weights: value weights to calculate 
                     the raw attention score; shape is (hidden_size, 1)

        :type v_bias: Variable
        :param v_bias: weights to be applied with v_weights

        :rtype: matrix of output
        :returns: attended context vector for timestep t
        """        
        if t == 0:
            return x[0,:,:]
        # transforming the previous output
        hidden = tensor.dot(S, h_weights) + h_bias

        # use V to calculate the attention scores for all previous input vectors
        raw_attention = tensor.dot(tensor.tanh(x_preact[:t+1,:,:]+hidden), v_weights) + v_bias

        # Apply softmax to calculate the current attention weights
        # first reshape the 3D tensor into a 2D one
        num_sequences = x.shape[1]
        #logging.debug("time: %s, seq: &s", t, num_sequences)
        raw_attention = tensor.swapaxes(raw_attention, 0, 1)
        raw_attention = raw_attention.reshape([num_sequences, raw_attention.shape[1]])
        
        # with softmax we get the attention scores for each time t
        # shape is (num_sequences, t)               
        attentions = tensor.nnet.softmax(raw_attention)
   
        # Calculate the new output using the attention weights
        # multiply the input vectors with the appropiate attention score
        C_out = tensor.batched_dot(attentions.reshape([attentions.shape[0],1,attentions.shape[1]]), x[:t+1, :, :].dimshuffle((1,0,2)))
        C_out = C_out.reshape([C_out.shape[0], C_out.shape[2]])
        #non_seq = [x[:t+1,:,:], attentions]
        #location = tensor.arange(num_sequences)
        #C_out,_ = theano.scan(fn=self._calc_sum,
        #                      outputs_info=None,
        #                      sequences=[location],
        #                      non_sequences=non_seq)
        # Apply the mask. None creates a new axis with size 1, causing the mask
        # to be broadcast to all the outputs.
        C_out = tensor.switch(mask[:, None], C_out, S)

        return C_out