Python SoftmaxLayer Examples, layers.softmax_layer.SoftmaxLayer Python Examples

Example #1

0

Show file

    def __init__(self, inputs, bs, max_time, classes, feature_dim, hidden_size, levels, N=1, pool=None, seed=12345):
        self._inputs = inputs
        self.N = N
        self.batch_size = bs
        self.classes = classes
        self.max_time = max_time
        self.levels = levels
        self.feature_dim = feature_dim
        self.pool = pool
        self.dropout = True

        # create a pyramid of filters
        self.temporal_pyramid = []
        for l in range(self.levels):
            for f in range(2**l):
                tf = TemporalAttentionLayer(batch_size=bs, N=N, channels=feature_dim, 
                                            name='temporal-attention-layer-'+str(l)+'-filter-'+str(f))
                tf.test = True
                tf.d = theano.shared(value=np.asarray([1./2**(l+1)]).astype('float32'), name='d', borrow=True,
                                     broadcastable=[True])
                tf.g = theano.shared(value=np.asarray([((1./2**l)+(2*f/2.**l))]).astype('float32'), name='g', borrow=True,
                                     broadcastable=[True])
                tf.sigma = theano.shared(value=np.asarray([5.0]).astype('float32'), name='sigma', borrow=True,
                                         broadcastable=[True])
                self.temporal_pyramid.append(tf)

        input_size = feature_dim*N*(len(self.temporal_pyramid) if pool == None else 1)
        self.hidden = HiddenLayer(input_size=input_size, hidden_size=hidden_size, activation=act.LeakyRelu(),
                                  batch_size=bs, name='hidden', dropout=0.5)
        self.softmax = SoftmaxLayer(input_size=hidden_size, classes=self.classes,
                                    batch_size=bs, name='softmax', dropout=0.5)

Example #2

0

Show file

File: baseline_model.py Project: ziweizhao1993/latent-subevents

 def __init__(self,
              inputs,
              bs,
              max_time,
              classes,
              feature_dim,
              hidden_size,
              method='max',
              seed=12345):
     self._inputs = inputs
     self.method = method
     self.batch_size = bs
     self.classes = classes
     self.max_time = max_time
     self.feature_dim = feature_dim
     self.dropout = True
     self.hidden = HiddenLayer(input_size=feature_dim,
                               hidden_size=hidden_size,
                               batch_size=bs,
                               name='hidden',
                               dropout=0.5,
                               activation=act.LeakyRelu())
     self.softmax = SoftmaxLayer(input_size=hidden_size,
                                 classes=self.classes,
                                 batch_size=bs,
                                 name='softmax',
                                 dropout=0.5)

Example #3

0

Show file

File: conv_net.py Project: zhmz90/kaggle-seizure-prediction

    def __init__(self, param_dict):

        self.param_dict = param_dict
        self.training_batch_size = param_dict['training_batch_size']
        nkerns = param_dict['nkerns']
        recept_width = param_dict['recept_width']
        pool_width = param_dict['pool_width']
        stride = param_dict['stride']
        dropout_prob = param_dict['dropout_prob']
        weight_decay = param_dict['l2_reg']
        activation = param_dict['activation']
        weights_variance = param_dict['weights_variance']
        n_channels = param_dict['n_channels']
        n_timesteps = param_dict['n_timesteps']
        n_fbins = param_dict['n_fbins']
        global_pooling = param_dict['global_pooling']
        rng = np.random.RandomState(23455)

        self.training_mode = T.iscalar('training_mode')
        self.x = T.tensor4('x')
        self.y = T.bvector('y')
        self.batch_size = theano.shared(self.training_batch_size)

        self.input = self.x.reshape((self.batch_size, 1, n_channels * n_fbins, n_timesteps))

        self.feature_extractor = FeatureExtractor(rng, self.input, nkerns, recept_width, pool_width, stride,
                                                  self.training_mode,
                                                  dropout_prob[0],
                                                  activation, weights_variance, n_channels, n_timesteps, n_fbins,
                                                  global_pooling)

        self.classifier = SoftmaxLayer(rng=rng, input=self.feature_extractor.output, n_in=nkerns[-1],
                                       training_mode=self.training_mode, dropout_prob=dropout_prob[-1])

        self.weights = self.feature_extractor.weights + self.classifier.weights

        # ---------------------- BACKPROP
        self.cost = self.classifier.cross_entropy_cost(self.y)
        self.cost = self.classifier.cross_entropy_cost(self.y)
        L2_sqr = sum((weight ** 2).sum() for weight in self.weights[::2])
        self.grads = T.grad(self.cost + weight_decay * L2_sqr, self.weights)
        self.updates = self.adadelta_updates(self.grads, self.weights)
        # self.updates = self.nesterov_momentum(self.grads, self.weights)

        # --------------------- FUNCTIONS
        self.train_model = theano.function([self.x, self.y, Param(self.training_mode, default=1)],
                                           outputs=self.cost,
                                           updates=self.updates)

        self.validate_model = theano.function([self.x, self.y, Param(self.training_mode, default=0)],
                                              self.cost)

        self.test_model = theano.function([self.x, Param(self.training_mode, default=0)],
                                          self.classifier.p_y_given_x[:, 1])

Example #4

0

Show file

    def __init__(self,
                 inputs,
                 bs,
                 max_time,
                 classes,
                 feature_dim,
                 hidden_size,
                 filters,
                 N=1,
                 pool=None,
                 lstm_dim=4096,
                 steps=8,
                 seed=12345):
        self._inputs = inputs
        self.N = N
        self.batch_size = bs
        self.classes = classes
        self.max_time = max_time
        self.filters = filters
        self.feature_dim = feature_dim
        self.pool = pool
        self.dropout = True
        self.steps = steps

        self.temporal_filters = []
        for f in range(filters):
            tf = TemporalAttentionLayer(
                batch_size=bs,
                N=N,
                channels=feature_dim,
                input_hidden_size=lstm_dim,
                name='temporal-attention-layer-filter-' + str(f))
            self.temporal_filters.append(tf)

        input_size = feature_dim * len(
            self.temporal_filters) * (N if pool == None else 1)

        self.lstm_in = HiddenLayer(input_size=input_size,
                                   hidden_size=lstm_dim * 4,
                                   batch_size=bs)
        self.lstm = LSTMLayer(input_size=lstm_dim, hidden_size=lstm_dim)

        self.hidden = HiddenLayer(input_size=lstm_dim,
                                  hidden_size=hidden_size,
                                  activation=act.relu,
                                  batch_size=bs,
                                  name='hidden',
                                  dropout=0.5)
        self.softmax = SoftmaxLayer(input_size=hidden_size,
                                    classes=self.classes,
                                    batch_size=bs,
                                    name='softmax',
                                    dropout=0.5)

Example #5

0

Show file

File: net.py Project: ailadson/nn

 def add_softmax_layer(self):
     l = SoftmaxLayer(self.layers[-1])
     self.layers.append(l)

Example #6

0

Show file

File: conv_net.py Project: zhmz90/kaggle-seizure-prediction

class ConvNet(object):
    def __init__(self, param_dict):

        self.param_dict = param_dict
        self.training_batch_size = param_dict['training_batch_size']
        nkerns = param_dict['nkerns']
        recept_width = param_dict['recept_width']
        pool_width = param_dict['pool_width']
        stride = param_dict['stride']
        dropout_prob = param_dict['dropout_prob']
        weight_decay = param_dict['l2_reg']
        activation = param_dict['activation']
        weights_variance = param_dict['weights_variance']
        n_channels = param_dict['n_channels']
        n_timesteps = param_dict['n_timesteps']
        n_fbins = param_dict['n_fbins']
        global_pooling = param_dict['global_pooling']
        rng = np.random.RandomState(23455)

        self.training_mode = T.iscalar('training_mode')
        self.x = T.tensor4('x')
        self.y = T.bvector('y')
        self.batch_size = theano.shared(self.training_batch_size)

        self.input = self.x.reshape((self.batch_size, 1, n_channels * n_fbins, n_timesteps))

        self.feature_extractor = FeatureExtractor(rng, self.input, nkerns, recept_width, pool_width, stride,
                                                  self.training_mode,
                                                  dropout_prob[0],
                                                  activation, weights_variance, n_channels, n_timesteps, n_fbins,
                                                  global_pooling)

        self.classifier = SoftmaxLayer(rng=rng, input=self.feature_extractor.output, n_in=nkerns[-1],
                                       training_mode=self.training_mode, dropout_prob=dropout_prob[-1])

        self.weights = self.feature_extractor.weights + self.classifier.weights

        # ---------------------- BACKPROP
        self.cost = self.classifier.cross_entropy_cost(self.y)
        self.cost = self.classifier.cross_entropy_cost(self.y)
        L2_sqr = sum((weight ** 2).sum() for weight in self.weights[::2])
        self.grads = T.grad(self.cost + weight_decay * L2_sqr, self.weights)
        self.updates = self.adadelta_updates(self.grads, self.weights)
        # self.updates = self.nesterov_momentum(self.grads, self.weights)

        # --------------------- FUNCTIONS
        self.train_model = theano.function([self.x, self.y, Param(self.training_mode, default=1)],
                                           outputs=self.cost,
                                           updates=self.updates)

        self.validate_model = theano.function([self.x, self.y, Param(self.training_mode, default=0)],
                                              self.cost)

        self.test_model = theano.function([self.x, Param(self.training_mode, default=0)],
                                          self.classifier.p_y_given_x[:, 1])

    def train(self, train_set, max_iter):
        print 'training for', max_iter, 'iterations'
        self.batch_size.set_value(self.training_batch_size)

        train_set_iterator = RandomTrainIterator(train_set, self.training_batch_size)

        done_looping = False
        iter = 0
        while not done_looping:
            for train_x, train_y in train_set_iterator:
                self.train_model(train_x, train_y)
                # if iter % 10 == 0:
                #     self.batch_size.set_value(train_set[0].shape[0])
                #     print self.validate_model(train_set[0], train_set[1])
                #     self.batch_size.set_value(self.training_batch_size)
                if iter > max_iter:
                    done_looping = True
                    break
                iter += 1

    def validate(self, train_set, valid_set, valid_freq, max_iter, fname_out):

        train_set_iterator = RandomTrainIterator(train_set, self.training_batch_size)
        valid_set_size = len(valid_set[1])

        f_out = open(fname_out, 'w')

        # ------------------------------  TRAINING
        epoch = 0
        iter = 0
        best_ce = np.inf
        best_iter_ce = 0
        best_auc = 0
        best_iter_auc = 0
        done_looping = False

        patience = 100000
        patience_increase = 2
        improvement_threshold = 0.995

        while iter < max_iter and not done_looping:
            epoch += 1
            for train_x, train_y in train_set_iterator:
                self.train_model(train_x, train_y)
                iter += 1
                # ------------------------ VALIDATION
                if iter % valid_freq == 0:

                    self.batch_size.set_value(valid_set_size)
                    cost_valid = self.validate_model(valid_set[0], valid_set[1])
                    auc_valid = self.get_auc(valid_set)

                    # print "%4s %7s  %15s  %15s %10s " % (
                    # epoch, iter, auc_valid, cost_valid,
                    #     patience)
                    f_out.write("%s \t %s  \t %s \n" % (
                        iter, auc_valid, cost_valid))
                    self.batch_size.set_value(self.training_batch_size)

                    if cost_valid <= best_ce:
                        if cost_valid < best_ce * improvement_threshold:
                            patience = max(patience, iter * patience_increase)
                        best_iter_ce = iter
                        best_ce = cost_valid

                    if auc_valid >= best_auc:
                        best_iter_auc = iter
                        best_auc = auc_valid

                if patience <= iter:
                    done_looping = True
        print 'best_iter_cost:', best_iter_ce, 'best_cost:', best_ce
        print 'best_iter_auc:', best_iter_auc, 'best_auc:', best_auc
        f_out.close()
        return max(best_iter_ce, best_iter_auc)

    def get_auc(self, data_xy):
        x, y = data_xy[0], data_xy[1]
        p_y_given_x = self.get_test_proba(x)
        fpr, tpr, thresholds = roc_curve(y, p_y_given_x, pos_label=1)
        roc_auc = auc(fpr, tpr)
        return roc_auc

    def get_test_proba(self, x_test):
        self.batch_size.set_value(len(x_test))
        p_y_given_x = self.test_model(x_test)
        return p_y_given_x

    def nesterov_momentum(self, grads, weights, learning_rate=0.001, momentum=0.9):
        updates = []

        for param_i, grad_i in zip(weights, grads):
            mparam_i = theano.shared(np.zeros(param_i.get_value().shape, dtype=theano.config.floatX))
            v = momentum * mparam_i - learning_rate * grad_i
            w = param_i + momentum * v - learning_rate * grad_i
            updates.append((mparam_i, v))
            updates.append((param_i, w))

        return updates

    def adadelta_updates(self, grads, weights, learning_rate=0.01, rho=0.95, epsilon=1e-6):
        accumulators = [theano.shared(np.zeros_like(param_i.get_value())) for param_i in weights]
        delta_accumulators = [theano.shared(np.zeros_like(param_i.get_value())) for param_i in weights]

        updates = []
        for param_i, grad_i, acc_i, acc_delta_i in zip(weights, grads, accumulators, delta_accumulators):
            acc_i_new = rho * acc_i + (1 - rho) * grad_i ** 2
            updates.append((acc_i, acc_i_new))

            update_i = grad_i * T.sqrt(acc_delta_i + epsilon) / T.sqrt(acc_i_new + epsilon)
            updates.append((param_i, param_i - learning_rate * update_i))

            acc_delta_i_new = rho * acc_delta_i + (1 - rho) * update_i ** 2
            updates.append((acc_delta_i, acc_delta_i_new))

        return updates

    def get_state(self):
        state = {}
        state['params'] = self.param_dict
        weights_vals = []
        for p in self.weights:
            weights_vals.append(p.get_value())
        state['weights'] = weights_vals
        return state

    def set_weights(self, weights_vals):
        for i, w in enumerate(weights_vals):
            self.weights[i].set_value(w)

Example #7

0

Show file

File: mnist.py Project: passion4energy/Elyane

# One hot encoding number 3 will become [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
y_train_enc = one_hot(y_train)

# number of classes / pixels per image
num_classes = y_train_enc.shape[0]
num_pixels = x_train.shape[0]

# Create our NN structure
net = NeuralNetwork()
net.add(FCLayer(num_pixels, 100, activation=TanH(), optimizer=Adam()))
net.add(DropOut(rate=0.0))
net.add(FCLayer(100, 50, activation=TanH(), optimizer=Adam()))
net.add(DropOut(rate=0.0))
net.add(FCLayer(50, 25, activation=TanH(), optimizer=Adam()))
net.add(DropOut(rate=0.0))
net.add(SoftmaxLayer(25, num_classes, activation=Softmax(), optimizer=Adam()))

# train
net.use(loss=MultiClassCrossEntropy(), regularizer=L2Regularizer(lambd=0.01))
net.train(x_train, y_train_enc, epochs=50, learning_rate=0.001, batch_size=256)

# check training accuracy
train_results = net.predict(x_train)
train_results = np.argmax(train_results, axis=0)
print("Accuracy on training set:",
      np.mean(train_results == y_train) * 100, "%")

# Check our model on the test set
x_test = normalize_images(x_test)

test_results = net.predict(x_test)

Example #8

0

Show file

File: baseline_model.py Project: ziweizhao1993/latent-subevents

class TemporalModel(Model):
    def __init__(self,
                 inputs,
                 bs,
                 max_time,
                 classes,
                 feature_dim,
                 hidden_size,
                 method='max',
                 seed=12345):
        self._inputs = inputs
        self.method = method
        self.batch_size = bs
        self.classes = classes
        self.max_time = max_time
        self.feature_dim = feature_dim
        self.dropout = True
        self.hidden = HiddenLayer(input_size=feature_dim,
                                  hidden_size=hidden_size,
                                  batch_size=bs,
                                  name='hidden',
                                  dropout=0.5,
                                  activation=act.LeakyRelu())
        self.softmax = SoftmaxLayer(input_size=hidden_size,
                                    classes=self.classes,
                                    batch_size=bs,
                                    name='softmax',
                                    dropout=0.5)

    @property
    def params(self):
        return self.softmax.params + self.hidden.params

    @property
    def inputs(self):
        return self._inputs

    @property
    def outputs(self):
        return self._outputs

    @property
    def updates(self):
        return self._updates

    @property
    def test_algorithm(self):
        if not hasattr(self, '_talgorithm'):
            d = self.dropout
            self.dropout = False
            o = self.run(*self.inputs)
            for i, ot in enumerate(self.outputs):
                o[i].name = ot.name
            self._talgorithm = theano.function(inputs=self.inputs,
                                               outputs=o,
                                               on_unused_input='warn')
            self.dropout = d
        return self._talgorithm

    def run(self, x, mask, y):
        # get the max/mean/sum of x for each feature
        # from all frame
        if self.method == 'max':
            m = (-100 * (1 - mask)).dimshuffle([0, 1, 'x'])
            x = T.max(x + m, axis=1)
        elif self.method == 'sum' or self.method == 'mean':
            x = T.sum(x, axis=1)
        elif self.method == 'mean':
            x = x / T.sum(mask, axis=1).dimshuffle([0, 'x'])

        x = x.astype(theano.config.floatX)
        x = self.hidden.run(x, self.dropout)

        prob, pred = self.softmax.run(x, self.dropout)
        y = y.reshape((y.shape[0], ))
        loss = self.softmax.loss(prob, y) + T.sum(
            self.hidden.w**2) * 0.001 + T.sum(self.softmax.w**2) * 0.0001
        y = T.extra_ops.to_one_hot(y, 51)
        error = self.softmax.error(pred, y)
        acc = 1 - error

        return prob, pred, loss, error, acc

Example #9

0

Show file

class TemporalModel(Model):
    def __init__(self, inputs, bs, max_time, classes, feature_dim, hidden_size, levels, N=1, pool=None, seed=12345):
        self._inputs = inputs
        self.N = N
        self.batch_size = bs
        self.classes = classes
        self.max_time = max_time
        self.levels = levels
        self.feature_dim = feature_dim
        self.pool = pool
        self.dropout = True

        # create a pyramid of filters
        self.temporal_pyramid = []
        for l in range(self.levels):
            for f in range(2**l):
                tf = TemporalAttentionLayer(batch_size=bs, N=N, channels=feature_dim, 
                                            name='temporal-attention-layer-'+str(l)+'-filter-'+str(f))
                tf.test = True
                tf.d = theano.shared(value=np.asarray([1./2**(l+1)]).astype('float32'), name='d', borrow=True,
                                     broadcastable=[True])
                tf.g = theano.shared(value=np.asarray([((1./2**l)+(2*f/2.**l))]).astype('float32'), name='g', borrow=True,
                                     broadcastable=[True])
                tf.sigma = theano.shared(value=np.asarray([5.0]).astype('float32'), name='sigma', borrow=True,
                                         broadcastable=[True])
                self.temporal_pyramid.append(tf)

        input_size = feature_dim*N*(len(self.temporal_pyramid) if pool == None else 1)
        self.hidden = HiddenLayer(input_size=input_size, hidden_size=hidden_size, activation=act.LeakyRelu(),
                                  batch_size=bs, name='hidden', dropout=0.5)
        self.softmax = SoftmaxLayer(input_size=hidden_size, classes=self.classes,
                                    batch_size=bs, name='softmax', dropout=0.5)


    @property
    def params(self):
        return self.softmax.params+self.hidden.params#+[p for f in self.temporal_filters for p in f.params]

    @property
    def inputs(self):
        return self._inputs

    @property
    def outputs(self):
        return self._outputs
    
    @property
    def updates(self):
        return self._updates

    @property
    def test_algorithm(self):
        if not hasattr(self, '_talgorithm'):
            d = self.dropout
            self.dropout = False
            o = self.run(*self.inputs)
            for i,ot in enumerate(self.outputs):
                o[i].name = ot.name
            self._talgorithm = theano.function(inputs=self.inputs,
                                               outputs=o, on_unused_input='warn')
            self.dropout = d
        return self._talgorithm



    def run(self, x, mask, y):
        # use temporal filters
        results = []
        # make x to be batch x features x time
        x = x.transpose([0,2,1])
        for tf in self.temporal_pyramid:
            # results is batch x features x N
            # flatten to batch x features*N
            res, (g,s2,d) = tf.run(x, mask)
            if self.pool == None:
                results.append(res.reshape((x.shape[0], self.feature_dim*self.N)))
            else:
                results.append(res.reshape((x.shape[0], 1, self.feature_dim*self.N)))
        # concatenate on axis 1 to get batch x features*N*filters
        x = T.concatenate(results, axis=1)

        if self.pool == 'max':
            x = T.max(x, axis=1)
        elif self.pool == 'sum':
            x = T.sum(x, axis=1)
        elif self.pool == 'mean':
            x = T.mean(x, axis=1)

        x = self.hidden.run(x, self.dropout)
        prob, pred = self.softmax.run(x, self.dropout)
        loss = self.softmax.loss(prob, y)
        error = self.softmax.error(pred, y)
        acc = 1-error

        return prob, pred, loss, error, acc

Example #10

0

Show file

       11,
       11,
       48,
       2,
       4,
       Relu(),
       0.05,
       momentum_rate=0.0,
       decay_rate=0.1)
    # CLh(net, 2, 2, 10, 1, 2, Relu(), 0.05)
    LrnLayer(net, 2, 0.0001, 5, 0.75)
    MaxPoolingLayer(net, 3, 3, 2)

    FcLayer(net, 5, Relu(), momentum_rate=0.0, decay_rate=0.1)
    DropoutLayer(net, dropout_prob=0.5)
    SoftmaxLayer(net, 10, momentum_rate=0.0, decay_rate=0.1)

    # net1 = Network()
    #
    sp = SciPlot('Curve of softmax output')
    # CLM(net1, 25, 25, 3, 11, 11, 48, 2, 4, Relu(), 0.05)
    # CLMh(net1, 2, 2, 10, 1, 2, Relu(), 0.05)
    # LrnLayer(net1, 2, 0.0001, 5, 0.75)
    # MaxPoolingLayer(net1, 3, 3, 2)
    # #
    # FcLayer(net1, 5, Relu())
    # DropoutLayer(net1, dropout_prob=0.5)
    # SoftmaxLayer(net1, 10)
    sp.plot(net.predict(fake_image, training=False), desc='episode-' + str(0))
    for i in range(0, 5):
        net.train_one_sample(fake_label, fake_image, 1)

Example #11

0

Show file

class TemporalModel(Model):
    def __init__(self,
                 inputs,
                 bs,
                 max_time,
                 classes,
                 feature_dim,
                 hidden_size,
                 filters,
                 N=1,
                 pool=None,
                 lstm_dim=4096,
                 steps=8,
                 seed=12345):
        self._inputs = inputs
        self.N = N
        self.batch_size = bs
        self.classes = classes
        self.max_time = max_time
        self.filters = filters
        self.feature_dim = feature_dim
        self.pool = pool
        self.dropout = True
        self.steps = steps

        self.temporal_filters = []
        for f in range(filters):
            tf = TemporalAttentionLayer(
                batch_size=bs,
                N=N,
                channels=feature_dim,
                input_hidden_size=lstm_dim,
                name='temporal-attention-layer-filter-' + str(f))
            self.temporal_filters.append(tf)

        input_size = feature_dim * len(
            self.temporal_filters) * (N if pool == None else 1)

        self.lstm_in = HiddenLayer(input_size=input_size,
                                   hidden_size=lstm_dim * 4,
                                   batch_size=bs)
        self.lstm = LSTMLayer(input_size=lstm_dim, hidden_size=lstm_dim)

        self.hidden = HiddenLayer(input_size=lstm_dim,
                                  hidden_size=hidden_size,
                                  activation=act.relu,
                                  batch_size=bs,
                                  name='hidden',
                                  dropout=0.5)
        self.softmax = SoftmaxLayer(input_size=hidden_size,
                                    classes=self.classes,
                                    batch_size=bs,
                                    name='softmax',
                                    dropout=0.5)

    @property
    def params(self):
        return self.softmax.params + self.hidden.params + self.lstm_in.params + self.lstm.params + [
            p for f in self.temporal_filters for p in f.params
        ]

    @property
    def inputs(self):
        return self._inputs

    @property
    def outputs(self):
        return self._outputs

    @property
    def updates(self):
        return self._updates

    @property
    def test_algorithm(self):
        if not hasattr(self, '_talgorithm'):
            d = self.dropout
            self.dropout = False
            o = self.run(*self.inputs)
            for i, ot in enumerate(self.outputs):
                o[i].name = ot.name
            self._talgorithm = theano.function(inputs=self.inputs,
                                               outputs=o,
                                               on_unused_input='warn')
            self.dropout = d
        return self._talgorithm

    def run(self, x, mask, y):
        # use temporal filters

        # make x to be batch x features x time
        x = x.transpose([0, 2, 1])

        h, c = self.lstm.get_initial_hidden(x)

        outputs_info = [
            dict(initial=h, taps=[-1]),  # h
            dict(initial=c, taps=[-1])
        ]  # c

        [h, c], _ = theano.scan(fn=self.step,
                                non_sequences=[x, mask],
                                outputs_info=outputs_info,
                                n_steps=self.steps)

        x = self.hidden.run(h[-1], self.dropout)
        prob, pred = self.softmax.run(x, self.dropout)
        loss = self.softmax.loss(prob, y)
        error = self.softmax.error(pred, y)
        acc = 1 - error

        return prob, pred, loss, error, acc

    def step(self, h, c, x, mask):
        results = []
        for tf in self.temporal_filters:
            # results is batch x features x N
            # flatten to batch x features*N
            res, (g, s2, d) = tf.run(x, h, mask)
            if self.pool == None:
                results.append(
                    res.reshape((x.shape[0], self.feature_dim * self.N)))
            elif self.pool == 'max':
                results.append(
                    T.max(res, axis=2).reshape((x.shape[0], self.feature_dim)))
            elif self.pool == 'sum':
                results.append(
                    T.sum(res, axis=2).reshape((x.shape[0], self.feature_dim)))
            elif self.pool == 'mean':
                results.append(
                    T.mean(res, axis=2).reshape(
                        (x.shape[0], self.feature_dim)))

        # concatenate on axis 1 to get batch x features*N*filters
        x = T.concatenate(results, axis=1)
        x = self.lstm_in.run(x)
        h, c = self.lstm.run(x, h, c)

        return h, c