def load_data(self):
     """Makes the data handlers, and loads the data from disk"""
     emb_train, emb_test, instance_labels = self.filenames
     self.train_data = DataHandler(emb_train, max_size=200000)
     self.train_eval = GroupEvaluator(data=self.train_data)
     self.test_eval = GroupEvaluator(
         data=DataHandler(emb_test, max_size=200000))
     self.instance_eval = InstanceEvaluator()
     self.instance_eval.load_labeled_instances(instance_labels)
Beispiel #2
0
#skip = int(100/fps)

flen = []
for line in open(data_dir + '01_test_framenum.txt'):  # test
    flen.append(line.strip())

maxlen = int(flen[tbidx])  # to get the alphas for the whole tbidx-th video
print 'Video length:', maxlen

print '-----'
#print 'Skip set at', skip
print 'Booting up the data handler'

data_pb = TestTestProto(batch_size, maxlen, maxlen, dataset, data_dir,
                        fps)  # or TestTrainProto or TestValidProto
dh = DataHandler(data_pb)
dataset_size = dh.GetDatasetSize()
num_batches = dataset_size / batch_size

print 'Data handler ready'
print '-----'
params = src.actrec.init_params(options)
params = src.actrec.load_params(model, params)
tparams = src.actrec.init_tparams(params)

trng, use_noise, inps, alphas, cost, opt_outs, preds = src.actrec.build_model(
    tparams, options)
f_alpha = theano.function(inps,
                          alphas,
                          name='f_alpha',
                          on_unused_input='ignore')
Beispiel #3
0
def train(
        dim_out=500,  # hidden layer dim for outputs
        ctx_dim=1024,  # context vector dimensionality
        dim=1024,  # the number of LSTM units
        n_actions=3101,  # number of actions to predict
        n_layers_att=1,
        n_layers_out=1,
        n_layers_init=1,
        ctx2out=False,
        patience=50,
        max_epochs=5000,
        dispFreq=100,
        decay_c=0.,
        alpha_c=0.,
        temperature_inverse=1.0,
        lrate=0.001,
        selector=False,
        maxlen=5,  # maximum length of the video
        optimizer='sgd',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        dataset='flickr8k',  # dummy dataset, replace with video ones
        dictionary=None,  # word dictionary
        use_dropout=False,
        reload_=False,
        training_stride=1,
        testing_stride=8,
        last_n=16,
        fps=30):

    # Model options
    model_options = locals().copy()
    #model_options = validate_options(model_options)

    # reload options
    if reload_ and os.path.exists(saveto):
        print "Reloading options"
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print '-----'
    print 'Booting up all data handlers'
    data_pb = TrainProto(batch_size, maxlen, training_stride, dataset, fps)
    dh = DataHandler(data_pb)
    dataset_size = dh.GetDatasetSize()
    num_train_batches = dataset_size / batch_size
    if dataset_size % batch_size != 0:
        num_train_batches += 1

    valid = True  # not None
    test = True  # not None

    data_test_train_pb = TestTrainProto(valid_batch_size, maxlen,
                                        testing_stride, dataset, fps)
    dh_test_train = DataHandler(data_test_train_pb)
    test_train_dataset_size = dh_test_train.GetDatasetSize()
    num_test_train_batches = test_train_dataset_size / valid_batch_size
    if test_train_dataset_size % valid_batch_size != 0:
        num_test_train_batches += 1

    data_test_valid_pb = TestValidProto(valid_batch_size, maxlen,
                                        testing_stride, dataset, fps)
    dh_test_valid = DataHandler(data_test_valid_pb)
    test_valid_dataset_size = dh_test_valid.GetDatasetSize()
    num_test_valid_batches = test_valid_dataset_size / valid_batch_size
    if test_valid_dataset_size % valid_batch_size != 0:
        num_test_valid_batches += 1

    data_test_test_pb = TestTestProto(valid_batch_size, maxlen, testing_stride,
                                      dataset, fps)
    dh_test_test = DataHandler(data_test_test_pb)
    test_test_dataset_size = dh_test_test.GetDatasetSize()
    num_test_test_batches = test_test_dataset_size / valid_batch_size
    if test_test_dataset_size % valid_batch_size != 0:
        num_test_test_batches += 1
    print 'Data handlers ready'
    print '-----'

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        print "Reloading model"
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, use_noise, \
          inps,\
          cost, \
          opts_out, preds, i_gate = \
          build_model(tparams, model_options)
    '''
    get_i_gate = theano.function(inps[0:2], i_gate, profile=False, on_unused_input='ignore')
    print 'build get_i_gate felished'

    x, vid, n_ex = dh_test_train.GetBatch(data_test_train_pb)
    mask = numpy.ones((maxlen, batch_size)).astype('float32')
    if n_ex != batch_size:
        mask[:,n_ex:] = numpy.zeros((maxlen, batch_size-n_ex)).astype('float32')

    i_gate_np = get_i_gate(x,mask)
    print len(i_gate_np)
    print len(i_gate_np[0])
    print len(i_gate_np[0][0])
    print i_gate_np[0][0][0].shape
    weig = numpy.zeros((7,7,30,batch_size))
    for i in xrange(7):
        for j in xrange(7):
            for k in xrange(30):
                weig[i,j,k,:] = numpy.mean(i_gate_np[k][j][i],axis=1)
    dic = {'weig':weig, 'vid':vid}
    sio.savemat('weig.mat', {'dic':dic})

    train_err = 0
    valid_err = 0
    test_err = 0

    '''

    # before any regularizer
    f_log_probs = theano.function(inps, -cost, profile=False)
    f_preds = theano.function(inps,
                              preds,
                              profile=False,
                              on_unused_input='ignore')

    cost = cost.mean()
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    cost += 0.0001 * i_gate.sum()

    #if alpha_c > 0.:
    #    alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
    #    alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean()
    #    cost += alpha_reg

    # gradient computation
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = numpy.load(saveto)['history_errs'].tolist()
    best_p = None
    bad_count = 0

    uidx = 0

    try:

        for epochidx in xrange(max_epochs):
            # If the input sequences are of variable length get mask from the data loader instead of setting them all to one
            mask = numpy.ones((maxlen, batch_size)).astype('float32')
            print 'Epoch ', epochidx
            n_examples_seen = 0
            estop = False
            if epochidx > 0:
                dh.Reset()

            for tbidx in xrange(num_train_batches):
                n_examples_seen += batch_size
                uidx += 1
                use_noise.set_value(1.)

                pd_start = time.time()
                x, y, n_ex = dh.GetBatch(data_pb)
                if n_ex != batch_size:
                    mask[:, n_ex:] = numpy.zeros(
                        (maxlen, batch_size - n_ex)).astype('float32')
                pd_duration = time.time() - pd_start

                if x == None:
                    print 'Minibatch with zero sample under length ', maxlen
                    continue
                ud_start = time.time()

                cost = f_grad_shared(x, mask, y)
                if uidx == 1:
                    print 'Original Cost ', cost / x.shape[3]
                f_update(lrate)
                ud_duration = time.time() - ud_start

                if n_ex != batch_size:
                    mask[:, n_ex:] = numpy.ones(
                        (maxlen, batch_size - n_ex)).astype('float32')

                if numpy.isnan(cost):
                    print 'NaN detected in cost'
                    return 1., 1., 1.
                if numpy.isinf(cost):
                    print 'INF detected in cost'
                    return 1., 1., 1.

                if numpy.mod(uidx, dispFreq) == 0:
                    print 'Epoch ', epochidx, 'Update ', uidx, 'Cost ', cost / x.shape[
                        3], 'PD ', pd_duration, 'UD ', ud_duration

                if numpy.mod(uidx, saveFreq) == 0:
                    print 'Saving...',

                    if best_p != None:
                        params = copy.copy(best_p)
                    else:
                        params = unzip(tparams)
                    numpy.savez(saveto, history_errs=history_errs, **params)
                    pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                    print 'Done'

                if numpy.mod(uidx, validFreq) == 0:

                    use_noise.set_value(0.)
                    train_err = 0
                    valid_err = 0
                    test_err = 0
                    print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)'
                    #train_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_train_pb, dh_test_train, test_train_dataset_size, num_test_train_batches, last_n, test=False)
                    if valid is not None:
                        valid_err = pred_acc(saveto,
                                             valid_batch_size,
                                             f_preds,
                                             maxlen,
                                             data_test_valid_pb,
                                             dh_test_valid,
                                             test_valid_dataset_size,
                                             num_test_valid_batches,
                                             last_n,
                                             test=True)
                    #if test is not None:
                    #    test_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_test_pb, dh_test_test, test_test_dataset_size, num_test_test_batches, last_n, test=True)

                    history_errs.append([valid_err, test_err])
                    if epochidx == 0 or valid_err >= numpy.array(
                            history_errs)[:, 0].max():
                        best_p = unzip(
                            tparams)  # p for min valid err / max valid acc

                    print 'Accuracy: Train', train_err, 'Valid', valid_err, 'Test', test_err

            if n_ex == batch_size:
                print 'Seen %d training examples' % (n_examples_seen)
            else:
                print 'Seen %d training examples' % (n_examples_seen -
                                                     batch_size + n_ex)
            use_noise.set_value(0.)
            train_err = 0
            valid_err = 0
            test_err = 0
            print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)'
            #train_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_train_pb, dh_test_train, test_train_dataset_size, num_test_train_batches, last_n, test=False)
            if valid is not None:
                valid_err = pred_acc(saveto,
                                     valid_batch_size,
                                     f_preds,
                                     maxlen,
                                     data_test_valid_pb,
                                     dh_test_valid,
                                     test_valid_dataset_size,
                                     num_test_valid_batches,
                                     last_n,
                                     test=True)
            if test is not None:
                test_err = pred_acc(saveto,
                                    valid_batch_size,
                                    f_preds,
                                    maxlen,
                                    data_test_test_pb,
                                    dh_test_test,
                                    test_test_dataset_size,
                                    num_test_test_batches,
                                    last_n,
                                    test=True)

            history_errs.append([valid_err, test_err])

            if epochidx == 0 or valid_err >= numpy.array(
                    history_errs)[:, 0].max():
                best_p = unzip(tparams)  # p for min valid err / max valid acc

            print 'Accuracy: Train', train_err, 'Valid', valid_err, 'Test', test_err
    finally:  #except KeyboardInterrupt:

        if best_p is not None:
            zipp(best_p, tparams)

        use_noise.set_value(0.)
        train_err = 0
        valid_err = 0
        test_err = 0
        print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)'
        #train_err = pred_acc(saveto, valid_batch_size, f_preds, maxlen, data_test_train_pb, dh_test_train, test_train_dataset_size, num_test_train_batches, last_n, test=False)
        if valid is not None:
            valid_err = pred_acc(saveto,
                                 valid_batch_size,
                                 f_preds,
                                 maxlen,
                                 data_test_valid_pb,
                                 dh_test_valid,
                                 test_valid_dataset_size,
                                 num_test_valid_batches,
                                 last_n,
                                 test=True)
        if test is not None:
            test_err = pred_acc(saveto,
                                valid_batch_size,
                                f_preds,
                                maxlen,
                                data_test_test_pb,
                                dh_test_test,
                                test_test_dataset_size,
                                num_test_test_batches,
                                last_n,
                                test=True)

        print 'Accuracy: Train', train_err, 'Valid', valid_err, 'Test', test_err
        params = copy.copy(best_p)
        numpy.savez(saveto,
                    zipped_params=best_p,
                    train_err=train_err,
                    valid_err=valid_err,
                    test_err=test_err,
                    history_errs=history_errs,
                    **params)

        print model_options

    return train_err, valid_err, test_err
class GICF(object):
    """Defines the model described in the paper. Creates filenames for each dataset and loads the data.
    Then optimizes the cost function through the train method"""
    def __init__(self, dataset='movies'):
        """Initialize variables, Get filenames according to dataset load data and set parameters"""
        self.dataset = dataset
        self.filenames = self.get_filenames(dataset)
        self.test_groups = []
        self.test_instances = []
        self.train_data = None
        self.train_eval = None
        self.test_eval = None
        self.instance_eval = None
        self.group_acc = []
        self.instance_auc = []
        self.instance_acc = []
        self.train_acc = []
        self.total_iterations = 0

        self.load_data()  # takes a few seconds
        self.set_parameters()  # set default parameters. Can be changed later
        self.embeddings_dimension = self.train_data.get_embeddings_dimension()
        self._print_titles = '#iter\tACC Train\tAUC Train\tACC Test\tAUC Test\t\t|\tACC Sent\tAUC Sent\t\tPRC Sent'

    def get_filenames(self, dataset='movies'):
        """Sets the filenames based on each dataset"""
        if dataset == 'movies':
            dir_name = 'data/movies/'
        elif dataset == 'yelp':
            dir_name = 'data/yelp/'
        elif dataset == 'amazon':
            dir_name = 'data/amazon/cells/'
        else:
            print('Wrong dataset Name.')
            return

        embeddings_file_train = dir_name + 'train.emb'  # emb\t emb\t score\n
        embeddings_file_test = dir_name + 'test.emb'
        instance_labels = dir_name + 'test_sentences.emb'
        return embeddings_file_train, embeddings_file_test, instance_labels

    def load_data(self):
        """Makes the data handlers, and loads the data from disk"""
        emb_train, emb_test, instance_labels = self.filenames
        self.train_data = DataHandler(emb_train, max_size=200000)
        self.train_eval = GroupEvaluator(data=self.train_data)
        self.test_eval = GroupEvaluator(
            data=DataHandler(emb_test, max_size=200000))
        self.instance_eval = InstanceEvaluator()
        self.instance_eval.load_labeled_instances(instance_labels)

    @property
    def _param_str(self):
        """A string with the parameters of the experiment"""
        return str(self.epochs) + 'x' + str(self.batch_size) + '_' + str(
            self.lr) + '_' + str(
                self.alpha_balance) + self.similarity_fn + str(
                    self.sim_variance)

    def set_parameters(self,
                       batch_size=500,
                       alpha_balance=0.04,
                       lr=0.1,
                       momentum_value=0.7,
                       similarity_fn='rbf',
                       sim_variance=0.7071,
                       epochs=3):
        """Set the parameters for the run/experiment"""
        self.alpha_balance = alpha_balance
        self.momentum_value = momentum_value
        self.similarity_fn = similarity_fn
        self.sim_variance = sim_variance
        self.epochs = epochs
        self.batch_size = batch_size

        self.lr = lr * self.batch_size  # learning rate is a funciton of batch size
        self.run_name = self._param_str
        self.dir_name = self.similarity_fn + '_' + str(
            self.batch_size) + '_' + str(self.epochs)
        self.output_name = './training_output/' + self.dataset + '/' + self.dir_name + '_'
        self.train_data.set_batch_size(batch_size)

    def train(self):
        """Where the magic happens. Optimizes the cost function of the paper, based on the parameters given before.
        There is a terminating function which determines if optimization should end before the epochs end,
        based on essentially heuristics. Every 50 iterations prints progress. Keeps the best theta values based on the
        group reconstruction score. At the end prints detailed stats about classifying with that."""
        print('Optimizing for ', self._param_str)
        self.total_iterations = 0
        accs = []
        #theta = np.random.random(self.embeddings_dimension)
        theta = np.zeros(self.embeddings_dimension)
        #theta=np.loadtxt('training_output/movies/rbf_100_300_300x100_10.0_0.04rbf0.7071_last_theta', delimiter=',')
        print(theta)
        best_theta = theta
        best_acc = 0
        terminate = False

        for epoch in range(self.epochs):
            self.train_data.rewind_dataset(True)  # reset and shuffle data

            if terminate:
                break
            print('-------epoch ', epoch, '-----------')
            print(self._print_titles)

            X, gs, gl = self.train_data.get_next_batch()

            while X is not None:  # for each mini-batch # do gd step

                W_ij = similarity.get_sim_matrix(X, self.similarity_fn,
                                                 self.sim_variance)

                # calculate y_hat and derivative
                Y_ij = af.calculate_y(X, theta)
                Y_der_ij = af.calculate_y_der(Y_ij, X)

                # calculate cost
                similarity_cost = af.similarity_derivative(
                    Y_ij, Y_der_ij, W_ij) / (X.shape[0]**2)
                group_cost = self.alpha_balance * af.group_derivative(
                    Y_ij, Y_der_ij, gs, gl) / float(len(gs))
                #if self.total_iterations %8==0:
                theta_der = similarity_cost + group_cost
                #else:
                #theta_der = similarity_cost
                #print(theta_der)
                # new theta
                #
                theta = self.momentum_value * theta - self.lr / (epoch +
                                                                 1) * theta_der
                #theta = theta - (1 - self.momentum_value) * self.lr / (epoch + 1) * theta_der#(1 - self.momentum_value) *

                self.total_iterations += 1

                # print progress
                #if self.total_iterations % 50 == 0:
                acc = self._print_progress(theta)
                accs.append(acc)
                if Jilu[-1] < acc:

                    Jilu.append(acc)
                else:
                    Jilu.append(Jilu[-1])
                if acc > best_acc:  # save best theta, based on training set
                    best_acc = acc
                    best_theta = theta
                    io.save_theta(theta,
                                  self.output_name + self._param_str,
                                  best=True)

                    #if self._terminate_conditions(theta, accs):
                #if self.total_iterations == 100:
                #   terminate = True
                #   break
                X, gs, gl = self.train_data.get_next_batch()

        io.save_theta(theta, self.output_name + self._param_str + '_last')

        print('\n\n\n\t\t\t---BEST THETA VALUE (in training group)---')

        self._print_progress(best_theta, print_details=True)
        return self.train_acc, self.group_acc, self.instance_acc, self.instance_auc

    def _terminate_conditions(self, theta, accs):
        if np.isnan(theta[0]):
            return True

        variance = np.array(accs)
        if len(variance) > 50:
            variance = variance[:-50]  # last 50 values
        var = np.var(variance)

        if self.total_iterations > 1500 and var < 0.00005:
            return True

    def _print_progress(self, theta, print_details=False):
        # iterations  train accuracy, train AUC, test accuracy, test AUC | instance accuracy, instance auc, instance PRC
        print('%6d\t' % self.total_iterations, )
        acc_train, auc_train = self.train_eval.evaluate_groups(
            theta, print_details)

        self.train_acc.append([acc_train])
        print(
            round(100 * acc_train, 2),
            ' \t\t(',
            round(100 * auc_train, 2),
            ')\t',
            'acc_train',
        )

        acc, auc = self.test_eval.evaluate_groups(theta, print_details)
        self.group_acc.append(acc)
        accback = acc
        print(
            round(100 * acc, 2),
            ' \t\t(',
            round(100 * auc, 2),
            ')\t',
            'evagroup',
        )
        print('\t|\t', )
        acc, auc = self.instance_eval.evaluate_instances(theta)
        auprc = self.instance_eval.evaluate_instances(theta, prc=True)

        self.instance_acc.append(acc)
        self.instance_auc.append(auc)
        XX = len(Jilu)
        XX = range(XX)  # 以0开始的递增序列作为x轴数据
        if print_details:
            plt.plot(XX, Jilu)  # 只提供x轴,y轴参数,画最简单图形
            plt.show()
        print(Jilu)
        print(
            round(100 * acc, 2),
            '\t\t(',
            round(100 * auc, 2),
            ')\t',
            ' \t(',
            round(100 * auprc, 2),
            ')\t',
            'evainstance',
        )
        return accback  #acc_train  # based on this we decide best theta
Beispiel #5
0
def train(
        dim_out=100,  # hidden layer dim for outputs
        ctx_dim=512,  # context vector dimensionality
        dim=1000,  # the number of LSTM units
        n_actions=3,  # number of actions to predict
        n_layers_att=1,
        n_layers_out=1,
        n_layers_init=1,
        ctx2out=False,
        patience=10,
        max_epochs=5000,
        dispFreq=100,
        decay_c=0.,
        alpha_c=0.,
        temperature_inverse=1.0,
        lrate=0.01,
        selector=False,
        maxlen=30,  # maximum length of the video
        optimizer='adam',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        dataset='flickr8k',  # dummy dataset, replace with video ones
        dictionary=None,  # word dictionary
        use_dropout=False,
        reload_=False,
        training_stride=1,
        testing_stride=8,
        last_n=16,
        fps=100,
        data_dir='/home/pmorerio/datasets/IIT_IFM/'):

    # Model options
    model_options = locals().copy()
    #model_options = validate_options(model_options)

    # reload options
    if reload_ and os.path.exists(saveto):
        print "Reloading options"
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print '-----'
    print 'Booting up all data handlers'
    print 'Training set for actual training (randomized)'
    data_pb = TrainProto(batch_size, maxlen, training_stride, dataset,
                         data_dir, fps)
    dh = DataHandler(data_pb)
    dataset_size = dh.GetDatasetSize()
    num_train_batches = dataset_size / batch_size
    if dataset_size % batch_size != 0:
        num_train_batches += 1
    print num_train_batches, ' batches'

    valid = None  # not None
    test = True  # not None

    print 'Training set for training accuracy'  # the training set is loaded twice: for actual training and for computing training error
    data_test_train_pb = TestTrainProto(valid_batch_size, maxlen,
                                        testing_stride, dataset, data_dir, fps)
    dh_test_train = DataHandler(data_test_train_pb)
    test_train_dataset_size = dh_test_train.GetDatasetSize()
    num_test_train_batches = test_train_dataset_size / valid_batch_size
    if test_train_dataset_size % valid_batch_size != 0:
        num_test_train_batches += 1
    print num_test_train_batches, ' batches'

    if valid == True:
        print 'Validation set for validation accuracy'
        data_test_valid_pb = TestValidProto(valid_batch_size, maxlen,
                                            testing_stride, dataset, data_dir,
                                            fps)
        dh_test_valid = DataHandler(data_test_valid_pb)
        test_valid_dataset_size = dh_test_valid.GetDatasetSize()
        num_test_valid_batches = test_valid_dataset_size / valid_batch_size
        if test_valid_dataset_size % valid_batch_size != 0:
            num_test_valid_batches += 1
        print num_test_valid_batches, ' batches'

    print 'Test set for test accuracy'
    data_test_test_pb = TestTestProto(valid_batch_size, maxlen, testing_stride,
                                      dataset, data_dir, fps)
    dh_test_test = DataHandler(data_test_test_pb)
    test_test_dataset_size = dh_test_test.GetDatasetSize()
    num_test_test_batches = test_test_dataset_size / valid_batch_size
    if test_test_dataset_size % valid_batch_size != 0:
        num_test_test_batches += 1
    print num_test_test_batches, ' batches'

    print 'Data handlers ready'
    print '-----'

    print 'Building model'
    params = init_params(model_options)  # actual parameter initialization
    # reload parameters
    if reload_ and os.path.exists(saveto):
        print "Reloading model"
        params = load_params(saveto, params)

    # simply initializes Theano shared variable according to param
    # numpy arrays -> theano shared variables
    tparams = init_tparams(params)

    # In order, we get:
    #   1) trng - theano random number generator
    #   2) use_noise - flag that turns on dropout
    #   3) inps - inputs for f_grad_shared
    #	4) alphas - the attention weigths
    #   4) cost - log likelihood for each sentence
    #   5) opts_out - optional outputs (e.g selector)
    #	6) preds - the computed labels

    trng, use_noise, \
          inps, alphas, \
          cost, \
          opts_out, preds = \
          build_model(tparams, model_options)   # builds the whole computation graph

    # before any regularizer
    f_log_probs = theano.function(inps, -cost, profile=False)
    f_preds = theano.function(inps,
                              preds,
                              profile=False,
                              on_unused_input='ignore')

    cost = cost.mean()

    # add L2 regularization costs
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # add attention penalty to the cost
    #if alpha_c > 0.:
    #alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
    #alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean()
    #cost += alpha_reg

    # add ATTENTION FOCUS to the cost
    if alpha_c > 0.:
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = -alpha_c * (
            alphas * tensor.log(alphas + 1e-8)).sum(0).sum(0).mean()
        cost += alpha_reg

    # Backpropagation
    # gradient computation
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    # f_grad_shared computes the cost and updates adaptive learning rate variables
    # f_update updates the weights of the model
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)

    print 'Optimization'

    history_acc = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_acc = numpy.load(saveto)['history_acc'].tolist()
    best_p = None
    bad_count = 0

    uidx = 0

    train_acc = 0
    valid_acc = 0
    test_acc = 0

    for epochidx in xrange(max_epochs):
        # If the input sequences are of variable length get mask from the data loader instead of setting them all to one
        mask = numpy.ones((maxlen, batch_size)).astype('float32')
        print 'Epoch ', epochidx
        n_examples_seen = 0
        estop = False  # not used
        #if epochidx > 0:
        dh.Reset()  #  training data is shuffled at each epoch in Reset()

        udtime = 0
        pdtime = 0
        for tbidx in xrange(num_train_batches):
            n_examples_seen += batch_size
            uidx += 1
            use_noise.set_value(1.)

            pd_start = time.time()
            x, y, n_ex = dh.GetBatch(
                data_pb
            )  # looks really slow. this is maybe why also predictions are slow (must get batches for all train/test/valid)
            if n_ex != batch_size:
                mask[:, n_ex:] = numpy.zeros(
                    (maxlen, batch_size - n_ex)).astype('float32')
            pdtime += time.time() - pd_start  # pd stands for prepare data?

            #if x == None: # this gives a Warning. Replaced with -> if x is None:
            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                continue
            ud_start = time.time()

            cost = f_grad_shared(x, mask, y)
            f_update(lrate)
            udtime += time.time() - ud_start  # ud stands for use data?

            if n_ex != batch_size:
                mask[:, n_ex:] = numpy.ones(
                    (maxlen, batch_size - n_ex)).astype('float32')

            if numpy.isnan(cost):
                print 'NaN detected in cost'
                return 1., 1., 1.
            if numpy.isinf(cost):
                print 'INF detected in cost'
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', epochidx, ' Update', uidx, ' Cost', cost, ' PD', pdtime / float(
                    dispFreq), ' UD', udtime / float(dispFreq)
                pdtime = 0
                udtime = 0

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p != None:
                    params = copy.copy(best_p)
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_acc=history_acc, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                train_acc = 0
                valid_acc = 0
                test_acc = 0
                print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)'
                train_acc = pred_acc(saveto,
                                     valid_batch_size,
                                     f_preds,
                                     maxlen,
                                     data_test_train_pb,
                                     dh_test_train,
                                     test_train_dataset_size,
                                     num_test_train_batches,
                                     last_n,
                                     test=False)
                if valid is not None:
                    valid_acc = pred_acc(saveto,
                                         valid_batch_size,
                                         f_preds,
                                         maxlen,
                                         data_test_valid_pb,
                                         dh_test_valid,
                                         test_valid_dataset_size,
                                         num_test_valid_batches,
                                         last_n,
                                         test=True)
                if test is not None:
                    test_acc = pred_acc(saveto,
                                        valid_batch_size,
                                        f_preds,
                                        maxlen,
                                        data_test_test_pb,
                                        dh_test_test,
                                        test_test_dataset_size,
                                        num_test_test_batches,
                                        last_n,
                                        test=True)

                history_acc.append([train_acc, valid_acc, test_acc])

                if uidx == 0 or valid_acc >= numpy.array(history_acc)[:,
                                                                      1].max():
                    best_p = unzip(
                        tparams)  # p for min valid err / max valid acc

                print 'Accuracy: Train', train_acc, 'Valid', valid_acc, 'Test', test_acc
            #here ends the  cycle over the batches
        if n_ex == batch_size:
            print 'Seen %d training examples' % (n_examples_seen)
        else:
            print 'Seen %d training examples' % (n_examples_seen - batch_size +
                                                 n_ex)
        use_noise.set_value(0.)
        train_acc = 0
        valid_acc = 0
        test_acc = 0
        print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)'
        train_acc = pred_acc(saveto,
                             valid_batch_size,
                             f_preds,
                             maxlen,
                             data_test_train_pb,
                             dh_test_train,
                             test_train_dataset_size,
                             num_test_train_batches,
                             last_n,
                             test=False)
        if valid is not None:
            valid_acc = pred_acc(saveto,
                                 valid_batch_size,
                                 f_preds,
                                 maxlen,
                                 data_test_valid_pb,
                                 dh_test_valid,
                                 test_valid_dataset_size,
                                 num_test_valid_batches,
                                 last_n,
                                 test=True)
        if test is not None:
            test_acc = pred_acc(saveto,
                                valid_batch_size,
                                f_preds,
                                maxlen,
                                data_test_test_pb,
                                dh_test_test,
                                test_test_dataset_size,
                                num_test_test_batches,
                                last_n,
                                test=True)

        history_acc.append([train_acc, valid_acc, test_acc])

        if epochidx == 0 or valid_acc >= numpy.array(history_acc)[:, 1].max():
            best_p = unzip(tparams)  # p for min valid err / max valid acc

        print 'Accuracy: Train', train_acc, 'Valid', valid_acc, 'Test', test_acc
        # here ends the cycle over the epochs

    # use the best  parameters for final checkpoint (if they exist)
    if best_p is not None:
        zipp(best_p, tparams)

    # if best param were found with validation, calculate accuracy with them
    if valid is not None:
        use_noise.set_value(0.)
        train_acc = 0
        valid_acc = 0
        test_acc = 0
        print 'Computing predictions (This will take a while. Set the verbose flag if you want to see the progress)'
        train_acc = pred_acc(saveto,
                             valid_batch_size,
                             f_preds,
                             maxlen,
                             data_test_train_pb,
                             dh_test_train,
                             test_train_dataset_size,
                             num_test_train_batches,
                             last_n,
                             test=False)
        valid_acc = pred_acc(saveto,
                             valid_batch_size,
                             f_preds,
                             maxlen,
                             data_test_valid_pb,
                             dh_test_valid,
                             test_valid_dataset_size,
                             num_test_valid_batches,
                             last_n,
                             test=True)
        if test is not None:
            test_acc = pred_acc(saveto,
                                valid_batch_size,
                                f_preds,
                                maxlen,
                                data_test_test_pb,
                                dh_test_test,
                                test_test_dataset_size,
                                num_test_test_batches,
                                last_n,
                                test=True)

        print 'Accuracy: Train', train_acc, 'Valid', valid_acc, 'Test', test_acc

    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                train_acc=train_acc,
                valid_acc=valid_acc,
                test_acc=test_acc,
                history_acc=history_acc,
                **params)

    print model_options

    return train_acc, valid_acc, test_acc