def __init__(self, mode):
        assert mode in ['train', 'eval']

        if mode == 'train':
            dl = Data_loader(labeled_only=True, option='both')
            self.train_test_val_data = dl.cv_data(0)[0] + dl.cv_data(
                0)[1] + dl.cv_data(0)[2]
            self.train()
        else:
            model_dict = pkl.load(open("../data/logistic_regression.pkl",
                                       'rb'))
            self.thresholds = model_dict['thresholds']
            self.classifiers = model_dict['models']
Exemple #2
0
def visualize_labeled_dataset():
    print('Initializing Data Loader')
    dl = Data_loader()

    tr, val, tst = dl.cv_data(fold_idx=0)
    labeled_tweets = tr + val + tst
    labeled_tweets = [(x['tweet_id'], x['label']) for x in labeled_tweets]
    print('Number of labeled tweets:', len(labeled_tweets))

    # plot_tweets(labeled_tweets, emb_type='splex', rep_mode='sum', include_sub=False, force_TSNE=True)
    plot_tweets(labeled_tweets, emb_type='w2v', rep_mode='avg')
                        batch_size=batch_size,
                        sample=True),
            create_data(input_name2id2np,
                        val,
                        return_generators=return_generators,
                        batch_size=batch_size,
                        sample=False),
            create_data(input_name2id2np,
                        test,
                        return_generators=return_generators,
                        batch_size=batch_size,
                        sample=False))


if __name__ == '__main__':
    from data_loader import Data_loader
    option = 'word'
    max_len = 50
    vocab_size = 40000
    dl = Data_loader(vocab_size=vocab_size, max_len=max_len, option=option)
    fold_idx = 0
    data_fold = dl.cv_data(fold_idx)
    tr, val, test = data_fold
    print(tr[0])
    '''
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = create_clf_data(simplest_tweet2data,
                                                                           data_fold)
    for key in X_train:
        print(X_train[key])
    '''
Exemple #4
0
class Experiment:

    def __init__(self, experiment_dir, input_name2id2np=None, adapt_train_vocab=False,
                 comments='', epochs=100, patience=4, noise_function=None, filter_function=None, fold=5,
                 predict_ens_test=True, by_fold=False,
                 **kwargs):
        """
        an experiment class that runs cross validation
        designed to enable easy experiments with combinations of:
        1) context representation:
            handled by input_name2id2np
        2) pre-training methods:
            handled by pretrained_weight_dir in the kwargs argument
            None if there is no pretraining weight available
        3) char vs. word:
            specified in "options"
            options = ['char', 'word'] if you want to include both
            implement the value for key "word_content_input"
        options = ['char', 'word'] if you want to include everything

        Parameters
        ----------
        input_name2id2np:
        experiment_dir: the directory that the experiment weights and results will be saved
        adapt_train_vocab: under supervised training without pretraining,
                            some vocab will not be seen (twice) in the training set.
                            if set to True, then vocab occuring less than twice will be removed.
        comments: the comments that will be written to the README
        epochs: number of epochs of training during cross validation
        patience: number of epochs allowable for not having any improvement on the validation set
        kwargs: arguments that will be passed to initializing the neural network model (shown below)

        ========== below is the parameters needed by the neural network model ==========

        options: an array containing all the options considered in the neural network model ['char', 'word']
                    (probably splex in the future)
                    for each option, the input is mapped to a lower dimension,
                    then the lower dimension representation of each option is concatenated
                    and is followed by the final classification layer
        word_vocab_size: number of word level vocabs to be considered
        word_max_len: number of words in a tweet sentence
        char_vocab_size: number of char level vocabs to be considered
        char_max_len: number of chars in a tweet sentence
        drop_out: dropout rate for regularization
        filter: number of filters for each kernel size
        dense_size: the size of the dense layer following the max pooling layer
        embed_dim: embedding dimension for character and word level
        kernel_range: range of kernel sizes
        pretrained_weight_dir: a dictionary containing the pretrained weight.
                    e.g. {'char': '../weights/char_ds.weights'} means that the pretrained weight for character level model
                    is in ../weights/char_ds.weights
        weight_in_keras: whether the weight is in Keras
        context_dim: the dimension of context representation
        context_dense_size: the dense layer size right before the context representation
        splex_dense_size: dense layer size right before the splex reps
        """
        # creating the experiment dir
        # automatically generate a README
        if experiment_dir[:-1] != '/':
            experiment_dir += '/'
        experiment_dir = '../experiments/' + experiment_dir
        self.experiment_dir, self.kwargs = experiment_dir, kwargs
        subprocess.call(['rm', '-rf', experiment_dir])
        subprocess.call(['mkdir', experiment_dir])
        self.adapt_train_vocab = adapt_train_vocab
        self.predict_ens_test = predict_ens_test
        '''
        with open(self.experiment_dir + 'README', 'w') as readme:
            
            readme.write(comments + '\n')
            for key in kwargs:
                readme.write("%s: %s\n" % (str(key), str(kwargs[key])))
        '''
        if input_name2id2np is None:
            input_name2id2np = {}
        self.input_name2id2np = input_name2id2np
        self.fold = fold
        self.dl = Data_loader(option='both', labeled_only=True, **kwargs)
        self.epochs, self.patience = epochs, patience
        self.noise_function, self.filter_function = noise_function, filter_function
        self.pretrained_weight_dirs = self.kwargs.get('pretrained_weight_dirs')
        self.by_fold = by_fold

    # cross validation
    # write all results to the directory
    # see read_results for retrieving the performance
    def cv(self):
        results = []

        for fold_idx in range(self.fold):
            print('cross validation fold %d.' % (fold_idx + 1))

            # retriving cross validataion data
            fold_data = self.dl.cv_data(fold_idx)
            ((X_train, y_train), (X_val, y_val), (X_test, y_test)) = \
                create_clf_data(self.input_name2id2np, fold_data, return_generators=False)
        
            if self.predict_ens_test:
                # retrieving the ensemble data
                ensemble_data = self.dl.ensemble_data()
                X_ensemble, y_ensemble = create_data(self.input_name2id2np, ensemble_data)
            
                # retrieving the held-out test data
                held_out_data = self.dl.test_data()
                X_held_out, y_held_out = create_data(self.input_name2id2np, held_out_data)
            
            if self.filter_function is not None:
                def apply_filter(X):
                    X_filtered = {}
                    for key in X:
                        if 'char' in key and 'input' in key:
                            X_filtered[key] = np.array([self.filter_function(x[:])
                                                        for x in X[key]])
                        else:
                            X_filtered[key] = X[key]
                    return X_filtered
            
                X_train, X_val, X_test = apply_filter(X_train), apply_filter(X_val), apply_filter(X_test)
                if self.predict_ens_test:
                    X_ensemble, X_held_out = apply_filter(X_ensemble), apply_filter(X_held_out)

            # if no pretrained weights, adapting vocabulary so that those who appear in
            # X_train less than twice would not be counted
            if self.adapt_train_vocab:
                if self.predict_ens_test:
                    adapt_vocab(X_train, (X_val, X_test, X_ensemble, X_held_out))
                else:
                    adapt_vocab(X_train, (X_val, X_test))

            class_weight = calculate_class_weight(y_train)

            # initializing model, train and predict
            K.clear_session()
            self.kwargs['input_dim_map'] = extract_dim_input_name2id2np(self.input_name2id2np)

            # cross validation test data in categorical form
            y_test = np.argmax(y_test, axis=-1)
            
            # OBSOLETE
            if self.kwargs.get('mode') == 'ternary':
                if not self.by_fold:
                    self.model = NN_architecture(**self.kwargs).model
                else:
                    self.kwargs['pretrained_weight_dirs'] = self.pretrained_weight_dirs[fold_idx]
                    self.model = NN_architecture(**self.kwargs).model
                self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[macro_f1])

                # call backs
                es = EarlyStopping(patience=self.patience, monitor='val_loss', verbose=1)
                weight_dir = self.experiment_dir + str(fold_idx) + '.weight'
                mc = ModelCheckpoint(weight_dir, save_best_only=True, save_weights_only=True)
                callbacks = [es, mc]

                # fit for at least 1 epoch
                self.model.fit(x=X_train, y=y_train, validation_data=(X_val, y_val), class_weight=class_weight)

                # training
                self.model.fit(x=X_train, y=y_train, validation_data=(X_val, y_val), callbacks=callbacks,
                               epochs=self.epochs, class_weight=class_weight)
                self.model.load_weights(weight_dir)

                # prediction
                y_pred = self.model.predict(x=X_test)
                y_pred_val = self.model.predict(x=X_val)

                # saving predictions for ensembles
                np.savetxt(self.experiment_dir + 'pred_test' + str(fold_idx) + '.np', y_pred)
                np.savetxt(self.experiment_dir + 'pred_val' + str(fold_idx) + '.np', y_pred_val)
                np.savetxt(self.experiment_dir + 'truth_test' + str(fold_idx) + '.np', y_test)
                np.savetxt(self.experiment_dir + 'truth_val' + str(fold_idx) + '.np', y_test)

                # make y categorical
                y_pred = np.argmax(y_pred, axis=-1)

            # only cascade model is used and it is the default
            elif self.kwargs.get('mode') is None or self.kwargs.get('mode') == 'cascade':
                # initialize the predictions
                num_train, num_val, num_test = y_train.shape[0], y_val.shape[0], y_test.shape[0]
                y_pred_val, y_pred_test = [None] * num_val, [None] * num_test

                for class_idx in range(2):
                    # time the training
                    start = int(round(time.time() * 1000))

                    # create layer name that has prefix
                    # since for each fodl we train model for aggression and loss models separately
                    if class_idx  == 0:
                        self.kwargs['prefix'] = 'aggression'
                    else:
                        self.kwargs['prefix'] = 'loss'
                
                    # initialize a model
                    if not self.by_fold:
                        self.model = NN_architecture(**self.kwargs).model
                    else:
                        self.kwargs['pretrained_weight_dirs'] = self.pretrained_weight_dirs[fold_idx]
                        self.model = NN_architecture(**self.kwargs).model

                    self.model.compile(optimizer='adam', loss='binary_crossentropy')

                    # create the label for this binary classification task
                    _y_train_, _y_val_ = y_train[:,class_idx], y_val[:,class_idx]
                    num_positive_train = sum(_y_train_)

                    # call backs
                    es = EarlyStopping(patience=self.patience, monitor='val_loss', verbose=1)
                    weight_dir = self.experiment_dir + str(fold_idx) + '_' + str(class_idx) + '.weight'
                    mc = ModelCheckpoint(weight_dir, save_best_only=True, save_weights_only=True)
                    callbacks = [es, mc]

                    # training
                    if self.noise_function is None:
                        self.model.fit(x=X_train, y=_y_train_,
                                       validation_data=(X_val, _y_val_))
                        history = self.model.fit(x=X_train, y=_y_train_,
                                                 validation_data=(X_val, _y_val_),
                                                 callbacks=callbacks, epochs=self.epochs)
                    else:
                        def add_noise2data(X_train):
                            X_train_noised = {}
                            for key in X_train:
                                if 'char' in key and 'input' in key:
                                    X_train_noised[key] = np.array([self.noise_function(x[:])
                                                                    for x in X_train[key]])
                                else:
                                    X_train_noised[key] = X_train[key]
                            return X_train_noised
        
                        self.model.fit(x=add_noise2data(X_train), y=_y_train_)
                                       
                        best_val_loss, best_epoch = float('inf'), 0
                        for epoch_idx in range(1, self.epochs + 1):
                            self.model.fit(x=add_noise2data(X_train), y=_y_train_)
                            val_loss = self.model.evaluate(x=X_val, y=_y_val_)
                            print('validation loss for epoch %d: %.3f' % (epoch_idx, val_loss))
                            if val_loss < best_val_loss:
                                best_epoch = epoch_idx
                                best_val_loss = val_loss
                                self.model.save_weights(weight_dir)

                            if epoch_idx - best_epoch >= self.patience:
                                break
            
                    self.model.load_weights(weight_dir)

                    _y_pred_val_score, _y_pred_test_score = (self.model.predict(X_val).flatten(),
                                                             self.model.predict(X_test).flatten())
                    if self.predict_ens_test:
                        _y_pred_ensemble_score, _y_pred_held_out_score = (self.model.predict(X_ensemble).flatten(),
                                                                          self.model.predict(X_held_out).flatten())
                    
                    prefix = self.experiment_dir + 'fold_%d_class_%d_' % (fold_idx, class_idx)
                    
                    np.savetxt(prefix + 'pred_val.np', _y_pred_val_score)
                    np.savetxt(prefix + 'pred_test.np', _y_pred_test_score)
                    
                    if self.predict_ens_test:
                        np.savetxt(prefix + 'pred_ensemble.np', _y_pred_ensemble_score)
                        np.savetxt(prefix + 'pred_held_out.np', _y_pred_held_out_score)

                    # threshold tuning
                    best_t, best_f_val = 0, -1
                    for t in np.arange(0.01, 1, 0.01):
                        y_val_pred_ = [0] * num_val
                        for idx in range(num_val):
                            if y_pred_val[idx] is None and _y_pred_val_score[idx] >= t:
                                y_val_pred_[idx] = 1
                        f = f1_score(_y_val_, y_val_pred_)
                        if f > best_f_val:
                            best_f_val = f
                            best_t = t
                        # a temp variable that we do not want its value
                        # to be accidentally accessed by outside code
                        y_val_pred_ = None

                    # predictions made only when predictions not made by the previous model
                    # and larger than the best threshold
                    # true for both val_pred and test_pred
                    for idx in range(num_val):
                        if y_pred_val[idx] is None and _y_pred_val_score[idx] >= best_t:
                            y_pred_val[idx] = class_idx


                    for idx in range(num_test):
                        if y_pred_test[idx] is None and _y_pred_test_score[idx] >= best_t:
                            y_pred_test[idx] = class_idx
                    
                    end = int(round(time.time()))

                    # write how many time it takes for a run into the readme
                    duration = end - start
                    with open(self.experiment_dir + 'README', 'a') as readme:
                        readme.write('fold %d class %d takes %d seconds\n'
                                     % (fold_idx, class_idx, duration))

                # predict the rest as the "Other" class
                for idx in range(num_test):
                    if y_pred_test[idx] is None:
                        y_pred_test[idx] = 2
                
                for idx in range(num_val):
                    if y_pred_val[idx] is None:
                        y_pred_val[idx] = 2

            np.savetxt(self.experiment_dir + 'fold_%d_pred_val.np' % fold_idx, y_pred_val)
            np.savetxt(self.experiment_dir + 'fold_%d_pred_test.np' % fold_idx, y_pred_test)
            np.savetxt(self.experiment_dir + 'fold_%d_truth_val.np' % fold_idx, y_val)
            np.savetxt(self.experiment_dir + 'fold_%d_truth_test.np' % fold_idx, y_test)
            
            # append the result on this fold to results
            results.append(precision_recall_fscore_support(y_test, y_pred_test))

        # saving results
        results = np.array(results)
        np.savetxt(self.experiment_dir + 'result_by_fold.np', results.flatten())
        np.savetxt(self.experiment_dir + 'result_averaged.np', np.mean(results, axis=0))
        np.savetxt(self.experiment_dir + 'result_std.np', np.std(results, axis=0))

        avg_macro_f = np.mean(np.mean(results, axis=0)[2])
        with open(self.experiment_dir + 'README', 'a') as readme:
            readme.write('macro F-score: %.4f\n' % avg_macro_f)
Exemple #5
0
class Experiment:
    def __init__(self,
                 mode,
                 experiment_dir,
                 input_name2id2np=None,
                 adapt_train_vocab=False,
                 epochs=100,
                 patience=4,
                 min_epochs=2,
                 lr=0.003,
                 fold=5,
                 lambda_attn=4,
                 input_format='discrete',
                 by_fold=False,
                 use_generator=False,
                 word_dropper=None,
                 random_mask_option=None,
                 random_mask_prob=0.5,
                 use_rationale=False,
                 rationale_with_UNK=1,
                 comment=None,
                 elmo_representation_dir=None,
                 batch_size=32,
                 **kwargs):
        """
        an experiment class that runs cross validation
        designed to enable easy experiments with combinations of:
        1) context representation:
            handled by input_name2id2np
        2) pre-training methods:
            handled by pretrained_weight_dir in the kwargs argument
            None if there is no pretraining weight available
        Parameters
        ----------
        mode: standard, lstm_attn, cnn_attn
        input_name2id2np:
        experiment_dir: the directory that the experiment weights and results will be saved
        adapt_train_vocab: under supervised training without pretraining,
                            some vocab will not be seen (twice) in the training set.
                            if set to True, then vocab occuring less than twice will be removed.
        comments: the comments that will be written to the README
        epochs: number of epochs of training during cross validation
        patience: number of epochs allowable for not having any improvement on the validation set
        min_epochs: minimum number of epochs of training
        lr: learning rate for the model training
        lambda_attn: the weight of the KL divergence loss for attention, only applicable if the model uses attention
        by_fold: train/test on a single fold or several folds
        fold: if by_fold is true, fold [0-4] means train/test on that single fold. if by_fold is false, fold[1-5] means
        train/test on the first fold number of folds.
        kwargs: arguments that will be passed to initializing the neural network model (shown below)
        """
        # creating the experiment dir
        # automatically generate a README
        assert mode in ['standard', 'lstm_attention', 'cnn_attention']

        if experiment_dir[:-1] != '/':
            experiment_dir += '/'
        experiment_dir = '../experiments/' + experiment_dir
        self.experiment_dir, self.kwargs = experiment_dir, kwargs
        if os.path.exists(self.experiment_dir) is False:
            subprocess.call(['mkdir', self.experiment_dir])

        self.by_fold = by_fold
        if self.by_fold:
            self.experiment_dir += 'fold_%d/' % fold
        subprocess.call(['rm', '-rf', self.experiment_dir])
        subprocess.call(['mkdir', self.experiment_dir])
        subprocess.call(['cp', '-r', '../src', self.experiment_dir + 'src'])

        # parameters for training
        self.mode = mode
        self.adapt_train_vocab = adapt_train_vocab
        if input_name2id2np is None:
            input_name2id2np = {}
        self.input_name2id2np = input_name2id2np
        self.fold = fold
        self.dl = Data_loader(option='both', labeled_only=True, **kwargs)
        self.epochs, self.patience, self.min_epochs, self.lr = epochs, patience, min_epochs, lr
        self.input_format = input_format
        if input_name2id2np is not {}:
            self.kwargs['input_dim_map'] = extract_dim_input_name2id2np(
                self.input_name2id2np)
        self.kwargs['input_format'] = input_format
        self.input_format = input_format
        if self.mode != "standard":
            self.lambda_attn = lambda_attn
        self.random_mask_option, self.random_mask_prob = random_mask_option, random_mask_prob
        if self.mode != 'standard':
            self.use_rationale = use_rationale
            self.rationale_with_UNK = rationale_with_UNK
        if self.input_format != 'discrete':
            self.elmo_representation_dir = elmo_representation_dir
        self.batch_size = batch_size
        self.use_generator = use_generator
        self.word_dropper = word_dropper

        #record experiment parameters
        experiment_parameters = {
            'mode': mode,
            'input context features': self.input_name2id2np != {},
            'epochs': epochs,
            'patience': patience,
            'min_epochs': min_epochs,
            'lr': lr,
            'lambda_attn': lambda_attn,
            'input_format': input_format,
            'random_mask_option': random_mask_option,
            'use generator': use_generator,
            'word dropout': str(self.word_dropper),
            'random_mask_prob': random_mask_prob,
            'use rationale': use_rationale
        }
        with open(self.experiment_dir + 'README', 'a') as readme:
            readme.write(str(experiment_parameters) + '\n')
        if comment != None:
            with open(self.experiment_dir + 'README', 'a') as readme:
                readme.write(comment + '\n')

    # fitting a binary classification model given the data
    def fit_model(self, tr, X_val, _y_val_, weight_dir):
        assert len(_y_val_.shape) == 1
        assert self.random_mask_option in {
            'individual', 'sentence', 'sentence_with_prob', None
        }
        if not self.use_generator:
            X_train, _y_train_ = tr
        else:
            tr_generator, num_steps = tr

        from model_def import NN_architecture
        from keras.callbacks import EarlyStopping, ModelCheckpoint
        if self.mode == 'standard':
            # initialize a model
            architecture = NN_architecture(**self.kwargs)
            self.model = architecture.model
            with open(self.experiment_dir + 'README', 'a') as readme:
                for property_description in architecture.property:
                    readme.write(property_description + '\n')

            self.model.compile(optimizer='adam', loss='binary_crossentropy')

            # call backs
            es = EarlyStopping(patience=self.patience,
                               monitor='val_loss',
                               verbose=1)
            mc = ModelCheckpoint(weight_dir,
                                 save_best_only=True,
                                 save_weights_only=True)
            callbacks = [es, mc]
            if not self.use_generator:
                # fit for at least one epoch
                self.model.fit(x=X_train,
                               y=_y_train_,
                               validation_data=(X_val, _y_val_),
                               batch_size=self.batch_size)
                self.model.fit(x=X_train,
                               y=_y_train_,
                               batch_size=self.batch_size,
                               validation_data=(X_val, _y_val_),
                               callbacks=callbacks,
                               epochs=self.epochs)
            else:
                self.model.fit_generator(tr_generator,
                                         steps_per_epoch=num_steps,
                                         validation_data=(X_val, _y_val_),
                                         epochs=self.min_epochs - 1)
                self.model.fit_generator(tr_generator,
                                         steps_per_epoch=num_steps,
                                         validation_data=(X_val, _y_val_),
                                         callbacks=callbacks,
                                         epochs=self.epochs)
            # load back the best model
            self.model.load_weights(weight_dir)

        elif self.mode == 'lstm_attention':
            import torch
            from LSTM_attn import Attn_LSTM
            from model import Model
            torch.set_num_threads(1)

            self.model = Attn_LSTM(**self.kwargs)
            with open(self.experiment_dir + 'README', 'a') as readme:
                for property_description in self.model.property:
                    readme.write(property_description + '\n')

            if not self.use_generator:
                self.model_wrapper = Model(
                    self.model,
                    mode='train',
                    train_X=X_train,
                    train_y=_y_train_,
                    dev_X=X_val,
                    dev_y=_y_val_,
                    model_dir=weight_dir,
                    output_dir=self.experiment_dir + 'README',
                    num_epochs=self.epochs,
                    patience=self.patience,
                    min_epochs=self.min_epochs,
                    lr=self.lr,
                    lambda_attn=self.lambda_attn,
                    use_rationale=self.use_rationale,
                    rationale_with_UNK=self.rationale_with_UNK)
            else:
                self.model_wrapper = Model(
                    self.model,
                    mode='train',
                    tr_generator=tr_generator,
                    num_steps=num_steps,
                    dev_X=X_val,
                    dev_y=_y_val_,
                    model_dir=weight_dir,
                    output_dir=self.experiment_dir + 'README',
                    num_epochs=self.epochs,
                    patience=self.patience,
                    min_epochs=self.min_epochs,
                    lr=self.lr,
                    lambda_attn=self.lambda_attn,
                    use_rationale=self.use_rationale,
                    rationale_with_UNK=self.rationale_with_UNK)
            self.model_wrapper.train()

            # load back the best model
            self.model.load_state_dict(torch.load(weight_dir))

        elif self.mode == 'cnn_attention':
            import torch
            from CNN_attn import Attn_CNN
            from model import Model
            torch.set_num_threads(1)

            self.model = Attn_CNN(**self.kwargs)
            self.model_wrapper = Model(self.model,
                                       X_train,
                                       _y_train_,
                                       X_val,
                                       _y_val_,
                                       model_dir=weight_dir,
                                       output_dir=self.experiment_dir +
                                       'README',
                                       num_epochs=self.epochs,
                                       patience=self.patience,
                                       min_epochs=self.min_epochs,
                                       lr=self.lr,
                                       use_rationale=self.use_rationale,
                                       lambda_attn=self.lambda_attn)
            self.model_wrapper.train()

            # load back the best model
            self.model.load_state_dict(torch.load(weight_dir))

    # running train, val, test experiment on a fold/split of data
    def experiment_with(self, data, fold_idx=-1):

        #y_train, y_val are one-hot represented labels of dimension 3, y_test is one-dimensional 0,1,2

        # initializing model, train and predict
        if self.mode == 'standard':
            import keras.backend as K
            K.clear_session()

        self.kwargs['input_dim_map'] = extract_dim_input_name2id2np(
            self.input_name2id2np)

        # initialize the predictions
        num_train, num_val, num_test = [
            len(tweet_dicts) for tweet_dicts in data
        ]
        y_val, y_test = [extract_y_from_tweet_dicts(data[i]) for i in [1, 2]]
        y_pred_val, y_pred_test = [None] * num_val, [None] * num_test

        # all the results will be returned in a dictionary
        result_dict = {}

        for class_idx in range(2):
            # create layer name that has prefix
            # since for each fold we train model for aggression and loss models separately
            self.kwargs['prefix'] = 'aggression' if class_idx == 0 else 'loss'

            # time the training
            start = int(round(time.time()))  # seconds
            weight_dir = self.experiment_dir + str(fold_idx) + '_' + str(
                class_idx) + '.weight'

            (tr, (X_val, _y_val_),
             X_test) = self.prepare_for_clf(data, class_idx)
            if not self.use_generator:
                # create the label for this binary classification task
                X_train, _y_train_ = tr
            self.fit_model(tr, X_val, _y_val_, weight_dir)

            if self.mode == 'standard':
                _y_pred_val_score, _y_pred_test_score = (
                    self.model.predict(X_val).flatten(),
                    self.model.predict(X_test).flatten())
            else:
                _y_pred_val_score, _y_pred_test_score = (
                    self.model_wrapper.predict(X_val),
                    self.model_wrapper.predict(X_test))

            # save the prediction scores
            prefix = 'class_%d_scores_' % class_idx
            result_dict[prefix + 'val'] = _y_pred_val_score
            result_dict[prefix + 'test'] = _y_pred_test_score

            # calculate the area under curve
            # and save to the directory
            one_hot_y_test = [1 if b else 0 for b in (y_test == class_idx)]
            fpr, tpr, thresholds = metrics.roc_curve(one_hot_y_test,
                                                     _y_pred_test_score)
            auc_roc = metrics.auc(fpr, tpr)
            with open(self.experiment_dir + 'README', 'a') as readme:
                readme.write('Fold %d class %d auc_roc: %.4f\n' %
                             (fold_idx, class_idx, auc_roc))

            # threshold tuning
            best_t, best_f_val = 0, -1
            for t in np.arange(0.01, 1, 0.01):
                y_val_pred_ = [0] * num_val
                for idx in range(num_val):
                    if y_pred_val[idx] is None and _y_pred_val_score[idx] >= t:
                        y_val_pred_[idx] = 1
                f = f1_score(_y_val_, y_val_pred_)
                if f > best_f_val:
                    best_f_val = f
                    best_t = t
                # a temp variable that we do not want its value
                # to be accidentally accessed by outside code
                y_val_pred_ = None

            with open(self.experiment_dir + 'README', 'a') as readme:
                readme.write('fold %d class %d threshold best_t: %.2f\n' %
                             (fold_idx, class_idx, best_t))

            # predictions made only when predictions not made by the previous model
            # and larger than the best threshold
            # true for both val_pred and test_pred
            for idx in range(num_val):
                if y_pred_val[idx] is None and _y_pred_val_score[idx] >= best_t:
                    y_pred_val[idx] = class_idx

            for idx in range(num_test):
                if y_pred_test[idx] is None and _y_pred_test_score[
                        idx] >= best_t:
                    y_pred_test[idx] = class_idx

            end = int(round(time.time()))  # seconds

            # write how many time it takes for a run into the readme
            duration = (end - start) // 60  # minutes
            with open(self.experiment_dir + 'README', 'a') as readme:
                readme.write('fold %d class %d takes %d minutes\n' %
                             (fold_idx, class_idx, duration))

        # predict the rest as the "Other" class
        make_remaining_predictions(y_pred_test)
        make_remaining_predictions(y_pred_val)

        # put all the predictions and ground truth in the return dictionary
        result_dict['pred_val'], result_dict['pred_test'], result_dict['truth_val'], result_dict['truth_test'] = \
            y_pred_val, y_pred_test, y_val, y_test

        macro_fscore = f1_score(y_test, y_pred_test, average='macro')
        with open(self.experiment_dir + 'README', 'a') as readme:
            readme.write('Fold %d f score: %.3f\n' % (fold_idx, macro_fscore))

        # save the results of the experiment on this fold
        for key, result_arr in result_dict.items():
            np.savetxt(self.experiment_dir + 'fold_%d_%s.np' % (fold_idx, key),
                       result_arr)

        return result_dict

    def prepare_for_clf(self, data, class_idx):
        #For Keras CNN if we want to use ELMo representation, all elmo representation must be padded to exact shape
        #Setting the pad_elmo flag to be True in create_clf_data method will generate data that pad all elmo representation to max length
        #Padding the elmo representation is performed by create_data in generator_util.

        if self.mode == 'standard' and self.input_format != 'discrete':
            (tr, val,
             (X_test,
              _)) = create_clf_data(self.input_name2id2np,
                                    data,
                                    input_format=self.input_format,
                                    class_idx=class_idx,
                                    pad_elmo=True,
                                    elmo_dir=self.elmo_representation_dir)
        elif self.mode == 'standard' and self.input_format == 'discrete':
            (tr, val, (X_test,
                       _)) = create_clf_data(self.input_name2id2np,
                                             data,
                                             input_format=self.input_format,
                                             class_idx=class_idx,
                                             batch_size=self.batch_size,
                                             word_dropper=self.word_dropper)
        elif self.input_format != 'discrete':
            (tr, val, (X_test, _)) = create_clf_data(
                self.input_name2id2np,
                data,
                input_format=self.input_format,
                class_idx=class_idx,
                pad_elmo=False,
                elmo_dir=self.elmo_representation_dir,
            )
        else:  #LSTM Attention, discrete input
            if self.use_generator:
                batch_size = self.batch_size
                word_dropper = self.word_dropper
            else:
                batch_size = None
                word_dropper = None
            (tr, val, (X_test,
                       _)) = create_clf_data(self.input_name2id2np,
                                             data,
                                             input_format=self.input_format,
                                             class_idx=class_idx,
                                             batch_size=batch_size,
                                             word_dropper=word_dropper)
        """
        # if no pretrained weights, adapting vocabulary so that those who appear in
        # X_train less than twice would not be counted
        if self.adapt_train_vocab:
            adapt_vocab(X_train, (X_val, X_test))
            """

        return (tr, val, X_test)

    # cross validation
    # write all results to the directory
    # see read_results for retrieving the performance
    def cv(self):
        if self.by_fold is False:
            results = []

            for fold_idx in range(self.fold):
                print('cross validation fold %d.' % (fold_idx + 1))
                # retriving cross validataion data
                # fold data contains all the information for train, val and test
                fold_data = self.dl.cv_data(fold_idx)

                # train, val, test data ready, train with the given data
                result_dict = self.experiment_with(fold_data,
                                                   fold_idx=fold_idx)

                # append the result on this fold to results
                results.append(
                    precision_recall_fscore_support(result_dict['truth_test'],
                                                    result_dict['pred_test']))

            # saving results
            results = np.array(results)
            np.savetxt(self.experiment_dir + 'result_by_fold.np',
                       results.flatten())
            np.savetxt(self.experiment_dir + 'result_averaged.np',
                       np.mean(results, axis=0))
            np.savetxt(self.experiment_dir + 'result_std.np',
                       np.std(results, axis=0))

            avg_macro_f = np.mean(np.mean(results, axis=0)[2])
            with open(self.experiment_dir + 'README', 'a') as readme:
                readme.write('macro F-score: %.4f\n' % avg_macro_f)

        else:  # train a single fold
            print('cross validation fold %d.' % (self.fold))
            # retriving cross validataion data
            # fold data contains all the information for train, val and test
            fold_data = self.dl.cv_data(self.fold)

            # train, val, test data ready, train with the given data
            result_dict = self.experiment_with(fold_data, fold_idx=self.fold)
            result = precision_recall_fscore_support(result_dict['truth_test'],
                                                     result_dict['pred_test'])
            with open(self.experiment_dir + 'README', 'a') as readme:
                readme.write('fold macro F-score: %.4f\n' % np.mean(result[2]))

    # examine whether data generalize across time well
    def examine_time_effect(self, group_by_label):
        fold_data = self.dl.data_by_time(group_by_label=group_by_label)
        result_dict = self.experiment_with(fold_data)
        return result_dict