class Experiment:

    # an experiment class
    def __init__(self, verbose=False):
        print('Loading corpus ...')
        self.corpus = Corpus(verbose=verbose)
        self.corpus.create_data()
        self.X_seqs, self.y_seqs = self.corpus.X_seqs, self.corpus.y_seqs
        self.seq_count = len(self.X_seqs)
        for seq_idx in range(self.seq_count):
            assert (len(self.X_seqs[seq_idx]) == len(self.y_seqs[seq_idx]))
        self.feature_dim = len(self.X_seqs[0][0])

    # cross validation
    def cv(self, fold):
        # array of lenght 10
        fold_inds = self.create_train_test_split(fold)
        results = []
        for fold_idx in range(fold):
            print('Cross validating fold %d.' % fold_idx)
            fold_ind = fold_inds[fold_idx]
            result = self.train_test(fold_ind)
            results.append(result)
        return results

    # create cross validation training and testing index st
    def create_train_test_split(self, fold):

        # each data point is coordinated by its seq_idx and sent_idx
        seq_sent_agg = []
        for seq_idx in range(self.seq_count):
            for sent_idx in range(len(self.y_seqs[seq_idx])):
                if self.y_seqs[seq_idx][sent_idx] is not None:
                    seq_sent_agg.append((seq_idx, sent_idx))

        # shuffle to create folds randomly
        random.shuffle(seq_sent_agg)
        cutoffs = [
            int(float(len(seq_sent_agg)) / fold * fold_idx)
            for fold_idx in range(fold)
        ]
        cutoffs.append(len(seq_sent_agg))
        fold_inds = [
            seq_sent_agg[cutoffs[fold_idx]:cutoffs[fold_idx + 1]]
            for fold_idx in range(fold)
        ]
        return fold_inds

    def train_test(self, fold_ind):
        # get the training data that is masked (testing data taken out)
        train_y_seqs = mask_y_seqs(self.y_seqs, fold_ind)
        results = []
        for _ in range(num_repeats):
            self.clf = Seqs(self.feature_dim, order=2)
            self.clf.fit(self.X_seqs, train_y_seqs)
            pred_y_seqs = self.clf.predict(self.X_seqs, train_y_seqs)
            y_test, y_pred = [
                extract_labels(y, fold_ind)
                for y in [self.y_seqs, pred_y_seqs]
            ]
            result = ({
                'macro_f1':
                f1_score(y_test, y_pred, average='macro'),
                'weighted_f1':
                f1_score(y_test, y_pred, average='weighted'),
                'accuracy':
                accuracy_score(y_test, y_pred)
            })
            # print(classification_report(y_test, y_pred))
            results.append(result)
        # print(average_dictionary(results, sd=True))
        return average_dictionary(results)

    def create_unstructured_data(self):
        X, y = [], []
        for seq_idx in range(self.seq_count):
            for sent_idx in range(len(self.X_seqs[seq_idx])):
                if self.y_seqs[seq_idx][sent_idx] is not None:
                    X.append(self.X_seqs[seq_idx][sent_idx])
                    y.append(self.y_seqs[seq_idx][sent_idx])
        return X, y

    # ==================== cross validation by sequence ====================
    def _cv(self, fold):
        results_agg = []
        for fold_idx in range(fold):
            print('cross validation for %d fold.' % (fold_idx + 1))
            results = self.experiment_once()
            results_agg += results
        return results_agg

    def _experiment_once(self):
        self._create_train_test_split()
        results = []
        num_repeats = 10
        for _ in range(num_repeats):
            result = self._train_test()
            results.append(result)
        return results

    def _create_train_test_split(self):
        shuffle_order = [idx for idx in range(len(self.X_seqs))]
        random.shuffle(shuffle_order)
        self.train_size = int(0.9 * self.seq_count)
        self.train_ind, self.test_ind = shuffle_order[:self.
                                                      train_size], shuffle_order[
                                                          self.train_size:]
        self.X_seqs_train, self.y_seqs_train = ([
            self.X_seqs[idx] for idx in self.train_ind
        ], [self.y_seqs[idx] for idx in self.train_ind])
        self.X_seqs_test, self.y_seqs_test = ([
            self.X_seqs[idx] for idx in self.test_ind
        ], [self.y_seqs[idx] for idx in self.test_ind])

    def _train_test(self):
        self.clf = Seqs(self.feature_dim, order=2)
        self.clf.fit(self.X_seqs_train, self.y_seqs_train)
        return self.clf.evaluate(self.X_seqs_test, self.y_seqs_test)