def __init__(self, opts):

        if opts.task == 'POS_models':
            data_dir = 'data/pos_data'
            self.kfold = False
        elif opts.jackknife:
            data_dir = 'data/super_data'
            jk_data_dir = 'data/pos_data'
            path_to_k_fold = os.path.join(jk_data_dir, 'train_y.txt')
            path_to_k_fold_test = os.path.join(jk_data_dir, 'test_y.txt')
            self.kfold = True

        else:
            data_dir = 'data/super_data'
            self.kfold = False

        path_to_text = os.path.join(data_dir, 'train_x.txt')
        path_to_text_test = os.path.join(data_dir, 'test_x.txt')
        path_to_POS = os.path.join(data_dir, 'train_y.txt')
        path_to_POS_test = os.path.join(data_dir, 'test_y.txt')

        self.MAX_NB_WORDS = 200000000000

        # first, build index mapping words in the embeddings set
        # to their embedding vector

        f_train = open(path_to_text)
        f_test = open(path_to_text_test)

        texts = f_train.readlines()
        nb_train_samples = len(texts)
        self.nb_train_samples = nb_train_samples
        texts = texts + f_test.readlines()

        f_train.close()
        f_test.close()

        print('length', len(texts))

        f_train.close()
        # f_test.close()

        # finally, vectorize the text samples into a 2D integer tensor
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)
        indicator = tokenizer.cap_indicator(texts)
        num_indicator = tokenizer.num_indicator(texts)
        suffix = tokenizer.suffix_extract(texts)
        suffix_tokenizer = Tokenizer()
        suffix_tokenizer.fit_on_texts(suffix, non_split=True)

        suffix_sequences = suffix_tokenizer.texts_to_sequences(suffix,
                                                               non_split=True)
        # debugging
        #        for i in xrange(len(sequences)):
        #            assert len(sequences[i]) == len(suffix_sequences[i])

        word_index = tokenizer.word_index
        self.word_index = word_index

        suffix_index = suffix_tokenizer.word_index
        print('Found %s unique words.' % len(word_index))
        data = pad_sequences(sequences, opts, True)
        suffix_data = pad_sequences(suffix_sequences, opts)
        cap_indicator = pad_sequences(indicator, opts)
        num_indicator = pad_sequences(num_indicator, opts)

        f_train = open(path_to_POS)
        f_test = open(path_to_POS_test)
        texts = f_train.readlines() + f_test.readlines()
        f_train.close()
        f_test.close()
        lab_tokenizer = Tokenizer()
        lab_tokenizer.fit_on_texts(texts)
        lab_sequences = lab_tokenizer.texts_to_sequences(texts)
        tag_index = lab_tokenizer.word_index
        self.tag_index = tag_index
        self.tag_size = len(tag_index)
        print('Found %s unique tags.' % len(tag_index))
        labels = pad_sequences(lab_sequences, opts)
        #labels = np.expand_dims(labels, -1)  do not need it for tensorflow

        if opts.jackknife:

            f_train = open(path_to_k_fold)
            f_test = open(path_to_k_fold_test)
            texts = f_train.readlines() + f_test.readlines()
            f_train.close()
            f_test.close()
            jk_tokenizer = Tokenizer()
            jk_tokenizer.fit_on_texts(texts)
            jk_sequences = jk_tokenizer.texts_to_sequences(texts)
            jk_index = jk_tokenizer.word_index
            self.jk_index = jk_index
            self.jk_size = len(jk_index)
            print('Found %s unique jackknife tags.' % len(jk_index))
            jk_labels = pad_sequences(jk_sequences, opts)

        indices = np.arange(nb_train_samples)
        np.random.shuffle(indices)

        nb_validation_samples = data.shape[0] - nb_train_samples
        self.nb_validation_samples = nb_validation_samples
        ### define zero matrix first for splitting

        seq_length = labels.shape[1]
        if opts.attention in [100, 101, 102, 103]:
            self.nb_train_added = nb_train_samples // 10 * 10 + 10
            self.nb_validation_added = nb_validation_samples // 10 * 10 + 10
        else:
            self.nb_train_added = nb_train_samples
            self.nb_validation_added = nb_validation_samples

        self.X_train = np.zeros([self.nb_train_added, seq_length])

        self.X_train[:nb_train_samples] = data[:-nb_validation_samples][
            indices]
        if opts.jackknife:
            self.jk_labels = np.zeros([self.nb_train_added, seq_length])
            self.jk_labels[:nb_train_samples] = jk_labels[indices]
            self.jk_labels_test = np.zeros(
                [self.nb_validation_added, seq_length])
            self.jk_labels_test[:nb_validation_samples] = jk_labels[
                -nb_validation_samples:]

        self.train_cap_indicator = np.zeros([self.nb_train_added, seq_length])
        self.train_cap_indicator[:
                                 nb_train_samples] = cap_indicator[:
                                                                   -nb_validation_samples][
                                                                       indices]
        self.train_num_indicator = np.zeros([self.nb_train_added, seq_length])
        self.train_num_indicator[:
                                 nb_train_samples] = num_indicator[:
                                                                   -nb_validation_samples][
                                                                       indices]
        self.suffix_train = np.zeros([self.nb_train_added, seq_length])
        self.suffix_train[:
                          nb_train_samples] = suffix_data[:
                                                          -nb_validation_samples][
                                                              indices]
        self.y_train = np.zeros([self.nb_train_added, seq_length])
        self.y_train[:nb_train_samples] = labels[:-nb_validation_samples][
            indices]
        if opts.joint:
            self.pos_train = self.jk_labels
        self.X_test = np.zeros([self.nb_validation_added, seq_length])
        self.X_test[:nb_validation_samples] = data[-nb_validation_samples:]
        self.test_cap_indicator = np.zeros(
            [self.nb_validation_added, seq_length])
        self.test_cap_indicator[:nb_validation_samples] = cap_indicator[
            -nb_validation_samples:]
        self.test_num_indicator = np.zeros(
            [self.nb_validation_added, seq_length])
        self.test_num_indicator[:nb_validation_samples] = num_indicator[
            -nb_validation_samples:]
        self.suffix_test = np.zeros([self.nb_validation_added, seq_length])

        self.suffix_test[:nb_validation_samples] = suffix_data[
            -nb_validation_samples:]
        self.y_test = np.zeros([self.nb_validation_added, seq_length])
        self.y_test[:nb_validation_samples] = labels[-nb_validation_samples:]
        if opts.joint:
            self.pos_test = self.jk_labels_test

        if opts.jackknife:
            K = 10
            #k_fold_samples = nb_train_samples//K*K
            samples_per_group = (nb_train_samples // K) + 1

            print('splitting into {} folds'.format(K))

            ## don't get rid of the remainders. We will save all of them

            #            self.X_train = self.X_train[:k_fold_samples] # get rid of the remaining examples for kfold
            #            self.train_cap_indicator = self.train_cap_indicator[:k_fold_samples]
            #            self.train_num_indicator = self.train_num_indicator[:k_fold_samples]
            #            self.suffix_train = self.suffix_train[:k_fold_samples]
            #
            #            self.X_train_k_fold = np.split(self.X_train[:k_fold_samples], K)
            #            self.train_cap_indicator_k_fold = np.split(self.train_cap_indicator[:k_fold_samples], K)
            #            self.train_num_indicator_k_fold = np.split(self.train_num_indicator[:k_fold_samples], K)
            #            self.suffix_train_k_fold = np.split(self.suffix_train[:k_fold_samples], K)
            #            self.y_train_k_fold = np.split(self.jk_labels[:k_fold_samples], K)
            #
            # adding everything back

            self.X_train_k_fold = []
            self.train_cap_indicator_k_fold = []
            self.train_num_indicator_k_fold = []
            self.suffix_train_k_fold = []
            self.y_train_k_fold = []

            for k in xrange(K):

                self.X_train_k_fold.append(
                    self.X_train[samples_per_group * k:samples_per_group *
                                 (k + 1)])
                self.train_cap_indicator_k_fold.append(
                    self.train_cap_indicator[samples_per_group *
                                             k:samples_per_group * (k + 1)])
                self.train_num_indicator_k_fold.append(
                    self.train_num_indicator[samples_per_group *
                                             k:samples_per_group * (k + 1)])
                self.suffix_train_k_fold.append(
                    self.suffix_train[samples_per_group * k:samples_per_group *
                                      (k + 1)])
                self.y_train_k_fold.append(
                    self.jk_labels[samples_per_group * k:samples_per_group *
                                   (k + 1)])
                #if opts.joint:
                #    self.pos_train = self.pos_train[:k_fold_samples]

                print('end splitting')

        self.nb_suffix = len(suffix_index)
        self.suffix_embedding_mat = np.random.randn(self.nb_suffix + 1, 10)
        self.nb_words = min(self.MAX_NB_WORDS, len(word_index))
        ## cond entropy
        self.cond_matrix = np.ones((self.nb_words + 1, 1))
        with open('certain.pkl') as fhand:
            certain = pickle.load(fhand)
        self.certain_words = []
        for certain_word in certain:
            self.certain_words.append(self.word_index[certain_word])
        for certain_word in self.certain_words:
            self.cond_matrix[certain_word] = 0.0
        ### cond entropy ends
        if opts.embedding_name == 'random':
            np.random.seed(opts.seed)
            self.embedding_matrix = np.random.uniform(
                -2, 2, size=(self.nb_words + 1, opts.embedding_dim))
        elif opts.embedding_name == 'word2vec':
            if not opts.embedding_dim == 300:  # word2vec is of 300 dim
                sys.exit('error in dim')
            filename = os.path.join('../word2vec',
                                    'GoogleNews-vectors-negative300.bin')

            import gensim

            self.embedding_matrix = np.zeros(
                (self.nb_words + 1, opts.embedding_dim))
            self.word2vec_model = gensim.models.word2vec.Word2Vec.load_word2vec_format(
                filename, binary=True)
            print('Found %s word vectors.' % len(self.word2vec_model.vocab))
            for word, i in word_index.items():
                if i > self.MAX_NB_WORDS and word in self.word2vec_model.vocab:
                    self.embedding_matrix[i] = self.word2vec_model[word]
        else:
            self.embeddings_index = {}
            print('Indexing word vectors.')
            f = open(opts.embedding_name)
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                self.embeddings_index[word] = coefs
            f.close()

            print('Found %s word vectors.' % len(self.embeddings_index))

            self.embedding_matrix = np.zeros(
                (self.nb_words + 1, opts.embedding_dim))
            for word, i in word_index.items():
                if i > self.MAX_NB_WORDS:
                    continue
                embedding_vector = self.embeddings_index.get(word)
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    if not self.embedding_matrix.shape[1] == len(
                            embedding_vector):
                        sys.exit('error in dim')

                    self.embedding_matrix[i] = embedding_vector

        # load pre-trained word embeddings into an Embedding layer

        self._index_in_epoch = 0
        self._num_examples = self.X_train.shape[0]
        self._num_test_examples = self.X_test.shape[0]
        self._epoch_completed = 0
        self._index_in_test = 0
        if opts.jackknife:
            #            self._num_hold_in_examples = self.X_train_k_fold[0].shape[0]*(K-1)
            #            self._num_hold_out_examples = self.X_train_k_fold[0].shape[0]
            self.k = 0
        self.opts = opts
    def __init__(self, opts, test_opts=None):
        path_to_text = opts.text_train
        path_to_tag = opts.tag_train
        path_to_jk = opts.jk_train
        if test_opts is None:
            path_to_text_test = opts.text_test
            path_to_tag_test = opts.tag_test
            path_to_jk_test = opts.jk_test
        else:
            path_to_text_test = test_opts.text_test
            path_to_tag_test = test_opts.tag_test
            path_to_jk_test = test_opts.jk_test

        self.inputs_train = {}
        self.inputs_test = {}

        ## indexing sents files
        f_train = io.open(path_to_text, encoding='utf-8')
        texts = f_train.readlines()
        self.nb_train_samples = len(texts)
        f_train.close()
        tokenizer = Tokenizer(lower=True)
        tokenizer.fit_on_texts(texts)
        #print(tokenizer.word_index['-unseen-'])
        self.word_index = tokenizer.word_index
        sorted_freqs = tokenizer.sorted_freqs
        self.nb_words = len(self.word_index)
        print('Found {} unique lowercased words including -unseen-.'.format(self.nb_words))

        # lookup the glove word embeddings
        # need to reserve indices for testing file. 
        glove_size = opts.embedding_dim
        self.embeddings_index = {}
        print('Indexing word vectors.')
        #f = open('glovevector/glove.6B.{}d.txt'.format(glove_size))
        f = io.open(opts.word_embeddings_file, encoding='utf-8')
        for line in f:
            values = line.strip().split(' ')
            if len(values) == opts.embedding_dim+1:
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                self.embeddings_index[word] = coefs
        f.close()

        print('Found {} word vectors.'.format(len(self.embeddings_index)))

        unseens = list(set(self.embeddings_index.keys()) - set(self.word_index.keys())) ## list of words that appear in glove but not in the training set
        nb_unseens = len(unseens)
        print('Found {} words not in the training set'.format(nb_unseens))

        self.word_embeddings = np.zeros((self.nb_words+1+nb_unseens, glove_size)) ## +1 for padding (idx 0)

        ## Get Frequencies for Adversarial Training (Yasunaga et al. 2017)
        self.word_freqs = np.zeros([self.nb_words+1+nb_unseens])
        self.word_freqs[1:self.nb_words] = sorted_freqs ## Skip Zero Padding (Index 0)
        self.word_freqs = self.word_freqs.astype(np.float32)
        self.word_freqs = self.word_freqs/np.sum(self.word_freqs)
        for word, i in self.word_index.items(): ## first index the words in the training set
            embedding_vector = self.embeddings_index.get(word)
            if embedding_vector is not None: ## otherwise zero vector
                self.word_embeddings[i] = embedding_vector
        for unseen in unseens:
            self.word_index[unseen] = len(self.word_index) + 1 ## add unseen words to the word_index dictionary
            self.word_embeddings[self.word_index[unseen]] = self.embeddings_index[unseen]
        self.idx_to_word = invert_dict(self.word_index)
        print('end glove indexing')
        f_test = io.open(path_to_text_test, encoding='utf-8')
        texts = texts +  f_test.readlines()
        self.nb_validation_samples = len(texts) - self.nb_train_samples
        f_test.close()
        text_sequences = tokenizer.texts_to_sequences(texts)
        #print(map(lambda x: self.idx_to_word[x], text_sequences[self.nb_train_samples]))
        self.inputs_train['words'] = text_sequences[:self.nb_train_samples]
        self.inputs_test['words'] = text_sequences[self.nb_train_samples:]
        ## indexing sents files ends
        ## indexing suffixes 
        if opts.suffix_dim > 0:
            suffix = tokenizer.suffix_extract(texts)
            suffix_tokenizer = Tokenizer()
            suffix_tokenizer.fit_on_texts(suffix[:self.nb_train_samples], non_split=True)
            self.suffix_index = suffix_tokenizer.word_index
            self.nb_suffixes = len(self.suffix_index)
            sorted_freqs = suffix_tokenizer.sorted_freqs
            self.suffix_freqs = np.zeros([self.nb_suffixes+1]).astype(np.float32) ## +1 for zero padding
            self.suffix_freqs[1:self.nb_suffixes] = sorted_freqs ## Skip Zero Padding (Index 0)
            self.suffix_freqs = self.suffix_freqs/np.sum(self.suffix_freqs)
            self.idx_to_suffix = invert_dict(self.suffix_index)
            print('Found {} unique suffixes including -unseen-.'.format(self.nb_suffixes))
            suffix_sequences = suffix_tokenizer.texts_to_sequences(suffix, non_split=True)
            #print(map(lambda x: self.idx_to_suffix[x], suffix_sequences[self.nb_train_samples]))
            self.inputs_train['suffix'] = suffix_sequences[:self.nb_train_samples]
            self.inputs_test['suffix'] = suffix_sequences[self.nb_train_samples:]
            ## indexing suffixes ends
        ## indexing capitalization 
        if opts.cap:
            cap_sequences = tokenizer.cap_indicator(texts)
            #print(cap_sequences[self.nb_train_samples])
            self.inputs_train['cap'] = cap_sequences[:self.nb_train_samples]
            self.inputs_test['cap'] = cap_sequences[self.nb_train_samples:]
            ## indexing capitalization ends
            ## indexing numbers
        if opts.num:
            num_sequences = tokenizer.num_indicator(texts)
            #print(num_sequences[self.nb_train_samples])
            self.inputs_train['num'] = num_sequences[:self.nb_train_samples]
            self.inputs_test['num'] = num_sequences[self.nb_train_samples:]
            ## indexing numbers ends
        ## indexing jackknife files
        if opts.jk_dim > 0:
            f_train = io.open(path_to_jk, encoding='utf-8')
            texts = f_train.readlines()
            f_train.close()
            tokenizer = Tokenizer(lower=False) 
            tokenizer.fit_on_texts(texts)
            self.jk_index = tokenizer.word_index
            self.nb_jk = len(self.jk_index)
            sorted_freqs = tokenizer.sorted_freqs
            self.jk_freqs = np.zeros([self.nb_jk+1]).astype(np.float32) ## +1 for zero padding
            self.jk_freqs[1:self.nb_jk] = sorted_freqs ## Skip Zero Padding (Index 0)
            self.jk_freqs = self.jk_freqs/np.sum(self.jk_freqs)
            self.idx_to_jk = invert_dict(self.jk_index)
            print('Found {} unique tags including -unseen-.'.format(self.nb_jk))
            f_test = io.open(path_to_jk_test, encoding='utf-8')
            texts = texts + f_test.readlines() ## do not lowercase tCO
            f_test.close()
            jk_sequences = tokenizer.texts_to_sequences(texts)
            #print(map(lambda x: self.idx_to_jk[x], jk_sequences[self.nb_train_samples]))
            self.inputs_train['jk'] = jk_sequences[:self.nb_train_samples]
            self.inputs_test['jk'] = jk_sequences[self.nb_train_samples:]
            ## indexing jackknife files ends
        ## indexing char files
        if opts.chars_dim > 0:
            f_train = io.open(path_to_text, encoding='utf-8')
            texts = f_train.readlines()
            f_train.close()
            tokenizer = Tokenizer(lower=False,char_encoding=True) 
            tokenizer.fit_on_texts(texts)
            self.char_index = tokenizer.word_index
            self.nb_chars = len(self.char_index)
            sorted_freqs = tokenizer.sorted_freqs
            self.char_freqs = np.zeros([self.nb_chars+1]).astype(np.float32) ## +1 for zero padding
            self.char_freqs[1:self.nb_chars] = sorted_freqs ## Skip Zero Padding (Index 0)
            self.char_freqs = self.char_freqs/np.sum(self.char_freqs)
            self.idx_to_char = invert_dict(self.char_index)
            print('Found {} unique characters including -unseen-.'.format(self.nb_chars))
            f_test = io.open(path_to_text_test, encoding='utf-8')
            texts = texts + f_test.readlines() ## do not lowercase tCO
            f_test.close()
            char_sequences = tokenizer.texts_to_sequences(texts)
            #print(map(lambda x: self.idx_to_jk[x], jk_sequences[self.nb_train_samples]))
            self.inputs_train['chars'] = char_sequences[:self.nb_train_samples]
            self.inputs_test['chars'] = char_sequences[self.nb_train_samples:]
            ## indexing char files ends
        ## indexing stag files
        f_train = open(path_to_tag)
        texts = f_train.readlines()
        f_train.close()
        tokenizer = Tokenizer(lower=False) ## for tCO
        tokenizer.fit_on_texts(texts, zero_padding=False)
        #print(tokenizer.word_index['-unseen-'])
        self.tag_index = tokenizer.word_index
        self.nb_tags = len(self.tag_index)
        self.idx_to_tag = invert_dict(self.tag_index)
        print('Found {} unique tags including -unseen-.'.format(self.nb_tags))
        f_test = open(path_to_tag_test)
        texts = texts + f_test.readlines() ## do not lowercase tCO
        f_test.close()
        tag_sequences = tokenizer.texts_to_sequences(texts)
        #print(map(lambda x: self.idx_to_tag[x], tag_sequences[self.nb_train_samples+8]))
        self.inputs_train['tags'] = tag_sequences[:self.nb_train_samples]
        self.inputs_test['tags'] = tag_sequences[self.nb_train_samples:]

        ## indexing stag files ends
        self.test_gold = np.hstack(tag_sequences[self.nb_train_samples:]) ## for calculation of accuracy
        ## padding the train inputs and test inputs
        #self.inputs_train = [pad_sequences(x) for x in self.inputs_train]
        self.inputs_train = {key: pad_sequences(x, key) for key, x in self.inputs_train.items()}
        random.seed(0)
        perm = np.arange(self.nb_train_samples)
        random.shuffle(perm)
        self.inputs_train = {key: x[perm] for key, x in self.inputs_train.items()}
        #self.inputs_train = [x[perm] for x in self.inputs_train]

        #self.inputs_test = [pad_sequences(x) for x in self.inputs_test]
        self.inputs_test = {key: pad_sequences(x, key) for key, x in self.inputs_test.items()}

        ## setting the current indices
        self._index_in_epoch = 0
        self._epoch_completed = 0
        self._index_in_test = 0