def make_data(self,
                  trainfilename,
                  maxseqlen=None,
                  maxclauselen=None,
                  label_ind=None,
                  train=False):
        use_attention = self.params["use_attention"]
        batch_size = self.params["batch_size"]

        str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train)
        print("Filtering data")
        str_seqs = clean_words(str_seqs)
        label_seqs = to_BIO(label_seqs)
        if not label_ind:
            self.label_ind = {"none": 0}
        else:
            self.label_ind = label_ind
        seq_lengths = [len(seq) for seq in str_seqs]
        if self.maxseqlen is None:
            if maxseqlen:
                self.maxseqlen = maxseqlen
            elif self.params["maxseqlen"] is not None:
                self.maxseqlen = self.params["maxseqlen"]
            else:
                self.maxseqlen = max(seq_lengths)
        if self.maxclauselen is None:
            if maxclauselen:
                self.maxclauselen = maxclauselen
            elif self.params["maxclauselen"] is not None:
                self.maxclauselen = self.params["maxclauselen"]
            elif use_attention:
                sentence_lens = []
                for str_seq in str_seqs:
                    for seq in str_seq:
                        tokens = self.tokenizer.tokenize(seq.lower())
                        sentence_lens.append(len(tokens))
                self.maxclauselen = np.round(
                    np.mean(sentence_lens) +
                    3 * np.std(sentence_lens)).astype(int)

        if len(self.label_ind) <= 1:
            for str_seq, label_seq in zip(str_seqs, label_seqs):
                for label in label_seq:
                    if label not in self.label_ind:
                        # Add new labels with values 0,1,2,....
                        self.label_ind[label] = len(self.label_ind)
        self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
        discourse_generator = BertDiscourseGenerator(self.bert, self.tokenizer,
                                                     str_seqs, label_seqs,
                                                     self.label_ind,
                                                     batch_size, use_attention,
                                                     self.maxseqlen,
                                                     self.maxclauselen, train)
        return seq_lengths, discourse_generator  # One-hot representation of labels
    def make_data(self,
                  trainfilename,
                  maxseqlen=None,
                  maxclauselen=None,
                  label_ind=None,
                  train=False):
        use_attention = self.params["use_attention"]
        maxseqlen = self.params["maxseqlen"]
        maxclauselen = self.params["maxclauselen"]
        batch_size = self.params["batch_size"]

        str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train)
        print("Filtering data")
        str_seqs = clean_words(str_seqs)
        label_seqs = to_BIO(label_seqs)
        if not label_ind:
            self.label_ind = {"none": 0}
        else:
            self.label_ind = label_ind
        seq_lengths = [len(seq) for seq in str_seqs]
        if not maxseqlen:
            maxseqlen = max(seq_lengths)
        if not maxclauselen:
            if use_attention:
                clauselens = []
                for str_seq in str_seqs:
                    clauselens.extend(
                        [len(clause.split()) for clause in str_seq])

                maxclauselen = np.round(
                    np.mean(clauselens) + 3 * np.std(clauselens)).astype(int)
        X = []
        Y = []
        Y_inds = []
        init_word_rep_len = len(self.rep_reader.word_rep)  # Vocab size
        if len(self.label_ind) <= 1:
            for str_seq, label_seq in zip(str_seqs, label_seqs):
                for label in label_seq:
                    if label not in self.label_ind:
                        # Add new labels with values 0,1,2,....
                        self.label_ind[label] = len(self.label_ind)
        self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
        discourse_generator = DiscourseGenerator(self.rep_reader, str_seqs,
                                                 label_seqs, self.label_ind,
                                                 batch_size, use_attention,
                                                 maxseqlen, maxclauselen,
                                                 train, self.input_size)
        self.maxseqlen = maxseqlen
        self.maxclauselen = maxclauselen
        return seq_lengths, discourse_generator  # One-hot representation of labels