Beispiel #1
0
def get_daily_dialog_tokenizer(tokenizer_location=None):
    '''
    Get the daily dialog tokenizer. Trains a new one if no location is provided
    :param tokenizer_location: Json containing information about the tokenizer.
    :return:
    '''
    if tokenizer_location:
        tokenizer = Tokenizer.from_file(tokenizer_location, )
        tokenizer.enable_padding()
        return tokenizer
    else:
        dataset_train = datasets.load_dataset("daily_dialog", split="train", )
        utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train]

        trainer = WordPieceTrainer(
            vocab_size = 2048, 
            special_tokens = token_utils.special_tokens.values()
        )

        custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], ))
        custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
        custom_tokenizer.pre_tokenizer = Whitespace()
        custom_tokenizer.train_from_iterator(utterances, trainer, )
        custom_tokenizer.enable_padding()

        # Write every dialogue to file
        location = './daily_dialog/'
        custom_tokenizer.save(location + "tokenizer.json")

        return custom_tokenizer
def get_recurrent_tokenizer(vocab,
                            max_context_tokens,
                            unk_token,
                            pad_token,
                            device="cpu"):
    """
    Return a tokenizer to be used with recurrent-based models
    """
    question_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token))
    question_tokenizer.normalizer = Sequence(
        [StripAccents(), Lowercase(), Strip()])
    question_tokenizer.pre_tokenizer = PreSequence(
        [Whitespace(), Punctuation()])
    question_tokenizer.enable_padding(direction="right",
                                      pad_id=vocab[pad_token],
                                      pad_type_id=1,
                                      pad_token=pad_token)

    context_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token))
    context_tokenizer.normalizer = Sequence(
        [StripAccents(), Lowercase(), Strip()])
    context_tokenizer.pre_tokenizer = PreSequence(
        [Whitespace(), Punctuation()])
    context_tokenizer.enable_padding(
        direction="right",
        pad_id=vocab[pad_token],
        pad_type_id=1,
        pad_token=pad_token,
    )
    context_tokenizer.enable_truncation(max_context_tokens)

    return RecurrentSquadTokenizer(question_tokenizer,
                                   context_tokenizer,
                                   device=device)
Beispiel #3
0
def wordpiece_tokenize(line):
    tokenizer = Tokenizer(WordPiece(wordpiece_dict3))
    tokenizer.enable_padding(length=200)
    tokenizer.enable_truncation(max_length=200)
    tokenizer.pre_tokenizer = WhitespaceSplit()
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    output = tokenizer.encode(line)
    return (output.ids)
    def test_post_process(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.enable_truncation(2)
        tokenizer.enable_padding(length=4)

        encoding = tokenizer.encode("my name is john")
        pair_encoding = tokenizer.encode("pair")

        # Can post process a single encoding
        output = tokenizer.post_process(encoding)
        assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]

        # Can post process a pair of encodings
        output = tokenizer.post_process(encoding, pair_encoding)
        assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
def create_tokenizer(sentence_list):
    filename = f'temp_{time.strftime("%Y%m%d-%H%M%S")}.txt'
    with open(filename, 'w') as f:
        for s in sentence_list:
            f.write(f'{s}\n')

    tokenizer = Tokenizer(WordPiece())
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.decoder = decoders.WordPiece()
    tokenizer.enable_padding(pad_token='[PAD]', pad_id=0)

    trainer = WordPieceTrainer(
        vocab_size=3000, special_tokens=['[PAD]', '[S]', '[/S]', '[UNK]'])
    tokenizer.train(trainer, [filename])

    os.remove(filename)

    return tokenizer
Beispiel #6
0
    def test_padding(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # By default it does nothing when encoding single sequence
        tokenizer.enable_padding()
        output = tokenizer.encode("my name")
        assert output.tokens == ["my", "name"]

        # Can pad to the longest in a batch
        output = tokenizer.encode_batch(["my name", "my name is john"])
        assert all([len(encoding) == 4 for encoding in output])

        # Can pad to the specified max length otherwise
        tokenizer.enable_padding(max_length=4)
        output = tokenizer.encode("my name")
        assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
        output = tokenizer.encode("my name", "pair")
        assert output.tokens == ["my", "name", "pair", "[PAD]"]
Beispiel #7
0
 def __init__(
     self,
     inputs: List[str],
     targets: List[Tensor],
     key: Tensor,
     tokenizer: Tokenizer,
     max_len: int = 200,
     batch_size: int = 32,
 ) -> None:
     self.inputs = inputs
     self.targets = np.array(onp.array(targets))
     tokenizer.enable_truncation(max_len)
     tokenizer.enable_padding()
     self.tokenizer = tokenizer
     self.max_len = max_len
     self.batch_size = batch_size
     self.key = key
     self.len = len(self.inputs)
     self.num_batches = math.ceil(self.len / batch_size)
def main(args):
    if args.do_train:
        # Initialize a tokenizer
        files = get_smi_files(args.training_files)
        print("Training BPE tokenizer using the following files:{}".format(
            files))
        tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
        tokenizer.enable_padding(pad_id=args.vocab_size + 2,
                                 pad_token="<pad>",
                                 length=args.pad_len)
        tokenizer.enable_truncation(max_length=args.pad_len,
                                    strategy='only_first')
        tokenizer.normalizer = Sequence([NFKC()])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=False)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
        # Train the tokenizer
        trainer = trainers.BpeTrainer(show_progress=True,
                                      vocab_size=args.vocab_size,
                                      min_frequency=args.min_frequency)
        tokenizer.train(files, trainer=trainer)
        tokenizer.add_tokens(["<start>", "<end>"])
        tokenizer.save(os.path.join('tokenizers', args.tokenizer_name),
                       pretty=True)
        print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

    if args.do_test:
        # Test the tokenizer
        tokenizer = Tokenizer.from_file(
            os.path.join('tokenizers', args.tokenizer_name))
        print("Testing with SMILES String: {}".format(args.test_string))
        encoding = tokenizer.encode(args.test_string)
        print("Encoded string: {}".format(encoding.tokens))
        print(encoding.ids)
        decoded = tokenizer.decode(encoding.ids)
        print("Decoded string: {}".format(decoded))
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                out_f.write(
                    json.dumps((examples_per_file[input_file],
                                len(label_counter))) + '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    for example, labels in g:

                        example_batch.append(example)
                        labels_batch.append(labels)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels in zip(example_batch,
                                                       labels_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                out_f.write(
                                    json.dumps([example.ids, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels in zip(example_batch, labels_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        out_f.write(json.dumps([example.ids, labels]) + '\n')
Beispiel #10
0
class SentencePieceBPETokenizer:
    """Custom SentencePiece tokenizer"""
    unk_token = '<unk>'
    pad_token = '<pad>'

    def __init__(self,
                 vocab: Dict[str, int] = None,
                 merges: List[Tuple[str, str]] = None,
                 dropout: float = None,
                 max_length: Optional[int] = 64) -> None:
        """Constructor

        Args:
            vocab (Dict[str, int]): A dictionary of string keys and their ids.
            merges (List[Tuple[str, str]]): A list of pairs of tokens.
            dropout (float): BPE dropout
            max_length (int, optional): The max length at which to truncate.
                Defaults to `64`.
        """
        self.tokenizer = Tokenizer(
            BPE(vocab, merges, dropout=dropout, unk_token=self.unk_token))
        self.tokenizer.normalizer = BertNormalizer()  # noqa
        self.tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()  # noqa
        self.tokenizer.decoder = decoders.Metaspace()  # noqa
        self.tokenizer.add_special_tokens([self.pad_token, self.unk_token])

        self.tokenizer.enable_padding(pad_token=self.pad_token)
        self.tokenizer.enable_truncation(max_length)

    @classmethod
    def train(cls,
              dataset: Sequence[str],
              vocab_size: int = 1000,
              min_frequency: int = 2,
              dropout: float = 0.0,
              max_length: Optional[int] = 64) -> 'SentencePieceBPETokenizer':
        instance = cls(dropout=dropout, max_length=max_length)
        trainer = trainers.BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            special_tokens=[cls.pad_token, cls.unk_token])
        instance.tokenizer.train_from_iterator(dataset, trainer=trainer)
        instance.tokenizer.model.dropout = None
        return instance

    @property
    def vocab_size(self):
        return len(self.tokenizer.get_vocab())

    def serialize(self):
        return self.tokenizer.to_str()

    @classmethod
    def deserialize(cls, s: str) -> 'SentencePieceBPETokenizer':
        tokenizer = cls()
        tokenizer.tokenizer = Tokenizer.from_str(s)
        return tokenizer

    def encode(self, text: str) -> Dict[str, Any]:
        encoding = self.tokenizer.encode(text)
        outputs = {
            'ids': torch.tensor(encoding.ids),
            'mask': torch.tensor(encoding.attention_mask),
            'spans': encoding.offsets,
        }
        return outputs

    def encode_batch(self, batch: List[str]):
        encodings = self.tokenizer.encode_batch(batch)
        outputs = {
            'ids': torch.tensor([e.ids for e in encodings]),
            'mask': torch.tensor([e.attention_mask for e in encodings]),
            'spans': [e.offsets for e in encodings],
        }
        return outputs
Beispiel #11
0
class LitTokenizer:
    def __init__(self,
                 padding=False,
                 truncation=False,
                 max_length=None,
                 lower=False,
                 lang=None):
        super().__init__()
        self.UNK_WORD = '[UNK]'
        self.PAD_WORD = '[PAD]'
        self.MASK_WORD = '[MASK]'
        self.SOS_WORD = '[SOS]'
        self.EOS_WORD = '[EOS]'
        self.special_tokens = [
            self.UNK_WORD, self.PAD_WORD, self.MASK_WORD, self.SOS_WORD,
            self.EOS_WORD
        ]

        # Define tokenizer
        self.tokenizer = None
        self.configure_tokenizers(padding, truncation, max_length, lower)

        # Other
        self.lang = lang

    def get_vocab_size(self):
        return self.tokenizer.get_vocab_size()

    def configure_tokenizers(self, padding, truncation, max_length, lower):
        # Settings
        pad_length = None
        if padding in {True, "longest"}:
            pass
        elif padding in {"max_length"}:
            pad_length = max_length
        elif padding in {False, "do_not_pad"}:
            pass
        else:
            raise ValueError("Unknown padding type")

        # SRC tokenizer
        tok_normalizers = [NFD(), Strip()]
        if lower:
            tok_normalizers += [Lowercase()]

        self.tokenizer = Tokenizer(tok_model())  # unk_token=... not working
        self.tokenizer.add_special_tokens(self.special_tokens)
        self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [WhitespaceSplit()])
        self.tokenizer.normalizer = normalizers.Sequence(
            tok_normalizers)  # StripAccents requires NFD
        self.tokenizer.decoder = tok_decoder()

        # Define template (Needed for the sos/eos tokens)
        basic_template = TemplateProcessing(
            single=f"{self.SOS_WORD} $A {self.EOS_WORD}",
            pair=
            f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}",
            special_tokens=[
                (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)),
                (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD))
            ],
        )
        self.tokenizer.post_processor = basic_template

        if padding:
            self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id(
                self.PAD_WORD),
                                          pad_token=self.PAD_WORD,
                                          length=pad_length)
        if truncation:
            self.tokenizer.enable_truncation(max_length,
                                             stride=0,
                                             strategy='longest_first')

    def load_vocab(self, vocab, merges):
        vocab, merges = tok_model.read_file(vocab, merges)
        self.tokenizer.model = tok_model(vocab, merges)

    def train_vocab(self, files, vocab_size=32000, min_frequency=3):
        # Train trainer
        trainer = tok_trainer(vocab_size=vocab_size,
                              min_frequency=min_frequency)
        self.tokenizer.train(files, trainer)

    def save_vocab(self, output_dir, prefix):
        self.tokenizer.model.save(output_dir, prefix)

    def pad(self, examples, keys=None):
        pad_idx = self.special_tokens.index(self.PAD_WORD)

        # Keys to modify
        if not keys:
            keys = list(examples[0].keys())

        d = {}
        for k in keys:
            # Collect same-type items (list of IDs, list of masks,...)
            d[k] = [x[k] for x in examples]

            # Get max length (value to pad)
            max_length = max([x.shape[-1] for x in d[k]])

            # Apply padding
            for i, x in enumerate(examples):
                unpadded_t = x[k]
                if k == "ids":
                    tmp = torch.full((max_length, ),
                                     fill_value=pad_idx,
                                     device=unpadded_t.device)  # All padding
                elif k == "attention_mask":
                    tmp = torch.full(
                        (max_length, ), fill_value=0,
                        device=unpadded_t.device)  # No attention mask
                else:
                    raise TypeError("Unknown key")
                tmp[:unpadded_t.shape[-1]] = unpadded_t
                d[k][i] = tmp
        return d

    def encode(self, x):
        return self.tokenizer.encode(x)

    def decode(self, x):
        if isinstance(x, torch.Tensor):
            assert len(x.shape) == 2
            x = x.detach().cpu().numpy()
        return [self.tokenizer.decode(x_i) for x_i in x]
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import WordLevel

VOCAB_FILE = "data/tx1_vocab.txt"

with open(VOCAB_FILE, "r") as f:
    words = list(set(f.read().strip().split("\n")))

vocab = {}
for i, word in enumerate(["<pad>", "<unk>"] + words):
    vocab[word] = i

tokenizer = Tokenizer(WordLevel(vocab, unk_token="<unk>"))
tokenizer.enable_padding(pad_token="<pad>")
tokenizer.pre_tokenizer = Whitespace()

tokenizer.save("data/tokenizer-LakhNES-tx1.json")
Beispiel #13
0
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    if args.labels_in:
        labels = json.load(open(args.labels_in))
        mlb = mlb.fit([labels])
    else:
        mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                if args.labels_in:
                    n_labels = len(json.load(open(args.labels_in)))
                else:
                    n_labels = len(label_counter)
                out_f.write(
                    json.dumps((examples_per_file[input_file], n_labels)) +
                    '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []
                doc_idx_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    START_POS = int(args.window_start) / 100
                    for doc_idx, (example, labels) in enumerate(g):
                        #example = ' '.join(example.split(' ')[-510:])
                        example_batch.append(example)
                        labels_batch.append(labels)
                        doc_idx_batch.append(doc_idx)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels, doc_idx in zip(
                                    example_batch, labels_batch,
                                    doc_idx_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                """try:
                                    [][0]
                                    print("DOC_LEN:",len(example.overflowing)+1)
                                    mid = len(example.overflowing)//2
                                    out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                                except IndexError:
                                    out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                                if args.all_blocks or args.n_blocks > 0:
                                    blocks = [example.ids] + [
                                        blk.ids for blk in example.overflowing
                                    ]
                                    #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                                    for b, block in enumerate(blocks, 2):
                                        if b > args.n_blocks and args.n_blocks > 0:
                                            break
                                        out_f.write(
                                            json.dumps(
                                                [block, labels, doc_idx]) +
                                            '\n')
                                else:
                                    window = get_window(example, START_POS)
                                    assert len(window) == 512
                                    assert all(
                                        [type(y) is int for y in window])
                                    out_f.write(
                                        json.dumps([window, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels, doc_idx in zip(
                            example_batch, labels_batch, doc_idx_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        """try:
                            [][0]
                            print("DOC_LEN:",len(example.overflowing)+1)
                            mid = len(example.overflowing)//2
                            out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                        except IndexError:
                            out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                        if args.all_blocks or args.n_blocks > 0:
                            blocks = [example.ids] + [
                                blk.ids for blk in example.overflowing
                            ]
                            #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                            for b, block in enumerate(blocks, 2):
                                if b > args.n_blocks and args.n_blocks > 0:
                                    break
                                out_f.write(
                                    json.dumps([block, labels, doc_idx]) +
                                    '\n')
                        else:
                            out_f.write(
                                json.dumps(
                                    [get_window(example, START_POS), labels]) +
                                '\n')