Esempio n. 1
0
 def setUp(self):
     self.data = SessionTSVDataSource(
         SafeFileWrapper(tests_module.test_file("seq_tagging_example.tsv")),
         field_names=["session_id", "intent", "goal", "label"],
         schema={
             "intent": List[str],
             "goal": List[str],
             "label": List[str]
         },
     )
Esempio n. 2
0
def _get_data_source(test_path, field_names, task):
    if test_path:
        data_source = TSVDataSource(
            test_file=SafeFileWrapper(test_path),
            schema=task.data.data_source.schema,
            field_names=field_names,
        )
    else:
        data_source = task.data.data_source
    return data_source
Esempio n. 3
0
    def setUp(self):
        self.data_source = TSVDataSource(
            SafeFileWrapper(
                tests_module.test_file("train_dense_features_tiny.tsv")),
            SafeFileWrapper(
                tests_module.test_file("test_dense_features_tiny.tsv")),
            eval_file=None,
            field_names=["label", "slots", "text", "dense"],
            schema={
                "text": str,
                "label": str
            },
        )

        self.tensorizers = {
            "tokens": TokenTensorizer(text_column="text"),
            "labels": LabelTensorizer(label_column="label",
                                      allow_unknown=True),
        }
Esempio n. 4
0
    def test_gazetteer_tensor(self):
        tensorizer = GazetteerTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_dict_features.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["text", "dict"],
            schema={
                "text": str,
                "dict": Gazetteer
            },
        )

        self._initialize_tensorizer(tensorizer, data)
        # UNK + PAD + 5 labels
        self.assertEqual(7, len(tensorizer.vocab))

        # only two rows in test file:
        # "Order coffee from Starbucks please"
        # "Order some fries from McDonalds please"
        for i, row in enumerate(data.train):
            if i == 0:
                idx, weights, lens = tensorizer.numberize(row)
                self.assertEqual([1, 1, 2, 3, 1, 1, 4, 1, 1, 1], idx)
                self.assertEqual(
                    [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
                    weights)
                self.assertEqual([1, 2, 1, 1, 1], lens)
            if i == 1:
                idx, weights, lens = tensorizer.numberize(row)
                self.assertEqual([1, 1, 5, 1, 6, 1], idx)
                self.assertEqual([0.0, 0.0, 1.0, 0.0, 1.0, 0.0], weights)
                self.assertEqual([1, 1, 1, 1, 1, 1], lens)

        feats, weights, lens = tensorizer.tensorize(
            tensorizer.numberize(row) for row in data.train)
        self.assertEqual(
            [
                [1, 1, 2, 3, 1, 1, 4, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 5, 1, 1, 1, 6, 1, 1, 1],
            ],
            feats.numpy().tolist(),
        )
        self.assertEqual(
            str([
                [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
            ]),
            str([[round(w, 2) for w in utt_weights]
                 for utt_weights in weights.numpy()]),
        )
        self.assertEqual([[1, 2, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]],
                         lens.numpy().tolist())
    def test_read_data_source_with_column_remapping(self):
        data_source = TSVDataSource(
            SafeFileWrapper(
                tests_module.test_file("train_dense_features_tiny.tsv")),
            SafeFileWrapper(
                tests_module.test_file("test_dense_features_tiny.tsv")),
            eval_file=None,
            field_names=["remapped_label", "slots", "remapped_text", "dense"],
            column_mapping={
                "remapped_label": "label",
                "remapped_text": "text"
            },
            schema={
                "text": str,
                "label": str
            },
        )

        data = list(data_source.train)
        self.assertEqual(10, len(data))
        example = next(iter(data))
        self.assertEqual(2, len(example))
        self.assertEqual({"label", "text"}, set(example))
Esempio n. 6
0
    def test_csv(self):
        data_source = TSVDataSource(
            SafeFileWrapper(tests_module.test_file("test_data_tiny_csv.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["label", "slots", "text"],
            delimiter=",",
            schema={"text": str, "label": str},
            quoted=True,
        )

        for row in data_source.train:
            self.assertEqual("alarm/set_alarm", row["label"])
            self.assertTrue(row["text"].startswith("this is the text"))
Esempio n. 7
0
 def from_config(cls, config: Config, schema: Dict[str, Type], **kwargs):
     args = config._asdict()
     language = args.pop("language")
     train_filename = args.pop("train_filename")
     test_filename = args.pop("test_filename")
     eval_filename = args.pop("eval_filename")
     train_file = (SafeFileWrapper(
         train_filename, encoding="utf-8", errors="replace")
                   if train_filename else None)
     test_file = (SafeFileWrapper(
         test_filename, encoding="utf-8", errors="replace")
                  if test_filename else None)
     eval_file = (SafeFileWrapper(
         eval_filename, encoding="utf-8", errors="replace")
                  if eval_filename else None)
     return cls(
         language=language,
         train_file=train_file,
         test_file=test_file,
         eval_file=eval_file,
         schema=schema,
         **args,
         **kwargs,
     )
Esempio n. 8
0
    def process_squad_tsv(self, fname):
        if not fname:
            print("Empty file name!")
            return

        field_names = [
            "doc", "question", "answers", "answer_starts", "has_answer"
        ]
        tsv_file = SafeFileWrapper(get_absolute_path(fname),
                                   encoding="utf-8",
                                   errors="replace")
        tsv = TSV(
            tsv_file,
            field_names=field_names,
            delimiter=self.delimiter,
            quoted=self.quoted,
            drop_incomplete_rows=True,
        )

        for id, row in enumerate(tsv):
            parts = (row[f] for f in field_names)
            doc, question, answers, answer_starts, has_answer = parts
            try:
                # if we have paraphrases for question
                question = json.loads(question)
                if isinstance(question, list):
                    question = choice(question)
            except ValueError:
                pass
            answers = json.loads(answers)
            answer_starts = json.loads(answer_starts)

            if has_answer != "True":
                answers = []
                answer_starts = []

            for piece_dict in _split_document(
                    id,
                    doc,
                    question,
                    answers,
                    answer_starts,
                    has_answer == "True",
                    self.ignore_impossible,
                    self.max_character_length,
                    self.min_overlap,
            ):
                yield piece_dict
 def setUp(self):
     self.data = TSVDataSource(
         SafeFileWrapper(
             tests_module.test_file("compositional_seq2seq_unit.tsv")),
         test_file=None,
         eval_file=None,
         field_names=["text", "seqlogical"],
         schema={
             "text": str,
             "seqlogical": str
         },
     )
     self.masked_tensorizer = MaskedTokenTensorizer.from_config(
         MaskedTokenTensorizer.Config(column="seqlogical",
                                      masking_function=TreeMask.Config()))
     self._initialize_tensorizer(self.masked_tensorizer)
Esempio n. 10
0
    def test_gazetteer_tensor_bad_json(self):
        tensorizer = GazetteerTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_dict_features_bad_json.tsv")
            ),
            test_file=None,
            eval_file=None,
            field_names=["text", "dict"],
            schema={"text": str, "dict": Gazetteer},
        )

        init = tensorizer.initialize()
        init.send(None)  # kick
        with self.assertRaises(Exception):
            for row in data.train:
                init.send(row)
        init.close()
Esempio n. 11
0
    def test_seq_tensor_max_turn(self):
        tensorizer = SeqTokenTensorizer(max_turn=1)

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_seq_features.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["text_seq"],
            schema={"text_seq": List[str]},
        )

        self._initialize_tensorizer(tensorizer, data)

        # only one row in test file:
        # ["where do you wanna meet?", "MPK"]
        for row in data.train:
            idx, sentence_lens, seq_len = tensorizer.numberize(row)
            self.assertEqual(1, seq_len)
            self.assertEqual([[2, 3, 4, 5, 6]], idx)
            self.assertEqual([5], sentence_lens)
Esempio n. 12
0
    def test_seq_tensor_with_bos_eos_eol_bol(self):
        tensorizer = SeqTokenTensorizer(
            add_bos_token=True,
            add_eos_token=True,
            add_bol_token=True,
            add_eol_token=True,
        )

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_seq_features.tsv")
            ),
            test_file=None,
            eval_file=None,
            field_names=["text_seq"],
            schema={"text_seq": List[str]},
        )

        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in data.train:
            init.send(row)
        init.close()
        # UNK + PAD + BOS + EOS + BOL + EOL + 6 tokens
        self.assertEqual(12, len(tensorizer.vocab))

        # only one row in test file:
        # ["where do you wanna meet?", "MPK"]
        for row in data.train:
            idx, lens = tensorizer.numberize(row)
            self.assertEqual(4, lens)
            self.assertEqual(
                [
                    [2, 4, 3, 1, 1, 1, 1],
                    [2, 6, 7, 8, 9, 10, 3],
                    [2, 11, 3, 1, 1, 1, 1],
                    [2, 5, 3, 1, 1, 1, 1],
                ],
                idx,
            )
Esempio n. 13
0
    def test_seq_tensor(self):
        tensorizer = SeqTokenTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_seq_features.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["text_seq"],
            schema={"text_seq": List[str]},
        )

        self._initialize_tensorizer(tensorizer, data)
        # UNK + PAD + 6 tokens
        self.assertEqual(8, len(tensorizer.vocab))

        # only one row in test file:
        # ["where do you wanna meet?", "MPK"]
        for row in data.train:
            idx, lens = tensorizer.numberize(row)
            self.assertEqual(2, lens)
            self.assertEqual([[2, 3, 4, 5, 6], [7, 1, 1, 1, 1]], idx)
    def test_quoting(self):
        """
        The text column of the first row of this file opens a quote but
        does not close it.
        """
        data_source = BlockShardedTSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("test_tsv_quoting.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["label", "text"],
            schema={
                "text": str,
                "label": str
            },
        )

        data = list(data_source.train_unsharded)
        self.assertEqual(4, len(data))

        data = list(data_source.train)
        self.assertEqual(4, len(data))
Esempio n. 15
0
 def __init__(
     self,
     path: str,
     columns: List[Any] = None,
     column_mapping: Optional[Dict[str, str]] = None,
     delimiter: str = "\t",
     batch_size: Optional[int] = None,
     is_shuffle: bool = True,
     transform: Optional[nn.Module] = None,
     custom_batcher: Optional[Batcher] = None,
     collate_fn: Optional[Callable] = None,
     chunk_size: int = 1000,
     is_cycle: bool = False,
     length: Optional[int] = None,
     rank: int = 0,
     world_size: int = 1,
     *args,
     **kwargs,
 ):
     logger.debug(f"init TsvDataset from: {path}")
     columns = columns or ["text", "label"]
     if column_mapping:
         raise NotImplementedError(
             "column mapping is not supported for tsv yet!")
     self.file = SafeFileWrapper(path, encoding="utf-8", errors="replace")
     tsv_iterator = TSV(self.file, field_names=columns, delimiter=delimiter)
     super().__init__(
         iterable=tsv_iterator,
         batch_size=batch_size,
         is_shuffle=is_shuffle,
         transform=transform,
         custom_batcher=custom_batcher,
         collate_fn=collate_fn,
         chunk_size=chunk_size,
         is_cycle=is_cycle,
         length=length,
         rank=rank,
         world_size=world_size,
     )
Esempio n. 16
0
    def test_seq_tensor(self):
        tensorizer = SeqTokenTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_seq_features.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["text_seq"],
            schema={"text_seq": List[str]},
        )

        self._initialize_tensorizer(tensorizer, data)
        # UNK + PAD + 6 tokens
        self.assertEqual(8, len(tensorizer.vocab))

        # only one row in test file:
        # ["where do you wanna meet?", "MPK"]
        for row in data.train:
            tokens, token_lens, seq_lens = tensorizer.prepare_input(row)
            idx, sentence_lens, lens = tensorizer.numberize(row)
            self.assertEqual(2, lens)
            self.assertEqual([[2, 3, 4, 5, 6], [7, 1, 1, 1, 1]], idx)
            self.assertEqual([5, 1], sentence_lens)
            self.assertEqual(2, seq_lens)
            self.assertEqual(
                [
                    ["where", "do", "you", "wanna", "meet?"],
                    [
                        "mpk",
                        str(SpecialTokens.PAD),
                        str(SpecialTokens.PAD),
                        str(SpecialTokens.PAD),
                        str(SpecialTokens.PAD),
                    ],
                ],
                tokens,
            )
Esempio n. 17
0
    def test_annotation_num(self):
        data = TSVDataSource(
            SafeFileWrapper(
                tests_module.test_file("compositional_seq2seq_unit.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["text", "seqlogical"],
            schema={
                "text": str,
                "seqlogical": str
            },
        )
        nbrz = AnnotationNumberizer()
        init = nbrz.initialize()
        init.send(None)  # kick
        for row in data.train:
            init.send(row)
        init.close()

        # vocab = {'IN:GET_INFO_TRAFFIC': 0, 'SHIFT': 1, 'SL:LOCATION': 2,
        # 'REDUCE': 3, 'IN:GET_DIRECTIONS': 4, 'SL:DESTINATION': 5, 'SL:SOURCE': 6,
        # 'IN:GET_LOCATION_HOME': 7, 'SL:CONTACT': 8, 'SL:DATE_TIME_DEPARTURE': 9,
        # 'IN:UNSUPPORTED_NAVIGATION': 10, 'IN:GET_ESTIMATED_DURATION': 11,
        # 'IN:GET_LOCATION_WORK': 12, 'SL:PATH_AVOID': 13, 'IN:GET_DISTANCE': 14}

        self.assertEqual(15, len(nbrz.vocab))
        self.assertEqual(1, nbrz.shift_idx)
        self.assertEqual(3, nbrz.reduce_idx)
        self.assertEqual([10], nbrz.ignore_subNTs_roots)
        self.assertEqual([0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
                         nbrz.valid_NT_idxs)
        self.assertEqual([0, 4, 7, 10, 11, 12, 14], nbrz.valid_IN_idxs)
        self.assertEqual([2, 5, 6, 8, 9, 13], nbrz.valid_SL_idxs)

        for row, expected in zip(data.train, EXPECTED_ACTIONS):
            actions = nbrz.numberize(row)
            self.assertEqual(expected, actions)
Esempio n. 18
0
def process_squad_tsv(
    fname, ignore_impossible, max_character_length, min_overlap, delimiter, quoted
):
    if not fname:
        print(f"Empty file name!")
        return

    field_names = ["doc", "question", "answers", "answer_starts", "has_answer"]
    tsv_file = SafeFileWrapper(
        get_absolute_path(fname), encoding="utf-8", errors="replace"
    )
    tsv = TSV(
        tsv_file,
        field_names=field_names,
        delimiter=delimiter,
        quoted=quoted,
        drop_incomplete_rows=True,
    )

    for id, row in enumerate(tsv):
        parts = (row[f] for f in field_names)
        doc, question, answers, answer_starts, has_answer = parts
        answers = json.loads(answers)
        answer_starts = json.loads(answer_starts)

        for piece_dict in _split_document(
            id,
            doc,
            question,
            answers,
            answer_starts,
            has_answer == "True",
            ignore_impossible,
            max_character_length,
            min_overlap,
        ):
            yield piece_dict
Esempio n. 19
0
    def test_seq_tensor_pad_batch(self):
        tensorizer = SeqTokenTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_seq_features.tsv")
            ),
            test_file=None,
            eval_file=None,
            field_names=["text_seq"],
            schema={"text_seq": List[str]},
        )

        self._initialize_tensorizer(tensorizer, data)
        token_idx_1 = [[2, 3], [2, 1]]
        token_count_1 = [2, 1]
        seq_len_1 = 2
        token_idx_2 = [[2, 3, 4]]
        token_count_2 = [3]
        seq_len_2 = 1
        token_idx_tensor, token_count_tensor, seq_len_tensor = tensorizer.tensorize(
            [
                (token_idx_1, token_count_1, seq_len_1),
                (token_idx_2, token_count_2, seq_len_2),
            ]
        )
        np.testing.assert_array_almost_equal(
            np.array([[[2, 3, 1], [2, 1, 1]], [[2, 3, 4], [1, 1, 1]]]),
            token_idx_tensor.detach().numpy(),
        )
        np.testing.assert_array_almost_equal(
            np.array([[2, 1], [3, 1]]), token_count_tensor.detach().numpy()
        )
        np.testing.assert_array_almost_equal(
            np.array([2, 1]), seq_len_tensor.detach().numpy()
        )
Esempio n. 20
0
    def process_squad_tsv(self, fname):
        # Process SQUAD TSV for KD
        if not fname:
            print("Empty file name!")
            return
        field_names = [
            "id1",
            "doc",
            "question",
            "answers",
            "answer_starts",
            "has_answer",
            "id2",
            "start_logits",
            "end_logits",
            "has_answer_logits",
            "pad_mask",
            "segment_labels",
        ]
        tsv_file = SafeFileWrapper(get_absolute_path(fname),
                                   encoding="utf-8",
                                   errors="replace")
        tsv = TSV(
            tsv_file,
            field_names=field_names,
            delimiter=self.delimiter,
            quoted=self.quoted,
            drop_incomplete_rows=True,
        )

        for id, row in enumerate(tsv):
            parts = (row[f] for f in field_names)
            # All model output for KD are dumped using json serialization.
            (
                id1,
                doc,
                question,
                answers,
                answer_starts,
                has_answer,
                id2,
                start_logits,
                end_logits,
                has_answer_logits,
                pad_mask,
                segment_labels,
            ) = (json.loads(s) for s in parts)
            if isinstance(question, list):
                # if we have paraphrases for question
                question = choice(question)
            for piece_dict in _split_document(
                    id,
                    doc,
                    question,
                    answers,
                    answer_starts,
                    has_answer == "True",
                    self.ignore_impossible,
                    self.max_character_length,
                    self.min_overlap,
            ):
                piece_dict.update({
                    "start_logits": start_logits,
                    "end_logits": end_logits,
                    "has_answer_logits": has_answer_logits,
                    "pad_mask": pad_mask,
                    "segment_labels": segment_labels,
                })
                yield piece_dict
Esempio n. 21
0
 def __iter__(self):
     logger.debug(f"Initializing JSONL iterator for {self.path}.")
     file = SafeFileWrapper(self.path, encoding="utf-8")
     for line in file:
         yield json.loads(line)
Esempio n. 22
0
 def __iter__(self):
     logger.debug(f"Initializing TSV iterator for {self.path}.")
     file = SafeFileWrapper(self.path, encoding="utf-8")
     yield from csv.DictReader(file,
                               delimiter=self.delimiter,
                               fieldnames=self.column_names)
Esempio n. 23
0
    def test_create_normalized_float_list_tensor(self):
        def round_list(l):
            return [float("%.4f" % n) for n in l]

        data = TSVDataSource(
            SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")),
            eval_file=None,
            field_names=["label", "slots", "text", "dense_feat"],
            schema={"text": str, "label": str, "dense_feat": List[float]},
        )
        tensorizer = FloatListTensorizer(
            column="dense_feat", dim=10, error_check=True, normalize=True
        )
        self._initialize_tensorizer(tensorizer, data)
        self.assertEqual(10, tensorizer.normalizer.num_rows)
        self.assertEqual(
            round_list(
                [
                    7.56409,
                    8.2388,
                    0.5531,
                    0.2403,
                    1.03130,
                    6.2888,
                    3.1595,
                    0.1538,
                    0.2403,
                    5.3463,
                ]
            ),
            round_list(tensorizer.normalizer.feature_sums),
        )
        self.assertEqual(
            round_list(
                [
                    5.80172,
                    7.57586,
                    0.30591,
                    0.05774,
                    0.52762,
                    5.22811,
                    2.51727,
                    0.02365,
                    0.05774,
                    4.48798,
                ]
            ),
            round_list(tensorizer.normalizer.feature_squared_sums),
        )
        self.assertEqual(
            round_list(
                [
                    0.75640,
                    0.82388,
                    0.05531,
                    0.02403,
                    0.10313,
                    0.62888,
                    0.31595,
                    0.01538,
                    0.02403,
                    0.53463,
                ]
            ),
            round_list(tensorizer.normalizer.feature_avgs),
        )
        self.assertEqual(
            round_list(
                [
                    0.08953,
                    0.28072,
                    0.16593,
                    0.07209,
                    0.20524,
                    0.35682,
                    0.38974,
                    0.04614,
                    0.07209,
                    0.40369,
                ]
            ),
            round_list(tensorizer.normalizer.feature_stddevs),
        )

        row = [0.64840776, 0.7575, 0.5531, 0.2403, 0, 0.9481, 0, 0.1538, 0.2403, 0.3564]
        output = tensorizer.numberize({"dense_feat": row})

        self.assertEqual(
            round_list(
                [
                    -1.20619,
                    -0.23646,
                    2.99999,
                    3.0,
                    -0.50246,
                    0.89462,
                    -0.81066,
                    2.99999,
                    3.0,
                    -0.44149,
                ]
            ),
            round_list(output),
        )
Esempio n. 24
0
    def test_seq_tensor_with_bos_eos_eol_bol(self):
        tensorizer = SeqTokenTensorizer(
            add_bos_token=True,
            add_eos_token=True,
            add_bol_token=True,
            add_eol_token=True,
        )

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_seq_features.tsv")
            ),
            test_file=None,
            eval_file=None,
            field_names=["text_seq"],
            schema={"text_seq": List[str]},
        )

        self._initialize_tensorizer(tensorizer, data)
        # UNK + PAD + BOS + EOS + BOL + EOL + 6 tokens
        self.assertEqual(12, len(tensorizer.vocab))

        # only one row in test file:
        # ["where do you wanna meet?", "MPK"]
        for row in data.train:
            idx, sen_lens, lens = tensorizer.numberize(row)
            tokens, token_lens, seq_lens = tensorizer.prepare_input(row)
            self.assertEqual(4, lens)
            self.assertEqual(4, seq_lens)
            self.assertEqual([3, 7, 3, 3], token_lens)
            self.assertEqual(
                [
                    [2, 4, 3, 1, 1, 1, 1],
                    [2, 6, 7, 8, 9, 10, 3],
                    [2, 11, 3, 1, 1, 1, 1],
                    [2, 5, 3, 1, 1, 1, 1],
                ],
                idx,
            )
            self.assertEqual(
                [
                    [
                        "__BEGIN_OF_SENTENCE__",
                        "__BEGIN_OF_LIST__",
                        "__END_OF_SENTENCE__",
                        "__PAD__",
                        "__PAD__",
                        "__PAD__",
                        "__PAD__",
                    ],
                    [
                        "__BEGIN_OF_SENTENCE__",
                        "where",
                        "do",
                        "you",
                        "wanna",
                        "meet?",
                        "__END_OF_SENTENCE__",
                    ],
                    [
                        "__BEGIN_OF_SENTENCE__",
                        "mpk",
                        "__END_OF_SENTENCE__",
                        "__PAD__",
                        "__PAD__",
                        "__PAD__",
                        "__PAD__",
                    ],
                    [
                        "__BEGIN_OF_SENTENCE__",
                        "__END_OF_LIST__",
                        "__END_OF_SENTENCE__",
                        "__PAD__",
                        "__PAD__",
                        "__PAD__",
                        "__PAD__",
                    ],
                ],
                tokens,
            )