def __init__(self,
              filename: str,
              tokenizers: Dict[str, BaseTokenizer] = WordTokenizer()):
     super().__init__(filename, tokenizers)
     self.filename = filename
     self.tokenizers = tokenizers
     self.lines, self.labels = self.get_lines_labels()
Esempio n. 2
0
    def __init__(self,
                 text: str,
                 context: List[str],
                 tokenizers: Dict[str, BaseTokenizer] = None):
        if tokenizers is None:
            tokenizers = {"tokens": WordTokenizer()}
        self.text = text
        self.context = context
        self.tokenizers = tokenizers
        self.tokens: Dict[str, List[Any]] = defaultdict(list)
        self.namespaces = list(tokenizers.keys())
        for namespace in tokenizers.keys():
            self.namespaces.append(f"contextual_{namespace}")

        # add tokens for the word tokens
        for namespace, tokenizer in self.tokenizers.items():
            tokens = tokenizer.tokenize(text)
            for token in tokens:
                self.add_token(token=token, namespace=namespace)

        # add tokens for the contextual lines
        for namespace, tokenizer in self.tokenizers.items():
            for contextual_line in self.context:
                tokens = tokenizer.tokenize(contextual_line)
                tokens = [Token(tok) for tok in tokens]
                self.tokens[f"contextual_{namespace}"].append(tokens)

        self.line = Line(text=text, tokenizers=self.tokenizers)
        self.context_lines = []
        for text in self.context:
            context_line = Line(text=text, tokenizers=self.tokenizers)
            self.context_lines.append(context_line)
Esempio n. 3
0
 def test_sents_word_tokenizers(self):
     sents = ["Nice people", "Great weather"]
     sent = SeqSentence(sents=sents, tokenizers={"tokens": WordTokenizer()})
     tokens = sent.tokens
     assert [[token.text for token in sent_tokens]
             for sent_tokens in tokens["tokens"]] == [["Nice", "people"],
                                                      ["Great", "weather"]]
Esempio n. 4
0
    def test_get_lines_labels_len(self, test_file):
        dataset = CoNLLDataset(filename=test_file,
                               tokenizers={"tokens": WordTokenizer()})

        lines, labels = dataset.get_lines_labels()
        assert len(lines) == 1
        assert len(labels) == 1
Esempio n. 5
0
 def test_spacy_whitespace_tokenizer(self):
     tokenizer = WordTokenizer(tokenizer="spacy-whitespace")
     tokenized = tokenizer.tokenize(
         "(1999). & P., W. The Control of Discrete Event Systems.")
     assert tokenized == [
         "(1999).",
         "&",
         "P.,",
         "W.",
         "The",
         "Control",
         "of",
         "Discrete",
         "Event",
         "Systems.",
     ]
def get_tokenized_data(get_parsect_data):
    parsect_json = get_parsect_data
    parsect_lines = parsect_json["parse_sect"]
    parsect_lines = parsect_lines[:100]
    tokenizer = WordTokenizer()

    lines = []
    labels = []

    for line_json in parsect_lines:
        text = line_json["text"]
        label = line_json["label"]
        lines.append(text)
        labels.append(label)

    instances = tokenizer.tokenize_batch(lines)

    return instances, labels
Esempio n. 7
0
def conll_yago_dataset(request):
    train_filename = DATA_DIR.joinpath(request.param)
    dataset = ConllYagoDataset(
        filename=str(train_filename),
        tokenizers={"tokens": WordTokenizer(tokenizer="vanilla")},
        column_names=["NER"],
    )

    return dataset
Esempio n. 8
0
def setup_lines():
    texts = ["First sentence", "Second Sentence"]
    lines = []
    for text in texts:
        line = Line(
            text=text,
            tokenizers={"tokens": WordTokenizer(), "char_tokens": CharacterTokenizer()},
        )
        lines.append(line)
    return lines
Esempio n. 9
0
def lines():
    texts = ["First line", "Second Line which is longer"]
    lines = []
    for text in texts:
        line = Line(
            text=text, tokenizers={"tokens": WordTokenizer(tokenizer="vanilla")}
        )
        lines.append(line)

    return lines
Esempio n. 10
0
    def __init__(
        self,
        train_filename: str,
        dev_filename: str,
        test_filename: str,
        tokenizers: Dict[str, BaseTokenizer] = None,
        namespace_vocab_options: Dict[str, Dict[str, Any]] = None,
        namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None,
        batch_size: int = 10,
    ):

        self.train_filename = train_filename
        self.dev_filename = dev_filename
        self.test_filename = test_filename
        self.tokenizers = tokenizers or {
            "tokens": WordTokenizer(tokenizer="vanilla"),
            "char_tokens": CharacterTokenizer(),
        }
        self.namespace_vocab_options = namespace_vocab_options or {
            "char_tokens": {
                "start_token": " ",
                "end_token": " ",
                "pad_token": " ",
                "unk_token": " ",
            }
        }
        self.namespace_numericalizer_map = namespace_numericalizer_map or {
            "tokens": Numericalizer(),
            "char_tokens": Numericalizer(),
        }
        self.namespace_numericalizer_map["seq_label"] = Numericalizer()

        self.batch_size = batch_size

        self.train_dataset = SeqLabellingDataset(
            filename=self.train_filename, tokenizers=self.tokenizers
        )

        self.dev_dataset = SeqLabellingDataset(
            filename=self.dev_filename, tokenizers=self.tokenizers
        )

        self.test_dataset = SeqLabellingDataset(
            filename=self.test_filename, tokenizers=self.tokenizers
        )

        super(SeqLabellingDatasetManager, self).__init__(
            train_dataset=self.train_dataset,
            dev_dataset=self.dev_dataset,
            test_dataset=self.test_dataset,
            namespace_vocab_options=self.namespace_vocab_options,
            namespace_numericalizer_map=self.namespace_numericalizer_map,
            batch_size=batch_size,
        )
Esempio n. 11
0
 def test_line_word_tokenizers(self):
     text = "This is a single line"
     line = Line(text=text, tokenizers={"tokens": WordTokenizer()})
     tokens = line.tokens
     assert [token.text for token in tokens["tokens"]] == [
         "This",
         "is",
         "a",
         "single",
         "line",
     ]
Esempio n. 12
0
    def test_get_item(self, test_file):
        dataset = SeqLabellingDataset(filename=str(test_file),
                                      tokenizers={"tokens": WordTokenizer()})
        num_instances = len(dataset)

        for idx in range(num_instances):
            line, label = dataset[idx]
            word_tokens = line.tokens["tokens"]
            label_tokens = label.tokens["seq_label"]
            print(f"label tokens {label.tokens}")
            assert len(word_tokens) == len(label_tokens)
Esempio n. 13
0
    def __init__(self, sents: List[str], tokenizers: Dict[str, BaseTokenizer] = None):
        if tokenizers is None:
            tokenizers = {"tokens": WordTokenizer()}
        self.sents = sents
        self.tokenizers = tokenizers
        self.tokens: Dict[str, List[List[Any]]] = defaultdict(list)
        self.namespaces = list(tokenizers.keys())

        for namespace, tokenizer in tokenizers.items():
            for sent in sents:
                sent_tokens = tokenizer.tokenize(sent)
                self.add_sent_tokens(tokens=sent_tokens, namespace=namespace)
Esempio n. 14
0
File: line.py Progetto: yyht/sciwing
    def __init__(self, text: str, tokenizers: Dict[str, BaseTokenizer] = None):
        if tokenizers is None:
            tokenizers = {"tokens": WordTokenizer()}
        self.text = text
        self.tokenizers = tokenizers
        self.tokens: Dict[str, List[Any]] = defaultdict(list)
        self.namespaces = list(tokenizers.keys())

        for namespace, tokenizer in tokenizers.items():
            tokens = tokenizer.tokenize(text)
            for token in tokens:
                self.add_token(token=token, namespace=namespace)
    def test_get_item(self, test_file):
        classification_dataset = TextClassificationDataset(
            filename=str(test_file), tokenizers={"tokens": WordTokenizer()})
        num_instances = len(classification_dataset)
        tokens = ["line1", "line2"]
        line_tokens = []
        for idx in range(num_instances):
            line, label = classification_dataset[idx]
            line_tokens.extend(line.tokens["tokens"])

        line_tokens = list(map(lambda token: token.text, line_tokens))

        assert set(tokens) == set(line_tokens)
Esempio n. 16
0
    def test_restricted_namesapces(self, test_file, train_only):
        dataset = CoNLLDataset(
            filename=test_file,
            tokenizers={"tokens": WordTokenizer()},
            column_names=["POS", "DEP", "NER"],
            train_only=train_only,
        )
        lines, labels = dataset.get_lines_labels()

        for label in labels:
            namespaces = label.namespace
            assert len(namespaces) == 1
            assert train_only.upper() in namespaces
Esempio n. 17
0
 def test_labels_namespaces(self, test_file):
     dataset = CoNLLDataset(
         filename=test_file,
         tokenizers={"tokens": WordTokenizer()},
         column_names=["NER", "POS", "DEP"],
     )
     lines, labels = dataset.get_lines_labels()
     for label in labels:
         namespaces = label.namespace
         assert len(namespaces) == 3
         assert "NER" in namespaces
         assert "POS" in namespaces
         assert "DEP" in namespaces
Esempio n. 18
0
 def test_len_lines_labels_equal(self, test_file):
     dataset = CoNLLDataset(
         filename=test_file,
         tokenizers={"tokens": WordTokenizer()},
         column_names=["NER", "POS", "DEP"],
     )
     lines, labels = dataset.get_lines_labels()
     for line, label in zip(lines, labels):
         line_tokens = line.tokens["tokens"]
         labels_ner = label.tokens["NER"]
         labels_pos = label.tokens["POS"]
         labels_dep = label.tokens["DEP"]
         assert (len(line_tokens) == len(labels_ner) == len(labels_pos) ==
                 len(labels_dep))
    def __init__(
        self,
        train_filename: str,
        dev_filename: str,
        test_filename: str,
        tokenizers: Dict[str, BaseTokenizer] = None,
        namespace_vocab_options: Dict[str, Dict[str, Any]] = None,
        namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None,
        batch_size: int = 10,
    ):
        self.train_filename = train_filename
        self.dev_filename = dev_filename
        self.test_filename = test_filename
        self.tokenizers = tokenizers or {
            "tokens": WordTokenizer(),
            "char_tokens": CharacterTokenizer(),
        }
        self.namespace_vocab_options = namespace_vocab_options or {
            "char_tokens": {
                "start_token": " ",
                "end_token": " ",
                "pad_token": " ",
                "unk_token": " ",
            },
            "label": {
                "include_special_vocab": False
            },
        }
        self.namespace_numericalizer_map = namespace_numericalizer_map or {
            "tokens": Numericalizer(),
            "char_tokens": Numericalizer(),
        }
        self.namespace_numericalizer_map["label"] = Numericalizer()
        self.batch_size = batch_size

        self.train_dataset = TextClassificationDataset(
            filename=self.train_filename, tokenizers=self.tokenizers)
        self.dev_dataset = TextClassificationDataset(
            filename=self.dev_filename, tokenizers=self.tokenizers)
        self.test_dataset = TextClassificationDataset(
            filename=self.test_filename, tokenizers=self.tokenizers)

        super(TextClassificationDatasetManager, self).__init__(
            train_dataset=self.train_dataset,
            dev_dataset=self.dev_dataset,
            test_dataset=self.test_dataset,
            namespace_vocab_options=self.namespace_vocab_options,
            namespace_numericalizer_map=self.namespace_numericalizer_map,
            batch_size=batch_size,
        )
Esempio n. 20
0
    def test_line_char_tokenizer(self):
        text = "Word"
        line = Line(
            text=text,
            tokenizers={
                "tokens": WordTokenizer(),
                "chars": CharacterTokenizer()
            },
        )
        tokens = line.tokens
        word_tokens = tokens["tokens"]
        char_tokens = tokens["chars"]

        word_tokens = [tok.text for tok in word_tokens]
        char_tokens = [tok.text for tok in char_tokens]

        assert word_tokens == ["Word"]
        assert char_tokens == ["W", "o", "r", "d"]
Esempio n. 21
0
def setup_char_embedder(request, clf_dataset_manager):
    char_embedding_dim, hidden_dim = request.param
    datset_manager = clf_dataset_manager
    embedder = CharEmbedder(
        char_embedding_dimension=char_embedding_dim,
        hidden_dimension=hidden_dim,
        datasets_manager=datset_manager,
    )
    texts = ["This is sentence", "This is another sentence"]
    lines = []
    for text in texts:
        line = Line(
            text=text,
            tokenizers={"tokens": WordTokenizer(), "char_tokens": CharacterTokenizer()},
        )
        lines.append(line)

    return embedder, lines
Esempio n. 22
0
    def test_get_item(self, test_file):
        summarization_dataset = AbstractiveSummarizationDataset(
            filename=str(test_file), tokenizers={"tokens": WordTokenizer()}
        )
        num_instances = len(summarization_dataset)
        defined_line_tokens = ["word11_train", "word21_train", "word12_train", "word22_train", "word32_train"]
        defined_label_tokens = ["word11_label", "word12_label", "word21_label"]
        line_tokens = []
        label_tokens = []
        for idx in range(num_instances):
            line, label = summarization_dataset[idx]
            line_tokens.extend(line.tokens["tokens"])
            label_tokens.extend(label.tokens["tokens"])

        line_tokens = list(map(lambda token: token.text, line_tokens))
        label_tokens = list(map(lambda token: token.text, label_tokens))

        assert set(defined_line_tokens) == set(line_tokens)
        assert set(defined_label_tokens) == set(label_tokens)
Esempio n. 23
0
    def test_sents_char_tokenizer(self):
        sents = ["Hello", "World"]
        sent = SeqSentence(
            sents=sents,
            tokenizers={
                "tokens": WordTokenizer(),
                "chars": CharacterTokenizer()
            },
        )
        tokens = sent.tokens
        word_tokens = tokens["tokens"]
        char_tokens = tokens["chars"]

        word_tokens = [[tok.text for tok in sent_word_tokens]
                       for sent_word_tokens in word_tokens]
        char_tokens = [[tok.text for tok in sent_char_tokens]
                       for sent_char_tokens in char_tokens]

        assert word_tokens == [["Hello"], ["World"]]
        assert char_tokens == [["H", "e", "l", "l", "o"],
                               ["W", "o", "r", "l", "d"]]
Esempio n. 24
0
    def __init__(
        self,
        filename: str,
        dataset_type: str,
        max_num_words: int,
        max_instance_length: int,
        word_vocab_store_location: str,
        debug: bool = False,
        debug_dataset_proportion: float = 0.1,
        word_embedding_type: Union[str, None] = None,
        word_embedding_dimension: Union[int, None] = None,
        word_start_token: str = "<SOS>",
        word_end_token: str = "<EOS>",
        word_pad_token: str = "<PAD>",
        word_unk_token: str = "<UNK>",
        train_size: float = 0.8,
        test_size: float = 0.2,
        validation_size: float = 0.5,
        word_tokenizer=WordTokenizer(),
        word_tokenization_type="vanilla",
    ):
        """ Base Text Classification Dataset to be inherited by all text classification datasets

        Parameters
        ----------
        filename : str
            Path of file where the text classification dataset is stored. Ideally this should have
            an example text and label separated by space. But it is left to the specific dataset to
            handle the different ways in which file could be structured
        dataset_type : str
            One of ``[train, valid, test]``
        max_num_words : int
            The top ``max_num_words`` will be considered for building vocab
        max_instance_length : int
            Every instance in the dataset will be padded to or curtailed to ``max_length`` number of
            tokens
        word_vocab_store_location : str
            Vocabulary once built will be stored in this location
            If the vocabulary already exists then it will be loaded from the filepath
        debug : bool
            Useful to build a small dataset for debugging purposes. If ``True``, then a smaller
            random version of the dataset should be returned. If ``True`` then
            ``debug_dataset_proportion`` will be the proportion of the dataset that will be returned
        debug_dataset_proportion : int
            Percent of dataset that will be returned for debug purposes. Should be between 0 and 1
        word_embedding_type : str
            The kind of word embedding that will be associated with the words in the database
            Any of the ``allowed_types`` in vocab.EmbeddingLoader is allowed here
        word_embedding_dimension : int
            Dimension of word embedding
        word_start_token : str
            Start token appended at the beginning of every instance
        word_end_token : str
            End token appended at the end of every instance
        word_pad_token : str
            Pad token to be used for padding
        word_unk_token : str
            All OOV words (if they are less frequent than ``max_words`` or word is in
            test but not in train) will be mapped to ``unk_token``
        train_size : str
            Percentage of the instances to be used for training
        test_size : str
            Remaining percentage that will be used for testing
        validation_size : str
            Percentage of test data that will be used for validation
        word_tokenizer : WordTokenizer
            Word Tokenizer to be used for the dataset. You can reference
            ``tokenizers.WordTokenizer`` for more information
        word_tokenization_type : str
            The type of word tokenization that the word tokenizer represents
        """
        pass
Esempio n. 25
0
 def test_len(self, test_file):
     dataset = SeqLabellingDataset(filename=str(test_file),
                                   tokenizers={"tokens": WordTokenizer()})
     assert len(dataset) == 2
Esempio n. 26
0
 def test_get_lines_labels(self, test_file):
     dataset = SeqLabellingDataset(filename=str(test_file),
                                   tokenizers={"tokens": WordTokenizer()})
     lines, labels = dataset.get_lines_labels()
     assert len(lines) == 2
Esempio n. 27
0
    def __init__(
        self,
        train_filename: str,
        dev_filename: str,
        test_filename: str,
        tokenizers: Dict[str, BaseTokenizer] = None,
        namespace_vocab_options: Dict[str, Dict[str, Any]] = None,
        namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None,
        batch_size=10,
        column_names: List[str] = None,
        train_only: Optional[str] = None,
    ):

        self.train_filename = train_filename
        self.dev_filename = dev_filename
        self.test_filename = test_filename
        self.tokenizers = tokenizers or {
            "tokens": WordTokenizer(tokenizer="vanilla"),
            "char_tokens": CharacterTokenizer(),
        }

        if namespace_vocab_options is None:
            namespace_vocab_options = {}

        namespace_vocab_options_defaults = {
            "char_tokens": {
                "start_token": " ",
                "end_token": " ",
                "pad_token": " ",
                "unk_token": " ",
            }
        }
        self.namespace_vocab_options = {}

        vocab_namespaces = set(namespace_vocab_options.keys()).union(
            namespace_vocab_options_defaults.keys())

        for namespace in vocab_namespaces:
            user_passed = namespace_vocab_options.get(namespace, {})
            defaults = namespace_vocab_options_defaults.get(namespace, {})
            self.namespace_vocab_options[namespace] = {
                **defaults,
                **user_passed
            }

        self.namespace_numericalizer_map = namespace_numericalizer_map or {
            "tokens": Numericalizer(),
            "char_tokens": Numericalizer(),
        }

        self.batch_size = batch_size

        if column_names is None:
            column_names = ["label_1"]

        valid_column_names = [column_names[0]]

        for column_name in valid_column_names:
            self.namespace_numericalizer_map[column_name] = Numericalizer()

        self.train_dataset = BioNerDataset(
            filename=self.train_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
            train_only=train_only,
        )

        self.dev_dataset = BioNerDataset(
            filename=self.dev_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
            train_only=train_only,
        )

        self.test_dataset = BioNerDataset(
            filename=self.test_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
            train_only=train_only,
        )

        super(BioNERDatasetManager, self).__init__(
            train_dataset=self.train_dataset,
            dev_dataset=self.dev_dataset,
            test_dataset=self.test_dataset,
            namespace_vocab_options=self.namespace_vocab_options,
            namespace_numericalizer_map=self.namespace_numericalizer_map,
            batch_size=batch_size,
        )
Esempio n. 28
0
    def __call__(self, wrapped, instance, args, kwargs):
        self.wrapped_cls = wrapped
        self.init_signature = inspect.signature(wrapped.__init__)
        instance = wrapped(*args, **kwargs)
        for idx, (name, param) in enumerate(self.init_signature.parameters.items()):
            if name == "self":
                continue

            # These are values that must be passed
            if name in [
                "filename",
                "dataset_type",
                "max_num_words",
                "max_instance_length",
                "word_vocab_store_location",
            ]:
                try:
                    value = args[idx]
                except IndexError:
                    try:
                        value = kwargs[name]
                    except KeyError:
                        raise ValueError(
                            f"Dataset {self.cls.__name__} should be instantiated with {name}"
                        )
                if self.autoset_attrs:
                    setattr(instance, name, value)
                setattr(self, name, value)

            # These can be passed but have default values
            else:
                try:
                    value = args[idx]
                except IndexError:
                    try:
                        value = kwargs[name]
                    except KeyError:
                        value = param.default

                if self.autoset_attrs:
                    setattr(instance, name, value)
                setattr(self, name, value)

        # set the lines and labels
        self.lines, self.labels = instance.get_lines_labels(self.filename)
        self.word_instances = None
        self.word_vocab = None

        if "word_vocab" in self.vocab_pipe:
            self.word_tokenizer = WordTokenizer(self.word_tokenization_type)
            self.set_word_vocab()
            instance.word_tokenizer = self.word_tokenizer
            instance.word_numericalizer = self.word_numericalizer
            instance.word_vocab = copy.deepcopy(self.word_vocab)
            instance.word_instances = copy.deepcopy(self.word_instances)
            instance.num_instances = len(self.word_instances)
            instance.instance_max_len = max(
                [len(instance) for instance in self.word_instances]
            )

        if "char_vocab" in self.vocab_pipe:
            self.char_tokenizer = CharacterTokenizer()
            self.set_char_vocab()
            instance.char_vocab = copy.deepcopy(self.char_vocab)
            instance.char_instances = copy.deepcopy(self.char_instances)
            instance.char_tokenizer = self.char_tokenizer
            instance.char_numericalizer = self.char_numericalizer

        if self.is_get_label_stats_table:
            label_stats_table = self._get_label_stats_table()
            instance.label_stats_table = label_stats_table

        return instance
Esempio n. 29
0
    def __init__(
        self,
        train_filename: str,
        dev_filename: str,
        test_filename: str,
        tokenizers: Dict[str, BaseTokenizer] = None,
        namespace_vocab_options: Dict[str, Dict[str, Any]] = None,
        namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None,
        batch_size=10,
        column_names: List[str] = None,
    ):
        self.train_filename = train_filename
        self.dev_filename = dev_filename
        self.test_filename = test_filename
        self.tokenizers = tokenizers or {
            "tokens": WordTokenizer(tokenizer="vanilla"),
            "char_tokens": CharacterTokenizer(),
        }

        namespace_vocab_options_defaults = {
            "char_tokens": {
                "start_token": " ",
                "end_token": " ",
                "pad_token": " ",
                "unk_token": " ",
            }
        }

        if namespace_vocab_options is None:
            namespace_vocab_options = {}

        self.namespace_vocab_options = copy.deepcopy(
            namespace_vocab_options_defaults)

        for namespace, options in self.namespace_vocab_options.items():
            user_passed = namespace_vocab_options.get(namespace, {})
            self.namespace_vocab_options[namespace] = {
                **options,
                **user_passed
            }

        self.namespace_numericalizer_map = namespace_numericalizer_map or {
            "tokens": Numericalizer(),
            "char_tokens": Numericalizer(),
        }

        self.batch_size = batch_size

        if column_names is None:
            column_names = ["NER"]

        for column_name in column_names:
            self.namespace_numericalizer_map[column_name] = Numericalizer()

        self.train_dataset = ConllYagoDataset(
            filename=self.train_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
        )

        self.dev_dataset = ConllYagoDataset(
            filename=self.dev_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
        )

        self.test_dataset = ConllYagoDataset(
            filename=self.test_filename,
            tokenizers=self.tokenizers,
            column_names=column_names,
        )

        super(ConllYagoDatasetsManager, self).__init__(
            train_dataset=self.train_dataset,
            dev_dataset=self.dev_dataset,
            test_dataset=self.test_dataset,
            namespace_vocab_options=self.namespace_vocab_options,
            namespace_numericalizer_map=self.namespace_numericalizer_map,
            batch_size=batch_size,
        )
    def test_get_item(self, test_file):
        dataset = ExtractiveSummarizationDataset(
            filename=str(test_file), tokenizers={"tokens": WordTokenizer()})

        doc0, label0, ref0 = dataset[0]
        assert len(doc0) == len(label0.tokens["seq_label"])