def __init__( self, text: Union[str, List[str]] = None, use_tokenizer: Union[bool, Tokenizer] = True, language_code: str = None, start_position: int = None ): """ Class to hold all meta related to a text (tokens, predictions, language code, ...) :param text: original string (sentence), or a list of string tokens (words) :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`) more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer` to use Spacy library if available). Check the implementations of abstract class Tokenizer or implement your own subclass (if you need it). If instead of providing a Tokenizer, this parameter is just set to True (deprecated), :class:`SegtokTokenizer` will be used. :param language_code: Language of the sentence :param start_position: Start char offset of the sentence in the superordinate document """ super().__init__() self.tokens: List[Token] = [] self._embeddings: Dict = {} self.language_code: str = language_code self.start_pos = start_position self.end_pos = ( start_position + len(text) if start_position is not None else None ) if isinstance(use_tokenizer, Tokenizer): tokenizer = use_tokenizer elif hasattr(use_tokenizer, "__call__"): from flair.tokenization import TokenizerWrapper tokenizer = TokenizerWrapper(use_tokenizer) elif type(use_tokenizer) == bool: from flair.tokenization import SegtokTokenizer, SpaceTokenizer tokenizer = SegtokTokenizer() if use_tokenizer else SpaceTokenizer() else: raise AssertionError("Unexpected type of parameter 'use_tokenizer'. " + "Parameter should be bool, Callable[[str], List[Token]] (deprecated), Tokenizer") # if text is passed, instantiate sentence with tokens (words) if text is not None: if isinstance(text, (list, tuple)): [self.add_token(self._restore_windows_1252_characters(token)) for token in text] else: text = self._restore_windows_1252_characters(text) [self.add_token(token) for token in tokenizer.tokenize(text)] # log a warning if the dataset is empty if text == "": log.warning( "Warning: An empty Sentence was created! Are there empty strings in your dataset?" ) self.tokenized = None
def test_newline_sentence_splitter(): newline_splitter = NewlineSentenceSplitter() sentences = newline_splitter.split("I love Berlin\nMe too") assert len(sentences) == 2 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 3 assert sentences[0].start_pos == 0 assert len(sentences[1].tokens) == 2 newline_splitter = NewlineSentenceSplitter(tokenizer=TokenizerWrapper(no_op_tokenizer)) sentences = newline_splitter.split("I love Berlin\nMe too") assert len(sentences) == 2 assert len(sentences[0].tokens) == 1 assert sentences[1].start_pos == 14 assert len(sentences[1].tokens) == 1 sentences = newline_splitter.split("I love Berlin Me too") assert len(sentences) == 1 sentences = newline_splitter.split("I love Berlin\n\nMe too") assert len(sentences) == 2 sentences = newline_splitter.split("I love Berlin\n \nMe too") assert len(sentences) == 2
def test_tag_sentence_splitter(): tag_splitter = TagSentenceSplitter(tag="#!") sentences = tag_splitter.split("I love Berlin#!Me too") assert len(sentences) == 2 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 3 assert sentences[1].start_pos == 15 assert len(sentences[1].tokens) == 2 tag_splitter = TagSentenceSplitter(tag="#!", tokenizer=TokenizerWrapper(no_op_tokenizer)) sentences = tag_splitter.split("I love Berlin#!Me too") assert len(sentences) == 2 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 1 assert sentences[1].start_pos == 15 assert len(sentences[1].tokens) == 1 sentences = tag_splitter.split("I love Berlin Me too") assert len(sentences) == 1 sentences = tag_splitter.split("I love Berlin#!#!Me too") assert len(sentences) == 2 sentences = tag_splitter.split("I love Berl#! #!inMe too") assert len(sentences) == 2
def test_create_sentence_with_custom_tokenizer(): def custom_tokenizer(text: str) -> List[Token]: return [Token(text, 0)] sentence:Sentence = Sentence("I love Berlin.", use_tokenizer=TokenizerWrapper(custom_tokenizer)) assert 1 == len(sentence.tokens) assert "I love Berlin." == sentence.tokens[0].text
def test_split_text_spacy(): spacy_splitter = SpacySentenceSplitter("en_core_sci_sm") sentences = spacy_splitter.split("This a sentence. " "And here is another one.") assert len(sentences) == 2 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 4 assert sentences[1].start_pos == 17 assert len(sentences[1].tokens) == 6 sentences = spacy_splitter.split( "VF inhibits something. ACE-dependent (GH+) issuses too.") assert len(sentences) == 2 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 4 assert sentences[1].start_pos == 23 assert len(sentences[1].tokens) == 7 spacy_splitter = SpacySentenceSplitter( "en_core_sci_sm", tokenizer=TokenizerWrapper(no_op_tokenizer)) sentences = spacy_splitter.split("This a sentence. " "And here is another one.") assert len(sentences) == 2 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 1 assert sentences[1].start_pos == 17 assert len(sentences[1].tokens) == 1
def test_no_sentence_splitter(): no_splitter = NoSentenceSplitter() sentences = no_splitter.split("I love Berlin") assert len(sentences) == 1 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 3 no_splitter = NoSentenceSplitter(TokenizerWrapper(no_op_tokenizer)) sentences = no_splitter.split("I love Berlin") assert len(sentences) == 1 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 1
def test_segtok_sentence_splitter(): segtok_splitter = SegtokSentenceSplitter() sentences = segtok_splitter.split("I love Berlin. Berlin is a great city.") assert len(sentences) == 2 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 4 assert sentences[1].start_pos == 15 assert len(sentences[1].tokens) == 6 segtok_splitter = SegtokSentenceSplitter(tokenizer=TokenizerWrapper(no_op_tokenizer)) sentences = segtok_splitter.split("I love Berlin. Berlin is a great city.") assert len(sentences) == 2 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 1 assert sentences[1].start_pos == 15 assert len(sentences[1].tokens) == 1
def test_conll_writer_whitespace_after(): text = f"A sentence with cardio-dependent. {SENTENCE_TAG}Clark et al. reported that" dataset = InternalBioNerDataset( documents={"1": text}, entities_per_document={"1": []}, ) assert_conll_writer_output( dataset, [ "A O +", "sentence O +", "with O +", "cardio O -", "dependent. O +", "Clark O +", "et O +", "al. O +", "reported O +", "that O -", ], TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=TokenizerWrapper(simple_tokenizer)))
def test_create_sentence_with_custom_tokenizer(): sentence: Sentence = Sentence("I love Berlin.", use_tokenizer=TokenizerWrapper(no_op_tokenizer)) assert 1 == len(sentence.tokens) assert 0 == sentence.tokens[0].start_pos assert "I love Berlin." == sentence.tokens[0].text