def _tokenize(text: str, label: List[str]) -> List[str]: tokens = SpaceTokenizer().run_tokenize(text) token_texts = [token.text for token in tokens] assert len(token_texts) == len( label), "Tokenization does not match with available labels" return token_texts
def __init__( self, text: Union[str, List[str]] = None, use_tokenizer: Union[bool, Tokenizer] = True, language_code: str = None, start_position: int = None ): """ Class to hold all meta related to a text (tokens, predictions, language code, ...) :param text: original string (sentence), or a list of string tokens (words) :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`) more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer` to use Spacy library if available). Check the implementations of abstract class Tokenizer or implement your own subclass (if you need it). If instead of providing a Tokenizer, this parameter is just set to True (deprecated), :class:`SegtokTokenizer` will be used. :param language_code: Language of the sentence :param start_position: Start char offset of the sentence in the superordinate document """ super().__init__() self.tokens: List[Token] = [] self._embeddings: Dict = {} self.language_code: str = language_code self.start_pos = start_position self.end_pos = ( start_position + len(text) if start_position is not None else None ) if isinstance(use_tokenizer, Tokenizer): tokenizer = use_tokenizer elif hasattr(use_tokenizer, "__call__"): from flair.tokenization import TokenizerWrapper tokenizer = TokenizerWrapper(use_tokenizer) elif type(use_tokenizer) == bool: from flair.tokenization import SegtokTokenizer, SpaceTokenizer tokenizer = SegtokTokenizer() if use_tokenizer else SpaceTokenizer() else: raise AssertionError("Unexpected type of parameter 'use_tokenizer'. " + "Parameter should be bool, Callable[[str], List[Token]] (deprecated), Tokenizer") # if text is passed, instantiate sentence with tokens (words) if text is not None: if isinstance(text, (list, tuple)): [self.add_token(self._restore_windows_1252_characters(token)) for token in text] else: text = self._restore_windows_1252_characters(text) [self.add_token(token) for token in tokenizer.tokenize(text)] # log a warning if the dataset is empty if text == "": log.warning( "Warning: An empty Sentence was created! Are there empty strings in your dataset?" ) self.tokenized = None
def test_token_positions_when_creating_with_tokenizer(): sentence = Sentence("I love Berlin .", use_tokenizer=SpaceTokenizer()) assert 0 == sentence.tokens[0].start_position assert 1 == sentence.tokens[0].end_position assert 2 == sentence.tokens[1].start_position assert 6 == sentence.tokens[1].end_position assert 7 == sentence.tokens[2].start_position assert 13 == sentence.tokens[2].end_position sentence = Sentence(" I love Berlin.", use_tokenizer=SegtokTokenizer()) assert 1 == sentence.tokens[0].start_position assert 2 == sentence.tokens[0].end_position assert 3 == sentence.tokens[1].start_position assert 7 == sentence.tokens[1].end_position assert 9 == sentence.tokens[2].start_position assert 15 == sentence.tokens[2].end_position
def __init__( self, texts: Union[str, List[str]], use_tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(), ): """ Instantiate StringDataset :param texts: a string or List of string that make up StringDataset :param use_tokenizer: Custom tokenizer to use (default is SpaceTokenizer, more advanced options are SegTokTokenizer to use segtok or SpacyTokenizer to use Spacy library models if available). Check the code of subclasses of Tokenizer to implement your own (if you need it). If instead of providing a function, this parameter is just set to True, SegTokTokenizer will be used. """ # cast to list if necessary if type(texts) == Sentence: texts = [texts] self.texts = texts self.use_tokenizer = use_tokenizer
def assert_conll_writer_output( dataset: InternalBioNerDataset, expected_output: List[str], sentence_splitter: SentenceSplitter = None, ): fd, outfile_path = tempfile.mkstemp() try: sentence_splitter = sentence_splitter if sentence_splitter else NoSentenceSplitter( tokenizer=SpaceTokenizer()) writer = CoNLLWriter(sentence_splitter=sentence_splitter) writer.write_to_conll(dataset, Path(outfile_path)) with open(outfile_path) as f: contents = [line.strip() for line in f.readlines() if line.strip()] finally: os.close(fd) os.remove(outfile_path) assert contents == expected_output
def predict(self, sentence: str, model_path: str = ''): """ Predict a sentences :param sentence: sentence to predict :param model_path: path to the model :return: sense id of the predicted preposition """ # (Try to) load classifier if none has yet been loaded if self.__classifier is None: self._load_classifier(model_path) if self.__classifier is None: raise ValueError( 'Unable to load a classifier. Prediction not possible') # Tokenize sentence with space tokenizer sentence = Sentence(sentence, SpaceTokenizer()) self.__classifier.predict(sentences=sentence, mini_batch_size=self.__mini_batch_size, verbose=self.__verbose) # Return sense id (number only) return str(sentence.labels).split(" ")[0].split("__")[2]
def space_tokenizer(text: str) -> List[Token]: # We don't want to create a SpaceTokenizer object each time this function is called, # so delegate the call directly to the static run_tokenize method from flair.tokenization import SpaceTokenizer return SpaceTokenizer.run_tokenize(text)