def test_split_with_regex(self): tokenizer = Tokenizer(split_regex=r"[\s,;!.?\"\(\)\-]+") sentence = """ Your bones don't break, mine do. That's clear. Your cells react to bacteria and viruses differently than mine. You don't get sick, I do. That's also clear. But for some reason, you and I react the exact same way to water. We swallow it too fast, we choke. We get some in our lungs, we drown. However unreal it may seem, we are connected, you and I. We're on the same curve, just on opposite ends. """ expected = """ your bones don't break mine do that's clear your cells react to bacteria and viruses differently than mine you don't get sick i do that's also clear but for some reason you and i react the exact same way to water we swallow it too fast we choke we get some in our lungs we drown however unreal it may seem we are connected you and i we're on the same curve just on opposite ends """.split() tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, [t.value for t in tokens]) sentence = '"Please, buy me a coffee?" He implored-in vain.' expected = "please buy me a coffee he implored in vain".split() tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, [t.value for t in tokens])
def test_tokenize_use_byte_offsets(self): tokenizer = Tokenizer(use_byte_offsets=True) sentence = "Ordér mê å ćoƒfee" expected = [ Token("ordér", 0, 6), Token("mê", 7, 10), Token("å", 11, 13), Token("ćoƒfee", 14, 22), ] tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, tokens)
def test_tokenize_no_byte_offsets(self): tokenizer = Tokenizer() sentence = "Ordér mê å ćoƒfee" expected = [ Token("ordér", 0, 5), Token("mê", 6, 8), Token("å", 9, 10), Token("ćoƒfee", 11, 17), ] tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, tokens)
def test_lookup_tokens(self): text = "let's tokenize this" tokenizer = Tokenizer() vocab = Vocabulary(text.split() + [BOS, EOS]) tokens, start_idx, end_idx = lookup_tokens( text, tokenizer=tokenizer, vocab=vocab, add_bos_token=False, add_eos_token=False, ) self.assertEqual(tokens, [0, 1, 2]) self.assertEqual(start_idx, (0, 6, 15)) self.assertEqual(end_idx, (5, 14, 19)) tokens, start_idx, end_idx = lookup_tokens( text, tokenizer=tokenizer, vocab=vocab, add_bos_token=True, add_eos_token=True, ) self.assertEqual(tokens, [3, 0, 1, 2, 4]) self.assertEqual(start_idx, (-1, 0, 6, 15, -1)) self.assertEqual(end_idx, (-1, 5, 14, 19, -1))
def __init__( self, text_column, tokenizer=None, add_bos_token=Config.add_bos_token, add_eos_token=Config.add_eos_token, use_eos_token_for_bos=Config.use_eos_token_for_bos, max_seq_len=Config.max_seq_len, build_vocab=True, vocab=None, vocab_file=Config.vocab_file, vocab_file_size_limit=Config.vocab_file_size_limit, ): super().__init__([(text_column, str)]) self.text_column = text_column self.tokenizer = tokenizer or Tokenizer() self.vocab = vocab self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos self.max_seq_len = max_seq_len or 2**30 # large number self.build_vocab = build_vocab self.vocab_builder = None self.vocab_file = vocab_file self.vocab_file_size_limit = vocab_file_size_limit
def tokenize( text: str = None, pre_tokenized: List[Token] = None, tokenizer: Tokenizer = None, bos_token: Optional[str] = None, eos_token: Optional[str] = None, pad_token: str = PAD, use_eos_token_for_bos: bool = False, max_seq_len: int = 2**30, ): tokenized = (pre_tokenized or tokenizer.tokenize(text)[:max_seq_len - (bos_token is not None) - (eos_token is not None)]) if bos_token: if use_eos_token_for_bos: bos_token = eos_token tokenized = [Token(bos_token, -1, -1)] + tokenized if eos_token: tokenized.append(Token(eos_token, -1, -1)) if not tokenized: tokenized = [Token(pad_token, -1, -1)] tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end) for t in tokenized)) return tokenized_texts, start_idx, end_idx
def setUp(self): self.json_data_source = SquadDataSource.from_config( SquadDataSource.Config( train_filename=tests_module.test_file("squad_tiny.json"), eval_filename=None, test_filename=None, ) ) self.tsv_data_source = SquadDataSource.from_config( SquadDataSource.Config( train_filename=tests_module.test_file("squad_tiny.tsv"), eval_filename=None, test_filename=None, ) ) self.tensorizer_with_wordpiece = SquadTensorizer.from_config( SquadTensorizer.Config( tokenizer=WordPieceTokenizer.Config( wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt" ), max_seq_len=250, ) ) self.tensorizer_with_alphanumeric = SquadTensorizer.from_config( SquadTensorizer.Config( tokenizer=Tokenizer.Config(split_regex=r"\W+"), max_seq_len=250 ) )
class Config(Tensorizer.Config): # BERT style models support multiple text inputs columns: List[str] = ["text"] tokenizer: Tokenizer.Config = Tokenizer.Config() # base token-level tokenizer for sequence labeling tasks base_tokenizer: Optional[Tokenizer.Config] = None vocab_file: str = "" max_seq_len: int = 256
class Config(Tensorizer.Config): #: The name of the text column to parse from the data source. column: str = "text" #: The tokenizer to use to split input text into tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() add_bos_token: bool = False add_eos_token: bool = False use_eos_token_for_bos: bool = False max_seq_len: Optional[int] = None
def __init__( self, text_column: str = Config.text_column, dict_column: str = Config.dict_column, tokenizer: Tokenizer = None, ): self.text_column = text_column self.dict_column = dict_column self.tokenizer = tokenizer or Tokenizer()
def __init__( self, text_column: str = Config.text_column, dict_column: str = Config.dict_column, tokenizer: Tokenizer = None, ): super().__init__([(text_column, str), (dict_column, Gazetteer)]) self.text_column = text_column self.dict_column = dict_column self.tokenizer = tokenizer or Tokenizer()
def __init__( self, text_column: str = Config.text_column, dict_column: str = Config.dict_column, tokenizer: Tokenizer = None, ): self.text_column = text_column self.dict_column = dict_column self.tokenizer = tokenizer or Tokenizer() self.vocab_builder = VocabBuilder() self.vocab = None
class Config(Tensorizer.Config): #: The name of the text column to parse from the data source. column: str = "text" #: The tokenizer to use to split input text into tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() #: The max token length for input text. max_seq_len: Optional[int] = None #: The max byte length for a token. max_byte_len: int = 15 #: Offset to add to all non-padding bytes offset_for_non_padding: int = 0
def __init__( self, actions_vocab, channels: List[Channel], text_column_name: str = Config.text_column_name, tokenizer: Tokenizer = None, ) -> None: super().__init__(channels) self.actions_vocab = actions_vocab self.text_column_name = text_column_name self.tokenizer = tokenizer or Tokenizer()
class Config(TokenTensorizer.Config): # for model inputs doc_column: str = "doc" ques_column: str = "question" # for labels answers_column: str = "answers" answer_starts_column: str = "answer_starts" # Since Tokenizer is __EXPANSIBLE__, we don't need a Union type to # support WordPieceTokenizer. tokenizer: Tokenizer.Config = Tokenizer.Config(split_regex=r"\W+") max_ques_seq_len: int = 64 max_doc_seq_len: int = 256
def __init__( self, slot_column: str = Config.slot_column, text_column: str = Config.text_column, tokenizer: Tokenizer = None, allow_unknown: bool = Config.allow_unknown, ): self.slot_column = slot_column self.text_column = text_column self.allow_unknown = allow_unknown self.tokenizer = tokenizer or Tokenizer() self.pad_idx = Padding.DEFAULT_LABEL_PAD_IDX
def __init__( self, slot_column: str = Config.slot_column, text_column: str = Config.text_column, tokenizer: Tokenizer = None, allow_unknown: bool = Config.allow_unknown, ): super().__init__([(text_column, str), (slot_column, List[Slot])]) self.slot_column = slot_column self.text_column = text_column self.allow_unknown = allow_unknown self.tokenizer = tokenizer or Tokenizer()
class Config(Tensorizer.Config): #: The name of the text column to parse from the data source. column: str = "text" #: The tokenizer to use to split input text into tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() add_bos_token: bool = False add_eos_token: bool = False use_eos_token_for_bos: bool = False max_seq_len: Optional[int] = None #: If False, will not create token vocab during initialization. The vocab will #: need to be set during model initialization (e.g. see WordEmbedding) build_vocab: bool = True
class Config(BERTTensorizer.Config): vocab_file: str = "/mnt/vol/nlp_technologies/xlm/vocab_xnli_15" tokenizer: Tokenizer.Config = Tokenizer.Config() is_fairseq: bool = False pretraining: bool = False max_seq_len: Optional[int] = 256 max_vocab: int = 95000 min_count: int = 0 language_columns: List[str] = ["language"] lang2id: Dict[str, int] = LANG2ID_15 reset_positions: bool = False has_language_in_data: bool = False use_language_embeddings: bool = True
class Config(Tensorizer.Config): #: The name of the slot label column to parse from the data source. slot_column: str = "slots" #: The name of the text column to parse from the data source. #: We need this to be able to generate tensors which correspond to input text. text_column: str = "text" #: The tokenizer to use to split input text into tokens. This should be #: configured in a way which yields tokens consistent with the tokens input to #: or output by a model, so that the labels generated by this tensorizer #: will match the indices of the model's tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() #: Whether to allow for unknown labels at test/prediction time allow_unknown: bool = False
class Config(Tensorizer.Config): column: str = "text_seq" max_seq_len: Optional[int] = None #: sentence markers add_bos_token: bool = False add_eos_token: bool = False use_eos_token_for_bos: bool = False #: list markers add_bol_token: bool = False add_eol_token: bool = False use_eol_token_for_bol: bool = False #: The tokenizer to use to split input text into tokens. tokenizer: Tokenizer.Config = Tokenizer.Config()
def __init__( self, text_column, tokenizer=None, max_seq_len=Config.max_seq_len, max_byte_len=Config.max_byte_len, offset_for_non_padding=Config.offset_for_non_padding, ): self.text_column = text_column self.tokenizer = tokenizer or Tokenizer() self.max_seq_len = max_seq_len or 2**30 # large number self.max_byte_len = max_byte_len self.offset_for_non_padding = offset_for_non_padding
class Config(BERTTensorizerBase.Config): vocab_file: str = "/mnt/vol/nlp_technologies/xlm/vocab_xnli_15" tokenizer: Tokenizer.Config = Tokenizer.Config() max_vocab: int = 95000 min_count: int = 0 # language identifiers for extracting the language from a row of data # during numberize language_column: str = "language" # language-to-id mapping used to obtain language embeddings lang2id: Dict[str, int] = LANG2ID_15 # Controls whether language is being read from the data file (which # is what happens for finetuning) or being added during processing # (which is what happens during pretraining) has_language_in_data: bool = False # controls whether we train with language embeddings or not use_language_embeddings: bool = True
def __init__( self, slot_column: str = Config.slot_column, text_column: str = Config.text_column, tokenizer: Tokenizer = None, allow_unknown: bool = Config.allow_unknown, ): self.slot_column = slot_column self.text_column = text_column self.allow_unknown = allow_unknown self.tokenizer = tokenizer or Tokenizer() self.pad_idx = Padding.DEFAULT_LABEL_PAD_IDX self.vocab_builder = VocabBuilder() self.vocab_builder.add(NO_LABEL) self.vocab_builder.use_pad = False self.vocab_builder.use_unk = self.allow_unknown self.vocab = None
def __init__( self, text_column, tokenizer=None, max_seq_len=Config.max_seq_len, max_byte_len=Config.max_byte_len, offset_for_non_padding=Config.offset_for_non_padding, ): if isinstance(tokenizer, DoNothingTokenizer): super().__init__([(text_column, List[str])]) else: super().__init__([(text_column, str)]) self.text_column = text_column self.tokenizer = tokenizer or Tokenizer() self.max_seq_len = max_seq_len or 2**30 # large number self.max_byte_len = max_byte_len self.offset_for_non_padding = offset_for_non_padding
def __init__( self, text_column, tokenizer=None, add_bos_token=Config.add_bos_token, add_eos_token=Config.add_eos_token, use_eos_token_for_bos=Config.use_eos_token_for_bos, max_seq_len=Config.max_seq_len, vocab=None, ): super().__init__([(text_column, str)]) self.text_column = text_column self.tokenizer = tokenizer or Tokenizer() self.vocab = vocab self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos self.max_seq_len = max_seq_len or 2**30 # large number
def __init__( self, text_column, tokenizer=None, max_seq_len=Config.max_seq_len, max_byte_len=Config.max_byte_len, offset_for_non_padding=Config.offset_for_non_padding, add_bos_token=Config.add_bos_token, add_eos_token=Config.add_eos_token, use_eos_token_for_bos=Config.use_eos_token_for_bos, ): self.text_column = text_column self.tokenizer = tokenizer or Tokenizer() self.max_seq_len = max_seq_len or 2**30 # large number self.max_byte_len = max_byte_len self.offset_for_non_padding = offset_for_non_padding self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos
def __init__( self, text_column, tokenizer=None, add_bos_token=Config.add_bos_token, add_eos_token=Config.add_eos_token, use_eos_token_for_bos=Config.use_eos_token_for_bos, max_seq_len=Config.max_seq_len, vocab_config=None, vocab=None, ): self.text_column = text_column self.tokenizer = tokenizer or Tokenizer() self.vocab = vocab self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos self.max_seq_len = max_seq_len or 2**30 # large number self.vocab_builder = None self.vocab_config = vocab_config or VocabConfig()
def __init__( self, column: str = Config.column, tokenizer=None, add_bos_token: bool = Config.add_bos_token, add_eos_token: bool = Config.add_eos_token, use_eos_token_for_bos: bool = Config.use_eos_token_for_bos, add_bol_token: bool = Config.add_bol_token, add_eol_token: bool = Config.add_eol_token, use_eol_token_for_bol: bool = Config.use_eol_token_for_bol, max_seq_len=Config.max_seq_len, vocab=None, ): self.column = column self.tokenizer = tokenizer or Tokenizer() self.vocab = vocab self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos self.add_bol_token = add_bol_token self.add_eol_token = add_eol_token self.use_eol_token_for_bol = use_eol_token_for_bol self.max_seq_len = max_seq_len or 2**30 # large number
def tokenize( text: str = None, pre_tokenized: List[Token] = None, tokenizer: Tokenizer = None, add_bos_token: bool = False, add_eos_token: bool = False, use_eos_token_for_bos: bool = False, max_seq_len: int = 2**30, ): tokenized = (pre_tokenized or tokenizer.tokenize(text)[:max_seq_len - add_bos_token - add_eos_token]) if add_bos_token: bos = EOS if use_eos_token_for_bos else BOS tokenized = [Token(bos, -1, -1)] + tokenized if add_eos_token: tokenized.append(Token(EOS, -1, -1)) if not tokenized: tokenized = [Token(PAD, -1, -1)] tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end) for t in tokenized)) return tokenized_texts, start_idx, end_idx