Example #1
0
    def test_split_with_regex(self):
        tokenizer = Tokenizer(split_regex=r"[\s,;!.?\"\(\)\-]+")
        sentence = """
            Your bones don't break, mine do. That's clear. Your cells react to
            bacteria and viruses differently than mine. You don't get sick,
            I do. That's also clear. But for some reason, you and I react the
            exact same way to water. We swallow it too fast, we choke. We get
            some in our lungs, we drown. However unreal it may seem, we are
            connected, you and I. We're on the same curve, just on opposite
            ends.
        """
        expected = """
            your bones don't break mine do that's clear your cells react to
            bacteria and viruses differently than mine you don't get sick
            i do that's also clear but for some reason you and i react the
            exact same way to water we swallow it too fast we choke we get
            some in our lungs we drown however unreal it may seem we are
            connected you and i we're on the same curve just on opposite ends
        """.split()
        tokens = tokenizer.tokenize(sentence)
        self.assertListEqual(expected, [t.value for t in tokens])

        sentence = '"Please, buy me a coffee?" He implored-in vain.'
        expected = "please buy me a coffee he implored in vain".split()
        tokens = tokenizer.tokenize(sentence)
        self.assertListEqual(expected, [t.value for t in tokens])
Example #2
0
 def test_tokenize_use_byte_offsets(self):
     tokenizer = Tokenizer(use_byte_offsets=True)
     sentence = "Ordér mê å ćoƒfee"
     expected = [
         Token("ordér", 0, 6),
         Token("mê", 7, 10),
         Token("å", 11, 13),
         Token("ćoƒfee", 14, 22),
     ]
     tokens = tokenizer.tokenize(sentence)
     self.assertListEqual(expected, tokens)
Example #3
0
 def test_tokenize_no_byte_offsets(self):
     tokenizer = Tokenizer()
     sentence = "Ordér mê å ćoƒfee"
     expected = [
         Token("ordér", 0, 5),
         Token("mê", 6, 8),
         Token("å", 9, 10),
         Token("ćoƒfee", 11, 17),
     ]
     tokens = tokenizer.tokenize(sentence)
     self.assertListEqual(expected, tokens)
Example #4
0
    def test_lookup_tokens(self):
        text = "let's tokenize this"
        tokenizer = Tokenizer()
        vocab = Vocabulary(text.split() + [BOS, EOS])
        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            add_bos_token=False,
            add_eos_token=False,
        )
        self.assertEqual(tokens, [0, 1, 2])
        self.assertEqual(start_idx, (0, 6, 15))
        self.assertEqual(end_idx, (5, 14, 19))

        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            add_bos_token=True,
            add_eos_token=True,
        )
        self.assertEqual(tokens, [3, 0, 1, 2, 4])
        self.assertEqual(start_idx, (-1, 0, 6, 15, -1))
        self.assertEqual(end_idx, (-1, 5, 14, 19, -1))
Example #5
0
 def __init__(
     self,
     text_column,
     tokenizer=None,
     add_bos_token=Config.add_bos_token,
     add_eos_token=Config.add_eos_token,
     use_eos_token_for_bos=Config.use_eos_token_for_bos,
     max_seq_len=Config.max_seq_len,
     build_vocab=True,
     vocab=None,
     vocab_file=Config.vocab_file,
     vocab_file_size_limit=Config.vocab_file_size_limit,
 ):
     super().__init__([(text_column, str)])
     self.text_column = text_column
     self.tokenizer = tokenizer or Tokenizer()
     self.vocab = vocab
     self.add_bos_token = add_bos_token
     self.add_eos_token = add_eos_token
     self.use_eos_token_for_bos = use_eos_token_for_bos
     self.max_seq_len = max_seq_len or 2**30  # large number
     self.build_vocab = build_vocab
     self.vocab_builder = None
     self.vocab_file = vocab_file
     self.vocab_file_size_limit = vocab_file_size_limit
Example #6
0
def tokenize(
    text: str = None,
    pre_tokenized: List[Token] = None,
    tokenizer: Tokenizer = None,
    bos_token: Optional[str] = None,
    eos_token: Optional[str] = None,
    pad_token: str = PAD,
    use_eos_token_for_bos: bool = False,
    max_seq_len: int = 2**30,
):
    tokenized = (pre_tokenized
                 or tokenizer.tokenize(text)[:max_seq_len -
                                             (bos_token is not None) -
                                             (eos_token is not None)])
    if bos_token:
        if use_eos_token_for_bos:
            bos_token = eos_token
        tokenized = [Token(bos_token, -1, -1)] + tokenized
    if eos_token:
        tokenized.append(Token(eos_token, -1, -1))
    if not tokenized:
        tokenized = [Token(pad_token, -1, -1)]

    tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end)
                                                for t in tokenized))
    return tokenized_texts, start_idx, end_idx
    def setUp(self):
        self.json_data_source = SquadDataSource.from_config(
            SquadDataSource.Config(
                train_filename=tests_module.test_file("squad_tiny.json"),
                eval_filename=None,
                test_filename=None,
            )
        )
        self.tsv_data_source = SquadDataSource.from_config(
            SquadDataSource.Config(
                train_filename=tests_module.test_file("squad_tiny.tsv"),
                eval_filename=None,
                test_filename=None,
            )
        )

        self.tensorizer_with_wordpiece = SquadTensorizer.from_config(
            SquadTensorizer.Config(
                tokenizer=WordPieceTokenizer.Config(
                    wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
                ),
                max_seq_len=250,
            )
        )
        self.tensorizer_with_alphanumeric = SquadTensorizer.from_config(
            SquadTensorizer.Config(
                tokenizer=Tokenizer.Config(split_regex=r"\W+"), max_seq_len=250
            )
        )
Example #8
0
 class Config(Tensorizer.Config):
     # BERT style models support multiple text inputs
     columns: List[str] = ["text"]
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     # base token-level tokenizer for sequence labeling tasks
     base_tokenizer: Optional[Tokenizer.Config] = None
     vocab_file: str = ""
     max_seq_len: int = 256
Example #9
0
 class Config(Tensorizer.Config):
     #: The name of the text column to parse from the data source.
     column: str = "text"
     #: The tokenizer to use to split input text into tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     add_bos_token: bool = False
     add_eos_token: bool = False
     use_eos_token_for_bos: bool = False
     max_seq_len: Optional[int] = None
Example #10
0
 def __init__(
     self,
     text_column: str = Config.text_column,
     dict_column: str = Config.dict_column,
     tokenizer: Tokenizer = None,
 ):
     self.text_column = text_column
     self.dict_column = dict_column
     self.tokenizer = tokenizer or Tokenizer()
Example #11
0
 def __init__(
     self,
     text_column: str = Config.text_column,
     dict_column: str = Config.dict_column,
     tokenizer: Tokenizer = None,
 ):
     super().__init__([(text_column, str), (dict_column, Gazetteer)])
     self.text_column = text_column
     self.dict_column = dict_column
     self.tokenizer = tokenizer or Tokenizer()
Example #12
0
 def __init__(
     self,
     text_column: str = Config.text_column,
     dict_column: str = Config.dict_column,
     tokenizer: Tokenizer = None,
 ):
     self.text_column = text_column
     self.dict_column = dict_column
     self.tokenizer = tokenizer or Tokenizer()
     self.vocab_builder = VocabBuilder()
     self.vocab = None
Example #13
0
 class Config(Tensorizer.Config):
     #: The name of the text column to parse from the data source.
     column: str = "text"
     #: The tokenizer to use to split input text into tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     #: The max token length for input text.
     max_seq_len: Optional[int] = None
     #: The max byte length for a token.
     max_byte_len: int = 15
     #: Offset to add to all non-padding bytes
     offset_for_non_padding: int = 0
Example #14
0
 def __init__(
     self,
     actions_vocab,
     channels: List[Channel],
     text_column_name: str = Config.text_column_name,
     tokenizer: Tokenizer = None,
 ) -> None:
     super().__init__(channels)
     self.actions_vocab = actions_vocab
     self.text_column_name = text_column_name
     self.tokenizer = tokenizer or Tokenizer()
Example #15
0
 class Config(TokenTensorizer.Config):
     # for model inputs
     doc_column: str = "doc"
     ques_column: str = "question"
     # for labels
     answers_column: str = "answers"
     answer_starts_column: str = "answer_starts"
     # Since Tokenizer is __EXPANSIBLE__, we don't need a Union type to
     # support WordPieceTokenizer.
     tokenizer: Tokenizer.Config = Tokenizer.Config(split_regex=r"\W+")
     max_ques_seq_len: int = 64
     max_doc_seq_len: int = 256
Example #16
0
 def __init__(
     self,
     slot_column: str = Config.slot_column,
     text_column: str = Config.text_column,
     tokenizer: Tokenizer = None,
     allow_unknown: bool = Config.allow_unknown,
 ):
     self.slot_column = slot_column
     self.text_column = text_column
     self.allow_unknown = allow_unknown
     self.tokenizer = tokenizer or Tokenizer()
     self.pad_idx = Padding.DEFAULT_LABEL_PAD_IDX
Example #17
0
 def __init__(
     self,
     slot_column: str = Config.slot_column,
     text_column: str = Config.text_column,
     tokenizer: Tokenizer = None,
     allow_unknown: bool = Config.allow_unknown,
 ):
     super().__init__([(text_column, str), (slot_column, List[Slot])])
     self.slot_column = slot_column
     self.text_column = text_column
     self.allow_unknown = allow_unknown
     self.tokenizer = tokenizer or Tokenizer()
Example #18
0
 class Config(Tensorizer.Config):
     #: The name of the text column to parse from the data source.
     column: str = "text"
     #: The tokenizer to use to split input text into tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     add_bos_token: bool = False
     add_eos_token: bool = False
     use_eos_token_for_bos: bool = False
     max_seq_len: Optional[int] = None
     #: If False, will not create token vocab during initialization. The vocab will
     #: need to be set during model initialization (e.g. see WordEmbedding)
     build_vocab: bool = True
Example #19
0
 class Config(BERTTensorizer.Config):
     vocab_file: str = "/mnt/vol/nlp_technologies/xlm/vocab_xnli_15"
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     is_fairseq: bool = False
     pretraining: bool = False
     max_seq_len: Optional[int] = 256
     max_vocab: int = 95000
     min_count: int = 0
     language_columns: List[str] = ["language"]
     lang2id: Dict[str, int] = LANG2ID_15
     reset_positions: bool = False
     has_language_in_data: bool = False
     use_language_embeddings: bool = True
Example #20
0
 class Config(Tensorizer.Config):
     #: The name of the slot label column to parse from the data source.
     slot_column: str = "slots"
     #: The name of the text column to parse from the data source.
     #: We need this to be able to generate tensors which correspond to input text.
     text_column: str = "text"
     #: The tokenizer to use to split input text into tokens. This should be
     #: configured in a way which yields tokens consistent with the tokens input to
     #: or output by a model, so that the labels generated by this tensorizer
     #: will match the indices of the model's tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     #: Whether to allow for unknown labels at test/prediction time
     allow_unknown: bool = False
Example #21
0
 class Config(Tensorizer.Config):
     column: str = "text_seq"
     max_seq_len: Optional[int] = None
     #: sentence markers
     add_bos_token: bool = False
     add_eos_token: bool = False
     use_eos_token_for_bos: bool = False
     #: list markers
     add_bol_token: bool = False
     add_eol_token: bool = False
     use_eol_token_for_bol: bool = False
     #: The tokenizer to use to split input text into tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
Example #22
0
 def __init__(
     self,
     text_column,
     tokenizer=None,
     max_seq_len=Config.max_seq_len,
     max_byte_len=Config.max_byte_len,
     offset_for_non_padding=Config.offset_for_non_padding,
 ):
     self.text_column = text_column
     self.tokenizer = tokenizer or Tokenizer()
     self.max_seq_len = max_seq_len or 2**30  # large number
     self.max_byte_len = max_byte_len
     self.offset_for_non_padding = offset_for_non_padding
Example #23
0
 class Config(BERTTensorizerBase.Config):
     vocab_file: str = "/mnt/vol/nlp_technologies/xlm/vocab_xnli_15"
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     max_vocab: int = 95000
     min_count: int = 0
     # language identifiers for extracting the language from a row of data
     # during numberize
     language_column: str = "language"
     # language-to-id mapping used to obtain language embeddings
     lang2id: Dict[str, int] = LANG2ID_15
     # Controls whether language is being read from the data file (which
     # is what happens for finetuning) or being added during processing
     # (which is what happens during pretraining)
     has_language_in_data: bool = False
     # controls whether we train with language embeddings or not
     use_language_embeddings: bool = True
Example #24
0
 def __init__(
     self,
     slot_column: str = Config.slot_column,
     text_column: str = Config.text_column,
     tokenizer: Tokenizer = None,
     allow_unknown: bool = Config.allow_unknown,
 ):
     self.slot_column = slot_column
     self.text_column = text_column
     self.allow_unknown = allow_unknown
     self.tokenizer = tokenizer or Tokenizer()
     self.pad_idx = Padding.DEFAULT_LABEL_PAD_IDX
     self.vocab_builder = VocabBuilder()
     self.vocab_builder.add(NO_LABEL)
     self.vocab_builder.use_pad = False
     self.vocab_builder.use_unk = self.allow_unknown
     self.vocab = None
Example #25
0
 def __init__(
     self,
     text_column,
     tokenizer=None,
     max_seq_len=Config.max_seq_len,
     max_byte_len=Config.max_byte_len,
     offset_for_non_padding=Config.offset_for_non_padding,
 ):
     if isinstance(tokenizer, DoNothingTokenizer):
         super().__init__([(text_column, List[str])])
     else:
         super().__init__([(text_column, str)])
     self.text_column = text_column
     self.tokenizer = tokenizer or Tokenizer()
     self.max_seq_len = max_seq_len or 2**30  # large number
     self.max_byte_len = max_byte_len
     self.offset_for_non_padding = offset_for_non_padding
Example #26
0
 def __init__(
     self,
     text_column,
     tokenizer=None,
     add_bos_token=Config.add_bos_token,
     add_eos_token=Config.add_eos_token,
     use_eos_token_for_bos=Config.use_eos_token_for_bos,
     max_seq_len=Config.max_seq_len,
     vocab=None,
 ):
     super().__init__([(text_column, str)])
     self.text_column = text_column
     self.tokenizer = tokenizer or Tokenizer()
     self.vocab = vocab
     self.add_bos_token = add_bos_token
     self.add_eos_token = add_eos_token
     self.use_eos_token_for_bos = use_eos_token_for_bos
     self.max_seq_len = max_seq_len or 2**30  # large number
Example #27
0
 def __init__(
     self,
     text_column,
     tokenizer=None,
     max_seq_len=Config.max_seq_len,
     max_byte_len=Config.max_byte_len,
     offset_for_non_padding=Config.offset_for_non_padding,
     add_bos_token=Config.add_bos_token,
     add_eos_token=Config.add_eos_token,
     use_eos_token_for_bos=Config.use_eos_token_for_bos,
 ):
     self.text_column = text_column
     self.tokenizer = tokenizer or Tokenizer()
     self.max_seq_len = max_seq_len or 2**30  # large number
     self.max_byte_len = max_byte_len
     self.offset_for_non_padding = offset_for_non_padding
     self.add_bos_token = add_bos_token
     self.add_eos_token = add_eos_token
     self.use_eos_token_for_bos = use_eos_token_for_bos
Example #28
0
 def __init__(
     self,
     text_column,
     tokenizer=None,
     add_bos_token=Config.add_bos_token,
     add_eos_token=Config.add_eos_token,
     use_eos_token_for_bos=Config.use_eos_token_for_bos,
     max_seq_len=Config.max_seq_len,
     vocab_config=None,
     vocab=None,
 ):
     self.text_column = text_column
     self.tokenizer = tokenizer or Tokenizer()
     self.vocab = vocab
     self.add_bos_token = add_bos_token
     self.add_eos_token = add_eos_token
     self.use_eos_token_for_bos = use_eos_token_for_bos
     self.max_seq_len = max_seq_len or 2**30  # large number
     self.vocab_builder = None
     self.vocab_config = vocab_config or VocabConfig()
Example #29
0
 def __init__(
     self,
     column: str = Config.column,
     tokenizer=None,
     add_bos_token: bool = Config.add_bos_token,
     add_eos_token: bool = Config.add_eos_token,
     use_eos_token_for_bos: bool = Config.use_eos_token_for_bos,
     add_bol_token: bool = Config.add_bol_token,
     add_eol_token: bool = Config.add_eol_token,
     use_eol_token_for_bol: bool = Config.use_eol_token_for_bol,
     max_seq_len=Config.max_seq_len,
     vocab=None,
 ):
     self.column = column
     self.tokenizer = tokenizer or Tokenizer()
     self.vocab = vocab
     self.add_bos_token = add_bos_token
     self.add_eos_token = add_eos_token
     self.use_eos_token_for_bos = use_eos_token_for_bos
     self.add_bol_token = add_bol_token
     self.add_eol_token = add_eol_token
     self.use_eol_token_for_bol = use_eol_token_for_bol
     self.max_seq_len = max_seq_len or 2**30  # large number
Example #30
0
def tokenize(
    text: str = None,
    pre_tokenized: List[Token] = None,
    tokenizer: Tokenizer = None,
    add_bos_token: bool = False,
    add_eos_token: bool = False,
    use_eos_token_for_bos: bool = False,
    max_seq_len: int = 2**30,
):
    tokenized = (pre_tokenized
                 or tokenizer.tokenize(text)[:max_seq_len - add_bos_token -
                                             add_eos_token])
    if add_bos_token:
        bos = EOS if use_eos_token_for_bos else BOS
        tokenized = [Token(bos, -1, -1)] + tokenized
    if add_eos_token:
        tokenized.append(Token(EOS, -1, -1))
    if not tokenized:
        tokenized = [Token(PAD, -1, -1)]

    tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end)
                                                for t in tokenized))
    return tokenized_texts, start_idx, end_idx