def __init__(self, vocab_path, language="en", tokenizer=None, subtokenizer=None, subtokenizer_codes=None, glossaries=None, reverse_sequence=False, **kwargs): """ Initializes the data pipeline for text data. Args: language: The language. vocab_path: The path to the vocabulary file, or a list of word tokens. tokenizer: The tokenizer name. subtokenizer: The name of tokenizer for subword encoding. subtokenizer_codes: The subword codes. glossaries: The glossaries that will not be split by tokenizer/subtokenizer. reverse_sequence: A bool, whether to reverse the sequence. """ DataPipeline.__init__(self, vocab_path=vocab_path, language=language, tokenizer=tokenizer, subtokenizer=subtokenizer, subtokenizer_codes=subtokenizer_codes, glossaries=glossaries, reverse_sequence=reverse_sequence, **kwargs) self._language = language self._reverse_sequence = reverse_sequence self._tokenizer = build_tokenizer(tokenizer, language=language, glossaries=glossaries) self._subtokenizer = None self._subtokenizer = build_tokenizer(subtokenizer, language=language, glossaries=glossaries, vocabulary=vocab_path) if self._subtokenizer is not None: if subtokenizer_codes is None: logging.info( "No codes provided for subtokenizer: {}. " "We assume this was done on purpose.".format(subtokenizer)) else: self._subtokenizer.init_subtokenizer(subtokenizer_codes) if isinstance(vocab_path, list): tokens = Vocab.load_tokens(tokens=vocab_path) else: tokens = Vocab.load_tokens(vocab_path=vocab_path) unk_token = Vocab.get_unique(tokens, "<UNK>") bos_token = Vocab.get_unique(tokens, "<SEQ_BEG>") eos_token = Vocab.get_unique(tokens, "<SEQ_END>") assert unk_token != bos_token != eos_token Vocab.__init__(self, tokens, [unk_token, bos_token, eos_token], lowercase=False) self._eos_id = Vocab.map_token_to_id(self, eos_token) self._bos_id = Vocab.map_token_to_id(self, bos_token) self._unk_id = Vocab.map_token_to_id(self, unk_token)
def __init__(self, vocab_path, spm_model, languages, reverse_sequence=False, **kwargs): """ Initializes the data pipeline for text data. Args: vocab_path: The path to the vocabulary file, or a list of word tokens. spm_model: The path to the sentence piece model. languages: A list of languages. The corresponding language tags will automatically append to the vocabulary. reverse_sequence: A bool, whether to reverse the sequence. """ DataPipeline.__init__(self, vocab_path=vocab_path, languages=languages, reverse_sequence=reverse_sequence, **kwargs) self._reverse_sequence = reverse_sequence self._tokenizer = SentencePiece() self._tokenizer.init_subtokenizer(spm_model) if isinstance(vocab_path, list): tokens = Vocab.load_tokens(tokens=vocab_path) else: tokens = Vocab.load_tokens(vocab_path=vocab_path) if isinstance(languages, str): languages = yaml.load(languages, Loader=yaml.FullLoader) assert isinstance(languages, list), ( f"`languages` must be a list of strings, but got {languages}") lang2tags = {} for lang in languages: lang2tags[lang] = Vocab.get_unique(tokens, "<" + lang + ">") unk_token = Vocab.get_unique(tokens, "<UNK>") bos_token = Vocab.get_unique(tokens, "<SEQ_BEG>") eos_token = Vocab.get_unique(tokens, "<SEQ_END>") assert unk_token != bos_token != eos_token Vocab.__init__(self, tokens, [unk_token, bos_token, eos_token] + list(lang2tags.values()), lowercase=False) self._eos_id = Vocab.map_token_to_id(self, eos_token) self._bos_id = Vocab.map_token_to_id(self, bos_token) self._unk_id = Vocab.map_token_to_id(self, unk_token) self._lang_ids = { lang: Vocab.map_token_to_id(self, lang2tags[lang]) for lang in languages }
def __init__(self, name, language="en", vocab_path=None, tokens=None, **kwargs): """ Initializes the data pipeline for text data. Args: name: The key of the BERT model, for creating the tokenizer and loading vocabulary. language: The language. tokens: A list of word tokens. vocab_path: The path to the vocabulary file. """ if tokens is None and vocab_path is None: path = GoogleBert.download(name) if path is None: raise ValueError( f"Unknown BERT model name={name} for downloading.") vocab_path = os.path.join(path, "vocab.txt") else: if tokens is not None: vocab_path = None tokens = Vocab.load_tokens(vocab_path, tokens) vocab_path = None # to handle with customized vocabulary for spec_token in ["[UNK]", "[CLS]", "[SEP]", "[MASK]", "[PAD]"]: if spec_token not in tokens: tokens.insert(0, spec_token) assert tokens[0] == "[PAD]" Vocab.__init__(self, Vocab.load_tokens(vocab_path, tokens), lowercase=False) DataPipeline.__init__(self, name=name, language=language, tokens=self.tokens, vocab_path=None, **kwargs) self._language = language self._tokenizer = HuggingFaceTokenizer(language=language) self._tokenizer.init_subtokenizer(name) self._unk_id = Vocab.map_token_to_id(self, "[UNK]") self._pad_id = Vocab.map_token_to_id(self, "[PAD]") self._cls_id = Vocab.map_token_to_id(self, "[CLS]") self._sep_id = Vocab.map_token_to_id(self, "[SEP]") self._mask_id = Vocab.map_token_to_id(self, "[MASK]")
def __init__(self, language="en", tokens=None, vocab_path=None): """ Initializes the data pipeline from OpenAI released GPT-2. Args: language: The language. tokens: A list of word tokens. vocab_path: The path to the vocabulary file. """ if tokens is None and vocab_path is None: path = OpenAIGPT2.download("117M") vocab_path = os.path.join(path, "encoder.json") Vocab.__init__(self, Vocab.load_tokens(vocab_path, tokens), lowercase=False) DataPipeline.__init__(self, language=language, tokens=self.tokens, vocab_path=None) self._language = language self._tokenizer = HuggingFaceTokenizer(language=language) self._tokenizer.init_subtokenizer("gpt2") self._eos_id = Vocab.map_token_to_id(self, "<|endoftext|>")