def __init__(self, vocab_path, language="en", tokenizer=None, subtokenizer=None, subtokenizer_codes=None, glossaries=None, reverse_sequence=False, **kwargs): """ Initializes the data pipeline for text data. Args: language: The language. vocab_path: The path to the vocabulary file, or a list of word tokens. tokenizer: The tokenizer name. subtokenizer: The name of tokenizer for subword encoding. subtokenizer_codes: The subword codes. glossaries: The glossaries that will not be split by tokenizer/subtokenizer. reverse_sequence: A bool, whether to reverse the sequence. """ DataPipeline.__init__(self, vocab_path=vocab_path, language=language, tokenizer=tokenizer, subtokenizer=subtokenizer, subtokenizer_codes=subtokenizer_codes, glossaries=glossaries, reverse_sequence=reverse_sequence, **kwargs) self._language = language self._reverse_sequence = reverse_sequence self._tokenizer = build_tokenizer(tokenizer, language=language, glossaries=glossaries) self._subtokenizer = None self._subtokenizer = build_tokenizer(subtokenizer, language=language, glossaries=glossaries, vocabulary=vocab_path) if self._subtokenizer is not None: if subtokenizer_codes is None: logging.info( "No codes provided for subtokenizer: {}. " "We assume this was done on purpose.".format(subtokenizer)) else: self._subtokenizer.init_subtokenizer(subtokenizer_codes) if isinstance(vocab_path, list): tokens = Vocab.load_tokens(tokens=vocab_path) else: tokens = Vocab.load_tokens(vocab_path=vocab_path) unk_token = Vocab.get_unique(tokens, "<UNK>") bos_token = Vocab.get_unique(tokens, "<SEQ_BEG>") eos_token = Vocab.get_unique(tokens, "<SEQ_END>") assert unk_token != bos_token != eos_token Vocab.__init__(self, tokens, [unk_token, bos_token, eos_token], lowercase=False) self._eos_id = Vocab.map_token_to_id(self, eos_token) self._bos_id = Vocab.map_token_to_id(self, bos_token) self._unk_id = Vocab.map_token_to_id(self, unk_token)
def __init__(self, vocab_path, language="en", tokenizer=None, subtokenizer=None, subtokenizer_codes=None, glossaries=None, reverse_sequence=False, **kwargs): """ Initializes the data pipeline for text data. Args: language: The language. vocab_path: The path to the vocabulary file. tokenizer: The tokenizer name. subtokenizer: The name of tokenizer for subword encoding. subtokenizer_codes: The subword codes. glossaries: The glossaries that will not be split by tokenizer/subtokenizer. reverse_sequence: A bool, whether to reverse the sequence. """ super(TextDataPipeline, self).__init__( vocab_path=vocab_path, language=language, tokenizer=tokenizer, subtokenizer=subtokenizer, subtokenizer_codes=subtokenizer_codes, glossaries=glossaries, reverse_sequence=reverse_sequence, **kwargs) self._language = language self._tokenizer = build_tokenizer(tokenizer, language=language, glossaries=glossaries) self._subtokenizer = None self._subtokenizer = build_tokenizer( subtokenizer, language=language, glossaries=glossaries, vocabulary=vocab_path) if self._subtokenizer is not None: if subtokenizer_codes is None: logging.info("No codes provided for subtokenizer: {}. " "We assume this was done on purpose.".format(subtokenizer)) else: self._subtokenizer.init_subtokenizer(subtokenizer_codes) self._symbols_mapper = SymbolsMapper(vocab_path=vocab_path, reverse=reverse_sequence)
def _main(_): arg_parser = flags_core.define_flags(FLAG_LIST, with_config_file=False) args, remaining_argv = flags_core.intelligent_parse_flags(FLAG_LIST, arg_parser) flags_core.verbose_flags(FLAG_LIST, args, remaining_argv) tokenizer = build_tokenizer(args) with tf.io.gfile.GFile(args["input"]) as fp: with tf.io.gfile.GFile(args["output"], "w") as fw: for line in fp: line = lowercase_and_remove_punctuations(tokenizer.language, line.strip(), args["lowercase"], args["remove_punctuation"]) fw.write(tokenizer.tokenize(line, return_str=True) + "\n")