Example #1
0
    def __init__(self,
                 vocab_path,
                 language="en",
                 tokenizer=None,
                 subtokenizer=None,
                 subtokenizer_codes=None,
                 glossaries=None,
                 reverse_sequence=False,
                 **kwargs):
        """ Initializes the data pipeline for text data.

        Args:
            language: The language.
            vocab_path: The path to the vocabulary file, or a list of word tokens.
            tokenizer: The tokenizer name.
            subtokenizer: The name of tokenizer for subword encoding.
            subtokenizer_codes: The subword codes.
            glossaries: The glossaries that will not be split by tokenizer/subtokenizer.
            reverse_sequence: A bool, whether to reverse the sequence.
        """
        DataPipeline.__init__(self,
                              vocab_path=vocab_path,
                              language=language,
                              tokenizer=tokenizer,
                              subtokenizer=subtokenizer,
                              subtokenizer_codes=subtokenizer_codes,
                              glossaries=glossaries,
                              reverse_sequence=reverse_sequence,
                              **kwargs)
        self._language = language
        self._reverse_sequence = reverse_sequence
        self._tokenizer = build_tokenizer(tokenizer,
                                          language=language,
                                          glossaries=glossaries)
        self._subtokenizer = None
        self._subtokenizer = build_tokenizer(subtokenizer,
                                             language=language,
                                             glossaries=glossaries,
                                             vocabulary=vocab_path)
        if self._subtokenizer is not None:
            if subtokenizer_codes is None:
                logging.info(
                    "No codes provided for subtokenizer: {}. "
                    "We assume this was done on purpose.".format(subtokenizer))
            else:
                self._subtokenizer.init_subtokenizer(subtokenizer_codes)
        if isinstance(vocab_path, list):
            tokens = Vocab.load_tokens(tokens=vocab_path)
        else:
            tokens = Vocab.load_tokens(vocab_path=vocab_path)
        unk_token = Vocab.get_unique(tokens, "<UNK>")
        bos_token = Vocab.get_unique(tokens, "<SEQ_BEG>")
        eos_token = Vocab.get_unique(tokens, "<SEQ_END>")
        assert unk_token != bos_token != eos_token
        Vocab.__init__(self,
                       tokens, [unk_token, bos_token, eos_token],
                       lowercase=False)
        self._eos_id = Vocab.map_token_to_id(self, eos_token)
        self._bos_id = Vocab.map_token_to_id(self, bos_token)
        self._unk_id = Vocab.map_token_to_id(self, unk_token)
Example #2
0
    def __init__(self,
                 vocab_path,
                 language="en",
                 tokenizer=None,
                 subtokenizer=None,
                 subtokenizer_codes=None,
                 glossaries=None,
                 reverse_sequence=False,
                 **kwargs):
        """ Initializes the data pipeline for text data.

        Args:
            language: The language.
            vocab_path: The path to the vocabulary file.
            tokenizer: The tokenizer name.
            subtokenizer: The name of tokenizer for subword encoding.
            subtokenizer_codes: The subword codes.
            glossaries: The glossaries that will not be split by tokenizer/subtokenizer.
            reverse_sequence: A bool, whether to reverse the sequence.
        """
        super(TextDataPipeline, self).__init__(
            vocab_path=vocab_path,
            language=language,
            tokenizer=tokenizer,
            subtokenizer=subtokenizer,
            subtokenizer_codes=subtokenizer_codes,
            glossaries=glossaries,
            reverse_sequence=reverse_sequence,
            **kwargs)
        self._language = language
        self._tokenizer = build_tokenizer(tokenizer, language=language, glossaries=glossaries)
        self._subtokenizer = None
        self._subtokenizer = build_tokenizer(
            subtokenizer, language=language, glossaries=glossaries, vocabulary=vocab_path)
        if self._subtokenizer is not None:
            if subtokenizer_codes is None:
                logging.info("No codes provided for subtokenizer: {}. "
                             "We assume this was done on purpose.".format(subtokenizer))
            else:
                self._subtokenizer.init_subtokenizer(subtokenizer_codes)
        self._symbols_mapper = SymbolsMapper(vocab_path=vocab_path, reverse=reverse_sequence)
Example #3
0
def _main(_):
    arg_parser = flags_core.define_flags(FLAG_LIST, with_config_file=False)
    args, remaining_argv = flags_core.intelligent_parse_flags(FLAG_LIST, arg_parser)
    flags_core.verbose_flags(FLAG_LIST, args, remaining_argv)

    tokenizer = build_tokenizer(args)
    with tf.io.gfile.GFile(args["input"]) as fp:
        with tf.io.gfile.GFile(args["output"], "w") as fw:
            for line in fp:
                line = lowercase_and_remove_punctuations(tokenizer.language, line.strip(),
                                                         args["lowercase"], args["remove_punctuation"])
                fw.write(tokenizer.tokenize(line, return_str=True) + "\n")