def _log_sample_data(model_dir: str, sp: spm.SentencePieceProcessor): training_data_path = Path(model_dir) / const.TRAINING_DATA if not training_data_path.is_file(): logging.info("Training data not found for SP sampling") return with open(training_data_path) as fin: sample = fin.readline().strip() logging.info(f"Tokenizer model vocabulary size: {len(sp)} tokens") logging.info( "Mapping first line of training data\n\n{}\n ---- sample tokens mapped to pieces ---- > \n{}\n" .format(repr(sample), ", ".join(sp.SampleEncodeAsPieces(sample, -1, 0.1)))) logging.info( "Mapping first line of training data\n\n{}\n ---- sample tokens mapped to int ---- > \n{}\n" .format(repr(sample), ", ".join([str(idx) for idx in sp.EncodeAsIds(sample)])))
class SubwordTokenizer(Tokenizer): def __init__(self, model_path: str = None, nbest_size: int = None, alpha: float = None): self._model_path = cached_path(model_path) self._processor = SentencePieceProcessor() self._processor.Load(self._model_path) self._nbest_size = nbest_size self._alpha = alpha def tokenize(self, text: str) -> List[Token]: if self._nbest_size and self._alpha: subwords = self._processor.SampleEncodeAsPieces(text, self._nbest_size, self._alpha) else: subwords = self._processor.EncodeAsPieces(text) tokens = [Token(s) for s in subwords] return tokens def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: return [self.tokenize(text) for text in texts]