def _predict_chars( model: tf.keras.Sequential, sp: spm.SentencePieceProcessor, start_string: str, store: _BaseConfig, ) -> str: """ Evaluation step (generating text using the learned model). Args: model: tf.keras.Sequential model sp: SentencePiece tokenizer start_string: string to bootstrap model store: our config object Returns: Yields line of text per iteration """ # Converting our start string to numbers (vectorizing) input_eval = sp.EncodeAsIds(start_string) input_eval = tf.expand_dims(input_eval, 0) # Empty string to store each line sentence_ids = [] # Here batch size == 1 model.reset_states() while True: predictions = model(input_eval) # remove the batch dimension predictions = tf.squeeze(predictions, 0) # using a categorical distribution to # predict the word returned by the model predictions = predictions / store.gen_temp predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy() # We pass the predicted word as the next input to the model # along with the previous hidden state input_eval = tf.expand_dims([predicted_id], 0) sentence_ids.append(int(predicted_id)) decoded = sp.DecodeIds(sentence_ids) if store.field_delimiter is not None: decoded = decoded.replace(store.field_delimiter_token, store.field_delimiter) if "<n>" in decoded: return _pred_string(decoded.replace("<n>", "")) elif 0 < store.gen_chars <= len(decoded): return _pred_string(decoded)
def preprocess_raw_example( rawe: RawExample, tokenizer: BertTokenizer, cond_tokenizer: spm.SentencePieceProcessor, ) -> Tuple[str, Example]: e = Example( title_token_ids=tokenizer.encode(rawe.title, add_special_tokens=False), description_token_ids=tokenizer.encode(rawe.description, add_special_tokens=False), condition_token_ids=cond_tokenizer.EncodeAsIds(rawe.condition), fact_token_ids=tokenizer.encode(rawe.fact, add_special_tokens=False), description=rawe.description, ) return hashlib.sha1(json.dumps(e.__dict__).encode()).hexdigest(), e
def _log_sample_data(model_dir: str, sp: spm.SentencePieceProcessor): training_data_path = Path(model_dir) / const.TRAINING_DATA if not training_data_path.is_file(): logging.info("Training data not found for SP sampling") return with open(training_data_path) as fin: sample = fin.readline().strip() logging.info(f"Tokenizer model vocabulary size: {len(sp)} tokens") logging.info( "Mapping first line of training data\n\n{}\n ---- sample tokens mapped to pieces ---- > \n{}\n" .format(repr(sample), ", ".join(sp.SampleEncodeAsPieces(sample, -1, 0.1)))) logging.info( "Mapping first line of training data\n\n{}\n ---- sample tokens mapped to int ---- > \n{}\n" .format(repr(sample), ", ".join([str(idx) for idx in sp.EncodeAsIds(sample)])))
def encode_comment(sp_model: sentencepiece.SentencePieceProcessor, comment: str, max_len=None) -> List[int]: """ Encode one comment with sentencepiece model. """ # TODO we can do sub-word augmentation here start = sp_model.PieceToId('<s>') end = sp_model.PieceToId('</s>') eol = sp_model.PieceToId(EOL) encoded = [start] for i, line in enumerate(comment.split('\n')): if i: encoded.append(eol) encoded.extend(sp_model.EncodeAsIds(line)) encoded.append(end) if max_len is not None: encoded = encoded[:max_len] return encoded
def process_to_ids_in_sparse_format( sentence_piece_processor: spm.SentencePieceProcessor, sentences: Iterable[str]) -> Tuple[List, List, Tuple[int, int]]: """ An utility method that processes sentences with the sentence piece processor `sp` and returns the results in tf.SparseTensor-similar format: (values, indices, dense_shape) """ ids = [sentence_piece_processor.EncodeAsIds(x) for x in sentences] max_len = max(len(x) for x in ids) dense_shape = (len(ids), max_len) values = list(flatten(ids)) indices = [[row, col] for row in range(len(ids)) for col in range(len(ids[row]))] return values, indices, dense_shape
def _create_dataset(store: _BaseConfig, text: str, sp: spm.SentencePieceProcessor) -> tf.data.Dataset: """ Before training, we need to map strings to a numerical representation. Create two lookup tables: one mapping characters to numbers, and another for numbers to characters. """ logging.info("Tokenizing training data") ids = [] for line in tqdm(text.split("\n")): ids.extend(sp.EncodeAsIds(line)) logging.info("Creating and shuffling tensorflow dataset") char_dataset = tf.data.Dataset.from_tensor_slices(ids) sequences = char_dataset.batch(store.seq_length + 1, drop_remainder=True) dataset = sequences.map(_split_input_target) dataset = dataset.shuffle(store.buffer_size).batch(store.batch_size, drop_remainder=True) return dataset
def _encode_batch(self, texts): from sentencepiece import SentencePieceProcessor tok = SentencePieceProcessor() tok.Load(str(self.sp_model)) return [np.array(tok.EncodeAsIds(t)) for t in texts]
class SentencepieceFasttextEmbed(EmbedderInterface): class Config(EmbedderInterface.Config): pass @classmethod def from_config(cls, config: Config): spm_model_file = os.path.join(config.preproc_dir, "spm.model") fasttext_model_file = os.path.join(config.preproc_dir, "fasttext-model.bin") return cls(spm_model_file, fasttext_model_file, config.max_pieces) def __init__(self, spm_model_file: str, fasttext_model_file: str = '', max_pieces: int = -1): super().__init__(max_pieces=max_pieces) self.spm = SentencePieceProcessor() self.spm.Load(spm_model_file) self.pad_idx = self.spm.pad_id() self.pad_token = self.spm.IdToPiece(self.pad_idx) self.unk_idx = self.spm.unk_id() self.unk_token = self.spm.IdToPiece(self.unk_idx) self.bos_idx = self.spm.bos_id() self.bos_token = self.spm.IdToPiece(self.bos_idx) self.eos_idx = self.spm.eos_id() self.eos_token = self.spm.IdToPiece(self.eos_idx) if fasttext_model_file: self.fasttext = fasttext.load_model(fasttext_model_file) @property def embed_dim(self): return self.fasttext.dim @property def n_vocab(self): return self.spm.get_piece_size() def encode_text_as_ids(self, text: str) -> np.array: """ Doesn't produce BOS, EOS ids. """ return np.asarray(self.spm.EncodeAsIds(text)[self.pieces_slice], dtype=np.int32) def encode_text_as_tokens(self, text: str) -> List[str]: """ Doesn't produce BOS, EOS tokens. """ return self.spm.EncodeAsPieces(text)[self.pieces_slice] def tokenize(self, text: str) -> List[str]: """ Alias for `encode_text_as_tokens`. Doesn't produce BOS, EOS tokens. """ return self.encode_text_as_tokens(text)[self.pieces_slice] def decode_ids_as_text(self, ids: List[int], strip_special=True) -> str: """ Doesn't produce PAD, BOS, or EOS text. i.e. PAD, BOS, EOS ids are stripped out before decoding. UNK is decoded but unintelligible. """ if strip_special: ids = [ int(id) for id in ids if id not in (self.pad_idx, self.bos_idx, self.eos_idx) ] else: ids = [int(id) for id in ids] return self.spm.DecodeIds(ids) def decode_tokens_as_text(self, toks: List[str]) -> str: """ Doesn't produce PAD, BOS, or EOS text. i.e. PAD, BOS, EOS tokens are stripped out before decoding. UNK is decoded but unintelligible. """ return self.spm.DecodePieces(toks[self.pieces_slice]) @functools.lru_cache(maxsize=1024) def decode_id_as_token(self, id: int) -> str: return self.spm.IdToPiece(id) def decode_ids_as_tokens(self, ids: List[int], strip_special: bool = True) -> List[str]: """ By default, doesn't produce PAD, BOS, EOS tokens. Avoids problematic intermediate string representation that causes length mismatch. In other words, SentencePiece isn't isomorphic with respect to the string representation. """ if strip_special: ids = [ id for id in ids if id not in (self.pad_idx, self.bos_idx, self.eos_idx) ] return [self.decode_id_as_token(int(ix)) for ix in ids] @functools.lru_cache(maxsize=1024) def embed_tok(self, tok: str) -> np.array: """ When given PAD, returns all zeros """ if tok == self.pad_token: return np.zeros(self.fasttext.dim) return np.asarray(self.fasttext[tok]) def embed_text(self, text: str) -> np.array: """ Doesn't produce PAD, BOS, EOS embeddings. i.e. PAD, BOS, EOS are stripped out during tokenization before embedding. """ return np.asarray([self.embed_tok(tok) for tok in self.tokenize(text)]) def embed_ids(self, ids: List[int], strip_special: bool = True) -> List[np.array]: """ By default, doesn't produce PAD, BOS, EOS tokens. Avoids problematic intermediate string representation that causes length mismatch. In other words, SentencePiece isn't isomorphic with respect to the string representation. """ return [ self.embed_tok(t) for t in self.decode_ids_as_tokens(ids, strip_special=strip_special) ] def embed_ids_batch(self, ids: np.array) -> torch.tensor: emb = [self.embed_ids(turn, strip_special=False) for turn in ids] emb = torch.tensor(emb) return emb
def test_encoding_using_sp_model(tokenizer: SentencePieceProcessor): assert tokenizer.EncodeAsIds("welcome home") == [3441, 4984, 1004]
class BPE_Dictionary(object): def __init__( self, dict, dict_type, pad=constants.PAD, eos=constants.EOS, unk=constants.UNK, bos=constants.BOS, ): self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos self.dict = os.path.expanduser(dict) self.dict_type = dict_type if self.dict_type == SENTENCEPIECE: assert self.exists(self.dict, self.dict_type) self.bpe_dict = SentencePieceProcessor() self.bpe_dict.load(f'{self.dict}.model') self.pad_index = self.bpe_dict.pad_id() self.bos_index = self.bpe_dict.bos_id() self.eos_index = self.bpe_dict.eos_id() self.unk_index = self.bpe_dict.unk_id() @staticmethod def exists(dict, dict_type='sentencepiece'): dict = os.path.expanduser(dict) if dict_type == SENTENCEPIECE: dict_file = f'{dict}.model' vocab_file = f'{dict}.vocab' if os.path.exists(dict_file) and os.path.exists(vocab_file): return True else: return False else: raise NotImplementedError def save(self, dict_name): dict_name = os.path.expanduser(dict_name) os.makedirs(os.path.dirname(dict_name), exist_ok=True) if self.dict_type == SENTENCEPIECE: shutil.copy(f'{self.dict}.model', f'{dict_name}.model') shutil.copy(f'{self.dict}.vocab', f'{dict_name}.vocab') else: raise NotImplementedError def encode_tokens(self, sentence): return self.bpe_dict.EncodeAsPieces(sentence) def encode_ids(self, sentence): return self.bpe_dict.EncodeAsIds(sentence) def string(self, tensor: torch.Tensor, bpe_symbol=None, escape_unk=None, trunc_eos=None): if torch.is_tensor(tensor) and tensor.dim() == 2: return "\n".join( self.string(t, bpe_symbol, escape_unk, trunc_eos) for t in tensor) return self.bpe_dict.Decode(tensor.tolist()) def __getitem__(self, idx): return self.bpe_dict.IdToPiece(idx) def __contains__(self, sym): return self.index(sym) != self.unk() def index(self, sym): return self.bpe_dict[sym] def __len__(self): return len(self.bpe_dict) def bos(self): """Helper to get index of beginning-of-sentence symbol""" return self.bos_index def pad(self): """Helper to get index of pad symbol""" return self.pad_index def eos(self): """Helper to get index of end-of-sentence symbol""" return self.eos_index def unk(self): """Helper to get index of unk symbol""" return self.unk_index
import subprocess # %% sentencepice_dir = os.path.expanduser('ujimaru_text_generate/sentencepiece') reformer_dir = os.path.expanduser('ujimaru_text_generate/reformer') with open(os.path.join(sentencepice_dir, "wiki-daimyo.txt")) as f: text = f.read().strip() # %% TOKENIZER = SentencePieceProcessor() TOKENIZER.load(os.path.join(sentencepice_dir, 'wiki-daimyo.model')) # %% IDS = TOKENIZER.EncodeAsIds(text) IDS = onp.asarray(IDS, dtype=onp.int32) print("Number of tokens:", IDS.shape[0]) # Configure hyperparameters. gin.parse_config_file('ujimaru_text_generate/config.gin') # %% # As we report in the Reformer paper, increasing the number of hashing rounds # helps with quality. We can even increase the number of hashing rounds at # evaluation time only. gin.parse_config("""LSHCausalAttention.n_hashes = 4""") model_infer = trax.models.ReformerLM(mode='predict') # Set up the initial state for sampling.
class BPEDataset(SummarizationDataset): """Summarization dataset with Byte-Pair encoding. Args: directory (str): Dataset directory. prefix (str): Dataset preprocessing prefix. part (str): Dataset part name. :attr:`directory` must contain :attr:`part`.tsv file. Use `None` for sampling. max_sequence_length (int, optional): Defaults to 150. Maximum sequence length. Note: Use **kwargs to set up preprocessing arguments. """ def __init__(self, directory: str, prefix: str, part: str, max_sequence_length=150, **kwargs): self.data_workdir = os.path.join(directory, prefix) self.spm_file = os.path.join(self.data_workdir, "spm.model") if part is None: self._sample_init(self.spm_file, max_sequence_length) return self.source_part_file = os.path.join(directory, part + ".tsv") self.part_file = os.path.join(self.data_workdir, part + ".npy") if not self.exist(directory, prefix, part): logger.info("Dataset part {}/{} not founded".format(self.data_workdir, part)) self.preprocess(directory, prefix, part, **kwargs) self.data = np.load(self.part_file, allow_pickle=True) if "spm" in kwargs: logger.info("Use existing spm model") self.spm = kwargs["spm"] else: logger.info("Load spm model") self.spm = SentencePieceProcessor() self.spm.load(self.spm_file) self.pad_symbol = self.spm.pad_id() self.eos_symbol = self.spm.eos_id() self._len = self.data.shape[0] sequence_lens = [ len(seq) for example in self.data for seq in example ] self.max_sequence_length = min(max_sequence_length, max(sequence_lens)) def _sample_init(self, spm_file_name, max_sequence_length): if not os.path.exists(spm_file_name): raise RuntimeError("Firstly preprocess dataset") self.spm = SentencePieceProcessor() self.spm.load(spm_file_name) self.pad_symbol = self.spm.pad_id() self.eos_symbol = self.spm.eos_id() self._len = 0 self.data = [] self.max_sequence_length = max_sequence_length def __getitem__(self, index): return self.data[index] def __len__(self): return self._len @staticmethod def exist(directory: str, prefix: str, part: str) -> bool: """Check dataset existence, Args: directory (str): Dataset directory. prefix (str): Dataset prefix. part (str): Dataset part. spm_filename (str, optional): Defaults to "spm.model". Name of sentencepiece serialized model. Returns: bool: Existence status. """ data_workdir = os.path.join(directory, prefix) part_filename = os.path.join(data_workdir, part + ".npy") spm_filename = os.path.join(data_workdir, "spm.model") necessary_files = [part_filename, spm_filename] existing = [os.path.exists(filename) for filename in necessary_files] return all(existing) @staticmethod def preprocess(directory: str, prefix: str, part: str, spm: SentencePieceProcessor = None, pretrain_emb=True, vocab_size=30000, embedding_size=300, max_sentence_length=16384, workers=3, skip_gramm=False): """Preprocess dataset. Args: directory (str): Dataset directory. prefix (str): Dataset preprocessing prefix. part (str): Dataset part. :attr:`directory` must contain :attr:`part`.tsv file with data. spm (SentencePieceProcessor, optional): Defaults to None. Sentecepiece model. pretrain_emb (bool, optional): Defaults to True. Whether to pretrain embeddings. vocab_size (int, optional): Defaults to 30000. Vocabulary size. embedding_size (int, optional): Defaults to 300. Pretrained embedding size. max_sentence_length (int, optional): Defaults to 16384. Maximum sentence length for sentencepiece. workers (int, optional): Defaults to 3. Number of workers. skip_gramm (bool, optional): Defaults to False. Whether to use skip-gram type of Word2Vec training. Raises: FileNotFoundError: Raises if source data file doesn't exist. """ data_workdir = os.path.join(directory, prefix) part_source_filename = os.path.join(directory, part + ".tsv") part_exported_filename = os.path.join(data_workdir, part + ".npy") spm_filename = os.path.join(data_workdir, "spm.model") spm_directory = os.path.join(data_workdir, "spm") w2v_model_filename = os.path.join(data_workdir, "word2vec.model") embeddings_filename = os.path.join(data_workdir, "embedding.npy") logger.info("Preprocess {}/{} dataset.".format(data_workdir, part)) os.makedirs(data_workdir, exist_ok=True) if not os.path.exists(part_source_filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), part_source_filename) if part not in ["train", "dev"]: assert spm is not None, "For non train part, `spm` must be specified." else: logger.info("Start training sentencepiece") spm_params = ( "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 " "--input={} --model_prefix={} --vocab_size={} --max_sentence_length={}".format( part_source_filename, spm_directory, vocab_size, max_sentence_length ) ) SentencePieceTrainer.Train(spm_params) spm = SentencePieceProcessor() spm.load(spm_filename) if pretrain_emb: logger.info("Start training Word2Vec embeddings") train_senteces = SentenceIterator(part_source_filename, spm) logger.info("Loaded train sentences") w2v_model = Word2Vec(train_senteces, min_count=0, workers=workers, size=embedding_size, sg=int(skip_gramm)) w2v_model.save(w2v_model_filename) # Export embeddings logger.info("Export embeddings") export_embeddings(embeddings_filename, spm, w2v_model) logger.info("Embeddings have been saved into {}".format(embeddings_filename)) logger.info("Start exporting data file") sentence_iterator = SentenceIterator(part_source_filename, spm) sentence_iterator.export(part_exported_filename) logger.info("{} exported".format(part_exported_filename)) def get_embeddings(self) -> np.array: """Load pretrain embeddings. Returns: np.array: Array with word2vec embeddings if this one exists, otherwise `None`. """ embedinds_path = os.path.join(self.data_workdir, "embedding.npy") if not os.path.exists(embedinds_path): logging.info("Embedding file does not founded") return None else: logging.info("Loading embedding dump file") return np.load(embedinds_path) def get_spm(self) -> SentencePieceProcessor: return self.spm def encode(self, sequences): sequences = [self.spm.EncodeAsIds(s)[:self.max_sequence_length] for s in sequences] return torch.LongTensor(sequences) def decode(self, sequences): sequences = [list(takewhile(lambda x: x != self.eos_symbol, sequence)) for sequence in sequences] return [self.spm.DecodeIds([token.item() for token in sentence]) for sentence in sequences] def collate_function(self, batch): src_list, src_length_list = self._pad_sequence( [example[0][:self.max_sequence_length] for example in batch], self.pad_symbol) trg_list, trg_length_list = self._pad_sequence( [example[1][:self.max_sequence_length] for example in batch], self.pad_symbol) batch = { "src": torch.LongTensor(src_list), "trg": torch.LongTensor(trg_list), "src_length": src_length_list, "trg_length": trg_length_list, } return batch @staticmethod def _pad_sequence(sequences, pad_symbol=0): sequence_lengths = [len(sequence) for sequence in sequences] max_len = max(sequence_lengths) for i, length in enumerate(sequence_lengths): to_add = max_len - length sequences[i] += [pad_symbol] * to_add return sequences, sequence_lengths