Beispiel #1
0
def _predict_chars(
    model: tf.keras.Sequential,
    sp: spm.SentencePieceProcessor,
    start_string: str,
    store: _BaseConfig,
) -> str:
    """
    Evaluation step (generating text using the learned model).

    Args:
        model: tf.keras.Sequential model
        sp: SentencePiece tokenizer
        start_string: string to bootstrap model
        store: our config object
    Returns:
        Yields line of text per iteration
    """

    # Converting our start string to numbers (vectorizing)
    input_eval = sp.EncodeAsIds(start_string)
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store each line
    sentence_ids = []

    # Here batch size == 1
    model.reset_states()

    while True:
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to
        # predict the word returned by the model
        predictions = predictions / store.gen_temp
        predicted_id = tf.random.categorical(predictions,
                                             num_samples=1)[-1, 0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        sentence_ids.append(int(predicted_id))

        decoded = sp.DecodeIds(sentence_ids)
        if store.field_delimiter is not None:
            decoded = decoded.replace(store.field_delimiter_token,
                                      store.field_delimiter)

        if "<n>" in decoded:
            return _pred_string(decoded.replace("<n>", ""))
        elif 0 < store.gen_chars <= len(decoded):
            return _pred_string(decoded)
Beispiel #2
0
def preprocess_raw_example(
    rawe: RawExample,
    tokenizer: BertTokenizer,
    cond_tokenizer: spm.SentencePieceProcessor,
) -> Tuple[str, Example]:

    e = Example(
        title_token_ids=tokenizer.encode(rawe.title, add_special_tokens=False),
        description_token_ids=tokenizer.encode(rawe.description,
                                               add_special_tokens=False),
        condition_token_ids=cond_tokenizer.EncodeAsIds(rawe.condition),
        fact_token_ids=tokenizer.encode(rawe.fact, add_special_tokens=False),
        description=rawe.description,
    )
    return hashlib.sha1(json.dumps(e.__dict__).encode()).hexdigest(), e
def _log_sample_data(model_dir: str, sp: spm.SentencePieceProcessor):
    training_data_path = Path(model_dir) / const.TRAINING_DATA
    if not training_data_path.is_file():
        logging.info("Training data not found for SP sampling")
        return

    with open(training_data_path) as fin:
        sample = fin.readline().strip()

    logging.info(f"Tokenizer model vocabulary size: {len(sp)} tokens")
    logging.info(
        "Mapping first line of training data\n\n{}\n ---- sample tokens mapped to pieces ---- > \n{}\n"
        .format(repr(sample),
                ", ".join(sp.SampleEncodeAsPieces(sample, -1, 0.1))))
    logging.info(
        "Mapping first line of training data\n\n{}\n ---- sample tokens mapped to int ---- > \n{}\n"
        .format(repr(sample),
                ", ".join([str(idx) for idx in sp.EncodeAsIds(sample)])))
def encode_comment(sp_model: sentencepiece.SentencePieceProcessor,
                   comment: str,
                   max_len=None) -> List[int]:
    """ Encode one comment with sentencepiece model.
    """
    # TODO we can do sub-word augmentation here
    start = sp_model.PieceToId('<s>')
    end = sp_model.PieceToId('</s>')
    eol = sp_model.PieceToId(EOL)
    encoded = [start]
    for i, line in enumerate(comment.split('\n')):
        if i:
            encoded.append(eol)
        encoded.extend(sp_model.EncodeAsIds(line))
    encoded.append(end)
    if max_len is not None:
        encoded = encoded[:max_len]
    return encoded
def process_to_ids_in_sparse_format(
        sentence_piece_processor: spm.SentencePieceProcessor,
        sentences: Iterable[str]) -> Tuple[List, List, Tuple[int, int]]:
    """
    An utility method that processes sentences with the sentence piece processor
    `sp` and returns the results in tf.SparseTensor-similar format:
    (values, indices, dense_shape)
    """

    ids = [sentence_piece_processor.EncodeAsIds(x) for x in sentences]
    max_len = max(len(x) for x in ids)

    dense_shape = (len(ids), max_len)
    values = list(flatten(ids))
    indices = [[row, col] for row in range(len(ids))
               for col in range(len(ids[row]))]

    return values, indices, dense_shape
Beispiel #6
0
def _create_dataset(store: _BaseConfig, text: str,
                    sp: spm.SentencePieceProcessor) -> tf.data.Dataset:
    """
    Before training, we need to map strings to a numerical representation.
    Create two lookup tables: one mapping characters to numbers,
    and another for numbers to characters.
    """
    logging.info("Tokenizing training data")
    ids = []
    for line in tqdm(text.split("\n")):
        ids.extend(sp.EncodeAsIds(line))

    logging.info("Creating and shuffling tensorflow dataset")
    char_dataset = tf.data.Dataset.from_tensor_slices(ids)
    sequences = char_dataset.batch(store.seq_length + 1, drop_remainder=True)
    dataset = sequences.map(_split_input_target)
    dataset = dataset.shuffle(store.buffer_size).batch(store.batch_size,
                                                       drop_remainder=True)
    return dataset
Beispiel #7
0
 def _encode_batch(self, texts):
     from sentencepiece import SentencePieceProcessor
     tok = SentencePieceProcessor()
     tok.Load(str(self.sp_model))
     return [np.array(tok.EncodeAsIds(t)) for t in texts]
class SentencepieceFasttextEmbed(EmbedderInterface):
    class Config(EmbedderInterface.Config):
        pass

    @classmethod
    def from_config(cls, config: Config):
        spm_model_file = os.path.join(config.preproc_dir, "spm.model")
        fasttext_model_file = os.path.join(config.preproc_dir,
                                           "fasttext-model.bin")
        return cls(spm_model_file, fasttext_model_file, config.max_pieces)

    def __init__(self,
                 spm_model_file: str,
                 fasttext_model_file: str = '',
                 max_pieces: int = -1):
        super().__init__(max_pieces=max_pieces)

        self.spm = SentencePieceProcessor()
        self.spm.Load(spm_model_file)
        self.pad_idx = self.spm.pad_id()
        self.pad_token = self.spm.IdToPiece(self.pad_idx)
        self.unk_idx = self.spm.unk_id()
        self.unk_token = self.spm.IdToPiece(self.unk_idx)
        self.bos_idx = self.spm.bos_id()
        self.bos_token = self.spm.IdToPiece(self.bos_idx)
        self.eos_idx = self.spm.eos_id()
        self.eos_token = self.spm.IdToPiece(self.eos_idx)

        if fasttext_model_file:
            self.fasttext = fasttext.load_model(fasttext_model_file)

    @property
    def embed_dim(self):
        return self.fasttext.dim

    @property
    def n_vocab(self):
        return self.spm.get_piece_size()

    def encode_text_as_ids(self, text: str) -> np.array:
        """
    Doesn't produce BOS, EOS ids.
    """
        return np.asarray(self.spm.EncodeAsIds(text)[self.pieces_slice],
                          dtype=np.int32)

    def encode_text_as_tokens(self, text: str) -> List[str]:
        """
    Doesn't produce BOS, EOS tokens.
    """
        return self.spm.EncodeAsPieces(text)[self.pieces_slice]

    def tokenize(self, text: str) -> List[str]:
        """
    Alias for `encode_text_as_tokens`.
    Doesn't produce BOS, EOS tokens.
    """
        return self.encode_text_as_tokens(text)[self.pieces_slice]

    def decode_ids_as_text(self, ids: List[int], strip_special=True) -> str:
        """
    Doesn't produce PAD, BOS, or EOS text.
    i.e. PAD, BOS, EOS ids are stripped out before decoding.
    UNK is decoded but unintelligible.
    """
        if strip_special:
            ids = [
                int(id) for id in ids
                if id not in (self.pad_idx, self.bos_idx, self.eos_idx)
            ]
        else:
            ids = [int(id) for id in ids]
        return self.spm.DecodeIds(ids)

    def decode_tokens_as_text(self, toks: List[str]) -> str:
        """
    Doesn't produce PAD, BOS, or EOS text.
    i.e. PAD, BOS, EOS tokens are stripped out before decoding.
    UNK is decoded but unintelligible.
    """
        return self.spm.DecodePieces(toks[self.pieces_slice])

    @functools.lru_cache(maxsize=1024)
    def decode_id_as_token(self, id: int) -> str:
        return self.spm.IdToPiece(id)

    def decode_ids_as_tokens(self,
                             ids: List[int],
                             strip_special: bool = True) -> List[str]:
        """
    By default, doesn't produce PAD, BOS, EOS tokens.

    Avoids problematic intermediate string representation that causes length mismatch.
    In other words, SentencePiece isn't isomorphic with respect to the string representation.
    """
        if strip_special:
            ids = [
                id for id in ids
                if id not in (self.pad_idx, self.bos_idx, self.eos_idx)
            ]
        return [self.decode_id_as_token(int(ix)) for ix in ids]

    @functools.lru_cache(maxsize=1024)
    def embed_tok(self, tok: str) -> np.array:
        """
    When given PAD, returns all zeros
    """
        if tok == self.pad_token:
            return np.zeros(self.fasttext.dim)
        return np.asarray(self.fasttext[tok])

    def embed_text(self, text: str) -> np.array:
        """
    Doesn't produce PAD, BOS, EOS embeddings.
    i.e. PAD, BOS, EOS are stripped out during tokenization before embedding.
    """
        return np.asarray([self.embed_tok(tok) for tok in self.tokenize(text)])

    def embed_ids(self,
                  ids: List[int],
                  strip_special: bool = True) -> List[np.array]:
        """
    By default, doesn't produce PAD, BOS, EOS tokens.

    Avoids problematic intermediate string representation that causes length mismatch.
    In other words, SentencePiece isn't isomorphic with respect to the string representation.
    """
        return [
            self.embed_tok(t)
            for t in self.decode_ids_as_tokens(ids,
                                               strip_special=strip_special)
        ]

    def embed_ids_batch(self, ids: np.array) -> torch.tensor:
        emb = [self.embed_ids(turn, strip_special=False) for turn in ids]
        emb = torch.tensor(emb)
        return emb
Beispiel #9
0
def test_encoding_using_sp_model(tokenizer: SentencePieceProcessor):
    assert tokenizer.EncodeAsIds("welcome home") == [3441, 4984, 1004]
Beispiel #10
0
class BPE_Dictionary(object):
    def __init__(
        self,
        dict,
        dict_type,
        pad=constants.PAD,
        eos=constants.EOS,
        unk=constants.UNK,
        bos=constants.BOS,
    ):
        self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos
        self.dict = os.path.expanduser(dict)
        self.dict_type = dict_type

        if self.dict_type == SENTENCEPIECE:
            assert self.exists(self.dict, self.dict_type)
            self.bpe_dict = SentencePieceProcessor()
            self.bpe_dict.load(f'{self.dict}.model')
            self.pad_index = self.bpe_dict.pad_id()
            self.bos_index = self.bpe_dict.bos_id()
            self.eos_index = self.bpe_dict.eos_id()
            self.unk_index = self.bpe_dict.unk_id()

    @staticmethod
    def exists(dict, dict_type='sentencepiece'):
        dict = os.path.expanduser(dict)
        if dict_type == SENTENCEPIECE:
            dict_file = f'{dict}.model'
            vocab_file = f'{dict}.vocab'
            if os.path.exists(dict_file) and os.path.exists(vocab_file):
                return True
            else:
                return False
        else:
            raise NotImplementedError

    def save(self, dict_name):
        dict_name = os.path.expanduser(dict_name)
        os.makedirs(os.path.dirname(dict_name), exist_ok=True)
        if self.dict_type == SENTENCEPIECE:
            shutil.copy(f'{self.dict}.model', f'{dict_name}.model')
            shutil.copy(f'{self.dict}.vocab', f'{dict_name}.vocab')
        else:
            raise NotImplementedError

    def encode_tokens(self, sentence):
        return self.bpe_dict.EncodeAsPieces(sentence)

    def encode_ids(self, sentence):
        return self.bpe_dict.EncodeAsIds(sentence)

    def string(self,
               tensor: torch.Tensor,
               bpe_symbol=None,
               escape_unk=None,
               trunc_eos=None):
        if torch.is_tensor(tensor) and tensor.dim() == 2:
            return "\n".join(
                self.string(t, bpe_symbol, escape_unk, trunc_eos)
                for t in tensor)
        return self.bpe_dict.Decode(tensor.tolist())

    def __getitem__(self, idx):
        return self.bpe_dict.IdToPiece(idx)

    def __contains__(self, sym):
        return self.index(sym) != self.unk()

    def index(self, sym):
        return self.bpe_dict[sym]

    def __len__(self):
        return len(self.bpe_dict)

    def bos(self):
        """Helper to get index of beginning-of-sentence symbol"""
        return self.bos_index

    def pad(self):
        """Helper to get index of pad symbol"""
        return self.pad_index

    def eos(self):
        """Helper to get index of end-of-sentence symbol"""
        return self.eos_index

    def unk(self):
        """Helper to get index of unk symbol"""
        return self.unk_index
Beispiel #11
0
import subprocess

# %%
sentencepice_dir = os.path.expanduser('ujimaru_text_generate/sentencepiece')
reformer_dir = os.path.expanduser('ujimaru_text_generate/reformer')

with open(os.path.join(sentencepice_dir, "wiki-daimyo.txt")) as f:
    text = f.read().strip()

# %%
TOKENIZER = SentencePieceProcessor()
TOKENIZER.load(os.path.join(sentencepice_dir, 'wiki-daimyo.model'))

# %%
IDS = TOKENIZER.EncodeAsIds(text)
IDS = onp.asarray(IDS, dtype=onp.int32)
print("Number of tokens:", IDS.shape[0])

# Configure hyperparameters.
gin.parse_config_file('ujimaru_text_generate/config.gin')

# %%

# As we report in the Reformer paper, increasing the number of hashing rounds
# helps with quality. We can even increase the number of hashing rounds at
# evaluation time only.
gin.parse_config("""LSHCausalAttention.n_hashes = 4""")
model_infer = trax.models.ReformerLM(mode='predict')

# Set up the initial state for sampling.
Beispiel #12
0
class BPEDataset(SummarizationDataset):
    """Summarization dataset with Byte-Pair encoding.

    Args:
        directory (str): Dataset directory.
        prefix (str): Dataset preprocessing prefix.
        part (str): Dataset part name. :attr:`directory` must contain :attr:`part`.tsv file.
          Use `None` for sampling.
        max_sequence_length (int, optional): Defaults to 150. Maximum sequence length.

    Note:
        Use **kwargs to set up preprocessing arguments.
    """

    def __init__(self, directory: str, prefix: str, part: str, max_sequence_length=150, **kwargs):
        self.data_workdir = os.path.join(directory, prefix)
        self.spm_file = os.path.join(self.data_workdir, "spm.model")

        if part is None:
            self._sample_init(self.spm_file, max_sequence_length)
            return

        self.source_part_file = os.path.join(directory, part + ".tsv")
        self.part_file = os.path.join(self.data_workdir, part + ".npy")

        if not self.exist(directory, prefix, part):
            logger.info("Dataset part {}/{} not founded".format(self.data_workdir, part))
            self.preprocess(directory, prefix, part, **kwargs)

        self.data = np.load(self.part_file, allow_pickle=True)

        if "spm" in kwargs:
            logger.info("Use existing spm model")
            self.spm = kwargs["spm"]
        else:
            logger.info("Load spm model")
            self.spm = SentencePieceProcessor()
            self.spm.load(self.spm_file)

        self.pad_symbol = self.spm.pad_id()
        self.eos_symbol = self.spm.eos_id()

        self._len = self.data.shape[0]

        sequence_lens = [
            len(seq) for example in self.data for seq in example
        ]
        self.max_sequence_length = min(max_sequence_length, max(sequence_lens))

    def _sample_init(self, spm_file_name, max_sequence_length):
        if not os.path.exists(spm_file_name):
            raise RuntimeError("Firstly preprocess dataset")

        self.spm = SentencePieceProcessor()
        self.spm.load(spm_file_name)

        self.pad_symbol = self.spm.pad_id()
        self.eos_symbol = self.spm.eos_id()

        self._len = 0
        self.data = []

        self.max_sequence_length = max_sequence_length

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self._len

    @staticmethod
    def exist(directory: str, prefix: str, part: str) -> bool:
        """Check dataset existence,

        Args:
            directory (str): Dataset directory.
            prefix (str): Dataset prefix.
            part (str): Dataset part.
            spm_filename (str, optional): Defaults to "spm.model". Name of sentencepiece serialized model.

        Returns:
            bool: Existence status.
        """

        data_workdir = os.path.join(directory, prefix)
        part_filename = os.path.join(data_workdir, part + ".npy")
        spm_filename = os.path.join(data_workdir, "spm.model")

        necessary_files = [part_filename, spm_filename]
        existing = [os.path.exists(filename) for filename in necessary_files]
        return all(existing)

    @staticmethod
    def preprocess(directory: str,
                   prefix: str,
                   part: str,
                   spm: SentencePieceProcessor = None,
                   pretrain_emb=True,
                   vocab_size=30000,
                   embedding_size=300,
                   max_sentence_length=16384,
                   workers=3,
                   skip_gramm=False):
        """Preprocess dataset.

        Args:
            directory (str): Dataset directory.
            prefix (str): Dataset preprocessing prefix.
            part (str): Dataset part. :attr:`directory` must contain :attr:`part`.tsv file with data.
            spm (SentencePieceProcessor, optional): Defaults to None. Sentecepiece model.
            pretrain_emb (bool, optional): Defaults to True. Whether to pretrain embeddings.
            vocab_size (int, optional): Defaults to 30000. Vocabulary size.
            embedding_size (int, optional): Defaults to 300. Pretrained embedding size.
            max_sentence_length (int, optional): Defaults to 16384. Maximum sentence length for sentencepiece.
            workers (int, optional): Defaults to 3. Number of workers.
            skip_gramm (bool, optional): Defaults to False. Whether to use skip-gram type of Word2Vec training.

        Raises:
            FileNotFoundError: Raises if source data file doesn't exist.
        """

        data_workdir = os.path.join(directory, prefix)
        part_source_filename = os.path.join(directory, part + ".tsv")
        part_exported_filename = os.path.join(data_workdir, part + ".npy")
        spm_filename = os.path.join(data_workdir, "spm.model")
        spm_directory = os.path.join(data_workdir, "spm")
        w2v_model_filename = os.path.join(data_workdir, "word2vec.model")
        embeddings_filename = os.path.join(data_workdir, "embedding.npy")

        logger.info("Preprocess {}/{} dataset.".format(data_workdir, part))
        os.makedirs(data_workdir, exist_ok=True)

        if not os.path.exists(part_source_filename):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), part_source_filename)

        if part not in ["train", "dev"]:
            assert spm is not None, "For non train part, `spm` must be specified."
        else:
            logger.info("Start training sentencepiece")
            spm_params = (
                "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 "
                "--input={} --model_prefix={} --vocab_size={} --max_sentence_length={}".format(
                    part_source_filename,
                    spm_directory,
                    vocab_size,
                    max_sentence_length
                )
            )
            SentencePieceTrainer.Train(spm_params)
            spm = SentencePieceProcessor()
            spm.load(spm_filename)

            if pretrain_emb:
                logger.info("Start training Word2Vec embeddings")

                train_senteces = SentenceIterator(part_source_filename, spm)
                logger.info("Loaded train sentences")
                w2v_model = Word2Vec(train_senteces, min_count=0, workers=workers,
                                     size=embedding_size, sg=int(skip_gramm))
                w2v_model.save(w2v_model_filename)

                # Export embeddings
                logger.info("Export embeddings")
                export_embeddings(embeddings_filename, spm, w2v_model)
                logger.info("Embeddings have been saved into {}".format(embeddings_filename))

        logger.info("Start exporting data file")
        sentence_iterator = SentenceIterator(part_source_filename, spm)
        sentence_iterator.export(part_exported_filename)
        logger.info("{} exported".format(part_exported_filename))

    def get_embeddings(self) -> np.array:
        """Load pretrain embeddings.
        Returns:
            np.array: Array with word2vec embeddings if this one exists, otherwise `None`.
        """

        embedinds_path = os.path.join(self.data_workdir, "embedding.npy")
        if not os.path.exists(embedinds_path):
            logging.info("Embedding file does not founded")
            return None
        else:
            logging.info("Loading embedding dump file")
            return np.load(embedinds_path)

    def get_spm(self) -> SentencePieceProcessor:
        return self.spm

    def encode(self, sequences):
        sequences = [self.spm.EncodeAsIds(s)[:self.max_sequence_length] for s in sequences]
        return torch.LongTensor(sequences)

    def decode(self, sequences):
        sequences = [list(takewhile(lambda x: x != self.eos_symbol, sequence)) for sequence in sequences]
        return [self.spm.DecodeIds([token.item() for token in sentence])
                for sentence in sequences]

    def collate_function(self, batch):
        src_list, src_length_list = self._pad_sequence(
            [example[0][:self.max_sequence_length] for example in batch], self.pad_symbol)
        trg_list, trg_length_list = self._pad_sequence(
            [example[1][:self.max_sequence_length] for example in batch], self.pad_symbol)
        batch = {
            "src": torch.LongTensor(src_list),
            "trg": torch.LongTensor(trg_list),
            "src_length": src_length_list,
            "trg_length": trg_length_list,
        }
        return batch

    @staticmethod
    def _pad_sequence(sequences, pad_symbol=0):
        sequence_lengths = [len(sequence) for sequence in sequences]
        max_len = max(sequence_lengths)
        for i, length in enumerate(sequence_lengths):
            to_add = max_len - length
            sequences[i] += [pad_symbol] * to_add
        return sequences, sequence_lengths