コード例 #1
0
def _predict_chars(
    model: tf.keras.Sequential,
    sp: spm.SentencePieceProcessor,
    start_string: str,
    store: _BaseConfig,
) -> str:
    """
    Evaluation step (generating text using the learned model).

    Args:
        model: tf.keras.Sequential model
        sp: SentencePiece tokenizer
        start_string: string to bootstrap model
        store: our config object
    Returns:
        Yields line of text per iteration
    """

    # Converting our start string to numbers (vectorizing)
    input_eval = sp.EncodeAsIds(start_string)
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store each line
    sentence_ids = []

    # Here batch size == 1
    model.reset_states()

    while True:
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to
        # predict the word returned by the model
        predictions = predictions / store.gen_temp
        predicted_id = tf.random.categorical(predictions,
                                             num_samples=1)[-1, 0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        sentence_ids.append(int(predicted_id))

        decoded = sp.DecodeIds(sentence_ids)
        if store.field_delimiter is not None:
            decoded = decoded.replace(store.field_delimiter_token,
                                      store.field_delimiter)

        if "<n>" in decoded:
            return _pred_string(decoded.replace("<n>", ""))
        elif 0 < store.gen_chars <= len(decoded):
            return _pred_string(decoded)
コード例 #2
0
class SentencepieceFasttextEmbed(EmbedderInterface):
    class Config(EmbedderInterface.Config):
        pass

    @classmethod
    def from_config(cls, config: Config):
        spm_model_file = os.path.join(config.preproc_dir, "spm.model")
        fasttext_model_file = os.path.join(config.preproc_dir,
                                           "fasttext-model.bin")
        return cls(spm_model_file, fasttext_model_file, config.max_pieces)

    def __init__(self,
                 spm_model_file: str,
                 fasttext_model_file: str = '',
                 max_pieces: int = -1):
        super().__init__(max_pieces=max_pieces)

        self.spm = SentencePieceProcessor()
        self.spm.Load(spm_model_file)
        self.pad_idx = self.spm.pad_id()
        self.pad_token = self.spm.IdToPiece(self.pad_idx)
        self.unk_idx = self.spm.unk_id()
        self.unk_token = self.spm.IdToPiece(self.unk_idx)
        self.bos_idx = self.spm.bos_id()
        self.bos_token = self.spm.IdToPiece(self.bos_idx)
        self.eos_idx = self.spm.eos_id()
        self.eos_token = self.spm.IdToPiece(self.eos_idx)

        if fasttext_model_file:
            self.fasttext = fasttext.load_model(fasttext_model_file)

    @property
    def embed_dim(self):
        return self.fasttext.dim

    @property
    def n_vocab(self):
        return self.spm.get_piece_size()

    def encode_text_as_ids(self, text: str) -> np.array:
        """
    Doesn't produce BOS, EOS ids.
    """
        return np.asarray(self.spm.EncodeAsIds(text)[self.pieces_slice],
                          dtype=np.int32)

    def encode_text_as_tokens(self, text: str) -> List[str]:
        """
    Doesn't produce BOS, EOS tokens.
    """
        return self.spm.EncodeAsPieces(text)[self.pieces_slice]

    def tokenize(self, text: str) -> List[str]:
        """
    Alias for `encode_text_as_tokens`.
    Doesn't produce BOS, EOS tokens.
    """
        return self.encode_text_as_tokens(text)[self.pieces_slice]

    def decode_ids_as_text(self, ids: List[int], strip_special=True) -> str:
        """
    Doesn't produce PAD, BOS, or EOS text.
    i.e. PAD, BOS, EOS ids are stripped out before decoding.
    UNK is decoded but unintelligible.
    """
        if strip_special:
            ids = [
                int(id) for id in ids
                if id not in (self.pad_idx, self.bos_idx, self.eos_idx)
            ]
        else:
            ids = [int(id) for id in ids]
        return self.spm.DecodeIds(ids)

    def decode_tokens_as_text(self, toks: List[str]) -> str:
        """
    Doesn't produce PAD, BOS, or EOS text.
    i.e. PAD, BOS, EOS tokens are stripped out before decoding.
    UNK is decoded but unintelligible.
    """
        return self.spm.DecodePieces(toks[self.pieces_slice])

    @functools.lru_cache(maxsize=1024)
    def decode_id_as_token(self, id: int) -> str:
        return self.spm.IdToPiece(id)

    def decode_ids_as_tokens(self,
                             ids: List[int],
                             strip_special: bool = True) -> List[str]:
        """
    By default, doesn't produce PAD, BOS, EOS tokens.

    Avoids problematic intermediate string representation that causes length mismatch.
    In other words, SentencePiece isn't isomorphic with respect to the string representation.
    """
        if strip_special:
            ids = [
                id for id in ids
                if id not in (self.pad_idx, self.bos_idx, self.eos_idx)
            ]
        return [self.decode_id_as_token(int(ix)) for ix in ids]

    @functools.lru_cache(maxsize=1024)
    def embed_tok(self, tok: str) -> np.array:
        """
    When given PAD, returns all zeros
    """
        if tok == self.pad_token:
            return np.zeros(self.fasttext.dim)
        return np.asarray(self.fasttext[tok])

    def embed_text(self, text: str) -> np.array:
        """
    Doesn't produce PAD, BOS, EOS embeddings.
    i.e. PAD, BOS, EOS are stripped out during tokenization before embedding.
    """
        return np.asarray([self.embed_tok(tok) for tok in self.tokenize(text)])

    def embed_ids(self,
                  ids: List[int],
                  strip_special: bool = True) -> List[np.array]:
        """
    By default, doesn't produce PAD, BOS, EOS tokens.

    Avoids problematic intermediate string representation that causes length mismatch.
    In other words, SentencePiece isn't isomorphic with respect to the string representation.
    """
        return [
            self.embed_tok(t)
            for t in self.decode_ids_as_tokens(ids,
                                               strip_special=strip_special)
        ]

    def embed_ids_batch(self, ids: np.array) -> torch.tensor:
        emb = [self.embed_ids(turn, strip_special=False) for turn in ids]
        emb = torch.tensor(emb)
        return emb
コード例 #3
0
    # Token id 0 is the equivalent of a "start" token
    model_infer.state = initial_state  # stateの初期化
    cur_inputs = np.zeros((1, 1), dtype=np.int32)  # 初期値=0の挿入
    all_samples = []
    if prompt is not None:
        prompt = np.asarray(TOKENIZER.EncodeAsIds(prompt))

    for iteration in range(length):
        logits = model_infer(cur_inputs)

        if prompt is not None and iteration < prompt.shape[0]:
            cur_samples = onp.array(prompt[iteration], dtype=int)
        else:
            logits = onp.array(logits)[0, 0, :]
            probs = onp.exp(logits)
            cur_samples = onp.random.choice(probs.shape[-1], p=probs[:])
            cur_samples = onp.array(cur_samples, dtype=int)

        all_samples.append(cur_samples)
        cur_inputs = np.array(cur_samples[None, None])

    all_samples = onp.stack(all_samples, -1)

    return all_samples


prefix = [5, 3, 5, 2, 1, 6]
pred = prediction(10, "家康は")

print(TOKENIZER.DecodeIds(pred.tolist()))
コード例 #4
0
ファイル: dataset.py プロジェクト: roholazandie/t-vae
class TVAEDataset(Dataset):
    def __init__(self, root, prefix, part, max_sequence_length=150, **kwargs):
        self.root = root
        self.prefix = prefix
        self.preprocess_args = kwargs

        data_file_name = os.path.join(root, prefix, part + ".npy")
        spm_file_name = os.path.join(root, prefix, "spm.model")

        if not TVAEDataset.exist(root, prefix, part):
            logging.info("Start preprocessing %s/%s/%s dataset", root, prefix, part)
            self.preprocess(root, prefix, part, **kwargs)

        if 'spm_model' in self.preprocess_args:
            logging.info("Use existed sentencepiece model")
            self.spm_model = self.preprocess_args['spm_model']
        else:
            logging.info("Load sentencepiece model from disk")
            self.spm_model = SentencePieceProcessor()
            self.spm_model.load(spm_file_name)

        self._data = np.load(data_file_name)

        self.pad_symbol = self.spm_model.pad_id()
        self.eos_symbol = self.spm_model.eos_id()

        self._len = self._data.shape[0]
        self.limit = max_sequence_length

        sequence_lens = [
            len(seq) for seq in self._data
        ]
        self.max_sequence_length = min(self.limit, max(sequence_lens))

    def __getitem__(self, index):
        return self._data[index]

    def __len__(self):
        return self._len

    def preprocess(self, directory: str, prefix: str, part: str, spm_model: SentencePieceProcessor = None,
                   pretrain_emb=True, vocab_size=3000, embedding_size=600,
                   max_sentence_length=16384, workers=3, skip_gramm=False):

        # Check data files existing
        workdir = os.path.join(directory, prefix)
        os.makedirs(workdir, exist_ok=True)

        data_part_file = os.path.join(directory, part + ".tsv")
        if not os.path.exists(data_part_file):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), data_part_file)

        if part not in ['train', 'develop']:
            assert spm_model is not None, "For non train part, `spm_model` must be specified."
        else:
            # Train sentecepiece:
            logging.info("Start training sentecepiece")
            spm_directory = os.path.join(workdir, "spm")
            spm_params = (
                "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 "
                "--input={} --model_prefix={} --vocab_size={} --max_sentence_length={}".format(
                    data_part_file, spm_directory, vocab_size, max_sentence_length
                )
            )
            SentencePieceTrainer.Train(spm_params)
            spm_model = SentencePieceProcessor()
            spm_model.load(spm_directory + ".model")

            if pretrain_emb:
                # Train word2vec
                logging.info("Start training word2vec")
                train_senteces = SentenceIterator(data_part_file, spm_model)
                logging.info("Loaded train sentences")
                w2v_model = Word2Vec(train_senteces, min_count=0, workers=workers,
                                     size=embedding_size, sg=int(skip_gramm))
                w2v_model_filename = os.path.join(workdir, "word2vec.model")
                w2v_model.save(w2v_model_filename)

                # Export embeddings
                logging.info("Export embeddings")
                embeddings_filename = os.path.join(workdir, "embedding.npy")
                export_embeddings(embeddings_filename, spm_model, w2v_model)
                logging.info("Embeddings have been saved into {}".format(embeddings_filename))

        logging.info("Start exporting data file")
        source_file_name = os.path.join(directory, part + ".tsv")
        exported_file_name = os.path.join(workdir, part + ".npy")
        sentence_iterator = SentenceIterator(source_file_name, spm_model)
        sentence_iterator.export(exported_file_name)
        logging.info("{} exported".format(exported_file_name))
        logging.info("Data preprocessing completed")

    @staticmethod
    def exist(root: str, prefix: str, parts: TypeVar("P", str, List[str])) -> bool:
        if isinstance(parts, str):
            parts = [parts]
        parts_file_name = [os.path.join(root, prefix, part + ".npy") for part in parts]
        smp_file_name = os.path.join(root, prefix, "spm.model")

        necessary_files = parts_file_name + [smp_file_name]
        existing = [os.path.exists(filename) for filename in necessary_files]
        return reduce(and_, existing)

    @staticmethod
    def _pad_sequence(sequences, pad_symbol=0):
        sequence_lengths = [len(sequence) for sequence in sequences]
        max_len = max(sequence_lengths)
        for i, length in enumerate(sequence_lengths):
            to_add = max_len - length
            sequences[i] += [pad_symbol] * to_add
        return sequences, sequence_lengths

    def collate_function(self, batch):
        src_list, src_length_list = TVAEDataset._pad_sequence(
            [example[:self.limit] for example in batch], self.pad_symbol)
        batch = {
            "src": torch.LongTensor(src_list)
        }
        return batch

    def get_embeddings(self):
        """Load pretrain embeddings.
        Returns:
            np.array: Array with word2vec embeddings if this one exists, otherwise `None`.
        """

        embedinds_path = os.path.join(self.root, self.prefix, "embedding.npy")
        if not os.path.exists(embedinds_path):
            logging.info("Embedding file does not founded")
            return None
        else:
            logging.info("Loading embedding dump file")
            return np.load(embedinds_path)

    def decode(self, sequences):
        sequences = [list(takewhile(lambda x: x != self.eos_symbol, sequence)) for sequence in sequences]
        return [self.spm_model.DecodeIds([token.item() for token in sentence])
                for sentence in sequences]
コード例 #5
0
class BPEDataset(SummarizationDataset):
    """Summarization dataset with Byte-Pair encoding.

    Args:
        directory (str): Dataset directory.
        prefix (str): Dataset preprocessing prefix.
        part (str): Dataset part name. :attr:`directory` must contain :attr:`part`.tsv file.
          Use `None` for sampling.
        max_sequence_length (int, optional): Defaults to 150. Maximum sequence length.

    Note:
        Use **kwargs to set up preprocessing arguments.
    """

    def __init__(self, directory: str, prefix: str, part: str, max_sequence_length=150, **kwargs):
        self.data_workdir = os.path.join(directory, prefix)
        self.spm_file = os.path.join(self.data_workdir, "spm.model")

        if part is None:
            self._sample_init(self.spm_file, max_sequence_length)
            return

        self.source_part_file = os.path.join(directory, part + ".tsv")
        self.part_file = os.path.join(self.data_workdir, part + ".npy")

        if not self.exist(directory, prefix, part):
            logger.info("Dataset part {}/{} not founded".format(self.data_workdir, part))
            self.preprocess(directory, prefix, part, **kwargs)

        self.data = np.load(self.part_file, allow_pickle=True)

        if "spm" in kwargs:
            logger.info("Use existing spm model")
            self.spm = kwargs["spm"]
        else:
            logger.info("Load spm model")
            self.spm = SentencePieceProcessor()
            self.spm.load(self.spm_file)

        self.pad_symbol = self.spm.pad_id()
        self.eos_symbol = self.spm.eos_id()

        self._len = self.data.shape[0]

        sequence_lens = [
            len(seq) for example in self.data for seq in example
        ]
        self.max_sequence_length = min(max_sequence_length, max(sequence_lens))

    def _sample_init(self, spm_file_name, max_sequence_length):
        if not os.path.exists(spm_file_name):
            raise RuntimeError("Firstly preprocess dataset")

        self.spm = SentencePieceProcessor()
        self.spm.load(spm_file_name)

        self.pad_symbol = self.spm.pad_id()
        self.eos_symbol = self.spm.eos_id()

        self._len = 0
        self.data = []

        self.max_sequence_length = max_sequence_length

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self._len

    @staticmethod
    def exist(directory: str, prefix: str, part: str) -> bool:
        """Check dataset existence,

        Args:
            directory (str): Dataset directory.
            prefix (str): Dataset prefix.
            part (str): Dataset part.
            spm_filename (str, optional): Defaults to "spm.model". Name of sentencepiece serialized model.

        Returns:
            bool: Existence status.
        """

        data_workdir = os.path.join(directory, prefix)
        part_filename = os.path.join(data_workdir, part + ".npy")
        spm_filename = os.path.join(data_workdir, "spm.model")

        necessary_files = [part_filename, spm_filename]
        existing = [os.path.exists(filename) for filename in necessary_files]
        return all(existing)

    @staticmethod
    def preprocess(directory: str,
                   prefix: str,
                   part: str,
                   spm: SentencePieceProcessor = None,
                   pretrain_emb=True,
                   vocab_size=30000,
                   embedding_size=300,
                   max_sentence_length=16384,
                   workers=3,
                   skip_gramm=False):
        """Preprocess dataset.

        Args:
            directory (str): Dataset directory.
            prefix (str): Dataset preprocessing prefix.
            part (str): Dataset part. :attr:`directory` must contain :attr:`part`.tsv file with data.
            spm (SentencePieceProcessor, optional): Defaults to None. Sentecepiece model.
            pretrain_emb (bool, optional): Defaults to True. Whether to pretrain embeddings.
            vocab_size (int, optional): Defaults to 30000. Vocabulary size.
            embedding_size (int, optional): Defaults to 300. Pretrained embedding size.
            max_sentence_length (int, optional): Defaults to 16384. Maximum sentence length for sentencepiece.
            workers (int, optional): Defaults to 3. Number of workers.
            skip_gramm (bool, optional): Defaults to False. Whether to use skip-gram type of Word2Vec training.

        Raises:
            FileNotFoundError: Raises if source data file doesn't exist.
        """

        data_workdir = os.path.join(directory, prefix)
        part_source_filename = os.path.join(directory, part + ".tsv")
        part_exported_filename = os.path.join(data_workdir, part + ".npy")
        spm_filename = os.path.join(data_workdir, "spm.model")
        spm_directory = os.path.join(data_workdir, "spm")
        w2v_model_filename = os.path.join(data_workdir, "word2vec.model")
        embeddings_filename = os.path.join(data_workdir, "embedding.npy")

        logger.info("Preprocess {}/{} dataset.".format(data_workdir, part))
        os.makedirs(data_workdir, exist_ok=True)

        if not os.path.exists(part_source_filename):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), part_source_filename)

        if part not in ["train", "dev"]:
            assert spm is not None, "For non train part, `spm` must be specified."
        else:
            logger.info("Start training sentencepiece")
            spm_params = (
                "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 "
                "--input={} --model_prefix={} --vocab_size={} --max_sentence_length={}".format(
                    part_source_filename,
                    spm_directory,
                    vocab_size,
                    max_sentence_length
                )
            )
            SentencePieceTrainer.Train(spm_params)
            spm = SentencePieceProcessor()
            spm.load(spm_filename)

            if pretrain_emb:
                logger.info("Start training Word2Vec embeddings")

                train_senteces = SentenceIterator(part_source_filename, spm)
                logger.info("Loaded train sentences")
                w2v_model = Word2Vec(train_senteces, min_count=0, workers=workers,
                                     size=embedding_size, sg=int(skip_gramm))
                w2v_model.save(w2v_model_filename)

                # Export embeddings
                logger.info("Export embeddings")
                export_embeddings(embeddings_filename, spm, w2v_model)
                logger.info("Embeddings have been saved into {}".format(embeddings_filename))

        logger.info("Start exporting data file")
        sentence_iterator = SentenceIterator(part_source_filename, spm)
        sentence_iterator.export(part_exported_filename)
        logger.info("{} exported".format(part_exported_filename))

    def get_embeddings(self) -> np.array:
        """Load pretrain embeddings.
        Returns:
            np.array: Array with word2vec embeddings if this one exists, otherwise `None`.
        """

        embedinds_path = os.path.join(self.data_workdir, "embedding.npy")
        if not os.path.exists(embedinds_path):
            logging.info("Embedding file does not founded")
            return None
        else:
            logging.info("Loading embedding dump file")
            return np.load(embedinds_path)

    def get_spm(self) -> SentencePieceProcessor:
        return self.spm

    def encode(self, sequences):
        sequences = [self.spm.EncodeAsIds(s)[:self.max_sequence_length] for s in sequences]
        return torch.LongTensor(sequences)

    def decode(self, sequences):
        sequences = [list(takewhile(lambda x: x != self.eos_symbol, sequence)) for sequence in sequences]
        return [self.spm.DecodeIds([token.item() for token in sentence])
                for sentence in sequences]

    def collate_function(self, batch):
        src_list, src_length_list = self._pad_sequence(
            [example[0][:self.max_sequence_length] for example in batch], self.pad_symbol)
        trg_list, trg_length_list = self._pad_sequence(
            [example[1][:self.max_sequence_length] for example in batch], self.pad_symbol)
        batch = {
            "src": torch.LongTensor(src_list),
            "trg": torch.LongTensor(trg_list),
            "src_length": src_length_list,
            "trg_length": trg_length_list,
        }
        return batch

    @staticmethod
    def _pad_sequence(sequences, pad_symbol=0):
        sequence_lengths = [len(sequence) for sequence in sequences]
        max_len = max(sequence_lengths)
        for i, length in enumerate(sequence_lengths):
            to_add = max_len - length
            sequences[i] += [pad_symbol] * to_add
        return sequences, sequence_lengths