Ejemplo n.º 1
0
    def train(self, train_dset, vocab_size, prefix='sample', **kwargs):

        self.vocab_size = vocab_size
        df = pd.DataFrame([x[0] for x in train_dset])
        df.to_csv('sample.csv', index=False, header=False)
        generate_sp_model('sample.csv',
                          vocab_size=vocab_size,
                          model_prefix=prefix)
        vocab_tokenizer = load_sp_model(prefix + ".model")
        self.tokenizer = sentencepiece_tokenizer(sp_model=vocab_tokenizer)
Ejemplo n.º 2
0
    def test_sentencepiece_tokenizer(self):
        test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        model_path = get_asset_path('spm_example.model')
        sp_model = load_sp_model(open(model_path, 'rb'))
        self.assertEqual(sp_model.GetPieceSize(), 20000)
        spm_generator = sentencepiece_tokenizer(sp_model)

        ref_results = [
            '\u2581Sent', 'ence', 'P', 'ie', 'ce', '\u2581is', '\u2581an',
            '\u2581un', 'super', 'vis', 'ed', '\u2581text', '\u2581to', 'ken',
            'izer', '\u2581and', '\u2581de', 'to', 'ken', 'izer'
        ]

        self.assertEqual(list(spm_generator([test_sample]))[0], ref_results)
Ejemplo n.º 3
0
    def test_sentencepiece_tokenizer(self):

        test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        model_path = 'test/asset/spm_example.model'
        sp_model = load_sp_model(model_path)
        self.assertEqual(len(sp_model), 20000)
        spm_generator = sentencepiece_tokenizer(sp_model)

        # Handle byte string in Python2 and Unicode string in Python3, respectively
        if sys.version_info < (3, 0):
            ref_results = [
                '\xe2\x96\x81Sent', 'ence', 'P', 'ie', 'ce', '\xe2\x96\x81is',
                '\xe2\x96\x81an', '\xe2\x96\x81un', 'super', 'vis', 'ed',
                '\xe2\x96\x81text', '\xe2\x96\x81to', 'ken', 'izer',
                '\xe2\x96\x81and', '\xe2\x96\x81de', 'to', 'ken', 'izer'
            ]
        else:
            ref_results = [
                '\u2581Sent', 'ence', 'P', 'ie', 'ce', '\u2581is', '\u2581an',
                '\u2581un', 'super', 'vis', 'ed', '\u2581text', '\u2581to',
                'ken', 'izer', '\u2581and', '\u2581de', 'to', 'ken', 'izer'
            ]

        self.assertEqual(list(spm_generator([test_sample]))[0], ref_results)
Ejemplo n.º 4
0
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# lets define already our device and make sure to run on gpu if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# %% [markdown]
# We use the same tokenizer as in the data-preprocessing line

# %%
sp_deu = load_sp_model("preprocessed_data/sp_model/de.wiki.bpe.vs10000.model")
sp_nds = load_sp_model("preprocessed_data/sp_model/nds.wiki.bpe.vs10000.model")

# %%

sp_deu_tokens_generator = sentencepiece_tokenizer(sp_deu)
list_a = [
    "Komplizierte Wörter sind Baustelle.",
    "Morgen soll es regnen und übermorgen scheint die Sonne"
]
print(list(sp_deu_tokens_generator(list_a)))
sp_numericalize_generator = sentencepiece_numericalizer(sp_deu)
print(list(sp_numericalize_generator(list_a)))

# %%

sp_deu_tokens_generator = sentencepiece_tokenizer(sp_deu)
sp_nds_tokens_generator = sentencepiece_tokenizer(sp_nds)


def tokenize_de(text):
Ejemplo n.º 5
0
from typing import List, Union

import janome
import spacy
from janome.tokenizer import Tokenizer as JanomeTokenizer
from torchtext.data.functional import load_sp_model, sentencepiece_tokenizer

from src.constants import SENTENCE_PIECE_MODEL_PATH
from src.utils import log_decorator

janome_tokenizer = JanomeTokenizer()
ginza_tokenizer = spacy.load("ja_ginza")

sp_model = load_sp_model(SENTENCE_PIECE_MODEL_PATH)
sp_tokens_generator = sentencepiece_tokenizer(sp_model)


@log_decorator
def wakachi_by_sentencepiece(sentence: str) -> List[str]:
    """SentencePieceを用いた分かち書き

    Parameters
    ----------
    sentence : str
        分かち書きしたい文章

    Returns
    -------
    List[str]
        分かち書き結果
    """