コード例 #1
0
ファイル: TestBPE.py プロジェクト: erelcan/keras-transformer
def test_encoding():
    text = ["This is Stratford", "<pad>"]

    bpemb_en = BPEmb(lang="en", add_pad_emb=True)

    # We can auto-add and encode start/end tokens. However, encoder can't handle <pad> directly.
    # We should pad outside with the corresponding index (index of the last word when add_pad_emb True).
    print(bpemb_en.encode(text))
    print(bpemb_en.encode_with_eos(text))
    print(bpemb_en.encode_with_bos_eos(text))
    print(bpemb_en.encode_ids(text))
    print(bpemb_en.encode_ids_with_eos(text))
    print(bpemb_en.encode_ids_with_bos_eos(text))
コード例 #2
0
class BPETokenizer:
    """Use byte pair encoding to transform text"""
    def __init__(self, lang='ru', pretrained=True, vocab_size=100000, dim=300):
        self.lang = lang
        self.pretrained = pretrained
        self.bpe = BPEmb(lang=self.lang,
                         vs=vocab_size,
                         dim=dim,
                         vs_fallback=True)

    def fit(self, text):
        raise NotImplementedError('fit is not supported')

    def transform(self, text: Union[str, List[str]], get_ids=True):
        if get_ids:
            return self.bpe.encode_ids_with_bos_eos(text)
        else:
            return self.bpe.encode_with_bos_eos(text)