def test_encoding(): text = ["This is Stratford", "<pad>"] bpemb_en = BPEmb(lang="en", add_pad_emb=True) # We can auto-add and encode start/end tokens. However, encoder can't handle <pad> directly. # We should pad outside with the corresponding index (index of the last word when add_pad_emb True). print(bpemb_en.encode(text)) print(bpemb_en.encode_with_eos(text)) print(bpemb_en.encode_with_bos_eos(text)) print(bpemb_en.encode_ids(text)) print(bpemb_en.encode_ids_with_eos(text)) print(bpemb_en.encode_ids_with_bos_eos(text))
class BPETokenizer: """Use byte pair encoding to transform text""" def __init__(self, lang='ru', pretrained=True, vocab_size=100000, dim=300): self.lang = lang self.pretrained = pretrained self.bpe = BPEmb(lang=self.lang, vs=vocab_size, dim=dim, vs_fallback=True) def fit(self, text): raise NotImplementedError('fit is not supported') def transform(self, text: Union[str, List[str]], get_ids=True): if get_ids: return self.bpe.encode_ids_with_bos_eos(text) else: return self.bpe.encode_with_bos_eos(text)