def from_file( vocab_filename: str, merges_filename: Union[str, None], **kwargs, ): vocab, merges = BPE.read_file(vocab_filename, merges_filename) return BrainBertTokenizer(vocab, merges, **kwargs)
def from_file( vocab_filename: str, merges_filename: Union[str, None], **kwargs, ): # BPE if merges_filename: vocab, merges = BPE.read_file(vocab_filename, merges_filename) # Unigram else: vocab = [] merges = None with open(vocab_filename, "r") as f_in: for line in f_in.readlines(): token, score = line.strip().split("\t") vocab.append((token, float(score))) return CustomTokenizer(vocab, merges, **kwargs)
def from_file(vocab_filename: str, merges_filename: str, **kwargs): vocab, merges = BPE.read_file(vocab_filename, merges_filename) return SentencePieceBPETokenizer(vocab, merges, **kwargs)
def from_file(vocab_filename: str, merges_filename: str, **kwargs): vocab, merges = BPE.read_file(vocab_filename, merges_filename) return ByteLevelBPETokenizer(vocab, merges, **kwargs)