Beispiel #1
0
 def from_file(
     vocab_filename: str,
     merges_filename: Union[str, None],
     **kwargs,
 ):
     vocab, merges = BPE.read_file(vocab_filename, merges_filename)
     return BrainBertTokenizer(vocab, merges, **kwargs)
Beispiel #2
0
    def from_file(
        vocab_filename: str,
        merges_filename: Union[str, None],
        **kwargs,
    ):
        # BPE
        if merges_filename:
            vocab, merges = BPE.read_file(vocab_filename, merges_filename)

        # Unigram
        else:
            vocab = []
            merges = None
            with open(vocab_filename, "r") as f_in:
                for line in f_in.readlines():
                    token, score = line.strip().split("\t")
                    vocab.append((token, float(score)))

        return CustomTokenizer(vocab, merges, **kwargs)
Beispiel #3
0
 def from_file(vocab_filename: str, merges_filename: str, **kwargs):
     vocab, merges = BPE.read_file(vocab_filename, merges_filename)
     return SentencePieceBPETokenizer(vocab, merges, **kwargs)
Beispiel #4
0
 def from_file(vocab_filename: str, merges_filename: str, **kwargs):
     vocab, merges = BPE.read_file(vocab_filename, merges_filename)
     return ByteLevelBPETokenizer(vocab, merges, **kwargs)