def from_spm(filename: str): try: import sys sys.path.append(".") import sentencepiece_model_pb2 as model except Exception: raise Exception( "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required." ) m = model.ModelProto() m.ParseFromString(open(filename, "rb").read()) precompiled_charsmap = m.normalizer_spec.precompiled_charsmap vocab = [(piece.piece, piece.score) for piece in m.pieces] unk_id = m.trainer_spec.unk_id model_type = m.trainer_spec.model_type if model_type != 1: raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" ) data = {"unk_id": unk_id, "vocab": vocab} replacement = "▁" add_prefix_space = True out_vocab_filename = f"{filename}.json" try: with open(out_vocab_filename, "w") as f: json.dump(data, f, indent=4) tokenizer = Tokenizer(Unigram(out_vocab_filename)) finally: os.remove(out_vocab_filename) tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space ), ] ) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space ) parameters = { "model": "SentencePieceUnigram", } obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters) BaseTokenizer.__init__(obj, tokenizer, parameters) return obj
def get_proto(filename: str): try: import sys sys.path.append(".") import sentencepiece_model_pb2 as model except Exception: raise Exception( "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required." ) m = model.ModelProto() m.ParseFromString(open(filename, "rb").read()) return m
def main(): # Argument Parser parser = argparse.ArgumentParser( description='Append unknown vocabulary to pretrained sentencepiece model') parser.add_argument('-m', '--model', type=str, help='path to existing sentencepiece model to use') parser.add_argument('-i', '--input', type=str, required=True, help='path to input unknown vocabulary file') parser.add_argument('-o', '--output', type=str, required=True, help='path to new sentencepiece model') args = parser.parse_args() print(args, file=sys.stderr) # load & serialize old sentencepiece model # Ref: https://github.com/google/sentencepiece/issues/121#issuecomment-400362011 m = model.ModelProto() with open(args.model, 'rb') as mf: m.ParseFromString(mf.read()) # Get min score for new SentencePiece objects min_score = m.pieces[-1].score # loop through input file with open(args.input, "r") as uf: # One unknown per line for line in uf: # Create new SentencePiece and Strip out newline letter new_piece = model.ModelProto.SentencePiece() new_piece.piece = line.strip('\n') new_piece.score = min_score # Append new piece to model m.pieces.append(new_piece) uf.close() # Save model with open(args.output, 'wb') as of: of.write(m.SerializeToString()) of.close()
if __name__ == "__main__": parser = argparse.ArgumentParser(description="Sp Model conversion") parser.add_argument( "--sp_model", help="sp path", ) parser.add_argument( "--tokens", help="sp path", ) args = parser.parse_args() m = model.ModelProto() m.ParseFromString(open(args.sp_model, 'rb').read()) vocab = [] with open(args.tokens, "r") as f: for line in f: line = line.replace("_", "\u2581").replace("\n", "") vocab.append(line) print(f'Vocab len {len(vocab)}') print(f'Pieces len {len(m.pieces)}') for i, p in enumerate(m.pieces): if p.score != 0: p.piece = vocab[i - 3] with open('new.model', 'wb') as f: