def __init__(self, num_class=4): """Constructor""" super(BagOfEmbeddings, self).__init__() tokenizer = CharBPETokenizer('../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') vocab_size = tokenizer.get_vocab_size() self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=cfg.getint('model', 'emb_dim')) self.hidden1 = nn.Linear(in_features=cfg.getint('model', 'emb_dim'), out_features=cfg.getint( 'model', 'hidden_size')) self.relu = nn.ReLU() self.hidden2 = nn.Linear( in_features=cfg.getint('model', 'hidden_size'), out_features=cfg.getint('model', 'hidden_size')) self.dropout = nn.Dropout(cfg.getfloat('model', 'dropout')) self.classif = nn.Linear(in_features=cfg.getint( 'model', 'hidden_size'), out_features=num_class)
def test_basic_encode(self, openai_files): tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"]) output = tokenizer.encode("My name is John", "pair") assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688] assert output.tokens == [ "<unk>", "y</w>", "name</w>", "is</w>", "<unk>", "o", "hn</w>", "pair</w>", ] assert output.offsets == [ (0, 1), (1, 2), (3, 7), (8, 10), (11, 12), (12, 13), (13, 15), (0, 4), ] assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
def __init__(self, num_classes=3): """We have some of the best constructors in the world""" super(TransformerClassifier, self).__init__() tokenizer = CharBPETokenizer('../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') vocab_size = tokenizer.get_vocab_size() self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=cfg.getint( 'model', 'emb_dim')) self.position = PositionalEncoding( embedding_dim=cfg.getint('model', 'emb_dim')) encoder_layer = nn.TransformerEncoderLayer( d_model=cfg.getint('model', 'emb_dim'), nhead=cfg.getint('model', 'num_heads'), dim_feedforward=cfg.getint('model', 'feedforw_dim')) self.trans_encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=cfg.getint( 'model', 'num_layers')) self.dropout = nn.Dropout(cfg.getfloat('model', 'dropout')) self.linear = nn.Linear(in_features=cfg.getint('model', 'emb_dim'), out_features=num_classes) self.init_weights()
def test_train_from_iterator(self): text = ["A first sentence", "Another sentence", "And a last one"] tokenizer = CharBPETokenizer() tokenizer.train_from_iterator(text, show_progress=False) output = tokenizer.encode("A sentence") assert output.tokens == ["A</w>", "sentence</w>"]
def __init__(self, text_list, vocab_size, lazy=False): if not lazy: self.tokenizer = CharBPETokenizer() self.tokenizer.train(text_list, vocab_size=vocab_size, special_tokens=[PAD, BOS, EOS, "<unk>"]) self.tokenizer.add_special_tokens([PAD, BOS, EOS]) else: self.tokenizer = None
def __init__(self, cache_dir, max_length=None, vocab_size=400): self.vocab_size = vocab_size self.max_length = max_length self.cache_dir = cache_dir self.name = "%d-%s" % (vocab_size, max_length) self.tokenizer = None vocab = os.path.join(self.cache_dir, self.name + '-vocab.json') merges = os.path.join(self.cache_dir, self.name + '-merges.txt') if os.path.exists(vocab) and os.path.exists(merges): self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True) print('Using cached HuggingFaceTokenizer')
def __init__(self, path='subword/'): """ Args: path: str, a path to vocab file. """ # Load vocab self.subword_tokenizer = CharBPETokenizer(vocab_file=path+"/bpe-vocab.json", merges_file=path+"/bpe-merges.txt") self.encode = self._encode_subwords self.id_to_token = self._id_to_subword() self.token_to_id = self._subword_to_id()
def test_lowercase(self, openai_files): tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True) output = tokenizer.encode("My name is John", "pair", add_special_tokens=False) assert output.ids == [547, 1362, 544, 2476, 2688] assert output.tokens == [ "my</w>", "name</w>", "is</w>", "john</w>", "pair</w>" ] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)] assert output.type_ids == [0, 0, 0, 0, 1]
def __init__(self, num_class=4): """Constructor""" super(LstmClassifier, self).__init__() tokenizer = CharBPETokenizer('../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') vocab_size = tokenizer.get_vocab_size() self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=cfg.getint('model', 'emb_dim')) self.lstm = nn.LSTM(input_size=cfg.getint('model', 'emb_dim'), hidden_size=cfg.getint('model', 'hidden_size')) self.dropout = nn.Dropout(p=cfg.getfloat('model', 'dropout')) self.linear = nn.Linear(in_features=cfg.getint('model', 'hidden_size'), out_features=num_class)
def load(vocab_file=None): if not os.path.exists(vocab_file): raise Exception("{} is not exist".format(vocab_file)) path, filename = os.path.split(vocab_file) ttype = filename.split("_")[0] merges_file = os.path.join( path, filename.replace("vocab.json", "merges.txt")) if ttype == "byte": if not os.path.exists(merges_file): raise Exception("{} is not exist".format(merges_file)) tokenizer = ByteLevelBPETokenizer( add_prefix_space=True, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=vocab_file, merges_file=merges_file, dropout=None, continuing_subword_prefix=None, end_of_word_suffix=None) elif ttype == "char": if not os.path.exists(merges_file): raise Exception("{} is not exist".format(merges_file)) tokenizer = CharBPETokenizer( unk_token=unk_token, # required suffix=suffix_token, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=vocab_file, merges_file=merges_file, dropout=None) elif ttype == "bert": tokenizer = BertWordPieceTokenizer( clean_text=True, # required handle_chinese_chars=True, # required strip_accents=True, # required lowercase=True, # required vocab_file=vocab_file, # add_special_tokens=True, unk_token=BUNK, sep_token=BSEP, cls_token=BCLS, wordpieces_prefix=BPRE) elif ttype == "sent": if not os.path.exists(merges_file): raise Exception("{} is not exist".format(merges_file)) tokenizer = SentencePieceBPETokenizer( add_prefix_space=True, # required unk_token=unk_token, replacement=rep_token, vocab_file=vocab_file, merges_file=merges_file, dropout=None) else: raise Exception("Not implement yet") return tokenizer
def get_data(): transcript_folder = os.path.join('data', 'transcripts') summary_folder = os.path.join('data', 'summary') train_files, train_result_files, test_files, test_result_files = get_dataset_files(transcript_folder, summary_folder) train_data, train_results, test_data, test_results = get_dataset(train_files, train_result_files, test_files, test_result_files) tokenizer = CharBPETokenizer() all_files = np.concatenate([train_files, train_result_files, test_files, test_result_files]) tokenizer.train(list(all_files)) train_data = tokenize_data(tokenizer, train_data) test_data = tokenize_data(tokenizer, test_data) return train_data, train_results, test_data, test_results
def test(): """Test trained tokenizer""" tokenizer = CharBPETokenizer('./thyme-tokenizer-vocab.json', './thyme-tokenizer-merges.txt') vocab = tokenizer.get_vocab() print('vocab size:', len(vocab)) encoded = tokenizer.encode('patient dr. who diagnosed with brain abc') encoded.pad(15) print('encoded:', encoded.ids) print('decoded:', tokenizer.decode(encoded.ids)) print(encoded.tokens) print(encoded.attention_mask)
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs): kwargs.setdefault("unk_token", unk_token) super().__init__( CharBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, unk_token=unk_token, lowercase=True), **kwargs, )
def main(): #argparser parser = argparse.ArgumentParser( prog="train_mlm_camembert_thai.py", description="train mlm for Camembert with huggingface Trainer", ) #required parser.add_argument("--bpe_tokenizer", type=str, default='sentencepiece', help='Specify the name of BPE Tokenizer') parser.add_argument("--vocab_size", type=int, default=52000) parser.add_argument("--min_frequency", type=int, default=2) parser.add_argument( "--train_dir", type=str, ) parser.add_argument( "--output_dir", type=str, ) parser.add_argument("--ext", type=str, default='.txt') args = parser.parse_args() fnames = [str(x) for x in glob.glob(f"{args.train_dir}/*{args.ext}")] # Initialize a tokenizer if args.bpe_tokenizer == 'byte_level': _BPE_TOKENIZER = ByteLevelBPETokenizer() if args.bpe_tokenizer == 'char': _BPE_TOKENIZER = CharBPETokenizer() if args.bpe_tokenizer == 'sentencepiece': _BPE_TOKENIZER = SentencePieceBPETokenizer() tokenizer = _BPE_TOKENIZER # Customize training tokenizer.train(files=fnames, vocab_size=args.vocab_size, min_frequency=args.min_frequency, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save_model(args.output_dir) #test tokenizer = CamembertTokenizer.from_pretrained(args.output_dir) print(tokenizer.encode_plus('สวัสดีครับ hello world'))
class SubwordEncoder: "Subword tokenization" def __init__(self, path='subword/'): """ Args: path: str, a path to vocab file. """ # Load vocab self.subword_tokenizer = CharBPETokenizer(vocab_file=path+"/bpe-vocab.json", merges_file=path+"/bpe-merges.txt") self.encode = self._encode_subwords self.id_to_token = self._id_to_subword() self.token_to_id = self._subword_to_id() def get_vocab_size(self): return self.subword_tokenizer.get_vocab_size() def _encode_subwords(self, sentence, with_eos): """ Args: sentence: str, texts to be encoded. with_eos: end with <EOS> token. Returns: tokens: list, encoded sequence. """ tokens = self.subword_tokenizer.encode(sentence).ids if with_eos: tokens += [2] # 2 is the id of <EOS> token return tokens def _id_to_subword(self): id2subword = {} for i in range(self.get_vocab_size()): id2subword[i] = self.subword_tokenizer.id_to_token(i) return id2subword def _subword_to_id(self): subword2id = {} for i in range(self.get_vocab_size()): subword2id[self.subword_tokenizer.id_to_token(i)] = i return subword2id
def to_lstm_inputs(texts, max_len=None): """Padded at the beginning rather than at the end""" tokenizer = CharBPETokenizer( '../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') seqs = [tokenizer.encode(text).ids for text in texts] if max_len is None: # set max_len to the length of the longest sequence max_len = max(len(id_seq) for id_seq in seqs) ids = torch.zeros(len(seqs), max_len, dtype=torch.long) for i, seq in enumerate(seqs): if len(seq) > max_len: seq = seq[:max_len] ids[i, -len(seq):] = torch.tensor(seq) return ids
def to_token_id_sequences(texts, max_len=None): """Matrix of token ids""" tokenizer = CharBPETokenizer( '../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') seqs = [tokenizer.encode(text).ids for text in texts] if max_len is None: # set max_len to the length of the longest sequence max_len = max(len(id_seq) for id_seq in seqs) ids = torch.zeros(len(seqs), max_len, dtype=torch.long) for i, seq in enumerate(seqs): if len(seq) > max_len: seq = seq[:max_len] ids[i, :len(seq)] = torch.tensor(seq) return ids
def train(args): tokenizer = CharBPETokenizer() tokenizer.train([args.corpus], vocab_size=1000) tokenizer.save("src/dev_scripts/tokenizer.json")
def create_tokenizer_imbd(data_path, file_name, vocab_size): #df = pd.read_csv(os.path.join(data_path, file_name)) tokenizer = CharBPETokenizer() tokenizer.train( os.path.join(data_path, file_name), vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=["[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]"]) print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format( str(tokenizer.token_to_id("[CLS]")), str(tokenizer.token_to_id("[PAD]")), str(tokenizer.token_to_id("[MASK]")), str(tokenizer.token_to_id("[UNK]")), str(tokenizer.token_to_id("[SEP]")))) tokenizer.save(data_path, "tokenizer")
def to_transformer_inputs(texts, max_len=None): """Matrix of token ids and a square attention mask for eash sample""" tokenizer = CharBPETokenizer( '../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') seqs = [tokenizer.encode(text).ids for text in texts] if max_len is None: # set max_len to the length of the longest sequence max_len = max(len(id_seq) for id_seq in seqs) ids = torch.zeros(len(seqs), max_len, dtype=torch.long) mask = torch.zeros(len(seqs), max_len, max_len, dtype=torch.long) for i, seq in enumerate(seqs): if len(seq) > max_len: seq = seq[:max_len] ids[i, :len(seq)] = torch.tensor(seq) mask[i, :len(seq), :len(seq)] = 1 return ids, mask
def __init__(self, num_class=3): """Constructor""" super(BagOfEmbeddings, self).__init__() tokenizer = CharBPETokenizer( '../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') vocab_size = tokenizer.get_vocab_size() self.embed = nn.Embedding( num_embeddings=vocab_size, embedding_dim=cfg.getint('model', 'emb_dim')) self.posit = positions.BertPositionalEncoding.from_pretrained( 'bert-base-uncased') self.dropout = nn.Dropout(cfg.getfloat('model', 'dropout')) self.classif = nn.Linear( in_features=cfg.getint('model', 'emb_dim'), out_features=num_class)
def build(self, texts): if self.tokenizer is not None: return tmp_file = tempfile.NamedTemporaryFile() with open(tmp_file.name, "w") as f: f.write(' '.join(texts).lower()) self.tokenizer = CharBPETokenizer(lowercase=True) self.tokenizer.train( [tmp_file.name], vocab_size=self.vocab_size, special_tokens=[ NUL_token, PAD_token, BOS_token, UNK_token, ], ) os.makedirs(self.cache_dir, exist_ok=True) self.tokenizer.save(self.cache_dir, self.name)
def __init__(self, args): self.args = args if self.args.type == "byte": self.tokenizer = ByteLevelBPETokenizer( add_prefix_space=True, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=None, merges_file=None, dropout=None, continuing_subword_prefix=None, end_of_word_suffix=None) elif self.args.type == "char": self.tokenizer = CharBPETokenizer( unk_token=unk_token, # required suffix=suffix_token, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=None, merges_file=None, dropout=None) elif self.args.type == "bert": self.tokenizer = BertWordPieceTokenizer( clean_text=True, # required handle_chinese_chars=True, # required strip_accents=True, # required lowercase=True, # required vocab_file=None, # add_special_tokens=True, unk_token=BUNK, sep_token=BSEP, cls_token=BCLS, wordpieces_prefix=BPRE) elif self.args.type == "sent": self.tokenizer = SentencePieceBPETokenizer( add_prefix_space=True, # required unk_token=unk_token, replacement=rep_token, vocab_file=None, merges_file=None, dropout=None) else: raise Exception("Not implement yet") pass
def _cbpe(self): tokenizer = CharBPETokenizer( vocab=self.conf.vocab, merges=self.conf.merges, unk_token=self.conf.cbpe_unk_token, suffix=self.conf.suffix, dropout=self.conf.dropout, lowercase=self.conf.lowercase, unicode_normalizer=self.conf.unicode_normalizer, bert_normalizer=self.conf.bert_normalizer, split_on_whitespace_only=self.conf.split_on_whitespace_only, ) tokenizer.train( files=self.files, vocab_size=self.conf.vocab_size, min_frequency=self.conf.min_frequency, special_tokens=self.conf.special_tokens, limit_alphabet=self.conf.limit_alphabet, initial_alphabet=self.conf.initial_alphabet, suffix=self.conf.cpbe_train_shuffix, ) return tokenizer
class HuggingFaceTokenizer: def __init__(self, cache_dir, max_length=None, vocab_size=400): self.vocab_size = vocab_size self.max_length = max_length self.cache_dir = cache_dir self.name = "%d-%s" % (vocab_size, max_length) self.tokenizer = None vocab = os.path.join(self.cache_dir, self.name + '-vocab.json') merges = os.path.join(self.cache_dir, self.name + '-merges.txt') if os.path.exists(vocab) and os.path.exists(merges): self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True) print('Using cached HuggingFaceTokenizer') def build(self, texts): if self.tokenizer is not None: return tmp_file = tempfile.NamedTemporaryFile() with open(tmp_file.name, "w") as f: f.write(' '.join(texts).lower()) self.tokenizer = CharBPETokenizer(lowercase=True) self.tokenizer.train( [tmp_file.name], vocab_size=self.vocab_size, special_tokens=[ NUL_token, PAD_token, BOS_token, UNK_token, ], ) os.makedirs(self.cache_dir, exist_ok=True) self.tokenizer.save(self.cache_dir, self.name) def encode(self, text): token_ids = self.tokenizer.encode(text.lower()).ids token_ids = token_ids[:self.max_length] return token_ids def decode(self, tokens, skip_special_tokens=True): text = self.tokenizer.decode( # My special tokens tokens, # [token for token in tokens if token > 3], # aren't skipped skip_special_tokens=skip_special_tokens, # even I set f*****g ) # skip_special_tokens return text # to True def decode_plus(self, token_batch): sentences = [] for tokens in token_batch: sentences.append(self.decode(tokens)) return sentences
def __init__(self, tokenizers=None, cleaner=english_cleaners): if tokenizers == None: tokenizers = CharBPETokenizer( './BPE-1024/-vocab.json', './BPE-1024/-merges.txt', lowercase=True, ) punctuation = string.punctuation punctuation = punctuation.replace("+", "") punctuation = punctuation.replace("&", "") table = str.maketrans(punctuation, " " * len(punctuation)) if cleaner != None: print('Use cleaner !') self.table = table self.cleaner = cleaner self.token = tokenizers self.vocab_size = self.token.get_vocab_size()
def create_tokenizer(data_path, vocab_size): tokenizer = CharBPETokenizer() tokenizer.train([ os.path.join(data_path, file) for file in [f for f in os.listdir(data_path) if f.find("uncased_chunk") != -1][:20] ], vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=[ "[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]" ]) print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format( str(tokenizer.token_to_id("[CLS]")), str(tokenizer.token_to_id("[PAD]")), str(tokenizer.token_to_id("[MASK]")), str(tokenizer.token_to_id("[UNK]")), str(tokenizer.token_to_id("[SEP]")))) tokenizer.save(data_path, "tokenizer")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--raw_data_path', default='data/train.txt', type=str, required=False, help='原始训练语料') parser.add_argument('--batch_size', default=2, type=int, required=False, help='模型推断batch大小') parser.add_argument('--model_path', default='./model/epoch_5/model.bin', type=str, required=False, help='模型保存位置') args = parser.parse_args() print('args:\n' + args.__repr__()) model_path = args.model_path # device os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) # tokenizer tokenizer = CharBPETokenizer("./vocab/bpe.tokenizer.json-vocab.json", './vocab/bpe.tokenizer.json-merges.txt') # model with open('./config/model_config.json', 'r', encoding='utf-8') as f: text = f.read() config = json.loads(text) model = GPT2LMHeadModel(config) model.load_state_dict(torch.load(model_path)) model.eval() model.to(device)
def train(): """My main man""" base = os.environ['DATA_ROOT'] corpus_path = base + 'Thyme/Text/train+dev+test/*' files = glob.glob(corpus_path) tokenizer = CharBPETokenizer(lowercase=True) tokenizer.train(files=files, vocab_size=10000, min_frequency=3, show_progress=True) tokenizer.save('.', name='thyme-tokenizer')
def build_tokenizer(args): tokenizer = None if args.tokenizer_type == "bbpe": tokenizer = ByteLevelBPETokenizer(unicode_normalizer="nfkc") elif args.tokenizer_type == "cbpe": tokenizer = CharBPETokenizer( unk_token="<unk>", unicode_normalizer="nfkc", bert_normalizer=False, split_on_whitespace_only=True, ) elif args.tokenizer_type == "wp": tokenizer = BertWordPieceTokenizer( clean_text=False, handle_chinese_chars=True, strip_accents=False, lowercase=False, ) return tokenizer