class BERTweetTokenizer(): def __init__(self,pretrained_path = '../input/bertweet-transformer-private/', parser=parser): self.bpe = fastBPE(args=parser.parse_args(args=[])) self.vocab = Dictionary() self.vocab.add_from_file(pretrained_path + "dict.txt") self.cls_token_id = 0 self.pad_token_id = 1 self.sep_token_id = 2 self.pad_token = '<pad>' self.cls_token = '<s> ' self.sep_token = ' </s>' def bpe_encode(self,text): return self.bpe.encode(text) def encode(self,text, add_special_tokens=False): subwords = self.cls_token + self.bpe.encode(text) + self.sep_token input_ids = self.vocab.encode_line(subwords, append_eos=False, add_if_not_exist=True).long().tolist() return input_ids def tokenize(self,text): return self.bpe_encode(text).split() def convert_tokens_to_ids(self,tokens): input_ids = self.vocab.encode_line(' '.join(tokens), append_eos=False, add_if_not_exist=False).long().tolist() return input_ids def decode_id(self,id): return self.vocab.string(id, bpe_symbol = '@@')
class BERTweetTokenizer(): def __init__(self, pretrained_path='./bertweet/'): self.bpe = fastBPE( SimpleNamespace(bpe_codes=pretrained_path + "bpe.codes")) self.vocab = Dictionary() self.vocab.add_from_file(pretrained_path + "dict.txt") self.cls_token_id = 0 self.pad_token_id = 1 self.sep_token_id = 2 self.pad_token = '<pad>' self.cls_token = '<s>' self.sep_token = '</s>' def bpe_encode(self, text): return self.bpe.encode(text) def encode(self, text, add_special_tokens=False): subwords = self.bpe.encode(text) input_ids = self.vocab.encode_line( subwords, append_eos=False, add_if_not_exist=False).long().tolist() return input_ids def tokenize(self, text): return self.bpe_encode(text).split() def convert_tokens_to_ids(self, tokens): input_ids = self.vocab.encode_line( ' '.join(tokens), append_eos=False, add_if_not_exist=False).long().tolist() return input_ids def decode(self, ids, clean_up_tokenization_spaces=False): return self.vocab.string(ids, bpe_symbol='@@')
def _clean_cats_attrs(ldict: Dictionary, schema, pred_cats: torch.Tensor, pred_attrs: torch.Tensor): cats = ldict.string(pred_cats).split(" ") attrs = [] if len(pred_attrs.shape) == 1: split_pred_attrs = [pred_attrs] else: split_pred_attrs = pred_attrs.split(1, dim=0) for (_cat_idx, attr_idxs) in zip(pred_cats.tolist(), split_pred_attrs): seq_attrs = [ lbl for lbl in ldict.string((attr_idxs.squeeze())).split(" ") ] if not any(it for it in seq_attrs): seq_attrs = [] attrs.append(seq_attrs) return list(zip(cats, attrs))
class BERTweetTokenizer(): def __init__(self, pretrained_path="../pretrained/bertweet/"): self.bpe = fastBPE( SimpleNamespace( bpe_codes=os.path.join(pretrained_path, "bpe.codes"))) self.vocab = Dictionary() self.vocab.add_from_file(os.path.join(pretrained_path, "dict.txt")) self.cls_token_id = 0 self.pad_token_id = 1 self.sep_token_id = 2 self.pad_token = '<pad>' self.cls_token = '<s>' self.sep_token = '</s>' def bpe_encode(self, text): return self.bpe.encode(text) def encode(self, text, add_special_tokens=False): subwords = self.bpe.encode(text) input_ids = self.vocab.encode_line( subwords, append_eos=False, add_if_not_exist=False).long().tolist() return input_ids def tokenize(self, text): return self.bpe_encode(text).split() def convert_tokens_to_ids(self, tokens): input_ids = self.vocab.encode_line( ' '.join(tokens), append_eos=False, add_if_not_exist=False).long().tolist() return input_ids #from: https://www.kaggle.com/nandhuelan/bertweet-first-look def decode_id(self, id): return self.vocab.string(id, bpe_symbol='@@') def decode_id_nospace(self, id): return self.vocab.string(id, bpe_symbol='@@ ')
def process_predictions(args, hypos, sp, tgt_dict: Dictionary, target_tokens, res_files, speaker, id): for hypo in hypos[:min(len(hypos), args.nbest)]: hyp_pieces = tgt_dict.string(hypo["tokens"].int().cpu()) # hyp_words = sp.DecodePieces(hyp_pieces.split()) hyp_words = hyp_pieces.replace(' ', '').replace('_', ' ') print(hyp_words) print("{} ({}-{})".format(hyp_pieces, speaker, id), file=res_files["hypo.units"]) print("{} ({}-{})".format(hyp_words, speaker, id), file=res_files["hypo.words"]) tgt_pieces = tgt_dict.string(target_tokens) # tgt_words = sp.DecodePieces(tgt_pieces.split()) tgt_words = tgt_pieces.replace(' ', '').replace('_', ' ') print("{} ({}-{})".format(tgt_pieces, speaker, id), file=res_files["ref.units"]) print("{} ({}-{})".format(tgt_words, speaker, id), file=res_files["ref.words"]) # only score top hypothesis if not args.quiet: logger.debug("HYPO:" + hyp_words) logger.debug("TARGET:" + tgt_words) logger.debug("___________________")
class PhoBertTokenizer(object): def __init__(self, ): self.vocab = Dictionary() self.vocab.add_from_file( "/content/drive/My Drive/PhoBERT_EMPATHETICDIALOGUES/EmpatheticDialogues/PhoBert/PhoBERT_base_transformers/dict.txt" ) def tokenize(self, inp_string): return bpe.encode(inp_string).split(" ") def convert_tokens_to_ids(self, tokens): return self.vocab.encode_line(" ".join(tokens), append_eos=False, add_if_not_exist=False).long().tolist() def convert_ids_to_tokens(self, ids): return self.vocab.string(torch.tensor([ids], dtype=torch.long))
class XLMRobertaTokenizer: def __init__(self, pretrained_file): # load bpe model and vocab file bpe_model_file = pjoin(pretrained_file, 'sentencepiece.bpe.model') vocab_file = pjoin(pretrained_file, 'dict.txt') self.sp = SentencepieceBPE(bpe_model_file) self.bpe_dict = Dictionary().load(vocab_file) self.cls_token = "<s>" self.sep_token = "</s>" self.pad_token_id = 1 def tokenize(self, sentence): return self.sp.encode(sentence).split(' ') def convert_tokens_to_ids(self, tokens): bpe_sentence = ' '.join(tokens) bpe_ids = self.bpe_dict.encode_line(bpe_sentence, add_if_not_exist=False, append_eos=False).tolist() # def encode(self, sentence, add_bos=False, add_eos=False): # bpe_sentence = '<s> ' + self.sp.encode(sentence) + ' </s>' # bpe_ids = self.bpe_dict.encode_line(bpe_sentence, append_eos=False).tolist() # if not add_bos: # bpe_ids = bpe_ids[1:] # if not add_eos: # bpe_ids = bpe_ids[:-1] return bpe_ids def decode(self, tokens): sentences = [self.sp.decode(self.bpe_dict.string(s)) for s in tokens] return sentences def encodeAsPieces(self, sentence): bpe_sentence = '<s> ' + self.sp.encode(sentence) + ' </s>' return bpe_sentence @property def vocab_size(self): return len(self.bpe_dict)