class BERTweetTokenizer(): def __init__(self, pretrained_path='./bertweet/'): self.bpe = fastBPE( SimpleNamespace(bpe_codes=pretrained_path + "bpe.codes")) self.vocab = Dictionary() self.vocab.add_from_file(pretrained_path + "dict.txt") self.cls_token_id = 0 self.pad_token_id = 1 self.sep_token_id = 2 self.pad_token = '<pad>' self.cls_token = '<s>' self.sep_token = '</s>' def bpe_encode(self, text): return self.bpe.encode(text) def encode(self, text, add_special_tokens=False): subwords = self.bpe.encode(text) input_ids = self.vocab.encode_line( subwords, append_eos=False, add_if_not_exist=False).long().tolist() return input_ids def tokenize(self, text): return self.bpe_encode(text).split() def convert_tokens_to_ids(self, tokens): input_ids = self.vocab.encode_line( ' '.join(tokens), append_eos=False, add_if_not_exist=False).long().tolist() return input_ids def decode(self, ids, clean_up_tokenization_spaces=False): return self.vocab.string(ids, bpe_symbol='@@')
class RobertaTweetEmbedding(AbstractEmbedding): def __init__(self, device): super(RobertaTweetEmbedding, self).__init__(device=device) self.config = RobertaConfig.from_pretrained( '../data/models/BERTweet_base_transformers/config.json') self.model = RobertaModel.from_pretrained( '../data/models/BERTweet_base_transformers/model.bin', config=self.config) self.model.eval( ) # disable dropout (or leave in train mode to finetune) self.model.to(self.device) self.pad_token_id = self.config.pad_token_id self.embedding_dim = self.model.config.hidden_size # Load BPE encoder parser = argparse.ArgumentParser() parser.add_argument( '--bpe-codes', default="../data/models/BERTweet_base_transformers/bpe.codes", required=False, type=str, help='path to fastBPE BPE') args = parser.parse_args() self.bpe = fastBPE(args) # Load the dictionary self.vocab = Dictionary() self.vocab.add_from_file( "../data/models/BERTweet_base_transformers/dict.txt") def forward(self, sentences): all_input_ids = [] for sentence in sentences: # Encode the line using fastBPE & Add prefix <s> and suffix </s> subwords = '<s> ' + self.bpe.encode(sentence) + ' </s>' # Map subword tokens to corresponding indices in the dictionary input_ids = self.vocab.encode_line( subwords, append_eos=False, add_if_not_exist=False).long().tolist() all_input_ids.append(input_ids) # Padding ids max_seq_length = max(map(len, all_input_ids)) pad_all_input_ids = [ input_ids + [self.pad_token_id] * (max_seq_length - len(input_ids)) for input_ids in all_input_ids ] # Extract features with torch.no_grad(): features = self.model( torch.tensor([pad_all_input_ids], dtype=torch.long).squeeze(0).to(self.device)) return features[0]
class BERTweetTokenizer(): def __init__(self, pretrained_path="../pretrained/bertweet/"): self.bpe = fastBPE( SimpleNamespace( bpe_codes=os.path.join(pretrained_path, "bpe.codes"))) self.vocab = Dictionary() self.vocab.add_from_file(os.path.join(pretrained_path, "dict.txt")) self.cls_token_id = 0 self.pad_token_id = 1 self.sep_token_id = 2 self.pad_token = '<pad>' self.cls_token = '<s>' self.sep_token = '</s>' def bpe_encode(self, text): return self.bpe.encode(text) def encode(self, text, add_special_tokens=False): subwords = self.bpe.encode(text) input_ids = self.vocab.encode_line( subwords, append_eos=False, add_if_not_exist=False).long().tolist() return input_ids def tokenize(self, text): return self.bpe_encode(text).split() def convert_tokens_to_ids(self, tokens): input_ids = self.vocab.encode_line( ' '.join(tokens), append_eos=False, add_if_not_exist=False).long().tolist() return input_ids #from: https://www.kaggle.com/nandhuelan/bertweet-first-look def decode_id(self, id): return self.vocab.string(id, bpe_symbol='@@') def decode_id_nospace(self, id): return self.vocab.string(id, bpe_symbol='@@ ')