class BERTweetTokenizer():
    
    def __init__(self,pretrained_path = '../input/bertweet-transformer-private/', parser=parser):
        

        self.bpe = fastBPE(args=parser.parse_args(args=[]))
        self.vocab = Dictionary()
        self.vocab.add_from_file(pretrained_path + "dict.txt")
        self.cls_token_id = 0
        self.pad_token_id = 1
        self.sep_token_id = 2
        self.pad_token = '<pad>'
        self.cls_token = '<s> '
        self.sep_token = ' </s>'
        
    def bpe_encode(self,text):
        return self.bpe.encode(text)
    
    def encode(self,text, add_special_tokens=False):
        subwords = self.cls_token + self.bpe.encode(text) + self.sep_token
        input_ids = self.vocab.encode_line(subwords, append_eos=False, add_if_not_exist=True).long().tolist()
        return input_ids
    
    def tokenize(self,text):
        return self.bpe_encode(text).split()
    
    def convert_tokens_to_ids(self,tokens):
        input_ids = self.vocab.encode_line(' '.join(tokens), append_eos=False, add_if_not_exist=False).long().tolist()
        return input_ids

    
    def decode_id(self,id):
        return self.vocab.string(id, bpe_symbol = '@@')
Beispiel #2
0
class BERTweetTokenizer():
    def __init__(self, pretrained_path='./bertweet/'):

        self.bpe = fastBPE(
            SimpleNamespace(bpe_codes=pretrained_path + "bpe.codes"))
        self.vocab = Dictionary()
        self.vocab.add_from_file(pretrained_path + "dict.txt")
        self.cls_token_id = 0
        self.pad_token_id = 1
        self.sep_token_id = 2
        self.pad_token = '<pad>'
        self.cls_token = '<s>'
        self.sep_token = '</s>'

    def bpe_encode(self, text):
        return self.bpe.encode(text)

    def encode(self, text, add_special_tokens=False):
        subwords = self.bpe.encode(text)
        input_ids = self.vocab.encode_line(
            subwords, append_eos=False,
            add_if_not_exist=False).long().tolist()
        return input_ids

    def tokenize(self, text):
        return self.bpe_encode(text).split()

    def convert_tokens_to_ids(self, tokens):
        input_ids = self.vocab.encode_line(
            ' '.join(tokens), append_eos=False,
            add_if_not_exist=False).long().tolist()
        return input_ids

    def decode(self, ids, clean_up_tokenization_spaces=False):
        return self.vocab.string(ids, bpe_symbol='@@')
def _clean_cats_attrs(ldict: Dictionary, schema, pred_cats: torch.Tensor,
                      pred_attrs: torch.Tensor):
    cats = ldict.string(pred_cats).split(" ")
    attrs = []

    if len(pred_attrs.shape) == 1:
        split_pred_attrs = [pred_attrs]
    else:
        split_pred_attrs = pred_attrs.split(1, dim=0)
    for (_cat_idx, attr_idxs) in zip(pred_cats.tolist(), split_pred_attrs):
        seq_attrs = [
            lbl for lbl in ldict.string((attr_idxs.squeeze())).split(" ")
        ]
        if not any(it for it in seq_attrs):
            seq_attrs = []
        attrs.append(seq_attrs)
    return list(zip(cats, attrs))
Beispiel #4
0
class BERTweetTokenizer():
    def __init__(self, pretrained_path="../pretrained/bertweet/"):

        self.bpe = fastBPE(
            SimpleNamespace(
                bpe_codes=os.path.join(pretrained_path, "bpe.codes")))
        self.vocab = Dictionary()
        self.vocab.add_from_file(os.path.join(pretrained_path, "dict.txt"))
        self.cls_token_id = 0
        self.pad_token_id = 1
        self.sep_token_id = 2
        self.pad_token = '<pad>'
        self.cls_token = '<s>'
        self.sep_token = '</s>'

    def bpe_encode(self, text):
        return self.bpe.encode(text)

    def encode(self, text, add_special_tokens=False):
        subwords = self.bpe.encode(text)
        input_ids = self.vocab.encode_line(
            subwords, append_eos=False,
            add_if_not_exist=False).long().tolist()
        return input_ids

    def tokenize(self, text):
        return self.bpe_encode(text).split()

    def convert_tokens_to_ids(self, tokens):
        input_ids = self.vocab.encode_line(
            ' '.join(tokens), append_eos=False,
            add_if_not_exist=False).long().tolist()
        return input_ids

    #from: https://www.kaggle.com/nandhuelan/bertweet-first-look
    def decode_id(self, id):
        return self.vocab.string(id, bpe_symbol='@@')

    def decode_id_nospace(self, id):
        return self.vocab.string(id, bpe_symbol='@@ ')
Beispiel #5
0
def process_predictions(args, hypos, sp, tgt_dict: Dictionary, target_tokens,
                        res_files, speaker, id):
    for hypo in hypos[:min(len(hypos), args.nbest)]:
        hyp_pieces = tgt_dict.string(hypo["tokens"].int().cpu())
        # hyp_words = sp.DecodePieces(hyp_pieces.split())
        hyp_words = hyp_pieces.replace(' ', '').replace('_', ' ')
        print(hyp_words)
        print("{} ({}-{})".format(hyp_pieces, speaker, id),
              file=res_files["hypo.units"])
        print("{} ({}-{})".format(hyp_words, speaker, id),
              file=res_files["hypo.words"])

        tgt_pieces = tgt_dict.string(target_tokens)
        # tgt_words = sp.DecodePieces(tgt_pieces.split())
        tgt_words = tgt_pieces.replace(' ', '').replace('_', ' ')
        print("{} ({}-{})".format(tgt_pieces, speaker, id),
              file=res_files["ref.units"])
        print("{} ({}-{})".format(tgt_words, speaker, id),
              file=res_files["ref.words"])
        # only score top hypothesis
        if not args.quiet:
            logger.debug("HYPO:" + hyp_words)
            logger.debug("TARGET:" + tgt_words)
            logger.debug("___________________")
class PhoBertTokenizer(object):
    def __init__(self, ):
        self.vocab = Dictionary()
        self.vocab.add_from_file(
            "/content/drive/My Drive/PhoBERT_EMPATHETICDIALOGUES/EmpatheticDialogues/PhoBert/PhoBERT_base_transformers/dict.txt"
        )

    def tokenize(self, inp_string):
        return bpe.encode(inp_string).split(" ")

    def convert_tokens_to_ids(self, tokens):
        return self.vocab.encode_line(" ".join(tokens),
                                      append_eos=False,
                                      add_if_not_exist=False).long().tolist()

    def convert_ids_to_tokens(self, ids):
        return self.vocab.string(torch.tensor([ids], dtype=torch.long))
class XLMRobertaTokenizer:
    def __init__(self, pretrained_file):
        # load bpe model and vocab file
        bpe_model_file = pjoin(pretrained_file, 'sentencepiece.bpe.model')
        vocab_file = pjoin(pretrained_file, 'dict.txt')
        self.sp = SentencepieceBPE(bpe_model_file)
        self.bpe_dict = Dictionary().load(vocab_file)
        self.cls_token = "<s>"
        self.sep_token = "</s>"
        self.pad_token_id = 1

    def tokenize(self, sentence):
        return self.sp.encode(sentence).split(' ')

    def convert_tokens_to_ids(self, tokens):
        bpe_sentence = ' '.join(tokens)
        bpe_ids = self.bpe_dict.encode_line(bpe_sentence,
                                            add_if_not_exist=False,
                                            append_eos=False).tolist()

        # def encode(self, sentence, add_bos=False, add_eos=False):
        #     bpe_sentence = '<s> ' + self.sp.encode(sentence) + ' </s>'
        #     bpe_ids = self.bpe_dict.encode_line(bpe_sentence, append_eos=False).tolist()
        #     if not add_bos:
        #         bpe_ids = bpe_ids[1:]
        #     if not add_eos:
        #         bpe_ids = bpe_ids[:-1]
        return bpe_ids

    def decode(self, tokens):
        sentences = [self.sp.decode(self.bpe_dict.string(s)) for s in tokens]
        return sentences

    def encodeAsPieces(self, sentence):
        bpe_sentence = '<s> ' + self.sp.encode(sentence) + ' </s>'
        return bpe_sentence

    @property
    def vocab_size(self):
        return len(self.bpe_dict)