Ejemplo n.º 1
0
class BERTweetTokenizer():
    def __init__(self, pretrained_path='./bertweet/'):

        self.bpe = fastBPE(
            SimpleNamespace(bpe_codes=pretrained_path + "bpe.codes"))
        self.vocab = Dictionary()
        self.vocab.add_from_file(pretrained_path + "dict.txt")
        self.cls_token_id = 0
        self.pad_token_id = 1
        self.sep_token_id = 2
        self.pad_token = '<pad>'
        self.cls_token = '<s>'
        self.sep_token = '</s>'

    def bpe_encode(self, text):
        return self.bpe.encode(text)

    def encode(self, text, add_special_tokens=False):
        subwords = self.bpe.encode(text)
        input_ids = self.vocab.encode_line(
            subwords, append_eos=False,
            add_if_not_exist=False).long().tolist()
        return input_ids

    def tokenize(self, text):
        return self.bpe_encode(text).split()

    def convert_tokens_to_ids(self, tokens):
        input_ids = self.vocab.encode_line(
            ' '.join(tokens), append_eos=False,
            add_if_not_exist=False).long().tolist()
        return input_ids

    def decode(self, ids, clean_up_tokenization_spaces=False):
        return self.vocab.string(ids, bpe_symbol='@@')
Ejemplo n.º 2
0
class RobertaTweetEmbedding(AbstractEmbedding):
    def __init__(self, device):
        super(RobertaTweetEmbedding, self).__init__(device=device)
        self.config = RobertaConfig.from_pretrained(
            '../data/models/BERTweet_base_transformers/config.json')
        self.model = RobertaModel.from_pretrained(
            '../data/models/BERTweet_base_transformers/model.bin',
            config=self.config)
        self.model.eval(
        )  # disable dropout (or leave in train mode to finetune)
        self.model.to(self.device)
        self.pad_token_id = self.config.pad_token_id
        self.embedding_dim = self.model.config.hidden_size

        # Load BPE encoder
        parser = argparse.ArgumentParser()
        parser.add_argument(
            '--bpe-codes',
            default="../data/models/BERTweet_base_transformers/bpe.codes",
            required=False,
            type=str,
            help='path to fastBPE BPE')
        args = parser.parse_args()
        self.bpe = fastBPE(args)

        # Load the dictionary
        self.vocab = Dictionary()
        self.vocab.add_from_file(
            "../data/models/BERTweet_base_transformers/dict.txt")

    def forward(self, sentences):
        all_input_ids = []
        for sentence in sentences:
            # Encode the line using fastBPE & Add prefix <s> and suffix </s>
            subwords = '<s> ' + self.bpe.encode(sentence) + ' </s>'

            # Map subword tokens to corresponding indices in the dictionary
            input_ids = self.vocab.encode_line(
                subwords, append_eos=False,
                add_if_not_exist=False).long().tolist()
            all_input_ids.append(input_ids)

        # Padding ids
        max_seq_length = max(map(len, all_input_ids))
        pad_all_input_ids = [
            input_ids + [self.pad_token_id] * (max_seq_length - len(input_ids))
            for input_ids in all_input_ids
        ]

        # Extract features
        with torch.no_grad():
            features = self.model(
                torch.tensor([pad_all_input_ids],
                             dtype=torch.long).squeeze(0).to(self.device))

        return features[0]
Ejemplo n.º 3
0
class BERTweetTokenizer():
    def __init__(self, pretrained_path="../pretrained/bertweet/"):

        self.bpe = fastBPE(
            SimpleNamespace(
                bpe_codes=os.path.join(pretrained_path, "bpe.codes")))
        self.vocab = Dictionary()
        self.vocab.add_from_file(os.path.join(pretrained_path, "dict.txt"))
        self.cls_token_id = 0
        self.pad_token_id = 1
        self.sep_token_id = 2
        self.pad_token = '<pad>'
        self.cls_token = '<s>'
        self.sep_token = '</s>'

    def bpe_encode(self, text):
        return self.bpe.encode(text)

    def encode(self, text, add_special_tokens=False):
        subwords = self.bpe.encode(text)
        input_ids = self.vocab.encode_line(
            subwords, append_eos=False,
            add_if_not_exist=False).long().tolist()
        return input_ids

    def tokenize(self, text):
        return self.bpe_encode(text).split()

    def convert_tokens_to_ids(self, tokens):
        input_ids = self.vocab.encode_line(
            ' '.join(tokens), append_eos=False,
            add_if_not_exist=False).long().tolist()
        return input_ids

    #from: https://www.kaggle.com/nandhuelan/bertweet-first-look
    def decode_id(self, id):
        return self.vocab.string(id, bpe_symbol='@@')

    def decode_id_nospace(self, id):
        return self.vocab.string(id, bpe_symbol='@@ ')