コード例 #1
0
def test_tokenizer(test_sentence, vocab_path, merge_path):
    r"""
        Illustrates how the individual Tokenizer works

        Args:
            test_sentence (:obj:`str`):
            	Sentence for demonstration purposes
            vocab_path (:obj:`str`):
				Path where the vocabulary (most frequent tokens ranked by frequency) is saved
			merge_path (:obj:`str`):
				Path where the merges file is saved
    """

    tokenizer = ByteLevelBPETokenizer(vocab_path, merge_path)

    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")))
    tokenizer.enable_truncation(max_length=512)

    print("Original sentence " + test_sentence)
    print("Encoded string: {}".format(tokenizer.encode(test_sentence).tokens))

    encoding = tokenizer.encode(test_sentence)
    decoded = tokenizer.decode(encoding.ids)
    print("Decoded string: {}".format(decoded))
コード例 #2
0
class HuggingFaceByteLevelBPE(object):
    def __init__(self, cfg):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError("Please install huggingface/tokenizers with: "
                              "pip install tokenizers")

        bpe_vocab = file_utils.cached_path(cfg.bpe_vocab)
        bpe_merges = file_utils.cached_path(cfg.bpe_merges)

        self.bpe = ByteLevelBPETokenizer(
            bpe_vocab,
            bpe_merges,
            add_prefix_space=cfg.bpe_add_prefix_space,
        )

    def encode(self, x: str) -> str:
        return " ".join(map(str, self.bpe.encode(x).ids))

    def decode(self, x: str) -> str:
        return self.bpe.decode([
            int(tok) if tok not in {"<unk>", "<mask>"} else tok
            for tok in x.split()
        ])

    def is_beginning_of_word(self, x: str) -> bool:
        return self.decode(x).startswith(" ")
コード例 #3
0
    def test_basic_encode(self, roberta_files):
        tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"])
        output = tokenizer.encode("The quick brown fox jumps over the lazy dog")

        assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
        assert output.tokens == [
            "The",
            "Ġquick",
            "Ġbrown",
            "Ġfox",
            "Ġjumps",
            "Ġover",
            "Ġthe",
            "Ġlazy",
            "Ġdog",
        ]
        assert output.offsets == [
            (0, 3),
            (3, 9),
            (9, 15),
            (15, 19),
            (19, 25),
            (25, 30),
            (30, 34),
            (34, 39),
            (39, 43),
        ]
コード例 #4
0
    def test_train_from_iterator(self):
        text = ["A first sentence", "Another sentence", "And a last one"]
        tokenizer = ByteLevelBPETokenizer()
        tokenizer.train_from_iterator(text, show_progress=False)

        output = tokenizer.encode("A sentence")
        assert output.tokens == ["A", "Ġsentence"]
コード例 #5
0
class HuggingFaceByteLevelBPE(object):
    @staticmethod
    def add_args(parser):
        # fmt: off
        parser.add_argument('--bpe-merges', help='path to merges.txt')
        parser.add_argument('--bpe-vocab', help='path to vocab.json')
        parser.add_argument('--bpe-add-prefix-space',
                            action='store_true',
                            help='add prefix space before encoding')
        # fmt: on

    def __init__(self, args):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError("Please install huggingface/tokenizers with: "
                              "pip install tokenizers")

        self.bpe = ByteLevelBPETokenizer(
            args.bpe_vocab,
            args.bpe_merges,
            add_prefix_space=getattr(args, "bpe_add_prefix_space", False),
        )

    def encode(self, x: str) -> str:
        return " ".join(map(str, self.bpe.encode(x).ids))

    def decode(self, x: str) -> str:
        return self.bpe.decode([
            int(tok) if tok not in {"<unk>", "<mask>"} else tok
            for tok in x.split()
        ])

    def is_beginning_of_word(self, x: str) -> bool:
        return self.decode(x).startswith(" ")
コード例 #6
0
class LineByLineTextDataset(Dataset):
    def __init__(self, args, file_path: str, block_size=512):
        assert os.path.isfile(file_path)

        self.block_size = block_size

        self.tokenizer = ByteLevelBPETokenizer(
            os.path.join(args.tokenizer_name, "vocab.json"),
            os.path.join(args.tokenizer_name, "merges.txt"),
        )

        self.tokenizer._tokenizer.post_processor = RobertaProcessing(
            ("</s>", self.tokenizer.token_to_id("</s>")),
            ("<s>", self.tokenizer.token_to_id("<s>")),
        )
        self.tokenizer.enable_truncation(max_length=block_size)


        logger.info("Creating features from dataset file at %s", file_path)

        self.examples = []

        with open(file_path, encoding="utf-8") as f:
            for line in f:
                if len(line) > 0 and not line.isspace():
                    self.examples.append(line)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.tokenizer.encode(self.examples[i]).ids[: self.block_size - 2], dtype=torch.long)
コード例 #7
0
class HuggingFaceBpeHelper(object):
    @staticmethod
    def add_cmdline_args(argparser):
        parser = argparser.add_argument_group('ByteLevelBPE Arguments')
        parser.add_argument('--bpe-vocab',
                            type=str,
                            help='path to pre-trained tokenizer vocab')
        parser.add_argument('--bpe-merge',
                            type=str,
                            help='path to pre-trained tokenizer merge')
        parser.add_argument(
            '--bpe-add-prefix-space',
            type='bool',
            hidden=True,
            default=True,
            help='add prefix space before encoding',
        )
        return parser

    def __init__(self, opt: Opt, shared=None):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if 'bpe_vocab' not in opt:
            raise ValueError(
                '--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError(
                '--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError('--bpe-vocab and --bpe-merge are mandatory with '
                          '--dict-tokenizer bytelevelbpe')

        if not os.path.isfile(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not os.path.isfile(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        self.tokenizer = ByteLevelBPETokenizer(self.vocab_path,
                                               self.merge_path,
                                               self.add_prefix_space)

    def encode(self, text: str) -> List[str]:
        return self.tokenizer.encode(text).tokens

    def decode(self, x: List[str]) -> str:
        return self.tokenizer.decode(self.tokenizer.token_to_id(c) for c in x)
コード例 #8
0
def inference(checkpoint_path,
              hyperparameters_path,
              tokenizer_path,
              merges_path,
              input='In 1691 Moscow established ',
              generated_length=64,
              random_selection=True):

    # Iitialize tokenizer and model from files
    tokenizer = ByteLevelBPETokenizer(
        tokenizer_path,
        merges_path,
        add_prefix_space=True,
    )

    #tokenizer2 = Tokenizer(BPE(unk_token="[UNK]"))
    #tokenizer2.pre_tokenizer2 = Whitespace()
    #tokenizer2 = Tokenizer.from_file("example/tokenizer.json")

    #initialize model
    model = LMModel.load_from_checkpoint(checkpoint_path=checkpoint_path,
                                         hparams_file=hyperparameters_path)

    # Tokenize input sample
    encoded_sample = tokenizer.encode(input).ids

    for i in range(generated_length):
        input_ids = torch.unsqueeze(torch.tensor(encoded_sample,
                                                 dtype=torch.long),
                                    axis=0)

        # Inference
        output, attn = model(input_ids)
        last_word = output[0][-1]

        if not random_selection:
            # Pick highest probability token from probability distributions
            prediction = torch.argmax(output,
                                      axis=2).squeeze(axis=0).tolist()[-1]
        else:
            # Pick Tokens acording to their probabilities
            prediction = torch.multinomial(torch.softmax(last_word, 0)**10,
                                           1)[0]
        # Add prediciton to sequence
        encoded_sample.append(prediction)

    # Detokenize output sample
    decoded_output = tokenizer.decode(encoded_sample)
    #decoded_output2 = tokenizer2.decode(encoded_sample)

    output_tokens = [tokenizer.id_to_token(int(id)) for id in encoded_sample]
    #output_tokens2 = [tokenizer2.id_to_token(int(id)) for id in encoded_sample]
    #print('\n========================\n      ORIGINAL BPE        \n========================')
    #print(output_tokens2, decoded_output2, sep='\n')
    #print('\n========================\n      MODIFIED BPE        \n========================')
    return decoded_output, output_tokens, attn
コード例 #9
0
def load_sentence_piece_model():
    tokenizer = ByteLevelBPETokenizer(path_vocab, path_model)
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")))

    tokenizer.enable_truncation(max_length=512)
    encoding = tokenizer.encode("배고파요")
    print(encoding.tokens)
    print(encoding.special_tokens_mask)
    print(encoding.ids)
    print(encoding.normalized_str)
コード例 #10
0
def inference():

    from tokenizers import ByteLevelBPETokenizer
    from tokenizers.processors import BertProcessing
    '''
    initialize tokenizer with saved model files
    '''
    tokenizer = ByteLevelBPETokenizer(
        "./tok_checkpoints/tokenizer_model-vocab.json",
        "./tok_checkpoints/tokenizer_model-merges.txt",
    )
    '''
    optional step : preprocess the strings
    Ex: add <s> and </s> as BOS and EOS tokens to the string
        pad string to some max length and truncate string to some max length
    '''
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_padding(pad_token='<pad>',
                             pad_id=tokenizer.get_vocab()['<pad>'],
                             length=20)
    tokenizer.enable_truncation(max_length=20)
    '''
    tokenize/encode strings
    '''
    input_ids = tokenizer.encode("Hello World, Whats up!!!").ids
    print("input ids", input_ids)
    tokens = tokenizer.encode("Hello World, Whats up!!!").tokens
    print("tokens", tokens)
    '''
    tokenize/encode batch of string
    '''
    batch_tokenized = tokenizer.encode_batch(
        ["Hello World, Whats up!!!", "Whata whata wa wada wada"])
    input_ids = [i.ids for i in batch_tokenized]
    print("input ids", input_ids)
    tokens = [i.tokens for i in batch_tokenized]
    print("tokens", tokens)
コード例 #11
0
def tokenize_hf(df, text_col='text', outfile=None):
    tokenizer = ByteLevelBPETokenizer(
        merges_file="/home/ubuntu/data/mimic/bbpe_tokenizer/mimic-merges.txt",
        vocab_file="/home/ubuntu/data/mimic/bbpe_tokenizer/mimic-vocab.json")
    tok_snts = []
    if outfile is not None: f = open(outfile, 'w', encoding='utf8')
    data = df if text_col is None else df[text_col]
    for snt in data:
        tokenized_snt = tokenizer.encode(snt)
        if outfile is not None:
            f.write("{}\n".format("\t".join(tokenized_snt.tokens)))
        else:
            tok_snts.append(tokenized_snt.tokens)
    return tok_snts
コード例 #12
0
ファイル: tokenization.py プロジェクト: ksjae/KoGPT2-train
class FullTokenizer(object):
    """Runs end-to-end tokenziation."""
    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_vocab(vocab_file)
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        self.tokenizer = ByteLevelBPETokenizer(vocab_file + '/vocab.json',
                                               vocab_file + '/merges.txt')

    def tokenize(self, text):
        return self.tokenizer.encode(text).ids

    def convert_tokens_to_ids(self, tokens):
        return [self.tokenizer.token_to_id(tok) for tok in tokens]

    def convert_ids_to_tokens(self, ids):
        return self.tokenizer.decode(ids)
コード例 #13
0
    def test_lowerspace(self, roberta_files):
        tokenizer = ByteLevelBPETokenizer(
            roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True
        )
        output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")

        assert output.ids == [5, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
        assert output.tokens == [
            "Ġthe",
            "Ġquick",
            "Ġbrown",
            "Ġfox",
            "Ġjumps",
            "Ġover",
            "Ġthe",
            "Ġlazy",
            "Ġdog",
        ]
コード例 #14
0
def get_french_vocab(model_name):
    root = Path(os.getcwd()).parent.parent.parent
    french_corpus = "Datasets/corpora/fr/text"
    fr_corpus_path = os.path.join(root, french_corpus)
    files = []
    for dir_ in os.listdir(fr_corpus_path):
        fr_corpus_dir = os.path.join(fr_corpus_path, dir_)
        for text_file in os.listdir(fr_corpus_dir):
            text_file = os.path.join(fr_corpus_dir, text_file)
            files.append(text_file)

    tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train(files,
                    vocab_size=20000,
                    min_frequency=2,
                    show_progress=True,
                    special_tokens=["<sos>", "<pad>", "<eos>", "<unk>"])

    print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens)
    tokenizer.save(model_name)
コード例 #15
0
class ByteBPETokenizer:
    def __init__(self, vocab_json, merge_txt, max_length=750):
        self.tokenizer = ByteLevelBPETokenizer(vocab_json, merge_txt)
        self.tokenizer.enable_truncation(max_length=max_length)
        self.tokenizer.enable_padding(max_length=max_length)
        self.tokenizer.add_special_tokens(["[PAD]", "[CLS]"])
        # self.tokenizer.post_processor = RobertaProcessing(("</s>", 2), ("<s>", 1))
        # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    def encode(self, review):
        review = clean_sentence(review)
        encoded = self.tokenizer.encode(review.lower())
        # pp_encoded = self.tokenizer.post_process(encoded)
        return encoded

    def tokenize2Index(self, review, should_stem=False):
        encoded = self.encode(review)

        return encoded.ids

    def trainBPE(self, paths, vocab_size=30000, min_frequency=10, special_tokens=["[PAD]", "[CLS]"]):
        tokenizer = ByteLevelBPETokenizer()
        tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens)
        tokenizer.save("yelp_bpe/", "yelp-bpe")
コード例 #16
0
)

i = 0
j = 0
with open(input_file) as ifile:
    with open(output_file, "w") as ofile:
        line_write = ""
        token_count = 0
        for i, line in enumerate(ifile):
            # print('>>>>>>>',line, '<<<<<<')
            i = i + 1
            if i % 50000 == 0:
                print('total:', i, line)
            if line == "\n":
                continue
            line = line.replace("\n", "")
            line = line.replace("\r", "")
            encoded = tokenizer.encode(line)
            token_count = token_count + len(encoded.tokens)
            if (token_count >= 256):
                # print(token_count, '>>>>>>>',line_write, '<<<<<<')
                j = j + 1
                if j % 10000 == 0:
                    print(j, token_count, line_write)
                ofile.write(line_write + "\n")
                line_write = line
                token_count = len(encoded.tokens)
                continue
            line_write = line_write + " " + line
        ofile.write(line_write + "\n")
コード例 #17
0
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./EsperBERTo/vocab.json",
    "./EsperBERTo/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

tokenizer.encode("Mi estas Julien.")
tokenizer.encode("Mi estas Julien.").tokens

# Check that PyTorch sees it
import torch
torch.cuda.is_available()

from transformers import RobertaConfig
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
コード例 #18
0
                    "<unk>",
                    "<mask>",
                ])
tokenizer.save_model("BERT/sumerianBERTo")

tokenizer = ByteLevelBPETokenizer(
    "BERT/sumerianBERTo/vocab.json",
    "BERT/sumerianBERTo/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
tokenizer.encode("dumu a-li2-wa-aq-rum")
print(tokenizer.encode("dumu a-li2-wa-aq-rum").tokens)

# Configuration

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=512,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
print(config)
tokenizer = RobertaTokenizerFast.from_pretrained("BERT/sumerianBERTo",
                                                 max_len=512)
model = RobertaForMaskedLM(config=config)
コード例 #19
0
    print("Files processed: %d, Total files: %d" % (count, len(files)))

    # load raw html data
    fileData = io.open(file, "r", errors="ignore").readlines()
    fileData = ''.join(str(line) for line in fileData)
    fileData = fileData.replace("\n", " ")

    # ignore the website if language is other than english
    if isIgnoreOtherLanguages == 1:
        inputLanguage = detectLanguage(fileData)
        if inputLanguage != "en":
            ignoredFiles[file] = True
            continue

    # tokenize html code
    output = tokenizer.encode(fileData)
    outputDict = collections.Counter(output.ids)

    # add counts to a dictionary for tfidf scores.
    for token in outputDict:
        docDict[token].append(file)

print("\nAssigning tfidf weights to tokens...\n")
features = []
htmlLabels = []
totalFilesUnderConsideration = len(files) - len(ignoredFiles)
count = 0
for i in range(0, len(files)):
    file = files[i]
    label = labels[i]
    count = count + 1
コード例 #20
0
                  allow_pickle=True).item()
print("Document frequency dictionary loaded...")

# Testing
print("Loading webpage...")
try:
    request = requests.get(websiteToTest)
    webpageHtml = str(request.text)
    webpageHtml = webpageHtml.replace("\n", " ")
except Exception as e:
    print('\n', e)
    print("\nAn error occurred, exiting now... ")
    exit()

# Convert text into feature vector
output = tokenizer.encode(webpageHtml)
outputDict = collections.Counter(output.ids)

# Apply tfidf weighting
totalFilesUnderConsideration = docDict["totalFilesUnderConsideration"]
array = [0] * tokenizerVocabSize
for item in outputDict:
    if len(docDict[item]) > 0:
        array[item] = (outputDict[item]) * (math.log10(
            totalFilesUnderConsideration / len(docDict[item])))

# Getting predictions
predictionProbability = model.predict_proba([array])[0][1]
print(
    "\n****************************\n--> Probability that the website is phishing: %.2f"
    % (predictionProbability * 100))
コード例 #21
0
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(["sp_data/mono/all.en-fr"], vocab_size=60000)

# with open("sp_data/mono/all.en-fr") as r, open("sp_data/mono/all.en-fr.wordpiece", "w") as w:
#     lines = r.readlines()
#     for line in lines:
#         encoded = tokenizer.encode(line[:-1])
#         w.write(" ".join(encoded.tokens))
#         w.write("\n")

with open("sp_data/para/dev/newstest2013-ref.en") as r, open(
        "sp_data/para/dev/newstest2013-ref.en.bytebpe", "w") as w:
    lines = r.readlines()
    for line in lines:
        encoded = tokenizer.encode(line[:-1])
        w.write(" ".join(encoded.tokens))
        w.write("\n")

with open("sp_data/para/dev/newstest2013-ref.fr") as r, open(
        "sp_data/para/dev/newstest2013-ref.fr.bytebpe", "w") as w:
    lines = r.readlines()
    for line in lines:
        encoded = tokenizer.encode(line[:-1])
        w.write(" ".join(encoded.tokens))
        w.write("\n")

with open("sp_data/para/dev/newstest2014-fren-src.en") as r, open(
        "sp_data/para/dev/newstest2014-fren-src.en.bytebpe", "w") as w:
    lines = r.readlines()
    for line in lines:
コード例 #22
0
ファイル: meta_cat.py プロジェクト: halloju/MedCAT
class MetaCAT(object):
    r''' TODO: Add documentation
    '''
    def __init__(self,
                 tokenizer=None,
                 embeddings=None,
                 cntx_left=20,
                 cntx_right=20,
                 save_dir='./meta_cat/',
                 pad_id=30000,
                 device='cpu'):
        self.tokenizer = tokenizer
        if embeddings is not None:
            self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        else:
            self.embeddings = None
        self.cntx_left = cntx_left
        self.cntx_right = cntx_right
        self.save_dir = save_dir
        self.pad_id = pad_id
        self.device = torch.device(device)

        self.category_name = None
        self.category_values = {}
        self.i_category_values = {}

        self.model = None

        # TODO: A shitty solution, make right at some point
        if not self.save_dir.endswith("/"):
            self.save_dir = self.save_dir + "/"

    def train(self,
              json_path,
              category_name=None,
              model_name='BERT_GRU',
              Bio_BERT_PATH=None,
              lr=0.01,
              test_size=0.1,
              batch_size=100,
              nepochs=20,
              lowercase=True,
              class_weights=None,
              cv=0,
              ignore_cpos=False,
              model_config={},
              tui_filter=None,
              fine_tune=False,
              auto_save_model=True,
              score_average='weighted',
              replace_center=None,
              seed=11):
        r''' TODO: Docs
        '''
        set_all_seeds(seed)
        data = json.load(open(json_path, 'r'))

        # Create directories if they don't exist
        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)

        # Prepare the data
        data = prepare_from_json(data,
                                 self.cntx_left,
                                 self.cntx_right,
                                 self.tokenizer,
                                 lowercase=lowercase,
                                 tui_filter=tui_filter,
                                 replace_center=replace_center)

        if category_name is not None:
            self.category_name = category_name

        # Check is the name there
        if self.category_name not in data:
            raise Exception(
                "The category name does not exist in this json file. You've provided '{}', while the possible options are: {}"
                .format(self.category_name, " | ".join(list(data.keys()))))

        data = data[self.category_name]

        if not fine_tune:
            # Encode the category values
            data, self.category_values = encode_category_values(data)
            self.i_category_values = {
                v: k
                for k, v in self.category_values.items()
            }
        else:
            # We already have everything, just get the data
            data, _ = encode_category_values(data, vals=self.category_values)

        # Convert data tkns to ids
        data = tkns_to_ids(data, self.tokenizer)

        if not fine_tune:
            if model_name == 'lstm':
                from medcat.utils.models import LSTM
                nclasses = len(self.category_values)
                bid = model_config.get("bid", True)
                num_layers = model_config.get("num_layers", 2)
                input_size = model_config.get("input_size", 300)
                hidden_size = model_config.get("hidden_size", 300)
                dropout = model_config.get("dropout", 0.5)

                self.model = LSTM(self.embeddings,
                                  self.pad_id,
                                  nclasses=nclasses,
                                  bid=bid,
                                  num_layers=num_layers,
                                  input_size=input_size,
                                  hidden_size=hidden_size,
                                  dropout=dropout)

            if model_name == 'bert_gru':
                from medcat.utils.models import BERT_GRU
                nclasses = len(self.category_values)
                bid = model_config.get("bid", True)
                num_layers = model_config.get("num_layers", 5)
                input_size = model_config.get("input_size", 768)
                hidden_size = model_config.get("hidden_size", 768)
                dropout = model_config.get("dropout", 0.5)

                self.model = BERT_GRU(Bio_BERT_PATH,
                                      nclasses=nclasses,
                                      bid=bid,
                                      num_layers=num_layers,
                                      input_size=input_size,
                                      hidden_size=hidden_size,
                                      dropout=dropout)

        if cv == 0:
            (f1, p, r, cls_report) = train_network(
                self.model,
                data,
                max_seq_len=(self.cntx_left + self.cntx_right + 1),
                lr=lr,
                test_size=test_size,
                pad_id=self.pad_id,
                batch_size=batch_size,
                nepochs=nepochs,
                device=self.device,
                class_weights=class_weights,
                ignore_cpos=ignore_cpos,
                save_dir=self.save_dir,
                auto_save_model=auto_save_model,
                score_average=score_average)
        elif cv > 0:
            # Mainly for testing, not really used in a normal workflow
            f1s = []
            ps = []
            rs = []
            cls_reports = []
            for i in range(cv):
                # Reset the model
                if fine_tune:
                    self.load_model(model=model_name)
                else:
                    if model_name == 'lstm':
                        from medcat.utils.models import LSTM
                        nclasses = len(self.category_values)
                        self.model = LSTM(self.embeddings,
                                          self.pad_id,
                                          nclasses=nclasses)

                (_f1, _p, _r, _cls_report) = train_network(
                    self.model,
                    data,
                    max_seq_len=(self.cntx_left + self.cntx_right + 1),
                    lr=lr,
                    test_size=test_size,
                    pad_id=self.pad_id,
                    batch_size=batch_size,
                    nepochs=nepochs,
                    device=self.device,
                    class_weights=class_weights,
                    ignore_cpos=ignore_cpos,
                    save_dir=self.save_dir,
                    score_average=score_average)
                f1s.append(_f1)
                ps.append(_p)
                rs.append(_r)
                cls_reports.append(_cls_report)
            f1 = np.average(f1s)
            p = np.average(ps)
            r = np.average(rs)

            # Average cls reports
            cls_report = {}
            _cls_report = cls_reports[0]
            for label in _cls_report.keys():
                cls_report[label] = {}
                if type(_cls_report[label]) == dict:
                    for score in _cls_report[label].keys():
                        cls_report[label][score] = sum(
                            [r[label][score]
                             for r in cls_reports]) / len(cls_reports)

        print("Best/Average scores: F1: {}, P: {}, R: {}".format(f1, p, r))

        return {'f1': f1, 'p': p, 'r': r, 'cls_report': cls_report}

    def eval(self,
             json_path,
             batch_size=100,
             lowercase=True,
             ignore_cpos=False,
             tui_filter=None,
             score_average='weighted',
             replace_center=None):
        data = json.load(open(json_path, 'r'))

        # Prepare the data
        data = prepare_from_json(data,
                                 self.cntx_left,
                                 self.cntx_right,
                                 self.tokenizer,
                                 lowercase=lowercase,
                                 tui_filter=tui_filter,
                                 replace_center=replace_center)

        # Check is the name there
        if self.category_name not in data:
            raise Exception(
                "The category name does not exist in this json file.")

        data = data[self.category_name]

        # We already have everything, just get the data
        data, _ = encode_category_values(data, vals=self.category_values)

        # Convert data tkns to ids
        data = tkns_to_ids(data, self.tokenizer)

        # Run evaluation
        result = eval_network(self.model,
                              data,
                              max_seq_len=(self.cntx_left + self.cntx_right +
                                           1),
                              pad_id=self.pad_id,
                              batch_size=batch_size,
                              device=self.device,
                              ignore_cpos=ignore_cpos,
                              score_average=score_average)

        return result

    def predicit_one(self, text, start, end):
        """ A test function, not useful in any other case
        """
        text = text.lower()

        doc_text = self.tokenizer.encode(text)
        ind = 0
        for ind, pair in enumerate(doc_text.offsets):
            if start >= pair[0] and start <= pair[1]:
                break
        _start = max(0, ind - self.cntx_left)
        _end = min(len(doc_text.tokens), ind + 1 + self.cntx_right)
        tkns = doc_text.ids[_start:_end]
        cpos = self.cntx_left + min(0, ind - self.cntx_left)

        x = torch.tensor([tkns], dtype=torch.long).to(self.device)
        cpos = torch.tensor([cpos], dtype=torch.long).to(self.device)

        self.model.eval()
        outputs_test = self.model(x, cpos)

        inv_map = {v: k for k, v in self.category_values.items()}
        return inv_map[int(np.argmax(outputs_test.detach().numpy()[0]))]

    def save(self, full_save=False):
        if full_save:
            # Save tokenizer and embeddings, slightly redundant
            if hasattr(self.tokenizer, 'save_model'):
                # Support the new save in tokenizer 0.8.2+
                self.tokenizer.save_model(self.save_dir, name='bbpe')
            else:
                # Old way of saving models
                self.tokenizer.save(self.save_dir, name='bbpe')
            # Save embeddings
            np.save(open(self.save_dir + "embeddings.npy", 'wb'),
                    np.array(self.embeddings))

        # The lstm model is saved during training, don't do it here
        #save the config.
        self.save_config()

    def save_config(self):
        # TODO: Add other parameters, e.g replace_center, ignore_cpos etc.
        path = self.save_dir + "vars.dat"
        to_save = {
            'category_name': self.category_name,
            'category_values': self.category_values,
            'i_category_values': self.i_category_values,
            'pad_id': self.pad_id,
            'cntx_left': self.cntx_left,
            'cntx_right': self.cntx_right
        }
        with open(path, 'wb') as f:
            pickle.dump(to_save, f)

    def load_config(self):
        """ Loads variables of this object
        """
        path = self.save_dir + "vars.dat"
        with open(path, 'rb') as f:
            to_load = pickle.load(f)

        self.category_name = to_load['category_name']
        self.category_values = to_load['category_values']
        self.i_category_values = to_load['i_category_values']
        self.cntx_left = to_load['cntx_left']
        self.cntx_right = to_load['cntx_right']
        self.pad_id = to_load.get('pad_id', 0)

    def load_model(self, model='lstm'):
        # Load MODEL
        if model == 'lstm':
            from medcat.utils.models import LSTM
            nclasses = len(self.category_values)
            self.model = LSTM(self.embeddings, self.pad_id, nclasses=nclasses)
            path = self.save_dir + "lstm.dat"

        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def load(self, model='lstm', tokenizer_name='bbpe'):
        """ Loads model and config for this meta annotation
        """
        # Load tokenizer if it is None
        if self.tokenizer is None:
            vocab_file = self.save_dir + "{}-vocab.json".format(tokenizer_name)
            merges_file = self.save_dir + "{}-merges.txt".format(
                tokenizer_name)
            self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file,
                                                   merges_file=merges_file,
                                                   lowercase=True)

        # Load embeddings if None
        if self.embeddings is None:
            embeddings = np.load(open(self.save_dir + "embeddings.npy", 'rb'))
            self.embeddings = torch.tensor(embeddings, dtype=torch.float32)

        # Load configuration
        self.load_config()

        # Load MODEL
        self.load_model(model=model)
        self.model.to(self.device)

    def __call__(self, doc, lowercase=True):
        """ Spacy pipe method """
        data = []
        id2row = {}
        text = doc.text
        if lowercase:
            text = text.lower()
        doc_text = self.tokenizer.encode(text)
        x = []
        cpos = []

        # Only loop through non overlapping entities
        for ent in doc.ents:
            start = ent.start_char
            end = ent.end_char
            ind = 0
            for ind, pair in enumerate(doc_text.offsets):
                if start >= pair[0] and start <= pair[1]:
                    break
            _start = max(0, ind - self.cntx_left)
            _end = min(len(doc_text.tokens), ind + 1 + self.cntx_right)
            _ids = doc_text.ids[_start:_end]
            _cpos = self.cntx_left + min(0, ind - self.cntx_left)

            id2row[ent._.id] = len(x)
            x.append(_ids)
            cpos.append(_cpos)

        max_seq_len = (self.cntx_left + self.cntx_right + 1)
        x = np.array([
            (sample +
             [self.pad_id] * max(0, max_seq_len - len(sample)))[0:max_seq_len]
            for sample in x
        ])

        x = torch.tensor(x, dtype=torch.long).to(self.device)
        cpos = torch.tensor(cpos, dtype=torch.long).to(self.device)

        # Nearly impossible that we need batches, so I'll ignore it
        if len(x) > 0:
            self.model.eval()
            outputs = self.model(x, cpos).detach().to('cpu').numpy()
            outputs = np.argmax(outputs, axis=1)

            for ent in doc.ents:
                val = self.i_category_values[outputs[id2row[ent._.id]]]
                if ent._.meta_anns is None:
                    ent._.meta_anns = {self.category_name: val}
                else:
                    ent._.meta_anns[self.category_name] = val

        return doc
コード例 #23
0
class HuggingFaceBpeHelper(BPEHelper):
    """
    HuggingFace's ByteLevelBPE Tokenizer.

    Fast because Rust.
    """

    def __init__(self, opt: Opt, shared: TShared = None):
        super().__init__(opt, shared)
        # Default true for HF
        self.special_tok_map = {}  # map from HF
        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        if self.add_prefix_space is None:
            self.add_prefix_space = True
        if opt.get('dict_loaded'):
            dfname = opt['dict_file']
            if PathManager.exists(f'{dfname}-merges.txt'):
                opt['bpe_merge'] = f'{dfname}-merges.txt'
            if PathManager.exists(f'{dfname}-vocab.json'):
                opt['bpe_vocab'] = f'{dfname}-vocab.json'
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if self.bpe_dropout:
            raise NotImplementedError(
                '--bpe-dropout is not supported with ByteLevelBPE because tokenizers '
                'library does not allow dynamically turning BPE on/off. You can use '
                '--dict-tokenizer slow_bytelevel_bpe to gain this feature.'
            )

        if self.lower:
            warn_once('Are you sure you want to lower case your BPE dictionary?')
        if self.maxtokens > 0 or self.minfreq > 0:
            raise ValueError(
                'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
                ' (no --dict-minfreq or --dict-maxtokens).'
            )
        if 'bpe_vocab' not in opt:
            raise ValueError('--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError('--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError(
                '--bpe-vocab and --bpe-merge are mandatory with '
                '--dict-tokenizer bytelevelbpe'
            )

        if not PathManager.exists(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not PathManager.exists(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.tokenizer = ByteLevelBPETokenizer(
            self.vocab_path, self.merge_path, self.add_prefix_space
        )

    def helper_encode(self, text: str) -> List[str]:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        return self.tokenizer.encode(text).tokens

    def helper_decode(
        self, tokens: List[str], token_ids: List[int], delimiter: str
    ) -> str:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param token_ids:
            list of token ids
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        text = self.tokenizer.decode(token_ids, skip_special_tokens=False)

        return text

    def add_special_tokens(self, dict_agent, special_tokens: List[str]):
        """
        Add special tokens to the tokenizer and dict_agent.
        """
        logging.debug(f'adding the following special tokens: {special_tokens}')
        self.tokenizer.add_special_tokens(special_tokens)  # add to HF

        for tok in special_tokens:
            parlai_key = dict_agent[tok]
            hf_key = self.tokenizer.token_to_id(tok)
            self.special_tok_map[parlai_key] = hf_key

    def sync_with_dict(self, dict_agent):
        """
        Sync the dictionary agent with Hugging Face tokenizer's BPE dict.

        Called only once on initialization.
        """
        special_tokens = [
            dict_agent.null_token,
            dict_agent.start_token,
            dict_agent.end_token,
            dict_agent.unk_token,
        ]
        self.add_special_tokens(dict_agent, special_tokens)

        for i in range(self.tokenizer.get_vocab_size() - len(special_tokens)):
            token = self.tokenizer.id_to_token(i)
            dict_agent.add_token(token)
            # We don't have access to the hugging face word frequency table,
            # just set it to 1 instead
            dict_agent.freq[token] = 1

    def save(self, dir_name: str, file_name: str):
        """
        Save appropriate files.

        :param dir_name:
            directory to save.
        :param file_name:
            file to save.
        """
        self.tokenizer.save_model(dir_name, file_name)
コード例 #24
0
if TRAIN_BASE:
    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Customize training
    tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])

    # Save files to disk
    tokenizer.save_model("tokenizer")

inp = 'print("Hello World!")'

tokenizer = GPT2Tokenizer.from_pretrained('tokenizer')

tokenizer.add_special_tokens({
    "bos_token": "<s>",
    "pad_token": "<pad>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "mask_token": "<mask>",
})

t = tokenizer.encode(inp)

print(t)
コード例 #25
0
    'Initial alphabet for ByteLevel BPE as defined in pre_tokenizers.ByteLevel.alphabet(): ',
    alphabet)
# And then train
tokenizer.train(
    files,
    vocab_size=args.vocab_size,
    min_frequency=2,
    show_progress=True,
    special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'],
)

# Save the files
tokenizer.save(args.out, args.name)

# Restoring model from learned vocab/merges
tokenizer = ByteLevelBPETokenizer(
    join(args.out, '{}-vocab.json'.format(args.name)),
    join(args.out, '{}-merges.txt'.format(args.name)),
    add_prefix_space=True,
)

# Test encoding
logger.info(
    'Tokens and their ids from ByteLevelBPETokenizer with GFP protein sequence: \n MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT'
)
encoded = tokenizer.encode('MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT',
                           pad_to_max_length=True)
logger.info(encoded.tokens)
logger.info(encoded.ids)
logger.info('done!')
コード例 #26
0
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

end = datetime.now()
print("train ByteLevelBPETokenizer : %s" % str(end - start))

tokenizer.save_model("vocab")

# 결과 확인
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "vocab/vocab.json",
    "vocab/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

tokenizer.encode("확인 문장(형태소 분석된 형태로 입력)").tokens
コード例 #27
0
from tokenizers.processors import BertProcessing

paths = [
    str(x)
    for x in Path('/Users/uri/Documents/Uri/Projects/Bertnik/data/for_training'
                  ).glob("*.txt")
]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=70000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(tokenizer.encode("איך בין א סטודענט פון תל אביב").tokens)
# Save files to disk
tokenizer.save_model(".", "bertnik")
コード例 #28
0
                ])

tokenizer.save_model(SAVE_MODEL)

tokenizer = ByteLevelBPETokenizer(
    SAVE_MODEL + "/vocab.json",
    SAVE_MODEL + "/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(tokenizer.encode("For it is in reality vain to profess"))

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = RobertaTokenizerFast.from_pretrained(SAVE_MODEL, max_len=512)
model = RobertaForMaskedLM(config=config)

print(model.num_parameters())

dataset = LineByLineTextDataset(
コード例 #29
0
# Customize training
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

# Save files to disk
tokenizer.save(".", "rubinberto")

tokenizer = ByteLevelBPETokenizer(
    "rubinberto-vocab.json",
    "rubinberto-merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(
    tokenizer.encode(
        "А можно вспоминать не о событиях, а, например, о чувствах, испытываемых нами за «отчетный период»."
    ).tokens)
コード例 #30
0
def load_french_vocab(model_name):
    #tokenizer = PreTrainedTokenizerFast(tokenizer_object=model_name)
    #print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens)
    tokenizer = ByteLevelBPETokenizer("wiki_fr_tokenizer.json",
                                      add_prefix_space=True)
    print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens)