Ejemplo n.º 1
0
def train(args):

    tokenizer = CharBPETokenizer()

    tokenizer.train([args.corpus], vocab_size=1000)

    tokenizer.save("src/dev_scripts/tokenizer.json")
class HuggingFaceTokenizer:
    def __init__(self, cache_dir, max_length=None, vocab_size=400):
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.cache_dir = cache_dir
        self.name = "%d-%s" % (vocab_size, max_length)
        self.tokenizer = None

        vocab = os.path.join(self.cache_dir, self.name + '-vocab.json')
        merges = os.path.join(self.cache_dir, self.name + '-merges.txt')
        if os.path.exists(vocab) and os.path.exists(merges):
            self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True)
            print('Using cached HuggingFaceTokenizer')

    def build(self, texts):
        if self.tokenizer is not None:
            return

        tmp_file = tempfile.NamedTemporaryFile()

        with open(tmp_file.name, "w") as f:
            f.write(' '.join(texts).lower())

        self.tokenizer = CharBPETokenizer(lowercase=True)
        self.tokenizer.train(
            [tmp_file.name],
            vocab_size=self.vocab_size,
            special_tokens=[
                NUL_token,
                PAD_token,
                BOS_token,
                UNK_token,
            ],
        )
        os.makedirs(self.cache_dir, exist_ok=True)
        self.tokenizer.save(self.cache_dir, self.name)

    def encode(self, text):
        token_ids = self.tokenizer.encode(text.lower()).ids
        token_ids = token_ids[:self.max_length]

        return token_ids

    def decode(self, tokens, skip_special_tokens=True):
        text = self.tokenizer.decode(  # My special tokens
            tokens,
            # [token for token in tokens if token > 3],   # aren't skipped
            skip_special_tokens=skip_special_tokens,  # even I set f*****g
        )  # skip_special_tokens
        return text  # to True

    def decode_plus(self, token_batch):
        sentences = []
        for tokens in token_batch:
            sentences.append(self.decode(tokens))
        return sentences
Ejemplo n.º 3
0
def train():
    """My main man"""

    base = os.environ['DATA_ROOT']
    corpus_path = base + 'Thyme/Text/train+dev+test/*'
    files = glob.glob(corpus_path)

    tokenizer = CharBPETokenizer(lowercase=True)
    tokenizer.train(files=files,
                    vocab_size=10000,
                    min_frequency=3,
                    show_progress=True)
    tokenizer.save('.', name='thyme-tokenizer')
Ejemplo n.º 4
0
def train_subword_tokenizer(size, special_tokens, path):
    """Train subword tokenizers for subword encoding
    ref: https://github.com/huggingface/tokenizers

    Args:
        path: path of training corpus.
    """
    tokenizer = CharBPETokenizer()
    tokenizer.train(
        [path+"/corpus_all.txt"],
        vocab_size=size,
        min_frequency=2,
        show_progress=True,
        special_tokens=special_tokens[:3]+["<unk>"],
    )
    tokenizer.save(path, "bpe")
Ejemplo n.º 5
0
def create_tokenizer_imbd(data_path, file_name, vocab_size):
    #df = pd.read_csv(os.path.join(data_path, file_name))
    tokenizer = CharBPETokenizer()
    tokenizer.train(
        os.path.join(data_path, file_name),
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]"])

    print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format(
        str(tokenizer.token_to_id("[CLS]")),
        str(tokenizer.token_to_id("[PAD]")),
        str(tokenizer.token_to_id("[MASK]")),
        str(tokenizer.token_to_id("[UNK]")),
        str(tokenizer.token_to_id("[SEP]"))))

    tokenizer.save(data_path, "tokenizer")
Ejemplo n.º 6
0
def create_tokenizer(data_path, vocab_size):

    tokenizer = CharBPETokenizer()
    tokenizer.train([
        os.path.join(data_path, file) for file in
        [f
         for f in os.listdir(data_path) if f.find("uncased_chunk") != -1][:20]
    ],
                    vocab_size=vocab_size,
                    min_frequency=2,
                    show_progress=True,
                    special_tokens=[
                        "[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]"
                    ])

    print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format(
        str(tokenizer.token_to_id("[CLS]")),
        str(tokenizer.token_to_id("[PAD]")),
        str(tokenizer.token_to_id("[MASK]")),
        str(tokenizer.token_to_id("[UNK]")),
        str(tokenizer.token_to_id("[SEP]"))))

    tokenizer.save(data_path, "tokenizer")
Ejemplo n.º 7
0
class BPETokenizer:
    def __init__(self, text_list, vocab_size, lazy=False):
        if not lazy:
            self.tokenizer = CharBPETokenizer()
            self.tokenizer.train(text_list,
                                 vocab_size=vocab_size,
                                 special_tokens=[PAD, BOS, EOS, "<unk>"])
            self.tokenizer.add_special_tokens([PAD, BOS, EOS])
        else:
            self.tokenizer = None

    def tokens_to_ids(self, tokens):
        return [self.tokenizer.token_to_id(t) for t in tokens]

    def ids_to_tokens(self, ids):
        return [self.tokenizer.id_to_token(i) for i in ids]

    def encode(self, text):
        encodes = self.tokenizer.encode(text)
        return encodes.ids

    def decode(self, ids, skip_special=True):
        return self.tokenizer.decode(ids, skip_special_tokens=skip_special)

    def save(self, path, file_name):
        self.tokenizer.save(path, file_name)

    @classmethod
    def load(cls, vocab, merges):
        tkz = cls(None, None, lazy=True)
        tkz.tokenizer = CharBPETokenizer(vocab, merges)
        tkz.tokenizer.add_special_tokens([PAD, BOS, EOS])
        return tkz

    def __len__(self):
        return self.tokenizer.get_vocab_size()
Ejemplo n.º 8
0
def main():
    batch_size = 4
    vocab_size = 16384
    max_source_length = 1024
    max_target_length = 1024
    num_workers = 3

    dataset = nlp.load_dataset("iwslt2017.py", "nl-en")

    # Train tokenizer
    tokenizer_filename = "tokenizer.json"
    if os.path.exists(tokenizer_filename):
        tokenizer = Tokenizer.from_file(tokenizer_filename)
    else:
        data_filename = "whole_data.txt"
        with open(data_filename, "w") as f:
            for item in dataset["train"]:
                f.write(item["source"] + "\n")
                f.write(item["target"] + "\n\n")

        tokenizer = CharBPETokenizer()
        tokenizer.train([data_filename], vocab_size=vocab_size)
        pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False)
        tokenizer.add_tokens([pad_token])
        tokenizer.save(tokenizer_filename)

    tokenizer.pad_token_id = vocab_size

    # Loaders
    train_dataset = Seq2SeqDataset(tokenizer, dataset["train"],
                                   max_source_length, max_target_length)
    val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"],
                                 max_source_length, max_target_length)
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=train_dataset.collate_fn,
        num_workers=num_workers,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        collate_fn=val_dataset.collate_fn,
        num_workers=num_workers,
    )

    # Train model
    config = BartConfig(
        vocab_size=vocab_size + 1,  # Pad
        d_model=1024,
        encoder_ffn_dim=1024,
        encoder_layers=6,
        encoder_attention_heads=4,
        decoder_ffn_dim=1024,
        decoder_layers=6,
        decoder_attention_heads=4,
    )
    model = BartForConditionalGeneration(config)
    translator = Translate(model, tokenizer)

    trainer = pl.Trainer(gpus=1)
    trainer.fit(translator, train_loader, val_loader)
Ejemplo n.º 9
0
import json
import argparse
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

from tokenizers import CharBPETokenizer

parser = argparse.ArgumentParser()

parser.add_argument("--corpus",
                    help="Path to text training corpus",
                    default="/home/benet/IRI/How2Sign/metadata/metadata.txt")
parser.add_argument("--saveto",
                    help="Path where to save the model",
                    default="steps/tokenizer.json")
parser.add_argument("--size",
                    help="Number of tokens / vocabulary size",
                    type=int,
                    default=1000)

if __name__ == '__main__':

    args = parser.parse_args()

    tokenizer = CharBPETokenizer()

    tokenizer.train([args.corpus], vocab_size=args.size)

    tokenizer.save(args.saveto)
Ejemplo n.º 10
0
# ANY ARGS?
tokenizer = CharBPETokenizer()

# And then train
tokenizer.train(
    files,
    vocab_size=args.vocab_size,
    min_frequency=2,
    show_progress=True,
    special_tokens=['<unk>'],
    suffix='</w>',
    limit_alphabet=args.limit_alphabet,
)

# Save the files
tokenizer.save(args.out, args.name)

# Restoring model from learned vocab/merges
tokenizer = CharBPETokenizer(
    join(args.out, '{}-vocab.json'.format(args.name)),
    join(args.out, '{}-merges.txt'.format(args.name)),
)

# Test encoding
logger.info(
    'Tokens and their ids from CharBPETokenizer with GFP protein sequence: \n MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT'
)
encoded = tokenizer.encode('MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT')
logger.info(encoded.tokens)
logger.info(encoded.ids)
logger.info('done!')
Ejemplo n.º 11
0
# coding:utf-8
from tokenizers import CharBPETokenizer
from pathlib import Path

# Initialize a tokenizer
tokenizer = CharBPETokenizer()

# Then train it!
tokenizer.train(["./data/wiki_sunyang.txt"])

# And you can use it
encoded = tokenizer.encode(
    "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming."
)
# print(encoded.tokens)

# And finally save it somewhere
saved_path = Path("./saved_tokenizer/wiki_sunyang")
saved_path.mkdir(exist_ok=True, parents=True)
tokenizer.save(str(saved_path))
                    f.write(t + '\n')
    tokenizer = CharBPETokenizer(lowercase=True)

    tokenizer.train(
        ["raw_corpus.txt"],
        vocab_size=1000,
        min_frequency=2,
        special_tokens=[
            "<blank>",
            "<bos>",
            "<unk>",
        ],
    )

    # os.makedirs('./BPE-1000', exist_ok=True)
    tokenizer.save(f'./BPE-1000', '')

    tokenizer = CharBPETokenizer('./BPE-1000/-vocab.json',
                                 './BPE-1000/-merges.txt')
    # with open('.test.pkl', 'w') as f:
    #     pickle.dump(tokenizer, f)

    tokenizer = HuggingFaceTokenizer()
    print(
        tokenizer.encode(
            'might have a solution it might take a long time nobody'))

    print(
        tokenizer.decode(
            tokenizer.encode(
                'might have a solution it might take a long time nobody'), ))
Ejemplo n.º 13
0
def train(args):

    tokenizer = CharBPETokenizer()

    tokenizer.train([args.corpus], vocab_size=args.size)
    tokenizer.save(args.output_file)
from tokenizers import CharBPETokenizer
import json
import tqdm

if __name__ == "__main__":
    # Initialize a tokenizer
    tokenizer = CharBPETokenizer()

    # Then train it!
    tokenizer.train(
        [
            "data\\train.txt",
            "D:/数据/wikitext-2-raw-v1/wikitext-2-raw/wiki.train.raw",
            "D:/数据/webtext2019zh/web_text_raw.txt"
        ],
        vocab_size=30000,
        min_frequency=2,
        special_tokens=['<UNK>', '<BOS>', '<EOS>', '<PAD>', '<CLS>', '<SEP>'])

    # Now, let's use it:
    encoded = tokenizer.encode("I can feel the magic, can you?")

    # And finally save it somewhere
    tokenizer.save("./", "bpe.tokenizer.json")