Ejemplo n.º 1
0
    def from_corpus(cls, corpus, corpus_save_path, tokenizer_save_path,
                    tokenizer_name, vocab_size, min_frequency, strip_accents,
                    clean_text, lowercase):
        with open(corpus_save_path, 'wb') as f:
            f.write('\n'.join(corpus).encode())

        tokenizer = BertWordPieceTokenizer(
            strip_accents=strip_accents,
            clean_text=clean_text,
            lowercase=lowercase,
        )
        tokenizer.train(
            [corpus_save_path],
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            show_progress=True,
            special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
            wordpieces_prefix="##",
        )

        if os.path.exists(tokenizer_save_path):
            shutil.rmtree(tokenizer_save_path)
        os.mkdir(tokenizer_save_path)

        tokenizer.save_model(tokenizer_save_path, tokenizer_name)
        vocab_path = os.path.join(tokenizer_save_path,
                                  f'{tokenizer_name}-vocab.txt')
        return cls(vocab_path, strip_accents, clean_text, lowercase)
Ejemplo n.º 2
0
def train_tokenizer(captions):
    print('Create training file...')
    train_tokenizer = [sample for samples in captions for sample in samples]
    with open('train_tokenizer.txt', 'a') as f:
        for sample in train_tokenizer:
            f.write(sample)
    # init
    bwpt = BertWordPieceTokenizer(vocab_file=None,
                                  unk_token='[UNK]',
                                  sep_token='[SEP]',
                                  cls_token='[CLS]',
                                  clean_text=True,
                                  handle_chinese_chars=True,
                                  strip_accents=True,
                                  lowercase=True,
                                  wordpieces_prefix='##')
    print('Tokenizer training...')
    bwpt.train(files=['train_tokenizer.txt'],
               vocab_size=30000,
               min_frequency=5,
               limit_alphabet=1000,
               special_tokens=['[PAD]', '[UNK]', '[CLS]', '[MASK]', '[SEP]'])

    bwpt.save('.', 'captions')

    # initialization of a trained tokenizer
    tokenizer = BertWordPieceTokenizer('captions-vocab.txt')
    tokenizer.enable_truncation(max_length=16)
    print('Tokenizer is ready to use...')
    return tokenizer
Ejemplo n.º 3
0
    def _wordpiece(self):
        tokenizer = BertWordPieceTokenizer(
            vocab=self.conf.vocab,
            unk_token=self.conf.unk_token,
            sep_token=self.conf.sep_token,
            cls_token=self.conf.cls_token,
            pad_token=self.conf.pad_token,
            mask_token=self.conf.mask_token,
            clean_text=self.conf.clean_text,
            handle_chinese_chars=self.conf.handle_chinese_chars,
            strip_accents=self.conf.strip_accents,
            lowercase=self.conf.lowercase,
            wordpieces_prefix=self.conf.wordpieces_prefix,
        )

        tokenizer.train(
            files=self.files,
            vocab_size=self.conf.vocab_size,
            min_frequency=self.conf.min_frequency,
            limit_alphabet=self.conf.limit_alphabet,
            initial_alphabet=self.conf.initial_alphabet,
            special_tokens=self.conf.word_piece_special_tokens,
            wordpieces_prefix=self.conf.wordpieces_prefix,
        )

        return tokenizer
def train_bert_tokenizer(dataset_base_path: str,
                         target_path: str,
                         tokenizer_name: str,
                         files_pattern: str = '**/*',
                         vocab_size: int = 30000,
                         lower_case: bool = False):
    """
    Trains a BERT WordPiece Tokenizer based on data
    located in dataset_base_path.

    By default it reads all files in dataset_base_path. One can
    specify `files_pattern` for filtering.

    The files generated by the tokenizer will be saved under
    <target_path>/<tokenizer_name> namespace.
    """
    files = [
        str(f) for f in Path(dataset_base_path).glob(files_pattern)
        if os.path.isfile(f)
    ]

    logger.info(f'Found {len(files)} files to use for training.')
    logger.debug(f'Files are: {files}')

    tokenizer_args = {
        'lowercase': lower_case,
        'strip_accents': False,
    }

    wordpiece_tokenizer = BertWordPieceTokenizer(**tokenizer_args)
    wordpiece_tokenizer.train(files=files, vocab_size=vocab_size)

    save_out = wordpiece_tokenizer.save(target_path, tokenizer_name)

    logger.info(f'Train finish. Result is in {save_out}')
Ejemplo n.º 5
0
def train_bert():
    # https://huggingface.co/transformers/_modules/transformers/tokenization_bert.html

    files = [
        "Corpora/CS_V0_normalized_sent_per_line.txt",
        "Corpora/AsoSoft_Large_sent_per_line.txt",
        "Corpora/KTC_all_cleaned.txt", "Corpora/Lyrics_all_cleaned.txt",
        "Corpora/Tanztil_ku_normalized.txt"
    ]

    vocab_size = 50000
    # Initialize a tokenizer
    tokenizer = BertWordPieceTokenizer(clean_text=True,
                                       handle_chinese_chars=False,
                                       strip_accents=True,
                                       lowercase=False)

    # And then train
    tokenizer.train(
        files,
        vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        limit_alphabet=1000,
        wordpieces_prefix="##",
    )

    tokenizer.save('./', 'ckb-wordpiece_%s' % str(vocab_size))
Ejemplo n.º 6
0
def main():
    tokenizer = BertWordPieceTokenizer()
    tokenizer.train(files=[configs.data.raw],
                    vocab_size=52_000,
                    min_frequency=5)
    tokenizer.save_model(configs.data.path)
    print(f"save to {configs.data.path}")
Ejemplo n.º 7
0
def train_tokenizer(files: List[str],
                    tokenizer_name: str,
                    base_path: str,
                    vocab_size: int,
                    lowercase: bool = False,
                    strip_accents: bool = False):

    tokenizer = BertWordPieceTokenizer(lowercase=lowercase,
                                       strip_accents=strip_accents)

    tokenizer_path = os.path.join(base_path, tokenizer_name)
    os.makedirs(tokenizer_path, exist_ok=True)

    initial_alphabet = get_bert_initial_alphabet()

    tokenizer.train(files,
                    special_tokens=initial_alphabet,
                    vocab_size=vocab_size)

    tokenizer.save(tokenizer_path)

    # Creating a default config for the tokenizer
    config = {'do_lower_case': lowercase, 'strip_accents': strip_accents}
    config_file_path = os.path.join(tokenizer_path, 'tokenizer_config.json')

    with open(config_file_path, 'w+') as config_file:
        json.dump(config, config_file)
Ejemplo n.º 8
0
def main():
    random.seed(1)
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--corpus-filelist-path",
                        required=True,
                        help="Location of pre-training text files.")
    args = parser.parse_args()

    paths = []
    with open(args.corpus_filelist_path) as f:
        for line in f:
            line = line.strip()
            if line:
                paths.append(line)

    random.shuffle(paths)
    print(f'Nrof files: {len(paths)}')
    paths = paths[:100_000]
    print(f'Nrof filtered files: {len(paths)}')

    # Initialize a tokenizer
    tokenizer = BertWordPieceTokenizer(lowercase=False)

    # Customize training
    tokenizer.train(
        files=paths,
        vocab_size=40_000,
        min_frequency=4,
    )

    # Save files to disk
    tokenizer.save_model(".", "vocab.txt")
    def train_wordpiece_tokenizer(self) -> None:
        wordpiece_tokenizer = BertWordPieceTokenizer()
        wordpiece_tokenizer.train(
            files=["./train.txt", "./test.txt"],
            vocab_size=10000,
        )

        wordpiece_tokenizer.save_model("nlpbook/wordpiece")
Ejemplo n.º 10
0
    def convert_to_ratt(self,
                        ratt_dir,
                        do_lower=True,
                        max_sequence_length=128,
                        data_type="train"):
        if not os.path.exists(ratt_dir):
            os.mkdir(ratt_dir)
        # Build dictionary
        text_list, label_list = self._read_csv(self.raw_data_file)

        # Token vocab
        token_vocab_name = "ratt"
        vocab_file = os.path.join(ratt_dir, token_vocab_name + "-vocab.txt")
        if not os.path.isfile(vocab_file):
            tokenizer = BertWordPieceTokenizer(lowercase=do_lower)
            tokenizer.train(files=[self.raw_data_file], vocab_size=8192)
            tokenizer.save_model(ratt_dir, token_vocab_name)
        else:
            tokenizer = BertWordPieceTokenizer(vocab_file=vocab_file,
                                               lowercase=do_lower)

        # Label vocab
        label_vocab_file = os.path.join(ratt_dir, "label_dict.txt")
        if not os.path.isfile(label_vocab_file):
            labels = set(label_list)
            label_map = {str(l): i for i, l in enumerate(labels)}
            with open(label_vocab_file, "w", encoding="utf-8") as fout:
                for l in labels:
                    fout.write("%s\n" % l)
        else:
            label_map = {}
            with open(label_vocab_file, encoding="utf-8") as fin:
                for i, line in enumerate(fin):
                    label_map[line.rstrip()] = i

        if data_type not in ["train", "dev", "test"]:
            data_types = ["train", "dev", "test"]
        else:
            data_types = [data_type]

        for data_type in data_types:
            logging.info("Converting %s.." %
                         eval("self.raw_%s_file" % data_type))
            text_list, label_list = self._read_csv(
                eval("self.raw_%s_file" % data_type))

            outputs = tokenizer.encode_batch(text_list,
                                             add_special_tokens=True)
            input_ids = [output.ids for output in outputs]
            padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
                input_ids,
                padding="post",
                maxlen=max_sequence_length,
                truncating="post")

            label_ids = [label_map[str(label)] for label in label_list]
            save_file = os.path.join(ratt_dir, data_type + ".npz")
            np.savez(save_file, inputs=padded_inputs, targets=label_ids)
    def generate_custom_vocab(self):

        try:
            tokenizer = None
            # root dir path check and generate
            if not os.path.isdir(self.vocab_root_dir):
                os.makedirs(self.vocab_root_dir, exist_ok=True)

            # generate models directory
            self.vocab_dir = '/BERT_TRAINING_VOCAB_' + self.getCurrent_time()[2] + '/'
            os.makedirs(self.vocab_root_dir + self.vocab_dir, exist_ok=True)

            user_defined_symbols = ['[BOS]', '[EOS]', '[UNK]', '[UNK1]', '[UNK2]', '[UNK3]', '[UNK4]', '[UNK5]',
                                    '[UNK6]', '[UNK7]', '[UNK8]', '[UNK9]']
            unused_token_num = 200
            unused_list = ['[unused{}]'.format(n) for n in range(unused_token_num)]
            user_defined_symbols = user_defined_symbols + unused_list

            if self.tokenizer_type == 'word':
                # if lowercase is False must set strip_accents option as 'False'
                tokenizer = BertWordPieceTokenizer(strip_accents=False,
                                                   lowercase=True,
                                                   clean_text=True,
                                                   handle_chinese_chars=True,
                                                   wordpieces_prefix="##"
                                                   )

            # when selected 'base' going to use bert-base-uncased tokenizer... close function

            # training vocab start
            corpus_file = [self.corpus_path]
            vocab_size = 32000
            limit_alphabet = 6000
            min_frequency = 3
            tokenizer.train(files=corpus_file,
                            vocab_size=vocab_size,
                            special_tokens=user_defined_symbols,
                            min_frequency=min_frequency,  # 단어의 최소 발생 빈도, 3
                            limit_alphabet=limit_alphabet,  # ByteLevelBPETokenizer 학습시엔 주석처리 필요
                            show_progress=True)

            self.setPrint('Customer Tokenizer Training is completed')

            sentence = '전화 통화가 정상적으로 안됨.'
            output = tokenizer.encode(sentence)
            self.setPrint('Tokenizer 테스트 문장: {}'.format(sentence))
            self.setPrint('Tokenizer 분석 결과\n=>idx: {}\n=>tokens: {}\n=>offset: {}\n=>decode: {}\n'.
                          format(output.ids, output.tokens, output.offsets, tokenizer.decode(output.ids)))

            # save tokenizer
            tokenizer.save_model(self.vocab_root_dir + self.vocab_dir)

        except:
            self.setPrint('Error: {}. {}, line: {}'.format(sys.exc_info()[0],
                                                           sys.exc_info()[1],
                                                           sys.exc_info()[2].tb_lineno))
Ejemplo n.º 12
0
 def create_vocab(file_path, output_path, least_freq=2):
     tokenizer = BertWordPieceTokenizer(clean_text=False,
                                        strip_accents=False,
                                        lowercase=True)
     files = [file_path]
     tokenizer.train(files,
                     vocab_size=1000,
                     min_frequency=least_freq,
                     show_progress=True,
                     special_tokens=['[PAD]', '[UNK]', '[SOS]', '[EOS]'],
                     limit_alphabet=1000,
                     wordpieces_prefix="##")
     tokenizer.save(output_path)
     print(f"Vacabulary created at location {output_path}")
Ejemplo n.º 13
0
def train_tokenizer(data_file_paths, vocab_size):
    t = BertWordPieceTokenizer(handle_chinese_chars=False)
    special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    wordpieces_prefix = "##"
    t.train(
        files=data_file_paths,
        vocab_size=vocab_size,
        min_frequency=10,
        show_progress=True,
        special_tokens=special_tokens,
        limit_alphabet=1000,
        wordpieces_prefix=wordpieces_prefix,
    )
    return t
def train_tokenizer(
    corpus: Union[str, List[str]],
    vocab_size: int = 30519,
    overwrite: bool = True,
    lowercase: bool = True,
    save_vocab: bool = False,
    dst: Optional[str] = None,
    in_domain_vocab: str = VOCAB_CACHE_PREFIX,
) -> BertWordPieceTokenizer:
    """Train a WordPiece tokenizer from scratch.

    Arguments:
        corpus {Union[str, List[str]]} -- In-domain corpus / corpora

    Keyword Arguments:
        vocab_size {int} -- Size of trained vocabulary (default: 30519)
        lowercase {bool} -- If True, perform lowercasing (default: True)
        save_vocab {bool} -- If True, save vocab to `in_domain_vocab`
                             (default: Fakse)
        in_domain_vocab {str} -- Path to save trained tokenizer vocabulary
                                 (default: {'in-domain-vocab.txt'})

    Returns:
        A BertWordPieceTokenizer trained on in-domain corpora.
    """
    if not isinstance(corpus, list):
        corpus = [corpus]

    # Load cached vocab if possible
    if not overwrite:
        cached_vocab = Path(dst) / (VOCAB_CACHE_PREFIX + '-vocab.txt')

        if cached_vocab.exists():
            logger.info(f'Loading cached vocabulary at {cached_vocab}')
            return BertWordPieceTokenizer(str(cached_vocab))
        else:
            logger.info(f'Cached vocabulary not found at {cached_vocab}')

    # Train tokenizer
    logger.info('Training new WordPiece tokenizer on in-domain corpora')
    tokenizer = BertWordPieceTokenizer(lowercase=lowercase)
    tokenizer.train(corpus, vocab_size=vocab_size)

    if save_vocab:
        tokenizer.save('.' if dst is None else dst, in_domain_vocab)
        logger.info('Saved in-domain vocabulary to '
                    f'{Path(dst) / (in_domain_vocab + "-vocab.txt")}')
    return tokenizer
Ejemplo n.º 15
0
class BertWordPiece:

    def __init__(self, clean_text: bool, strip_accents: bool, lowercase: bool):
        self.clean = clean_text
        self.strip = strip_accents
        self.lower = lowercase

        self.tokenizer = BertWordPieceTokenizer(
            clean_text=self.clean, 
            strip_accents=self.clean
            lowercase=self.lower, 
            handle_chinese_chars=True
        )

    def train(self, files, vocab_size, min_frequency, limit_alphabet):
        self.trainer = self.tokenizer.train(
            files,
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            show_progress=True,
            special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
            limit_alphabet=limit_alphabet,
            wordpieces_prefix="##",
        )

    def save(self, path, filename):
        self.tokenizer.save(path, filename)
Ejemplo n.º 16
0
    def gen_tokenizer(self, min_frequency=6, limit_alphabet=150):
        '''Create a WordPiece tokenizer from the parsed data'''

        # Store the flattened text in a temporary file
        f = tempfile.NamedTemporaryFile()
        text = self.flatten()
        f.write(text.encode("utf8"))

        # Create the tokenizer
        tokenizer = BertWordPieceTokenizer()
        tokenizer.train([f.name],
                        min_frequency=min_frequency,
                        limit_alphabet=limit_alphabet)
        f.close()

        return tokenizer
Ejemplo n.º 17
0
def main(args):
    print(args)
    if args['train']:
        tokenizer = BertWordPieceTokenizer(
            clean_text=True,
            handle_chinese_chars=True,
            strip_accents=True,  # Must be False if cased model
            lowercase=True,
            wordpieces_prefix="##"
        )

        tokenizer.train(
            files=['/data2/BERT/data/naver_news/news_3_preprocessed/naver_news.txt'],
            limit_alphabet=6000,
            vocab_size=32000
        )

        print(tokenizer.save_model("../BertWordPieceTokenizer_32000"))

    elif args['test']:
        test_str = '나는 워드피스 토크나이저를 써요. 성능이 좋은지 테스트 해보려 합니다.'

        print("=========== tokenizer ===========")
        tokenizer = BertWordPieceTokenizer("../BertWordPieceTokenizer_32000/vocab.txt")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str.ids)
        decoded_str = tokenizer.decode(encoded_str.ids)
        print(decoded_str)

        print("=========== BertTokenizer ===========")
        tokenizer = BertTokenizer("../BertWordPieceTokenizer_32000/vocab.txt")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str)
        decoded_str = tokenizer.decode(encoded_str)
        print(decoded_str)

        print("=========== BertTokenizer2 ===========")
        tokenizer = BertTokenizer.from_pretrained("../BertWordPieceTokenizer_32000")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str)
        decoded_str = tokenizer.decode(encoded_str)
        print(decoded_str)
Ejemplo n.º 18
0
def build_vocab(args):
    special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    special_tokens += ["[unused{}]".format(idx) for idx in range(args.unused_size)]

    if args.tokenizer_model == "mecab_wordpiece":
        mecab_wordpiece_notag_trainer = KoNLPyBertWordPieceTrainer(
            Mecab(), use_tag=False
        )
        mecab_wordpiece_notag_trainer.train(
            files=[args.corpus],
            vocab_size=args.vocab_size,
            min_frequency=args.min_frequency,
            limit_alphabet=args.limit_alphabet,
            special_tokens=special_tokens,
        )
        mecab_wordpiece_notag_trainer.save_model(
            "./data/vocab/mecab_normalize_{}".format(args.vocab_size), "notag"
        )

    elif args.tokenizer_model == "wordpiece":
        tokenizer = BertWordPieceTokenizer(
            vocab=None,
            clean_text=True,
            handle_chinese_chars=True,
            strip_accents=False,
            lowercase=False,
            wordpieces_prefix="##",
        )

        tokenizer.train(
            files=[args.corpus],
            vocab_size=args.vocab_size,
            min_frequency=args.min_frequency,
            limit_alphabet=args.limit_alphabet,
            special_tokens=special_tokens,
        )

        tokenizer.save_model("./data/vocab/wordpiece")

    else:
        logger.info("tokenizer model : wordpiece / mecab_wordpiece")
        sys.exit(1)
Ejemplo n.º 19
0
def get_vocabulary(infile: Text, vocabsize: int, outfolder: Text):
    # get special token maps and config
    autotok = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    autotok.save_pretrained(args.outfolder)
    os.remove(os.path.join(args.outfolder, "vocab.txt"))

    # Initialize a tokenizer
    tokenizer = BertWordPieceTokenizer(strip_accents=False,
                                       lowercase=False,
                                       clean_text=False)

    # Then train it!
    tokenizer.train([args.infile],
                    vocab_size=args.vocabsize,
                    limit_alphabet=int(1e9))

    # And finally save it somewhere
    tokenizer.save(args.outfolder, "vocab")
    os.rename(os.path.join(args.outfolder, "vocab-vocab.txt"),
              os.path.join(args.outfolder, "vocab.txt"))
Ejemplo n.º 20
0
class Tokenizer:
    def __init__(self, lang):
        """
        A Tokenizer class to load and train a custom tokenizer
        Using the Hugging Face tokenization library for the same
        """
        self.tokenizer_dir = r"data/{}".format(lang)
        if not os.path.exists(self.tokenizer_dir):
            os.mkdir(self.tokenizer_dir)
        self.vocab = self.tokenizer_dir + "/vocab.txt"
        if os.path.exists(self.vocab):
            print("Initialized tokenizer using cached vocab file {}".format(self.vocab))
            self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab)
        else:
            self.tokenizer = BertWordPieceTokenizer()

        self.tokenizer.enable_padding(max_length=MAX_LENGTH)
        self.tokenizer.enable_truncation(max_length=MAX_LENGTH)

    def train_tokenizer(self, sentences):
        """
        Train a tokenizer with a list of sentences
        """

        if not os.path.exists(self.vocab):
            print("Training tokenizer for {}".format(self.tokenizer_dir))
            # Hugging Face only accepts a Temp File with sentences for Training Tokenizer
            with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f:
                [f.write(i + "\n") for i in sentences]
            self.tokenizer.train([self.tokenizer_dir + "/data.txt"])
            self.tokenizer.save(self.tokenizer_dir)
            print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size()))

            # Removing the temp file
            os.remove(self.tokenizer_dir + "/data.txt")

    def encode(self, decoded):
        return self.tokenizer.encode(decoded)

    def decode(self, encoded):
        return self.tokenizer.decode_batch(encoded)
Ejemplo n.º 21
0
def tokenize(inputPath, outputPath):
    paths = [str(x) for x in Path(inputPath).glob("*.ns")]
    print(paths)
    # Initialize a tokenizer

    tokenizer = BertWordPieceTokenizer(vocab_file=None,
                                       clean_text=True,
                                       handle_chinese_chars=True,
                                       strip_accents=False,
                                       lowercase=False,
                                       wordpieces_prefix="##")

    # Customize training
    tokenizer.train(
        files=paths,
        vocab_size=50000,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    )

    tokenizer.save(outputPath)
Ejemplo n.º 22
0
def train_tokenizer(filename, params):
    """
    Train a BertWordPieceTokenizer with the specified params and save it
    """
    # Get tokenization params
    save_location = params["tokenizer_path"]
    max_length = params["max_length"]
    min_freq = params["min_freq"]
    vocabsize = params["vocab_size"]

    tokenizer = BertWordPieceTokenizer()
    tokenizer.do_lower_case = False
    special_tokens = ["[S]","[PAD]","[/S]","[UNK]","[MASK]", "[SEP]","[CLS]"]
    tokenizer.train(files=[filename], vocab_size=vocabsize, min_frequency=min_freq, special_tokens = special_tokens)

    tokenizer._tokenizer.post_processor = BertProcessing(("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]")),)
    tokenizer.enable_truncation(max_length=max_length)

    print("Saving tokenizer ...")
    if not os.path.exists(save_location):
        os.makedirs(save_location)
    tokenizer.save(save_location)
Ejemplo n.º 23
0
def load_from_files_bert_tokenizer(path_to_files=None, vocab_size=30000):
    """
    Adapted from:
    https://github.com/huggingface/tokenizers/tree/master/bindings/python/examples
    If used frequently, save the model to avoid reloading (see example above)
    """

    if path_to_files is None:
        path_to_files = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "sample_files"
        )
    # parse more complex patterns if used
    files = glob.glob(path_to_files)

    # Create tokenizer using tokenizer
    tokenizer = BertWordPieceTokenizer(
        strip_accents=True,
        # following arguments are all same as default, listed for clarity
        clean_text=True,
        handle_chinese_chars=True,
        lowercase=True,
    )

    # And finally train
    tokenizer.train(
        files,
        # following arguments are all same as default, listed for clarity
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        limit_alphabet=1000,
        wordpieces_prefix="##",
    )

    return tokenizer
Ejemplo n.º 24
0
def main(language):
    # Initialize an empty BERT tokenizer
    tokenizer = BertWordPieceTokenizer(
        clean_text=True,
        handle_chinese_chars=False,
        strip_accents=False,
        lowercase=False,
    )

    cleaned_dir = BASE_DIR / "data/wikiextracted" / language / "cleaned"

    # prepare text files to train vocab on them
    # use only one subdir
    # files = [str(file_path) for file_path in cleaned_dir.glob("AA/wiki_*")]
    # use all wiki articles (in the given language)
    files = [str(file_path) for file_path in cleaned_dir.glob("**/wiki_*")]

    # train BERT tokenizer
    tokenizer.train(
        files,
        # vocab_size=100, # default value is 30000
        min_frequency=MIN_FREQ,
        show_progress=True,
        special_tokens=SPEC_TOKENS,
        limit_alphabet=SIZE_OF_ALPHABET, # default value is 1000
        wordpieces_prefix="##"
    )

    # save the vocab
    os.makedirs(str(BASE_DIR / "data/tokenizer" / language), exist_ok=True)
    tokenizer.save(str(BASE_DIR / "data/tokenizer" / language / "vocab"))

    # save the alphabet
    vocab = json.loads(read_vocab(language))['model']['vocab']
    alphabet = prepare_alphabet(vocab)
    write_alphabet_to_file(alphabet, language)
Ejemplo n.º 25
0
# CharBPETokenizer: The original BPE
# ByteLevelBPETokenizer: The byte level version of the BPE
# SentencePieceBPETokenizer: A BPE implementation compatible with the one used by SentencePiece
# BertWordPieceTokenizer: The famous Bert tokenizer, using WordPiece

DATAFILE = '../data/pg16457.txt'
MODELDIR = 'models'

input_text = 'This is a test'

# Training the tokenizers

print("========= CharBPETokenizer ==========")
# CharBPETokenizer
tokenizer = CharBPETokenizer()
tokenizer.train([DATAFILE], vocab_size=500)

tokenizer.save(MODELDIR, 'char_bpe')

output = tokenizer.encode(input_text)
print(output.tokens)  # ['T', 'his</w>', 'is</w>', 'a</w>', 't', 'est</w>']

print("========= ByteLevelBPETokenizer ==========")
# ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train([DATAFILE], vocab_size=500)

tokenizer.save(MODELDIR, 'byte_bpe')
output = tokenizer.encode(input_text)
print(output.tokens)  # ['T', 'h', 'is', 'Ġis', 'Ġa', 'Ġt', 'est']
    os.makedirs('./vocab', exist_ok=True)
    train_files = [
        f"./inputs/pretrain/{f}" for f in os.listdir('./inputs/pretrain')
    ]
    tokenizer = BertWordPieceTokenizer(
        clean_text=True,
        handle_chinese_chars=False,
        strip_accents=False,
        lowercase=False,
    )
    special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

    tokenizer.train(
        files=train_files,
        vocab_size=32000,
        min_frequency=2,
        special_tokens=special_tokens,
        limit_alphabet=500,
        wordpieces_prefix="##",
    )

    tokenizer.save_model('./vocab')

vocab_file = f'./vocab/{os.listdir("./vocab")[0]}'
print(vocab_file)

with open(vocab_file) as f:
    for vocab_size, _ in enumerate(f, 1):
        pass

print(f'Vocab size: {vocab_size}')
Ejemplo n.º 27
0
# dev_corpus_file = './mimicdata/bio-mimic3/dev_50.csv'
# test_corpus_file = './mimicdata/bio-mimic3/test_50.csv'

train_corpus_file = './mimicdata/mimic3/train_full.csv'
dev_corpus_file = './mimicdata/mimic3/dev_full.csv'
test_corpus_file = './mimicdata/mimic3/test_full.csv'

limit_alphabet = 100
vocab_size = 100000

tokenizer = BertWordPieceTokenizer(
    vocab_file=None,
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,  # Must be False if cased model
    lowercase=True,
    wordpieces_prefix="##",
)

tokenizer.train(
    files=[train_corpus_file, dev_corpus_file, test_corpus_file],
    limit_alphabet=limit_alphabet,
    vocab_size=vocab_size,
    min_frequency=1,
)

# tokenizer.save("./tokenizers", "bert-tiny-mimic3-50-{}-limit-{}".format(limit_alphabet, vocab_size))
tokenizer.save(
    "./tokenizers",
    "bert-tiny-mimic3-full-{}-limit-{}".format(limit_alphabet, vocab_size))
Ejemplo n.º 28
0
    def train_tokenizer(self, train_files, tokenizer_name=None, output_dir=None, use_trained_tokenizer=True):
        """
        Train a new tokenizer on `train_files`.

        Args:

        - train_files: List of files to be used when training the tokenizer.

        - tokenizer_name: Name of a pretrained tokenizer or a path to a directory containing a tokenizer.

        - output_dir (optional): The directory where model files will be saved. If not given, self.args['output_dir']
        will be used.

        - use_trained_tokenizer (optional): Load the trained tokenizer once training completes.

        Returns: None
        """

        if not self.args["vocab_size"]:
            raise AttributeError(
                "Cannot train a new tokenizer as vocab_size is not specified in args dict. "
                "Either provide a tokenizer or specify vocab_size."
            )

        if not isinstance(train_files, list):
            train_files = [train_files]

        if not output_dir:
            output_dir = self.args["output_dir"]

        if self.args["model_type"] in ["bert", "electra"]:
            tokenizer = BertWordPieceTokenizer()
            self.args["special_tokens"] = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
            self.args["wordpieces_prefix"] = "##"

            tokenizer.train(
                files=train_files,
                vocab_size=self.args["vocab_size"],
                min_frequency=self.args["min_frequency"],
                special_tokens=self.args["special_tokens"],
                wordpieces_prefix="##",
            )
        else:
            tokenizer = ByteLevelBPETokenizer()

            tokenizer.train(
                files=train_files,
                vocab_size=self.args["vocab_size"],
                min_frequency=self.args["min_frequency"],
                special_tokens=self.args["special_tokens"],
            )

        os.makedirs(output_dir, exist_ok=True)

        tokenizer.save(output_dir)
        logger.info(" Training of {} tokenizer complete. Saved to {}.".format(tokenizer_name, output_dir))

        _, _, tokenizer_class = MODEL_CLASSES[self.args["model_type"]]
        tokenizer = tokenizer_class.from_pretrained(output_dir)

        if use_trained_tokenizer:
            self.tokenizer = tokenizer
            self.args["tokenizer_name"] = output_dir
            try:
                if self.args["model_type"] == "electra":
                    model_to_resize = (
                        self.model.generator_model.module
                        if hasattr(self.model.generator_model, "module")
                        else self.model.generator_model
                    )
                    model_to_resize.resize_token_embeddings(len(self.tokenizer))

                    model_to_resize = (
                        self.model.discriminator_model.module
                        if hasattr(self.model.discriminator_model, "module")
                        else self.model.discriminator_model
                    )
                    model_to_resize.resize_token_embeddings(len(self.tokenizer))

                model_to_resize = self.model.module if hasattr(self.model, "module") else self.model
                model_to_resize.resize_token_embeddings(len(self.tokenizer))
            except AttributeError:
                pass
Ejemplo n.º 29
0
from pathlib import Path

from tokenizers import BertWordPieceTokenizer

#paths = [str(x) for x in Path("./eo_data/").glob("**/*.txt")]
paths = ['../../data/jw300.en-tw.tw','../../data/asante_twi_bible.txt']

# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer()

# Customize training
# And then train
tokenizer.train(
    paths,
    vocab_size=30000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

# Save files to disk
tokenizer.save("abena-base-v2-akuapem-twi-cased")
Ejemplo n.º 30
0
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer()
tokenizer.train(["sp_data/mono/all.en-fr"], vocab_size=60000)

# with open("sp_data/mono/all.en-fr") as r, open("sp_data/mono/all.en-fr.wordpiece", "w") as w:
#     lines = r.readlines()
#     for line in lines:
#         encoded = tokenizer.encode(line[:-1])
#         w.write(" ".join(encoded.tokens))
#         w.write("\n")

# with open("sp_data/para/dev/newstest2013-ref.en") as r, open("sp_data/para/dev/newstest2013-ref.en.wordpiece", "w") as w:
#     lines = r.readlines()
#     for line in lines:
#         encoded = tokenizer.encode(line[:-1])
#         w.write(" ".join(encoded.tokens))
#         w.write("\n")
#
# with open("sp_data/para/dev/newstest2013-ref.fr") as r, open("sp_data/para/dev/newstest2013-ref.fr.wordpiece", "w") as w:
#     lines = r.readlines()
#     for line in lines:
#         encoded = tokenizer.encode(line[:-1])
#         w.write(" ".join(encoded.tokens))
#         w.write("\n")
#
# with open("sp_data/para/dev/newstest2014-fren-src.en") as r, open("sp_data/para/dev/newstest2014-fren-src.en.wordpiece", "w") as w:
#     lines = r.readlines()
#     for line in lines:
#         encoded = tokenizer.encode(line[:-1])
#         w.write(" ".join(encoded.tokens))