コード例 #1
0
def main():
    tokenizer = BertWordPieceTokenizer()
    tokenizer.train(files=[configs.data.raw],
                    vocab_size=52_000,
                    min_frequency=5)
    tokenizer.save_model(configs.data.path)
    print(f"save to {configs.data.path}")
コード例 #2
0
def main():
    random.seed(1)
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--corpus-filelist-path",
                        required=True,
                        help="Location of pre-training text files.")
    args = parser.parse_args()

    paths = []
    with open(args.corpus_filelist_path) as f:
        for line in f:
            line = line.strip()
            if line:
                paths.append(line)

    random.shuffle(paths)
    print(f'Nrof files: {len(paths)}')
    paths = paths[:100_000]
    print(f'Nrof filtered files: {len(paths)}')

    # Initialize a tokenizer
    tokenizer = BertWordPieceTokenizer(lowercase=False)

    # Customize training
    tokenizer.train(
        files=paths,
        vocab_size=40_000,
        min_frequency=4,
    )

    # Save files to disk
    tokenizer.save_model(".", "vocab.txt")
コード例 #3
0
    def from_corpus(cls, corpus, corpus_save_path, tokenizer_save_path,
                    tokenizer_name, vocab_size, min_frequency, strip_accents,
                    clean_text, lowercase):
        with open(corpus_save_path, 'wb') as f:
            f.write('\n'.join(corpus).encode())

        tokenizer = BertWordPieceTokenizer(
            strip_accents=strip_accents,
            clean_text=clean_text,
            lowercase=lowercase,
        )
        tokenizer.train(
            [corpus_save_path],
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            show_progress=True,
            special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
            wordpieces_prefix="##",
        )

        if os.path.exists(tokenizer_save_path):
            shutil.rmtree(tokenizer_save_path)
        os.mkdir(tokenizer_save_path)

        tokenizer.save_model(tokenizer_save_path, tokenizer_name)
        vocab_path = os.path.join(tokenizer_save_path,
                                  f'{tokenizer_name}-vocab.txt')
        return cls(vocab_path, strip_accents, clean_text, lowercase)
コード例 #4
0
    def train_wordpiece_tokenizer(self) -> None:
        wordpiece_tokenizer = BertWordPieceTokenizer()
        wordpiece_tokenizer.train(
            files=["./train.txt", "./test.txt"],
            vocab_size=10000,
        )

        wordpiece_tokenizer.save_model("nlpbook/wordpiece")
コード例 #5
0
    def convert_to_ratt(self,
                        ratt_dir,
                        do_lower=True,
                        max_sequence_length=128,
                        data_type="train"):
        if not os.path.exists(ratt_dir):
            os.mkdir(ratt_dir)
        # Build dictionary
        text_list, label_list = self._read_csv(self.raw_data_file)

        # Token vocab
        token_vocab_name = "ratt"
        vocab_file = os.path.join(ratt_dir, token_vocab_name + "-vocab.txt")
        if not os.path.isfile(vocab_file):
            tokenizer = BertWordPieceTokenizer(lowercase=do_lower)
            tokenizer.train(files=[self.raw_data_file], vocab_size=8192)
            tokenizer.save_model(ratt_dir, token_vocab_name)
        else:
            tokenizer = BertWordPieceTokenizer(vocab_file=vocab_file,
                                               lowercase=do_lower)

        # Label vocab
        label_vocab_file = os.path.join(ratt_dir, "label_dict.txt")
        if not os.path.isfile(label_vocab_file):
            labels = set(label_list)
            label_map = {str(l): i for i, l in enumerate(labels)}
            with open(label_vocab_file, "w", encoding="utf-8") as fout:
                for l in labels:
                    fout.write("%s\n" % l)
        else:
            label_map = {}
            with open(label_vocab_file, encoding="utf-8") as fin:
                for i, line in enumerate(fin):
                    label_map[line.rstrip()] = i

        if data_type not in ["train", "dev", "test"]:
            data_types = ["train", "dev", "test"]
        else:
            data_types = [data_type]

        for data_type in data_types:
            logging.info("Converting %s.." %
                         eval("self.raw_%s_file" % data_type))
            text_list, label_list = self._read_csv(
                eval("self.raw_%s_file" % data_type))

            outputs = tokenizer.encode_batch(text_list,
                                             add_special_tokens=True)
            input_ids = [output.ids for output in outputs]
            padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
                input_ids,
                padding="post",
                maxlen=max_sequence_length,
                truncating="post")

            label_ids = [label_map[str(label)] for label in label_list]
            save_file = os.path.join(ratt_dir, data_type + ".npz")
            np.savez(save_file, inputs=padded_inputs, targets=label_ids)
コード例 #6
0
    def generate_custom_vocab(self):

        try:
            tokenizer = None
            # root dir path check and generate
            if not os.path.isdir(self.vocab_root_dir):
                os.makedirs(self.vocab_root_dir, exist_ok=True)

            # generate models directory
            self.vocab_dir = '/BERT_TRAINING_VOCAB_' + self.getCurrent_time()[2] + '/'
            os.makedirs(self.vocab_root_dir + self.vocab_dir, exist_ok=True)

            user_defined_symbols = ['[BOS]', '[EOS]', '[UNK]', '[UNK1]', '[UNK2]', '[UNK3]', '[UNK4]', '[UNK5]',
                                    '[UNK6]', '[UNK7]', '[UNK8]', '[UNK9]']
            unused_token_num = 200
            unused_list = ['[unused{}]'.format(n) for n in range(unused_token_num)]
            user_defined_symbols = user_defined_symbols + unused_list

            if self.tokenizer_type == 'word':
                # if lowercase is False must set strip_accents option as 'False'
                tokenizer = BertWordPieceTokenizer(strip_accents=False,
                                                   lowercase=True,
                                                   clean_text=True,
                                                   handle_chinese_chars=True,
                                                   wordpieces_prefix="##"
                                                   )

            # when selected 'base' going to use bert-base-uncased tokenizer... close function

            # training vocab start
            corpus_file = [self.corpus_path]
            vocab_size = 32000
            limit_alphabet = 6000
            min_frequency = 3
            tokenizer.train(files=corpus_file,
                            vocab_size=vocab_size,
                            special_tokens=user_defined_symbols,
                            min_frequency=min_frequency,  # 단어의 최소 발생 빈도, 3
                            limit_alphabet=limit_alphabet,  # ByteLevelBPETokenizer 학습시엔 주석처리 필요
                            show_progress=True)

            self.setPrint('Customer Tokenizer Training is completed')

            sentence = '전화 통화가 정상적으로 안됨.'
            output = tokenizer.encode(sentence)
            self.setPrint('Tokenizer 테스트 문장: {}'.format(sentence))
            self.setPrint('Tokenizer 분석 결과\n=>idx: {}\n=>tokens: {}\n=>offset: {}\n=>decode: {}\n'.
                          format(output.ids, output.tokens, output.offsets, tokenizer.decode(output.ids)))

            # save tokenizer
            tokenizer.save_model(self.vocab_root_dir + self.vocab_dir)

        except:
            self.setPrint('Error: {}. {}, line: {}'.format(sys.exc_info()[0],
                                                           sys.exc_info()[1],
                                                           sys.exc_info()[2].tb_lineno))
コード例 #7
0
def build_vocab(args):
    special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    special_tokens += ["[unused{}]".format(idx) for idx in range(args.unused_size)]

    if args.tokenizer_model == "mecab_wordpiece":
        mecab_wordpiece_notag_trainer = KoNLPyBertWordPieceTrainer(
            Mecab(), use_tag=False
        )
        mecab_wordpiece_notag_trainer.train(
            files=[args.corpus],
            vocab_size=args.vocab_size,
            min_frequency=args.min_frequency,
            limit_alphabet=args.limit_alphabet,
            special_tokens=special_tokens,
        )
        mecab_wordpiece_notag_trainer.save_model(
            "./data/vocab/mecab_normalize_{}".format(args.vocab_size), "notag"
        )

    elif args.tokenizer_model == "wordpiece":
        tokenizer = BertWordPieceTokenizer(
            vocab=None,
            clean_text=True,
            handle_chinese_chars=True,
            strip_accents=False,
            lowercase=False,
            wordpieces_prefix="##",
        )

        tokenizer.train(
            files=[args.corpus],
            vocab_size=args.vocab_size,
            min_frequency=args.min_frequency,
            limit_alphabet=args.limit_alphabet,
            special_tokens=special_tokens,
        )

        tokenizer.save_model("./data/vocab/wordpiece")

    else:
        logger.info("tokenizer model : wordpiece / mecab_wordpiece")
        sys.exit(1)
コード例 #8
0
def train_tokenizer(filename, params):
    """
    Train a BertWordPieceTokenizer with the specified params and save it
    """
    # Get tokenization params
    save_location = params["tokenizer_path"]
    max_length = params["max_length"]
    min_freq = params["min_freq"]
    vocabsize = params["vocab_size"]

    tokenizer = BertWordPieceTokenizer()
    tokenizer.do_lower_case = False
    special_tokens = ["[S]","[PAD]","[/S]","[UNK]","[MASK]", "[SEP]","[CLS]"]
    tokenizer.train(files=[filename], vocab_size=vocabsize, min_frequency=min_freq, special_tokens = special_tokens)

    tokenizer._tokenizer.post_processor = BertProcessing(("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]")),)
    tokenizer.enable_truncation(max_length=max_length)

    print("Saving tokenizer ...")
    if not os.path.exists(save_location):
        os.makedirs(save_location)
    tokenizer.save_model(save_location)
コード例 #9
0
def train_tokenizer(file_iterator):

    # Initialize an empty tokenizer
    tokenizer = BertWordPieceTokenizer(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=True,
        lowercase=True,
    )

    # And then train
    #tokenizer.train_from_iterator(
    tokenizer.train_from_iterator(
        file_iterator,
        vocab_size=1000,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        limit_alphabet=1000,
        wordpieces_prefix="##",
    )

    # Save the files
    tokenizer.save_model(args.out, args.name)
コード例 #10
0
ファイル: build_tokenizer.py プロジェクト: SH-NLP/shin_bert
def main(args):
    print(args)
    if args['train']:
        tokenizer = BertWordPieceTokenizer(
            clean_text=True,
            handle_chinese_chars=True,
            strip_accents=True,  # Must be False if cased model
            lowercase=True,
            wordpieces_prefix="##"
        )

        tokenizer.train(
            files=['/data2/BERT/data/naver_news/news_3_preprocessed/naver_news.txt'],
            limit_alphabet=6000,
            vocab_size=32000
        )

        print(tokenizer.save_model("../BertWordPieceTokenizer_32000"))

    elif args['test']:
        test_str = '나는 워드피스 토크나이저를 써요. 성능이 좋은지 테스트 해보려 합니다.'

        print("=========== tokenizer ===========")
        tokenizer = BertWordPieceTokenizer("../BertWordPieceTokenizer_32000/vocab.txt")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str.ids)
        decoded_str = tokenizer.decode(encoded_str.ids)
        print(decoded_str)

        print("=========== BertTokenizer ===========")
        tokenizer = BertTokenizer("../BertWordPieceTokenizer_32000/vocab.txt")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str)
        decoded_str = tokenizer.decode(encoded_str)
        print(decoded_str)

        print("=========== BertTokenizer2 ===========")
        tokenizer = BertTokenizer.from_pretrained("../BertWordPieceTokenizer_32000")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str)
        decoded_str = tokenizer.decode(encoded_str)
        print(decoded_str)
コード例 #11
0
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=False,
    handle_chinese_chars=False,
    strip_accents=False,  # Must be False if cased model
    lowercase=True,
    wordpieces_prefix="##")
path = '../data'
tokenizer.train(files=[
    '../data/wiki/train.txt', '../data/news/train.txt',
    '../data/twitter/train.txt', '../data/books/train.txt'
],
                vocab_size=30000)
tokenizer.save_model(".", "bert")
コード例 #12
0
parser.add_argument('--txtfolder',
                    type=str,
                    help='the FOLDER where are those txt files')
args = parser.parse_args()

paths = [str(x) for x in Path(str(args.txtfolder)).glob("**/*.txt")]

# Initialize a lm_model
tokenizer = BertWordPieceTokenizer()

#trainer = BpeTrainer(vocab_size= VOCAB_SIZE, show_progress=True, initial_alphabet=ByteLevel.alphabet())
#tokenizer.train(trainer, paths)
# Customize training
'''
tokenizer._tokenizer.post_processor = BertProcessing(("[CLS]", tokenizer.token_to_id("[CLS]")),
                                                     ("[SEP]", tokenizer.token_to_id("[SEP]")),
                                                     )
                                                    '''
tokenizer.train(files=paths,
                vocab_size=VOCAB_SIZE,
                min_frequency=2,
                special_tokens=[
                    "[PAD]",
                    "[UNK]",
                    "[CLS]",
                    "[SEP]",
                    "[MASK]",
                ])
print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))
tokenizer.save_model('./lm_model')
print('tokenizer savedresults, they are vocab.json and merges.txt')
コード例 #13
0
from tokenizers import BertWordPieceTokenizer
from glob import glob

tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=False,
)
files = glob('splitted/*')
trainer = tokenizer.train(
    files,
    vocab_size=32000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

tokenizer.save_model('./', 'bert-standard')
コード例 #14
0
import argparse
import os
from pathlib import Path
from tokenizers import BertWordPieceTokenizer

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Process code into pre-trained tokenizer.")
    parser.add_argument("--vocab_size", type=int, default=8000)
    parser.add_argument("--text_file_path", type=str)
    parser.add_argument("--out_path", type=str)
    parser.add_argument("--out_name", type=str)
    args = parser.parse_args()

    paths = [str(x) for x in Path(args.text_file_path).glob("**/*")]
    tokenizer = BertWordPieceTokenizer(clean_text=True,
                                       lowercase=False,
                                       strip_accents=True)
    tokenizer.train(
        files=paths,
        vocab_size=args.vocab_size,
        min_frequency=2,
        special_tokens=[
            "<s>", "</s>", "<cls>", "<pad>", "<unk>", "<mask>", "[CLS]",
            "[SEP]", "[MASK]", "[EOL]", "[URL]", "[PAD]", "[UNK]"
        ],
    )
    tokenizer.save_model(args.out_path, args.out_name)
    tokenizer.save(os.path.join(args.out_path, "vocab.json"))
コード例 #15
0
# use this when training BPE tokenizer from scratch
from pathlib import Path

from tokenizers import BertWordPieceTokenizer

paths = ['../../data/jw300.en-tw.tw',
         '../../data/asante_twi_bible.txt']  # dataset location

# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer()

# Customize training
tokenizer.train(
    paths,
    vocab_size=30000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

# Save tokenizer to disk - make sure these directories exist
tokenizer.save_model("distilabena-base-v2-akuapem-twi-cased")  # akuapem
コード例 #16
0
from pathlib import Path

#from tokenizers import ByteLevelBPETokenizer
from tokenizers import BertWordPieceTokenizer

paths = ['../../results_file_clean.txt']

# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=50_000, min_frequency=2,special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save_model('.')

コード例 #17
0
    wp_tokenizer = BertWordPieceTokenizer(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=False,
        lowercase=False,
    )

    wp_tokenizer.train(
        files='/opt/ml/code/KBOBERT/KBOBERT_Data.txt',
        vocab_size=32000,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        wordpieces_prefix="##")

    wp_tokenizer.save_model('./')

    tokenizer = BertTokenizerFast(
        vocab_file="/opt/ml/code/KBOBERT/vocab.txt",
        max_len=512,
        do_lower_case=False,
    )

    tokenizer.add_special_tokens({'mask_token': '[MASK]'})

    # https://huggingface.co/transformers/model_doc/bert.html#bertconfig

    config = BertConfig(vocab_size=32000,
                        hidden_size=256,
                        num_hidden_layers=6,
                        num_attention_heads=4,
コード例 #18
0
def preprocessing(args):

    start_time = time.time()

    print('Start preprocessing!')

    #===================================#
    #=============Data Load=============#
    #===================================#

    # 1) Comment data open
    train = pd.read_csv(os.path.join(args.data_path, 'train.hate.csv'))
    valid = pd.read_csv(os.path.join(args.data_path, 'dev.hate.csv'))
    test = pd.read_csv(os.path.join(args.data_path, 'test.hate.no_label.csv'))

    # 2) Title data open
    with open(os.path.join(args.data_path, 'train.news_title.txt'), 'r') as f:
        train_title = [x.replace('\n', '') for x in f.readlines()]
    with open(os.path.join(args.data_path, 'dev.news_title.txt'), 'r') as f:
        valid_title = [x.replace('\n', '') for x in f.readlines()]
    with open(os.path.join(args.data_path, 'test.news_title.txt'), 'r') as f:
        test_title = [x.replace('\n', '') for x in f.readlines()]

    # 3) Unlabeld data open
    if args.unlabeled_data_processing:
        unlabel_title = pd.read_csv(os.path.join(
            args.data_path, 'unlabeled_comments.news_title.txt'),
                                    names=['title'])
        unlabel_comments = pd.read_csv(os.path.join(args.data_path,
                                                    'unlabeled_comments.txt'),
                                       names=['comments'])

    # 4) Path setting
    if not os.path.exists(args.preprocess_path):
        os.mkdir(args.preprocess_path)

    #===================================#
    #=============Tokenizer=============#
    #===================================#

    print('Tokenizer setting...')

    # 1) Tokenizer open
    if args.custom_training_tokenizer:
        tokenizer = BertWordPieceTokenizer(lowercase=False)
        with open(os.path.join(args.preprocessing_path, 'unlabeld.txt'),
                  'w') as f:
            for i in range(len(unlabel_title)):
                f.write(unlabel_title.tolist()[i])
                f.write('\n')
                f.write(unlabel_comments.tolist()[i])
                f.write('\n')
        tokenizer.train(
            [os.path.join(args.preprocessing_path, 'unlabeld.txt')],
            vocab_size=args.vocab_size,
            limit_alphabet=args.limit_alphabet)
        tokenizer.save_model(args.preprocessing_path)
    else:
        tokenizer = BertTokenizer.from_pretrained('beomi/kcbert-base')

    #===================================#
    #=============Cleansing=============#
    #===================================#

    print('Cleansing...')

    # 1) Regular expression compile
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())
    pattern = re.compile(f'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-힣{emojis}]+')
    url_pattern = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    )

    # 2) Definition clean
    def clean(x):
        x = pattern.sub(' ', x)
        x = url_pattern.sub('', x)
        x = x.strip()
        x = repeat_normalize(x, num_repeats=2)
        return x

    def encoding_text(list_x, tokenizer):
        encoded_text_list = list_x.map(lambda x: tokenizer.encode(
            clean(str(x)), max_length=args.max_len, truncation=True))
        return encoded_text_list

    # 3) Preprocess comments
    train['comments'] = encoding_text(train['comments'], tokenizer)
    valid['comments'] = encoding_text(valid['comments'], tokenizer)
    test['comments'] = encoding_text(test['comments'], tokenizer)

    # 4) Title parsing
    train['title'] = encoding_text(pd.Series(train_title), tokenizer)
    valid['title'] = encoding_text(pd.Series(valid_title), tokenizer)
    test['title'] = encoding_text(pd.Series(test_title), tokenizer)

    # 5) Unlabel data parsing
    if args.unlabeled_data_processing:
        unlabel_title = encoding_text(unlabel_title['title'], tokenizer)
        unlabel_comments = encoding_text(unlabel_comments['comments'],
                                         tokenizer)

    #===================================#
    #==========Label processing=========#
    #===================================#

    print('Label processing...')

    train.replace({'label': {
        'none': 0,
        'offensive': 1,
        'hate': 2
    }},
                  inplace=True)
    valid.replace({'label': {
        'none': 0,
        'offensive': 1,
        'hate': 2
    }},
                  inplace=True)

    #===================================#
    #==============Saving===============#
    #===================================#

    # 1) Print status
    print('Parsed sentence save setting...')

    max_train_len = max([len(x) for x in train['comments']])
    max_valid_len = max([len(x) for x in valid['comments']])
    max_test_len = max([len(x) for x in test['comments']])

    max_train_title_len = max([len(x) for x in train['title']])
    max_valid_title_len = max([len(x) for x in valid['title']])
    max_test_title_len = max([len(x) for x in test['title']])

    if args.unlabeled_data_processing:
        max_unlabel_title_len = max([len(x) for x in unlabel_title])
        max_unlabel_comments_len = max([len(x) for x in unlabel_comments])

    print(
        f'Train data max length => title: {max_train_len} | comment: {max_train_title_len}',
        end=' | ')
    print(f'total: {max_train_len + max_train_title_len}')
    print(
        f'Valid data max length => title: {max_valid_len} | comment: {max_valid_title_len}',
        end=' | ')
    print(f'total: {max_valid_len + max_valid_title_len}')
    print(
        f'Test data max length => title: {max_test_len} | comment: {max_test_title_len}',
        end=' | ')
    print(f'total: {max_test_len + max_test_title_len}')
    if args.unlabeled_data_processing:
        print(
            f'Unlabel data max length => title: {max_unlabel_title_len} | comment: {max_unlabel_comments_len}',
            end=' | ')
        print(f'total: {max_unlabel_title_len + max_unlabel_comments_len}')

    # 2) Training pikcle saving
    with open(os.path.join(args.preprocess_path, 'processed.pkl'), 'wb') as f:
        pickle.dump(
            {
                'train_comment_indices': train['comments'].tolist(),
                'valid_comment_indices': valid['comments'].tolist(),
                'train_title_indices': train['title'].tolist(),
                'valid_title_indices': valid['title'].tolist(),
                'train_label': train['label'].tolist(),
                'valid_label': valid['label'].tolist()
            }, f)

    # 3) Test pickle saving
    with open(os.path.join(args.preprocess_path, 'test_processed.pkl'),
              'wb') as f:
        pickle.dump(
            {
                'test_comment_indices': test['comments'].tolist(),
                'test_title_indices': test['title'].tolist(),
            }, f)

    # 4) Unlabeled pickle saving
    if args.unlabeled_data_processing:
        with open(
                os.path.join(args.preprocess_path, 'unlabeled_processed.pkl'),
                'wb') as f:
            pickle.dump(
                {
                    'unlabel_title': unlabel_title,
                    'unlabel_comments': unlabel_comments,
                }, f)

    print(f'Done! ; {round((time.time()-start_time)/60, 3)}min spend')
コード例 #19
0
    def pre_tokenize(self, text):
        return self.moses_tokenizer.tokenize(self.mpn.normalize(text.lower() if self.do_lowercase else text))


if __name__ == '__main__':
    lang = 'fr'
    clean_text = False
    handle_chinese_chars = True
    strip_accents = False
    lowercase = True
    vocab_size = 30000
    min_frequency = 2
    spt = ["<s>", "<pad>", "</s>", "<unk>", "<mask>", "[UNK]", "[SEP]", "[CLS]", "[PAD]", "[MASK]"]
    if lang == "fr":
        train_data = "../.data/wmt19_de_fr/train.fr"
    elif lang == "en":
        train_data = "../.data/wmt19_en_de/train.en"
    else:
        raise ValueError("Undefined language {}".format(lang))

    tokenizer = BertWordPieceTokenizer(clean_text=clean_text, lowercase=lowercase,
                                       handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents)
    tokenizer.pre_tokenizer = MosesPreTokenizer(lang, lowercase)

    # Customize training
    print("Starting to train ...")
    tokenizer.train(files=train_data, vocab_size=vocab_size, show_progress=True, min_frequency=min_frequency, special_tokens=spt)
    # Save files to disk
    tokenizer.save_model(".", "moses-pre-tokenized-wmt-uncased-{}".format(lang))
コード例 #20
0
ファイル: _base.py プロジェクト: sycomix/odin-ai
 def tokenizer(
         self) -> Union[BaseTokenizer, CountVectorizer, TfidfVectorizer]:
     pkl_path = os.path.join(self.tokenizer_path, "model.pkl")
     if self._tokenizer is not None:
         return self._tokenizer
     ### get pickled tokenizer
     if os.path.exists(pkl_path) and not self.retrain_tokenizer:
         with open(pkl_path, 'rb') as f:
             tokenizer = pickle.load(f)
     ### train new tokenizer
     else:
         self.retrain_tokenizer = False
         if self.algorithm == 'bert':
             from tokenizers import BertWordPieceTokenizer
             tokenizer = BertWordPieceTokenizer(
                 vocab_file=None if self._init_vocabulary is None else os.
                 path.join(self.cache_path, "bert_vocab.txt"))
             tokenizer.enable_truncation(max_length=self.max_length)
             tokenizer.enable_padding(length=self.max_length)
             # train the tokenizer
             if self._init_vocabulary is None:
                 path = os.path.join(self.cache_path, 'train.txt')
                 with open(path, 'w') as f:
                     for i in chain(self.train_text, self.valid_text,
                                    self.test_text):
                         if len(i) == 0:
                             continue
                         f.write(i + "\n" if i[-1] != "\n" else i)
                 tokenizer.train(files=path,
                                 vocab_size=self.vocab_size,
                                 min_frequency=self.min_frequency,
                                 limit_alphabet=self.limit_alphabet,
                                 show_progress=True)
             tokenizer.save_model(self.tokenizer_path)
         elif self.algorithm in ('count', 'tf', 'tfidf'):
             if self.algorithm == 'count':
                 tokenizer = CountVectorizer(
                     input='content',
                     ngram_range=self.ngram_range,
                     min_df=self.min_frequency,
                     max_df=self.max_frequency,
                     max_features=self.vocab_size,
                     vocabulary=self._init_vocabulary,
                     tokenizer=_simple_tokenizer,
                     stop_words='english')
             elif self.algorithm in ('tf', 'tfidf'):
                 tokenizer = TfidfVectorizer(
                     input='content',
                     ngram_range=self.ngram_range,
                     min_df=self.min_frequency,
                     max_df=self.max_frequency,
                     max_features=self.vocab_size,
                     stop_words='english',
                     vocabulary=self._init_vocabulary,
                     tokenizer=_simple_tokenizer,
                     use_idf=False if self.algorithm == 'tf' else True)
             tokenizer.fit((_simple_preprocess(i) for i in chain(
                 self.train_text, self.valid_text, self.test_text)))
         else:
             raise NotImplementedError
         # save the pickled model
         with open(pkl_path, "wb") as f:
             pickle.dump(tokenizer, f)
     ### assign and return
     self._tokenizer = tokenizer
     return self._tokenizer
コード例 #21
0
import os
from tokenizers import BertWordPieceTokenizer
from pathlib import Path

save_dir = "vocab"
paths = [
    str(x) for x in Path("/home/phmay/data/nlp/corpus/ready/").glob("*.txt")
]
print(paths)
vocab_size = 32_767  # 2^15-1
min_frequency = 2

os.makedirs(save_dir, exist_ok=True)

special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

for i in range(767 - 5):
    special_tokens.append('[unused{}]'.format(i))

# https://github.com/huggingface/tokenizers/blob/04fb9e4ebe785a6b2fd428766853eb27ee894645/bindings/python/tokenizers/implementations/bert_wordpiece.py#L11
tokenizer = BertWordPieceTokenizer(strip_accents=False)
tokenizer.train(
    files=paths,
    vocab_size=vocab_size,
    min_frequency=min_frequency,
    special_tokens=special_tokens,
)

tokenizer.save_model(save_dir)
tokenizer.save(save_dir + "/tokenizer.json")
コード例 #22
0
                    type=str,
                    help="The name of the output vocab files")
args = parser.parse_args()

files = glob.glob(args.files)
if not files:
    print(f"File does not exist: {args.files}")
    exit(1)

# Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True,
)

# And then train
tokenizer.train(
    files,
    vocab_size=10000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

# Save the files
tokenizer.save_model(args.out, args.name)
コード例 #23
0
    def train_tokenizer(self, train_files, tokenizer_name=None, output_dir=None, use_trained_tokenizer=True):
        """
        Train a new tokenizer on `train_files`.

        Args:

        - train_files: List of files to be used when training the tokenizer.

        - tokenizer_name: Name of a pretrained tokenizer or a path to a directory containing a tokenizer.

        - output_dir (optional): The directory where model files will be saved. If not given, self.args.output_dir
        will be used.

        - use_trained_tokenizer (optional): Load the trained tokenizer once training completes.

        Returns: None
        """

        if not self.args.vocab_size:
            raise AttributeError(
                "Cannot train a new tokenizer as vocab_size is not specified in args dict. "
                "Either provide a tokenizer or specify vocab_size."
            )

        if not isinstance(train_files, list):
            train_files = [train_files]

        if not output_dir:
            output_dir = self.args.output_dir

        if self.args.model_type in ["bert", "electra"]:
            tokenizer = BertWordPieceTokenizer(
                clean_text=self.args.clean_text,
                handle_chinese_chars=self.args.handle_chinese_chars,
                strip_accents=self.args.strip_accents,
                lowercase=self.args.do_lower_case,
            )
            self.args.special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
            self.args.wordpieces_prefix = "##"

            tokenizer.train(
                files=train_files,
                vocab_size=self.args.vocab_size,
                min_frequency=self.args.min_frequency,
                special_tokens=self.args.special_tokens,
                wordpieces_prefix="##",
            )
        else:
            tokenizer = ByteLevelBPETokenizer(lowercase=self.args.do_lower_case)

            tokenizer.train(
                files=train_files,
                vocab_size=self.args.vocab_size,
                min_frequency=self.args.min_frequency,
                special_tokens=self.args.special_tokens,
            )

        os.makedirs(output_dir, exist_ok=True)

        tokenizer.save_model(output_dir)
        logger.info(" Training of {} tokenizer complete. Saved to {}.".format(tokenizer_name, output_dir))

        _, _, tokenizer_class = MODEL_CLASSES[self.args.model_type]
        tokenizer = tokenizer_class.from_pretrained(output_dir)

        if use_trained_tokenizer:
            self.tokenizer = tokenizer
            self.args.tokenizer_name = output_dir
            try:
                if self.args.model_type == "electra":
                    model_to_resize = (
                        self.model.generator_model.module
                        if hasattr(self.model.generator_model, "module")
                        else self.model.generator_model
                    )
                    model_to_resize.resize_token_embeddings(len(self.tokenizer))

                    model_to_resize = (
                        self.model.discriminator_model.module
                        if hasattr(self.model.discriminator_model, "module")
                        else self.model.discriminator_model
                    )
                    model_to_resize.resize_token_embeddings(len(self.tokenizer))

                model_to_resize = self.model.module if hasattr(self.model, "module") else self.model
                model_to_resize.resize_token_embeddings(len(self.tokenizer))
            except AttributeError:
                pass
コード例 #24
0
        handle_chinese_chars=False,
        strip_accents=False,
        lowercase=False,
    )
    special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

    tokenizer.train(
        files=train_files,
        vocab_size=32000,
        min_frequency=2,
        special_tokens=special_tokens,
        limit_alphabet=500,
        wordpieces_prefix="##",
    )

    tokenizer.save_model('./vocab')

vocab_file = f'./vocab/{os.listdir("./vocab")[0]}'
print(vocab_file)

with open(vocab_file) as f:
    for vocab_size, _ in enumerate(f, 1):
        pass

print(f'Vocab size: {vocab_size}')

ELECTRA_SMALL_DEFAULT = {
    'generator_config': {
        "attention_probs_dropout_prob": 0.1,
        "embedding_size": 128,
        "hidden_act": "gelu",
コード例 #25
0
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='train wordpiece tokenizer')
    parser.add_argument("--input_file", type=str, default="")
    parser.add_argument("--vocab_size", type=int, default="40000")
    parser.add_argument(
        "--limit_alphabet", type=int,
        default="1000")  # 30000 for chinese and similar languages
    parser.add_argument("--output_path", type=str, default="")
    args = parser.parse_args()

    # Initialize a tokenizer
    tokenizer = BertWordPieceTokenizer(clean_text=True,
                                       handle_chinese_chars=True,
                                       strip_accents=True,
                                       lowercase=False)

    tokenizer.train(
        files=[args.input_file],
        vocab_size=args.vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        limit_alphabet=args.limit_alphabet,
        wordpieces_prefix="##",
    )

    # Save files to disk
    tokenizer.save_model(args.output_path)