def __init__(self, filename, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab): logging.debug('start to load file %s ...', filename) instances = create_training_instances([filename], tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab, nworker=1) super(BERTPretrainDataset, self).__init__(*instances)
def generate_dev_set(tokenizer, vocab, cache_file, args): """Generate validation set.""" # set random seed to generate dev data deterministically np.random.seed(0) random.seed(0) mx.random.seed(0) worker_pool = multiprocessing.Pool() eval_files = nlp.utils.glob(args.data_eval) num_files = len(eval_files) assert num_files > 0, 'Number of eval files must be greater than 0.' \ 'Only found %d files at %s'%(num_files, args.data_eval) logging.info('Generating validation set from %d files on rank 0.', len(eval_files)) create_training_instances((eval_files, tokenizer, args.max_seq_length, args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, args.whole_word_mask, vocab, 1, args.num_data_workers, worker_pool, cache_file)) logging.info('Done generating validation set on rank 0.')
def __init__(self, filename, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab, num_workers=1, worker_pool=None): logging.debug('start to load file %s ...', filename) dupe_factor = 1 instances = create_training_instances(([filename], tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab, dupe_factor, num_workers, worker_pool, None)) super(BERTPretrainDataset, self).__init__(*instances)
def prepare_pretrain_text_dataset(filename, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, whole_word_mask, random_next_sentence, vocab): """Create dataset based on the raw text files""" dupe_factor = 1 if not isinstance(filename, (list, tuple)): filename = [filename] logging.debug('start to load files %s ...', filename) instances = create_training_instances((filename, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab, dupe_factor, 1, None, None, random_next_sentence)) return mx.gluon.data.ArrayDataset(*instances)
def loop(files): client = storage.Client() bucket = client.bucket('mesolitica-tpu-general') files, index, postfix = files output_files = f'{directory}/albert-{index}-{postfix}.tfrecord' print(f'Output filename: {output_files}') files = ','.join(files) tokenizer = tokenization.FullTokenizer( vocab_file='sp10m.cased.albert.vocab', do_lower_case=False, spm_model_file='sp10m.cased.albert.model', ) input_files = [] for input_pattern in files.split(','): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info('*** Reading from input files ***') for input_file in input_files: tf.logging.info(' %s', input_file) rng = random.Random(random.randint(1, 999999)) instances = create_training_instances( input_files, tokenizer, max_seq_length=128, dupe_factor=2, short_seq_prob=0.1, masked_lm_prob=0.15, max_predictions_per_seq=20, rng=rng, ) tf.logging.info('number of instances: %i', len(instances)) write_instance_to_example_files( instances, tokenizer, max_seq_length=128, max_predictions_per_seq=20, output_files=output_files.split(','), ) blob = bucket.blob(f'albert-data/{output_files}') blob.upload_from_filename(output_files) os.system(f'rm {output_files}')
def loop(files): client = storage.Client() bucket = client.bucket('mesolitica-tpu-general') input_files, index = files output_file = f'{directory}/bert-{index}.tfrecord' print('*** Reading from input files ***') for input_file in input_files: print(input_file) max_seq_length = 128 dupe_factor = 5 max_predictions_per_seq = 20 masked_lm_prob = 0.15 short_seq_prob = 0.1 rng = random.Random(12345) instances = create_training_instances( input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng, ) print('*** Writing to output files ***') write_instance_to_example_files( instances, tokenizer, max_seq_length, max_predictions_per_seq, [output_file], ) blob = bucket.blob(f'bert-data/{output_file}') blob.upload_from_filename(output_file) os.system(f'rm {output_file}')
def create_pretraining_data_from_docs(docs, save_path, vocab_path, token_method='wordpiece', language='en', max_seq_length=128, dupe_factor=10, short_seq_prob=0.1, masked_lm_prob=0.15, max_predictons_per_seq=20): """docs: sequence of sequence of sentences. Args: docs: Sequence of sequence. Docs is a sequence of documents. A document is a sequence of sentences. save_path: path to save pretraining data. vocab_path: The vocabulary file that the BERT model was trained on. only enable when token_method='wordpiece'. token_method: string. 'wordpiece' or 'spacy' language: string. 'en' or 'chn' max_seq_length: integer. Maximum sequence length. dupe_factor: integer. Number of times to duplicate the input data (with different masks). short_seq_prob: float. Probability of creating sequences which are shorter than the maximum length. masked_lm_prob: float. Masked LM probability. max_predictons_per_seq: integer. Maximum number of masked LM predictions per sequence. """ if not hasattr(docs, '__len__'): raise ValueError("`docs` should be sequence of sequence.") else: if not hasattr(docs[0], '__len__'): raise ValueError("`docs` should be sequence of sequence.") if token_method not in ['wordpiece', 'spacy']: raise ValueError( "`token_method` must be one of `wordpiece` and `spacy`.") if language not in ['en', 'chn']: raise ValueError("`language` should be one of `en` and `chn`.") if token_method == "spacy" and language == "chn": raise ValueError( "spacy tokenizer only enable when `language` is `en`.") if token_method == "wordpiece": tokenizer = FullTokenizer(vocab_path, do_lower_case=True) else: tokenizer = SpacyTokenizer(vocab_path, do_lower_case=True) instances = create_training_instances( docs, tokenizer=tokenizer, max_seq_length=max_seq_length, dupe_factor=dupe_factor, short_seq_prob=short_seq_prob, masked_lm_prob=masked_lm_prob, max_predictions_per_seq=max_predictons_per_seq) pretraining_data = dict(tokens=[], segment_ids=[], is_random_next=[], masked_lm_positions=[], masked_lm_labels=[]) for i, instance in enumerate(instances): if i < 10: print("num-{}: {}".format(i, instance)) pretraining_data['tokens'].append(instance.tokens) pretraining_data['segment_ids'].append(instance.segment_ids) pretraining_data['is_random_next'].append(int(instance.is_random_next)) pretraining_data['masked_lm_positions'].append( instance.masked_lm_positions) pretraining_data['masked_lm_labels'].append(instance.masked_lm_labels) tokens_ids = [] tokens_mask = [] for tokens in pretraining_data['tokens']: sub_ids = tokenizer.convert_tokens_to_ids(tokens) sub_mask = [1] * len(sub_ids) tokens_ids.append(sub_ids) tokens_mask.append(sub_mask) masked_lm_ids = [] for mask_labels in pretraining_data['masked_lm_labels']: sub_masked_lm_ids = tokenizer.convert_tokens_to_ids(mask_labels) masked_lm_ids.append(sub_masked_lm_ids) # input tokens_ids = pad_sequences(tokens_ids, maxlen=128, padding='post', truncating='post') tokens_mask = pad_sequences(tokens_mask, maxlen=128, padding='post', truncating='post') segment_ids = pad_sequences(pretraining_data['segment_ids'], maxlen=128, padding='post', truncating='post') masked_lm_positions = pad_sequences( pretraining_data['masked_lm_positions'], maxlen=20, padding='post', truncating='post') # label is_random_next = to_categorical(pretraining_data['is_random_next'], num_classes=2) masked_lm_labels = pad_sequences(masked_lm_ids, maxlen=20, padding='post', truncating='post') # save np.savez(file=save_path, tokens_ids=tokens_ids, tokens_mask=tokens_mask, segment_ids=segment_ids, is_random_next=is_random_next, masked_lm_positions=masked_lm_positions, masked_lm_labels=masked_lm_labels) print("[INFO] number of train data:", len(tokens_ids)) print("[INFO] is_random_next ratio:", np.sum(pretraining_data['is_random_next']) / len(is_random_next))
from tokenization import BertTokenizer import random from create_pretraining_data import create_training_instances, write_instance_to_example_file from glob import glob # line='The switches between clarity and intoxication gave me a headache, but at least the silver-haired faery’s explanation of the queens’ “gifts” helped me understand why I could want to wrap my legs around a creature who terrified me.' vocab_file = '/workspace/bert/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt' base_dir = '/home/haris/share/git/DeepLearningExamples/PyTorch/LanguageModeling/BERT/sharded_training_shards_256_test_shards_256_fraction_0.2/books_wiki_en_corpus/' random_seed = 123 tokenizer = BertTokenizer(vocab_file) rng = random.Random(random_seed) max_seq_length = 128 dupe_factor = 5 short_seq_prob = 0.1 masked_lm_prob = 0.15 max_predictions_per_seq = 20 # tokens=tokenizer.tokenize(line) # input_files=glob(base_dir+'*.txt') input_files = [ '/home/haris/share/git/DeepLearningExamples/PyTorch/LanguageModeling/BERT/test_file.txt' ] output_file = '/home/haris/share/git/DeepLearningExamples/PyTorch/LanguageModeling/BERT/test_file.h5' instances = create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng) write_instance_to_example_file(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_file)