def __init__(self, model_name: str, start_tokens: List[str] = None, end_tokens: List[str] = None) -> None: self._tokenizer = AutoTokenizer.from_pretrained(model_name) default_start_tokens, default_end_tokens = _guess_start_and_end_token_defaults( model_name) self._start_tokens = start_tokens if start_tokens is not None else default_start_tokens self._end_tokens = end_tokens if end_tokens is not None else default_end_tokens
def test_as_array_produces_token_sequence(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lowercase=True) indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", do_lowercase=True) tokens = tokenizer.tokenize("AllenNLP is great") expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = [Token(token) for token in tokens] vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key") assert indexed["key"] == expected_ids
def __init__(self, model_name: str, namespace: str = "tags", token_min_padding_length: int = 0) -> None: super().__init__(token_min_padding_length) self._model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained(model_name) self._namespace = namespace self._added_to_vocabulary = False self._padding_value = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.pad_token])[0] logger.info( f"Using token indexer padding value of {self._padding_value}")
def __init__(self, model_name: str, do_lowercase: bool, start_tokens: List[str] = None, end_tokens: List[str] = None) -> None: if model_name.endswith("-cased") and do_lowercase: logger.warning("Your pretrained model appears to be cased, " "but your tokenizer is lowercasing tokens.") elif model_name.endswith("-uncased") and not do_lowercase: logger.warning("Your pretrained model appears to be uncased, " "but your tokenizer is not lowercasing tokens.") self._tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=do_lowercase) default_start_tokens, default_end_tokens = _guess_start_and_end_token_defaults(model_name) self._start_tokens = start_tokens if start_tokens is not None else default_start_tokens self._end_tokens = end_tokens if end_tokens is not None else default_end_tokens
def __init__(self, xlm_model_name: str, do_lowercase: bool, token_indexers: Dict[str, TokenIndexer] = None, max_sent_len: int = 80, dataset_field_name: str = "dataset", lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = AutoTokenizer.from_pretrained( xlm_model_name, do_lower_case=do_lowercase) self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } self._max_sent_len = max_sent_len self._dataset_field_name = dataset_field_name
def __init__(self, model_name: str, do_lowercase: bool, namespace: str = "tags", token_min_padding_length: int = 0) -> None: super().__init__(token_min_padding_length) if model_name.endswith("-cased") and do_lowercase: logger.warning("Your pretrained model appears to be cased, " "but your indexer is lowercasing tokens.") elif model_name.endswith("-uncased") and not do_lowercase: logger.warning("Your pretrained model appears to be uncased, " "but your indexer is not lowercasing tokens.") self.tokenizer = AutoTokenizer.from_pretrained( model_name, do_lower_case=do_lowercase) self._namespace = namespace self._added_to_vocabulary = False
def __init__( self, xlm_model_name: str, do_lowercase: bool, token_indexers: Dict[str, TokenIndexer] = None, cuda_device: int = 1, max_sent_len: int = 128, dataset_field_name: str = "dataset", source_fname_prefix: str = "multinli.train.", lg_pairs: str = "ar-en bg-en de-en el-en en-es en-fr en-hi en-ru en-sw en-th en-tr en-ur en-vi en-zh", target_lang="en", scheme: str = "round_robin", lazy: bool = False) -> None: super().__init__(lazy) tokenizer = AutoTokenizer.from_pretrained(xlm_model_name, do_lower_case=do_lowercase) self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } self._max_sent_len = max_sent_len self._dataset_field_name = dataset_field_name self._lg_pairs = lg_pairs.split(" ") self._scheme = scheme self._readers: Dict[str, DatasetReader] = {} for pair in self._lg_pairs: self._readers[pair] = ParaCorpusReader( xlm_tokenizer=tokenizer, lang_pair=pair, xlm_model_name=xlm_model_name, do_lowercase=do_lowercase, token_indexers=token_indexers, max_sent_len=max_sent_len, dataset_field_name=dataset_field_name, target_lang=target_lang, lazy=lazy, source_fname_prefix=source_fname_prefix, cuda_device=cuda_device)
os.makedirs(args.output_dir, exist_ok=True) # TPU devices devices = tpu_xm.get_xla_supported_devices() if args.one_tpu: devices = [devices[0]] n_tpu = len(devices) logging.info(f'Found {n_tpu} TPU cores') # set seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # load tokenizer tokenizer = AutoTokenizer.from_pretrained(args.bert_model) logging.info(f"Saving tokenizer to: {args.output_dir}") tokenizer.save_pretrained(args.output_dir) # load model model = AutoModelWithLMHead.from_pretrained( args.bert_model) # Only Masked Language Modeling logging.info(f"Saving initial checkpoint to: {args.output_dir}") model.save_pretrained(args.output_dir) # wrap model with TPU stuff model = tpu_dp.DataParallel(model, device_ids=devices) # expected total number of updates total_num_updates = utils.compute_num_updates_in_epoch( num_samples=args.total_num_training_examples,
def main(): parser = utils.get_args_parser_with_general_args() parser.add_argument( '--one_tpu', action='store_true', help= "Run on one tpu core for degugging. Makes it easy to use break points") parser.add_argument('--tpu_report', action='store_true', help="Print xla metric report") args = parser.parse_args() utils.init(args) # set seeds, init logger, prepare output directory devices = tpu_xm.get_xla_supported_devices() if args.one_tpu: devices = [devices[0]] n_tpu = len(devices) logging.info(f'Found {n_tpu} TPU cores') tokenizer = AutoTokenizer.from_pretrained(args.bert_model) tokenizer.save_pretrained(args.output_dir) args.start_epoch = utils.prepare_last_checkpoint(args.bert_model) model = AutoModelWithLMHead.from_pretrained( args.bert_model) # Only Masked Language Modeling logging.info(f"Saving initial checkpoint to: {args.output_dir}") model.save_pretrained(args.output_dir) model = tpu_dp.DataParallel(model, device_ids=devices) num_data_epochs, num_train_optimization_steps = utils.get_dataset_stats( args, n_tpu) def tpu_training_loop(model, loader, device, context): """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch""" param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # one optimizer and scheduler per TPU core. Both objects are saved in `context` to be reused the next epoch optimizer = context.getattr_or( 'optimizer', AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=tuple(args.betas))) # derive warmup info if args.warmup_proportion is not None: warmup_steps = int(args.warmup_proportion * num_train_optimization_steps + 0.5) elif args.warmup_steps is not None: warmup_steps = args.warmup_steps else: raise Exception( 'What is the warmup?? Specify either warmup proportion or steps' ) scheduler = context.getattr_or( 'scheduler', WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps)) tr_loss = None pbar = None if str(pbar_device) == str( device ): # All threads are in sync. Use progress bar only on one of them pbar = tqdm(total=int(pbar_steps), desc=f"device {device}", dynamic_ncols=True) tracker = tpu_xm.RateTracker() model.train() for step, batch in loader: input_ids, input_mask, segment_ids, lm_label_ids, _ = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tracker.add(args.train_batch_size) tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps if pbar is not None: pbar.update(1) # pbar.set_description(desc=f'LR: {scheduler.get_lr()}') if (step + 1) % args.gradient_accumulation_steps == 0: tpu_xm.optimizer_step(optimizer) prev_lr = scheduler.get_last_lr()[0] scheduler.step() curr_lr = scheduler.get_last_lr()[0] if args.track_learning_rate: if pbar is not None: pbar.set_description( f"Prev LR: {prev_lr} Curr LR: {curr_lr}") optimizer.zero_grad() return tr_loss.item( ) / step # `.item()` requires a trip from TPU to CPU, which is very slow. Use it only once per epoch= for epoch in range(args.start_epoch, args.epochs): # Load one training file into memory epoch_dataset = utils.PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) train_sampler = RandomSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) pbar_device = devices[0] pbar_steps = utils.compute_num_steps_in_epoch( num_samples=train_sampler.num_samples, batch_size=args.train_batch_size, grad_accum_steps= 1, # the pbar steps should not take into account grad accumulation steps n_tpu=n_tpu) logging.info( f'start training, epoch {epoch} on {len(devices)} cores for {pbar_steps} steps' ) start = time.time() losses = model( tpu_training_loop, train_dataloader ) # calls `tpu_training_loop` multiple times, once per TPU core logging.info( f'Epoch {epoch} took {round(time.time() - start, 2)} seconds. Average loss: {sum(losses)/len(losses)}' ) utils.save_checkpoint(model._models[0], epoch, args.output_dir) if args.tpu_report: logging.info(torch_xla._XLAC._xla_metrics_report())
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--lang", default=None, type=str, required=True) parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument("--out_dir", default=None, type=str, required=True) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) ## Other parameters parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="0", type=str) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('-d', '--emb_dim', type=int, default=1024, help="Embeddings size") parser.add_argument( '--vocab_file', type=str, default='vocabs/en_50k.vocab', help= "Path to vocab file with tokens (one per line) to include in output. Should also include <UNK> token. Can use $l as a placeholder for language" ) args = parser.parse_args() lang = args.lang tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab = vocabulary.Vocabulary() vocab.set_from_file(args.vocab_file, oov_token=tokenizer.unk_token) print("Loaded vocabulary of size {}".format(vocab.get_vocab_size())) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {} distributed training: {}".format( device, n_gpu, bool(args.local_rank != -1))) layer_indexes = [int(x) for x in args.layers.split(",")] examples = read_examples(args.input_file) features = convert_examples_to_features(examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer, lang=lang) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model = AutoModel.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) num_occurrences = [0] * vocab.get_vocab_size() anchors = {} norms = {} total_words = 0 for l in layer_indexes: norms[l] = 0.0 anchors[l] = np.zeros(shape=(vocab.get_vocab_size(), args.emb_dim)) oov_ind = vocab.get_token_index(vocab._oov_token) model.eval() for input_ids, input_mask, example_indices in tqdm(eval_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers = model(input_ids) all_encoder_layers = all_encoder_layers for b, example_index in enumerate(example_indices): feature = features[example_index.item()] for (i, token) in enumerate(feature.tokens): all_layers = [] w_id = vocab.get_token_index(token) if w_id == oov_ind: continue n = num_occurrences[w_id] for (j, layer_index) in enumerate(layer_indexes): layer_output = all_encoder_layers[int( layer_index)].detach().cpu().numpy() layer_output = layer_output[b] layers = collections.OrderedDict() l = layer_index values = layer_output[i] anchors[l][w_id, :] = anchors[l][w_id, :] * ( n / (n + 1)) + values[:] / (n + 1) norm = np.linalg.norm(values[:]) norms[l] = norms[l] * (total_words / (total_words + 1)) + norm / ( total_words + 1) total_words += 1 num_occurrences[w_id] += 1 os.makedirs(args.out_dir, exist_ok=True) norm_dict = {} print('Saving outputs to {}'.format(args.out_dir)) for l in tqdm(layer_indexes): norm_key = 'avg_norm_layer_{}'.format(l) norm_dict[norm_key] = norms[l] file_path = os.path.join(args.out_dir, 'avg_embeds_{}.txt'.format(l)) save_embeds(file_path, anchors[l], vocab, num_occurrences, args.emb_dim) norm_dict['occurrences'] = num_occurrences file_path = os.path.join(args.out_dir, 'norms.json') json.dump(norm_dict, open(file_path, 'w'))
def main(): parser = ArgumentParser() parser.add_argument('--train_corpus', type=str, required=True, help="Path to training corpus in glob format") parser.add_argument("--output_dir", type=Path, required=True) parser.add_argument("--bert_model", type=str, required=True, choices=[ "bert-base-uncased", "bert-large-uncased", "bert-large-cased", "bert-base-cased", "bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased", "roberta-base", "roberta-large" ]) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--do_whole_word_mask", action="store_true", help= "Whether to use whole word masking rather than per-WordPiece masking.") parser.add_argument( "--reduce_memory", action="store_true", help= "Reduce memory usage for large datasets by keeping data on disc rather than in memory" ) parser.add_argument("--num_workers", type=int, default=1, help="The number of workers to use to write the files") parser.add_argument("--epochs_to_generate", type=int, default=3, help="Number of epochs of data to pregenerate") parser.add_argument("--max_seq_len", type=int, default=128) parser.add_argument( "--short_seq_prob", type=float, default=0.1, help="Probability of making a short sentence as a training example") parser.add_argument( "--masked_lm_prob", type=float, default=0.15, help="Probability of masking each token for the LM task") parser.add_argument( "--max_predictions_per_seq", type=int, default=20, help="Maximum number of tokens to mask in each sequence") parser.add_argument( "--do_next_sent_prediction", action="store_true", help= "Add the next sentence prediction task (as in BERT) or ignore it (as in RoBERTa)" ) args = parser.parse_args() if args.num_workers > 1 and args.reduce_memory: raise ValueError("Cannot use multiple workers while reducing memory") args.output_dir.mkdir(exist_ok=True) tokenizer = AutoTokenizer.from_pretrained(args.bert_model) tokenizer.vocab_list = list((tokenizer.encoder if hasattr( tokenizer, 'encoder') else tokenizer.vocab).keys()) files = glob.glob(args.train_corpus) if args.num_workers > 1: pool = Pool(args.num_workers) arguments = [(args, input_file, i, tokenizer, len(files)) for i, input_file in enumerate(files)] pool.starmap(input_file_to_training_data, arguments) else: for i, input_file in enumerate(tqdm(files)): input_file_to_training_data(args, input_file, i, tokenizer, len(files))