def train(index): model.train() dataloaders = {} i = 0 global global_step datalengths = [] batchs_per_dataset = [] # Pretraining datasets wiki_pretrain_dataset = PreTrainingDataset( tokenizer=tokenizer, folder=args.train_path, logger=logger, max_seq_length=max_seq_length, index=index, data_type=PretrainDataType.WIKIPEDIA, max_predictions_per_seq=max_predictions_per_seq, masked_lm_prob=masked_lm_prob) datalengths.append(len(wiki_pretrain_dataset)) dataloaders[i] = get_dataloader(wiki_pretrain_dataset) num_batches_in_dataset = get_effective_batch(len(wiki_pretrain_dataset)) logger.info( 'Wikpedia data file: Number of samples {}, number of batches required to process these samples: {}' .format(len(wiki_pretrain_dataset), num_batches_in_dataset)) batchs_per_dataset.append(num_batches_in_dataset) i += 1 logger.info("Training on Wikipedia dataset") total_length = sum(datalengths) dataset_batches = [] for i, batch_count in enumerate(batchs_per_dataset): dataset_batches.extend([i] * batch_count) logger.info( 'Number of batches to process *all* data samples in this epoch: {}'. format(len(dataset_batches))) # shuffle random.shuffle(dataset_batches) # We don't want the dataset to be n the form of alternate chunks if we have more than # one dataset type, instead we want to organize them into contiguous chunks of each # data type, hence the multiplication with grad_accumulation_steps with dataset_batch_type dataset_picker = [] for dataset_batch_type in dataset_batches: dataset_picker.extend([dataset_batch_type] * gradient_accumulation_steps) logger.info( 'Number of steps to process all batches in this epoch: {}'.format( len(dataset_picker))) model.train() # Counter of sequences in an "epoch" sequences_counter = 0 global_step_loss = 0 for step, dataset_type in enumerate(dataset_picker): try: #logger.info('Step#: %d'%(step)) batch = next(dataloaders[dataset_type]) sequences_counter += len(batch) #import pdb;pdb.set_trace() if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # Move to GPU if step > 1 and step % args.log_steps == 0: logger.info( "{} Number of sequences processed so far: {} (cumulative in {} steps)" .format(datetime.utcnow(), sequences_counter, step)) # Calculate forward pass loss = model.network(batch) if n_gpu > 1: # this is to average loss for multi-gpu. In DistributedDataParallel # setting, we get tuple of losses form all proccesses loss = loss.mean() if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps #import pdb;pdb.set_trace() # Enabling optimized Reduction # reduction only happens in backward if this method is called before # when using the distributed module if accumulate_gradients: if use_multigpu_with_single_device_per_process and ( step + 1) % gradient_accumulation_steps == 0: model.network.enable_need_reduction() else: model.network.disable_need_reduction() if fp16: optimizer.backward(loss) else: loss.backward() global_step_loss += loss if (step + 1) % gradient_accumulation_steps == 0: if fp16: # modify learning rate with special warm up BERT uses # if fp16 is False, BertAdam is used that handles this automatically lr_this_step = \ job_config.get_learning_rate() * warmup_linear_decay_exp(global_step, job_config.get_decay_rate(), job_config.get_decay_step(), job_config.get_total_training_steps(), job_config.get_warmup_proportion()) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step # Record the LR against global_step on tensorboard if check_write_log(): summary_writer.add_scalar(f'Train/lr', lr_this_step, global_step) else: lr_this_step = job_config.get_learning_rate() optimizer.step() optimizer.zero_grad() global_step += 1 if check_write_log() and (global_step % args.log_steps == 0): logger.info("training_loss %f" % np.float(global_step_loss)) logger.info("lr_this_step %f" % np.float(lr_this_step)) logger.info("loss over steps=%f, loss=%f" % (global_step, np.float(global_step_loss))) logger.info("lr over steps=%f, lr=%f" % (global_step, np.float(lr_this_step))) global_step_loss = 0 except StopIteration: continue logger.info("Completed {} steps".format(step)) logger.info("Completed processing {} sequences".format(sequences_counter)) # Run Validation Loss if max_seq_length == 512: logger.info(f"TRAIN BATCH SIZE: {train_batch_size}") return pretrain_validation(index) else: return None
def train(): model.train() global global_step # Pretraining datasets batchs_per_dataset = [] shuffle_numbers = 10 midea_dataset = MideaDataset( tokenizer=tokenizer, folder=args.train_path, max_seq_length=max_seq_length, shuffle_numbers=shuffle_numbers, max_predictions_per_seq=max_predictions_per_seq, masked_lm_prob=masked_lm_prob) num_batches = get_effective_batch(len(midea_dataset)) logger.info('Wikpedia data file: Number of samples {}'.format( len(midea_dataset))) batchs_per_dataset.append(num_batches) logger.info("Training on Midea dataset") dataset_batches = [] for i, batch_count in enumerate(batchs_per_dataset): dataset_batches.extend([i] * batch_count) random.shuffle(dataset_batches) dataset_picker = [] for dataset_batch_type in dataset_batches: dataset_picker.extend([dataset_batch_type] * gradient_accumulation_steps) print("dataset_picker", len(dataset_picker)) # We don't want the dataset to be n the form of alternate chunks if we have more than # one dataset type, instead we want to organize them into contiguous chunks of each # data type, hence the multiplication with grad_accumulation_steps with dataset_batch_type model.train() # Counter of sequences in an "epoch" sequences_counter = 0 global_step_loss = 0 dataloaders = get_dataloader(midea_dataset) step = 0 best_loss = None for index in range(start_epoch, args.epochs): logger.info(f"Training epoch: {index + 1}") for batch in tqdm(dataloaders): # batch = [t.reshape(batch_size*2*shuffle_numbers, -1) for t in batch] sequences_counter += batch[1].shape[0] # if n_gpu == 1: # batch = tuple(t.to(device) for t in batch) # Move to GPU batch = tuple(t.cuda(device, non_blocking=True) for t in batch) # logger.info("{} Number of sequences processed so far: {} (cumulative in {} steps)".format(datetime.utcnow(), sequences_counter, step)) loss = model.network(batch) if n_gpu > 1: # this is to average loss for multi-gpu. In DistributedDataParallel # setting, we get tuple of losses form all proccesses loss = loss.mean() if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps # Enabling optimized Reduction # reduction only happens in backward if this method is called before # when using the distributed module if accumulate_gradients: if use_multigpu_with_single_device_per_process and ( step + 1) % gradient_accumulation_steps == 0: model.network.enable_need_reduction() else: model.network.disable_need_reduction() if fp16: optimizer.backward(loss) else: loss.backward() global_step_loss += loss if (step + 1) % gradient_accumulation_steps == 0: if fp16: # modify learning rate with special warm up BERT uses # if fp16 is False, BertAdam is used that handles this automatically lr_this_step = job_config.get_learning_rate( ) * warmup_linear_decay_exp( global_step, job_config.get_decay_rate(), job_config.get_decay_step(), job_config.get_total_training_steps(), job_config.get_warmup_proportion()) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step # Record the LR against global_step on tensorboard if check_write_log(): summary_writer.add_scalar(f'Train/lr', lr_this_step, global_step) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 global_step_loss = 0 step += 1 logger.info("Completed {} steps".format(step)) logger.info( "Completed processing {} sequences".format(sequences_counter)) eval_loss = pretrain_validation(index) if check_write_log(): if best_loss is None or eval_loss is None or eval_loss < best_loss * 0.99: best_loss = eval_loss epoch_ckp_path = os.path.join( saved_model_path, "bert_encoder_epoch_{0:04d}.pt".format(index + 1)) checkpoint_model( os.path.join( saved_model_path, "training_state_checkpoint_{0:04d}.tar".format(index + 1)), model, optimizer, index, global_step) logger.info( f"Saving checkpoint of the model from epoch {index + 1} at {epoch_ckp_path}" ) model.save_bert(epoch_ckp_path) # save best checkpoint in separate directory if args.best_cp_dir: best_ckp_path = os.path.join( args.best_cp_dir, "bert_encoder_epoch_{0:04d}.pt".format(index + 1)) shutil.rmtree(args.best_cp_dir) os.makedirs(args.best_cp_dir, exist_ok=True) model.save_bert(best_ckp_path) if args.latest_cp_dir: shutil.rmtree(args.latest_cp_dir) os.makedirs(args.latest_cp_dir, exist_ok=True) checkpoint_model( os.path.join( args.latest_cp_dir, "training_state_checkpoint_{0:04d}.tar".format(index + 1)), model, optimizer, index, global_step) latest_ckp_path = os.path.join( args.latest_cp_dir, "bert_encoder_epoch_{0:04d}.pt".format(index + 1)) model.save_bert(latest_ckp_path)