def test_lm_finetuning(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 5 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = BertStyleLMProcessor( data_dir="samples/lm_finetuning", train_filename="train-sample.txt", test_filename="test-sample.txt", dev_filename=None, tokenizer=tokenizer, max_seq_len=64, ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) lm_prediction_head = BertLMHead.load(lang_model) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/lm_finetuning" model.save(save_dir) processor.save(save_dir)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = BertStyleLMProcessor(data_dir="../data/lm_finetune_nips", tokenizer=tokenizer, max_seq_len=128, max_docs=30) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, max_multiprocessing_chunksize=20) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead.load(lang_model) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]),
def train_from_scratch(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="from_scratch", run_name="debug") ######################### ######## Settings ######################## set_all_seeds(seed=39) device, n_gpu = initialize_device_settings(use_cuda=True) evaluate_every = 5000 vocab_size = 30522 # dev_filename = None save_dir = Path("saved_models/train_from_scratch") n_epochs = 10 learning_rate = 1e-4 warmup_proportion = 0.05 batch_size = 16 # (probably only possible via gradient accumulation steps) max_seq_len = 64 data_dir = Path("data/lm_finetune_nips") train_filename = "train.txt" dev_filename = "dev.txt" # 1.Create a tokenizer tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = BertStyleLMProcessor( data_dir=data_dir, tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and # calculates a few descriptive statistics of our datasets stream_data_silo = StreamingDataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, vocab_size) next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(stream_data_silo.get_data_loader("train")), n_epochs=n_epochs, device=device, grad_acc_steps=8, ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=stream_data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, grad_acc_steps=8, checkpoint_root_dir=Path( "saved_models/train_from_scratch/checkpoints"), ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir)
def test_lm_finetuning(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False, never_split_chars=["-", "_"]) processor = BertStyleLMProcessor( data_dir="samples/lm_finetuning", train_filename="train-sample.txt", test_filename="test-sample.txt", dev_filename=None, tokenizer=tokenizer, max_seq_len=12, ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) lm_prediction_head = BertLMHead.load(lang_model) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/lm_finetuning" model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Farmer's life is great." }, { "text": "It's nothing for big city kids though." }, ] model = Inferencer.load(save_dir, embedder_only=True) result = model.extract_vectors(dicts=basic_texts) assert result[0]["context"] == [ 'Farmer', "'", 's', 'life', 'is', 'great', '.' ] assert result[0]["vec"].shape == (768, ) # TODO check why results vary accross runs with same seed assert isinstance(result[0]["vec"][0], np.float32)
def train_from_scratch(): args = parse_arguments() use_amp = "O2" # using "O2" here allows roughly 30% larger batch_sizes and 45% speed up logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) # Only the main process should log here if args.local_rank in [-1, 0]: ml_logger = MLFlowLogger( tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="train_from_scratch", run_name="run") set_all_seeds(seed=39) device, n_gpu = initialize_device_settings(use_cuda=True, local_rank=args.local_rank, use_amp=use_amp) save_dir = Path("saved_models/train_from_scratch") data_dir = Path("data/test") # Option A) just using a single file # train_filename = "train.txt" # Option B) (recommended when using StreamingDataSilo): # split and shuffle that file to have random order within and across epochs randomize_and_split_file(data_dir / "train.txt", output_dir=Path("data/split_files"), docs_per_file=1000) train_filename = Path("data/split_files") dev_filename = "dev.txt" distributed = args.local_rank != -1 max_seq_len = 128 batch_size = 8 #if distributed: this is per_gpu grad_acc = 1 learning_rate = 1e-4 warmup_proportion = 0.05 n_epochs = 2 evaluate_every = 15000 log_loss_every = 2 checkpoint_every = 500 checkpoint_root_dir = Path("checkpoints") checkpoints_to_keep = 4 next_sent_pred_style = "bert-style" #or "sentence" max_docs = None # Choose enough workers to queue sufficient batches during training. # Optimal number depends on your GPU speed, CPU speed and number of cores # 16 works well on a 4x V100 machine with 16 cores (AWS: p3.8xlarge). For a single GPU you will need less. data_loader_workers = 1 # 1.Create a tokenizer tokenizer = Tokenizer.load("bert-base-uncased", do_lower_case=True) # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset processor = BertStyleLMProcessor(data_dir=data_dir, tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, next_sent_pred_style=next_sent_pred_style, max_docs=max_docs) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and # calculates a few descriptive statistics of our datasets # stream_data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=distributed) stream_data_silo = StreamingDataSilo( processor=processor, batch_size=batch_size, distributed=distributed, dataloader_workers=data_loader_workers) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, tokenizer.vocab_size) next_sentence_head = NextSentenceHead(num_labels=2, task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(stream_data_silo.get_data_loader("train")), n_epochs=n_epochs, device=device, grad_acc_steps=grad_acc, distributed=distributed, use_amp=use_amp, local_rank=args.local_rank) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=stream_data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, log_loss_every=log_loss_every, device=device, grad_acc_steps=grad_acc, local_rank=args.local_rank, checkpoint_every=checkpoint_every, checkpoint_root_dir=checkpoint_root_dir, checkpoints_to_keep=checkpoints_to_keep, use_amp=use_amp) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir) if args.local_rank != -1: torch.distributed.destroy_process_group()
def test_lm_finetuning_custom_vocab(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False ) tokenizer.add_tokens(["aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbbbbbb", "ccccccccccccccccccccccc"]) processor = BertStyleLMProcessor( data_dir="samples/lm_finetuning", train_filename="train-sample.txt", test_filename="test-sample.txt", dev_filename=None, tokenizer=tokenizer, max_seq_len=12, next_sent_pred=True ) data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model, n_added_tokens=len(tokenizer.added_tokens_decoder)) lm_prediction_head = BertLMHead.load(lang_model, n_added_tokens=len(tokenizer.added_tokens_decoder)) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1} ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) # LM embeddings and weight of decoder in head are shared and should therefore be equal assert torch.all( torch.eq(model.language_model.model.embeddings.word_embeddings.weight, model.prediction_heads[0].decoder.weight)) save_dir = "testsave/lm_finetuning" model.save(save_dir) processor.save(save_dir) basic_texts = [ {"text": "Farmer's life is great."}, {"text": "It's nothing for big city kids though."}, ] model = Inferencer.load(save_dir, embedder_only=True) result = model.extract_vectors(dicts=basic_texts) assert result[0]["context"] == ['Farmer', "'", 's', 'life', 'is', 'great', '.'] assert result[0]["vec"].shape == (768,) # TODO check why results vary accross runs with same seed assert isinstance(result[0]["vec"][0], np.float32)
def train_from_scratch(): # We need the local rank argument for DDP args = parse_arguments() use_amp = "O2" logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="train_from_scratch", run_name="run") set_all_seeds(seed=39) # device, n_gpu = initialize_device_settings(use_cuda=True) device, n_gpu = initialize_device_settings(use_cuda=True, local_rank=args.local_rank, use_amp=use_amp) evaluate_every = 10000 save_dir = Path("saved_models/train_from_scratch") data_dir = Path("data/lm_finetune_nips") train_filename = "train.txt" # dev_filename = "dev.txt" max_seq_len = 128 batch_size = 80 grad_acc = 3 learning_rate = 0.0001 warmup_proportion = 0.01 n_epochs = 5 vocab_file = "bert-base-uncased-vocab.txt" # 1.Create a tokenizer tokenizer = BertTokenizer(data_dir / vocab_file, do_lower_case=True) # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset # limiting max docs to divisible of 64 (world_size * num_workers) processor = BertStyleLMProcessor(data_dir=data_dir, tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=None, test_filename=None) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and # calculates a few descriptive statistics of our datasets stream_data_silo = StreamingDataSilo(processor=processor, batch_size=batch_size, distributed=True, dataloader_workers=16) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, tokenizer.vocab_size) next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(stream_data_silo.get_data_loader("train")), n_epochs=n_epochs, device=device, grad_acc_steps=grad_acc, distributed=True, use_amp=use_amp, local_rank=args.local_rank) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time # if args.get("checkpoint_every"): # checkpoint_every = int(args["checkpoint_every"]) # checkpoint_root_dir = Path("/opt/ml/checkpoints/training") # else: checkpoint_every = None checkpoint_root_dir = None trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=stream_data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, grad_acc_steps=grad_acc, checkpoint_every=checkpoint_every, checkpoint_root_dir=checkpoint_root_dir, use_amp=use_amp, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train()
def train_from_scratch(args): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) #TODO prettify this loading of params from two sources (cmd + json) cmd_args = parse_arguments() args["local_rank"] = cmd_args.local_rank logging.info(f'local_rank: {args["local_rank"]}') next_sent_task = bool(int(args.get("next_sent_task", 1))) distributed = True use_amp = args.get("use_amp", None) use_amp = None if use_amp == "" else use_amp # Only the main process should log here if args["local_rank"] in [-1, 0]: ml_logger = StdoutLogger(tracking_uri=None) ml_logger.init_experiment(experiment_name="train_from_scratch", run_name="run") set_all_seeds(seed=39) device, n_gpu = initialize_device_settings(use_cuda=True, local_rank=args["local_rank"], use_amp=use_amp) effective_batch_size = int(args["per_gpu_batch_size"]) * int( args["gradient_accumulation_steps"] ) * torch.distributed.get_world_size() logging.info( f'Training with effective batch size of {effective_batch_size} ' f'(per_gpu_batch_size = {int(args["per_gpu_batch_size"])}, gradient_accumulation_steps={int(args["gradient_accumulation_steps"])}, n_gpus = {torch.distributed.get_world_size()} )' ) save_dir = Path("/opt/ml/model") data_dir = Path("/opt/ml/input/data/input_channel") # Split and shuffle training data if args["local_rank"] in [-1, 0]: randomize_and_split_file(data_dir / args["train_file"], output_dir=data_dir / "split_files") # let other processes wait for splitted files from rank 0 torch.distributed.barrier() args["train_file"] = data_dir / "split_files" # 1.Create a tokenizer tokenizer = BertTokenizer(data_dir / args["vocab_file"], do_lower_case=bool(int(args["do_lower_case"]))) # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset processor = BertStyleLMProcessor(data_dir=data_dir, tokenizer=tokenizer, max_seq_len=int(args["max_seq_len"]), train_filename=args.get("train_file"), dev_filename=args.get("dev_file", None), test_filename=args.get("test_file", None), next_sent_pred_style=args.get( "next_sent_pred_style", "bert-style"), max_docs=args.get("max_docs", None), next_sent_pred=next_sent_task) # 3. Create a DataSilo that loads several datasets (train/dev/test) and provides DataLoaders for them data_silo = StreamingDataSilo(processor=processor, batch_size=int(args["per_gpu_batch_size"]), dataloader_workers=int( args.get("data_loader_workers", 8)), distributed=distributed) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, tokenizer.vocab_size) if next_sent_task: next_sentence_head = NextSentenceHead(num_labels=2, task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) else: model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=float(args["learning_rate"]), schedule_opts={ "name": "LinearWarmup", "warmup_proportion": float(args["warmup_proportion"]) }, n_batches=len(data_silo.get_data_loader("train")), n_epochs=int(args["n_epochs"]), device=device, grad_acc_steps=int(args["gradient_accumulation_steps"]), distributed=distributed, use_amp=use_amp, local_rank=args["local_rank"]) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time if args.get("checkpoint_every"): checkpoint_every = int(args["checkpoint_every"]) checkpoint_root_dir = Path("/opt/ml/checkpoints/training") else: checkpoint_every = None checkpoint_root_dir = None trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=data_silo, epochs=int(args["n_epochs"]), n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=int(args["evaluate_every"]), log_loss_every=int(args.get("log_loss_every", 500)), log_learning_rate=bool(int(args.get("log_learning_rate", 0))), device=device, local_rank=args["local_rank"], grad_acc_steps=int(args["gradient_accumulation_steps"]), checkpoint_every=checkpoint_every, checkpoint_root_dir=checkpoint_root_dir, checkpoints_to_keep=int(args.get("checkpoints_to_keep", 10)), disable_tqdm=True, use_amp=use_amp, ) # 7. Let it grow! trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir)
def lm_finetuning(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) set_all_seeds(seed=42) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_minimal_example_lm") ########################## ########## Settings ########################## device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 32 evaluate_every = 30 lang_model = "bert-base-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = BertStyleLMProcessor(data_dir=Path("../data/lm_finetune_nips"), tokenizer=tokenizer, max_seq_len=128, max_docs=20) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, max_multiprocessing_chunksize=20) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead.load(lang_model) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-english-lm-tutorial") model.save(save_dir) processor.save(save_dir)
train_filename="train_small.txt", dev_filename="train_small.txt", test_filename=None) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, vocab_size) next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=learning_rate, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]),
def train_from_scratch(args): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri=args.get( "mlflow_tracking_uri", "file:/opt/ml/model/mlflow")) ml_logger.init_experiment(experiment_name="train_from_scratch", run_name="run") set_all_seeds(seed=39) device, n_gpu = initialize_device_settings(use_cuda=True) evaluate_every = int(args["evaluate_every"]) save_dir = Path("/opt/ml/model") data_dir = Path("/opt/ml/input/data/input_channel") # 1.Create a tokenizer tokenizer = BertTokenizer(data_dir / args["vocab_file"], do_lower_case=args["do_lower_case"]) # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset processor = BertStyleLMProcessor( data_dir=data_dir, tokenizer=tokenizer, max_seq_len=int(args["max_seq_len"]), train_filename=args["train_file"], dev_filename=args.get("dev_file", None), test_filename=args.get("test_file", None), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and # calculates a few descriptive statistics of our datasets stream_data_silo = StreamingDataSilo(processor=processor, batch_size=int(args["batch_size"])) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, tokenizer.vocab_size) next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=float(args["learning_rate"]), schedule_opts={ "name": "LinearWarmup", "warmup_proportion": float(args["warmup_proportion"]) }, n_batches=len(stream_data_silo.get_data_loader("train")), n_epochs=int(args["n_epochs"]), device=device, grad_acc_steps=int(args["gradient_accumulation_steps"]), ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time if args.get("checkpoint_every"): checkpoint_every = int(args["checkpoint_every"]) checkpoint_root_dir = Path("/opt/ml/checkpoints/training") else: checkpoint_every = None checkpoint_root_dir = None trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=stream_data_silo, epochs=int(args["n_epochs"]), n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, grad_acc_steps=int(args["gradient_accumulation_steps"]), checkpoint_every=checkpoint_every, checkpoint_root_dir=checkpoint_root_dir, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir)
def outcome_pretraining(task_config, model_name, cache_dir, run_name="0", lr=1e-05, warmup_steps=5000, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="loss", early_stopping_mode="min", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=True, do_train=True, do_eval=True, do_hpo=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = Path(task_config["data"]["data_dir"]) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = OutcomePretrainingProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], seed=seed, max_size_admission=50, max_size_discharge=50, cache_dir=cache_dir) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = OutcomePretrainingDataSilo( processor=processor, caching=True, cache_dir=cache_dir, batch_size=batch_size, max_multiprocessing_chunksize=200) if do_train: # Set save dir for experiment output save_dir = Path(task_config["output_dir"]) / f'{task_config["experiment_name"]}_{run_name}' # Use HPO config args if config is passed if do_hpo: save_dir = save_dir / tune.session.get_trial_name() else: exp_name = f"exp_{random.randint(100000, 999999)}" save_dir = save_dir / exp_name # Create save dir if not os.path.exists(save_dir): os.makedirs(save_dir) # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment(experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name, language_model_class=model_class) # b) and NextSentenceHead prediction head or TextClassificationHead if it's not a Bert Model if model_class == "Bert": next_sentence_head = NextSentenceHead.load(model_class) else: next_sentence_head = TextClassificationHead(num_labels=2) model = AdaptiveModel( language_model=language_model, prediction_heads=[next_sentence_head], embeds_dropout_prob=embeds_dropout, lm_output_types=["per_sequence"], device=device, ) # 5. Create an optimizer schedule_opts = {"name": "LinearWarmup", "num_warmup_steps": warmup_steps} model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping( mode=early_stopping_mode, min_delta=0.0001, save_dir=save_dir, metric=early_stopping_metric, patience=early_stopping_patience ) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval ) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(save_dir / "final_model") processor.save(save_dir / "final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = save_dir else: model_dir = Path(model_name) logger.info("###### Eval on TEST SET #####") evaluator_test = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # Load trained model for evaluation model = AdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir / "eval_results.txt")