def evaluate_question_answering(): ########################## ########## Settings ########################## device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/roberta-base-squad2" do_lower_case = True data_dir = Path("../data/squad20") evaluation_filename = "dev-v2.0.json" batch_size = 50 no_ans_boost = 0 accuracy_at = 3 # accuracy at n is useful for answers inside long documents # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=None, dev_split=0, test_filename=evaluation_filename, data_dir=data_dir, doc_stride=128, ) # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an Evaluator evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # 5. Load model model = AdaptiveModel.convert_from_transformers( lang_model, device=device, task_type="question_answering") # use "load" if you want to use a local model that was trained with FARM #model = AdaptiveModel.load(lang_model, device=device) model.prediction_heads[0].no_ans_boost = no_ans_boost model.prediction_heads[0].n_best = accuracy_at model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # 6. Run the Evaluator results = evaluator.eval(model) f1_score = results[0]["f1"] em_score = results[0]["EM"] tnacc = results[0]["top_n_accuracy"] print("F1-Score:", f1_score) print("Exact Match Score:", em_score) print(f"top_{accuracy_at}_accuracy:", tnacc)
def evaluate_classification(): ########################## ########## Settings ########################## device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/bert-base-german-cased-sentiment-Germeval17" do_lower_case = False batch_size = 100 data_dir = Path("../data/germeval17") evaluation_filename = "test_TIMESTAMP1.tsv" label_list = ["negative", "neutral", "positive"] metric = "f1_macro" # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2017 Data automaticaly if it is not available. processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=None, dev_filename=None, dev_split=0, test_filename=evaluation_filename, data_dir=data_dir, ) # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an Evaluator evaluator = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # 5. Load model model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="text_classification") # use "load" if you want to use a local model that was trained with FARM # model = AdaptiveModel.load(lang_model, device=device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # 6. Run the Evaluator results = evaluator.eval(model) f1_score = results[0]["f1_macro"] print("Macro-averaged F1-Score:", f1_score)
def build_data_silo(params: Params, splits, processor): data_silo = DataSilo( processor=processor, batch_size=params.batch_size, automatic_loading=False, max_processes=4, ) farm_data = build_farm_data_dicts(splits) data_silo._load_data( **{"%s_dicts" % split_name: d for split_name, d in farm_data.items()}) return data_silo, farm_data
def eval_on_file(self, data_dir: str, test_filename: str, device: Optional[str] = None): """ Performs evaluation on a SQuAD-formatted file. Returns a dict containing the following metrics: - "EM": exact match score - "f1": F1-Score - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer :param data_dir: The directory in which the test set can be found :type data_dir: Path or str :param test_filename: The name of the file containing the test data in SQuAD format. :type test_filename: str :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default. :type device: str """ if device is None: device = self.device eval_processor = SquadProcessor( tokenizer=self.inferencer.processor.tokenizer, max_seq_len=self.inferencer.processor.max_seq_len, label_list=self.inferencer.processor.tasks["question_answering"] ["label_list"], metric=self.inferencer.processor.tasks["question_answering"] ["metric"], train_filename=None, dev_filename=None, dev_split=0, test_filename=test_filename, data_dir=Path(data_dir), ) data_silo = DataSilo(processor=eval_processor, batch_size=self.inferencer.batch_size, distributed=False) data_loader = data_silo.get_data_loader("test") evaluator = Evaluator(data_loader=data_loader, tasks=eval_processor.tasks, device=device) eval_results = evaluator.eval(self.inferencer.model) results = { "EM": eval_results[0]["EM"], "f1": eval_results[0]["f1"], "top_n_accuracy": eval_results[0]["top_n_accuracy"] } return results
def test_prediction_head_load_save_class_weights(tmp_path, caplog=None): """This is a regression test for #428 and #422.""" if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 1 lang_model = "bert-base-german-cased" data_dir_path = "samples/doc_class" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False) tcp_params = dict(tokenizer=tokenizer, max_seq_len=8, data_dir=Path(data_dir_path), train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") processor = TextClassificationProcessor(**tcp_params) data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = TextClassificationHead( num_labels=2, class_weights=data_silo.calculate_class_weights(task_name="text_classification")) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model.save(tmp_path) model_loaded = AdaptiveModel.load(tmp_path, device='cpu') assert model_loaded is not None
def test_qa(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "bert-base-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=base_LM_model, do_lower_case=False ) label_list = ["start_token", "end_token"] processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir="samples/qa", label_list=label_list, metric="squad" ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(base_LM_model) prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device ) model = trainer.train(model) save_dir = "testsave/qa" model.save(save_dir) processor.save(save_dir)
def distilbert_nq(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "distilbert-base-uncased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=base_LM_model, do_lower_case=True ) processor = NaturalQuestionsProcessor( tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train_sample.jsonl", dev_filename="dev_sample.jsonl", data_dir=Path("samples/nq") ) data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(base_LM_model) qa_head = QuestionAnsweringHead() classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[qa_head, classification_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device ) trainer.train() return model, processor
def test_lm_finetuning(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 5 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = BertStyleLMProcessor( data_dir="samples/lm_finetuning", train_filename="train-sample.txt", test_filename="test-sample.txt", dev_filename=None, tokenizer=tokenizer, max_seq_len=64, ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) lm_prediction_head = BertLMHead.load(lang_model) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/lm_finetuning" model.save(save_dir) processor.save(save_dir)
def distilbert_squad(request): set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "distilbert-base-uncased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=base_LM_model, do_lower_case=True, use_fast=True # TODO parametrize this to test slow as well ) label_list = ["start_token", "end_token"] processor = SquadProcessor(tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir=Path("samples/qa"), label_list=label_list, metric="squad") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(base_LM_model) prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() return model, processor
def test_data_silo_for_holdout1(): n_splits = 5 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir=Path("data/germeval18"), label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_split=0.2, dev_stratification=False, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=32, max_processes=128) silos = DataSiloForHoldout.make( data_silo, sets=['train'], n_splits=n_splits, train_split=0.8, ) assert len(silos) == n_splits for idx, silo in enumerate(silos): train_ds = silo.get_data_loader("train").dataset dev_ds = silo.get_data_loader("dev").dataset test_ds = silo.get_data_loader("test").dataset train_indices = train_ds.datasets[0].indices dev_indices = dev_ds.indices train_idx_set = set([i for i in train_indices]) dev_idx_set = set([i for i in dev_indices]) orig_train = train_ds.datasets[0].dataset orig_train_idx = orig_train.indices test_idx = test_ds.indices orig_train_idx_set = set([i for i in orig_train_idx]) test_idx_set = set([i for i in test_idx]) assert len(orig_train_idx_set.intersection(test_idx_set)) == 0 assert len(train_idx_set.intersection(dev_idx_set)) == 0 del tokenizer del processor del data_silo del silos
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification_fasttext") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 3 batch_size = 32 evaluate_every = 100 # load fasttext from a local path: #fasttext_model = "../saved_models/fasttext-german-uncased" # or through s3 fasttext_model = "fasttext-german-uncased" do_lower_case = True max_features = 10_000 # maximum number of unique words we will transform device, n_gpu = initialize_device_settings(use_cuda=True) # 1. To make Fasttext work within FARM and with advanced aggregation strategies, we need a fixed vocabulary and associated Wordembeddings ft_converter = Fasttext_converter( pretrained_model_name_or_path=fasttext_model, do_lower_case=do_lower_case, data_path=Path("../data/germeval18"), train_filename="train.tsv", output_path=Path("../saved_models/fasttext-german-uncased-converted"), language="German", max_features=max_features) # We convert the data to have fixed size vocab and embeddings vocab_counts = ft_converter.convert_on_data() # 2. Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=ft_converter.output_path, do_lower_case=do_lower_case) # 3. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=ft_converter.data_path, label_list=label_list, train_filename=ft_converter.train_filename, dev_split=0, test_filename="test.tsv", metric=metric, label_column_name="coarse_label") # 4. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size, max_processes=1 ) # multiprocessing with WordembeddingTokenizer is not optimal - so disable it # 5. Create an AdaptiveModel # a) which consists of the newly created embedding model as a basis. language_model = LanguageModel.load(ft_converter.output_path) # b) and a prediction head on top that is suited for our task => Text classification # Since we do not have a powerful Transformer based Language Model, we need a slightly deeper NN # for going the Classification prediction_head = TextClassificationHead( layer_dims=[300, 600, len(label_list)], class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 6. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-3, device=device, n_batches=len( data_silo.get_data_loader("train") ), #len(data_silo.loaders["train"]),streaming: len(data_silo.get_data_loader("train")) n_epochs=n_epochs) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 8. Let it grow trainer.train()
batch_size = 32 evaluate_every = 30 lang_model = "bert-base-cased" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = BertStyleLMProcessor(data_dir="../data/lm_finetune_nips", tokenizer=tokenizer, max_seq_len=128, max_docs=30) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, max_multiprocessing_chunksize=20) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead.load(lang_model) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device,
def test_qa(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model, do_lower_case=False) label_list = ["start_token", "end_token"] processor = SquadProcessor(tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir="samples/qa", label_list=label_list, metric="squad") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(base_LM_model) prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=1e-5, warmup_proportion=0.2, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/qa" model.save(save_dir) processor.save(save_dir) QA_input = [{ "questions": ["In what country is Normandy"], "text": 'The Normans gave their name to Normandy, a region in France.', }] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=QA_input, use_multiprocessing=False) assert isinstance( result[0]["predictions"][0]["answers"][0]["offset_start"], int)
def finetune_sentence_level(args): logging.basicConfig( format="%(asctime)s %(levelname)s %(name)s %(message)s", datefmt="%d-%m-%y %H:%M:%S", level=logging.INFO) args.logger = logging.getLogger(__name__) if args.do_logfile: filehandler = logging.FileHandler( os.path.join(args.log_dir, f"{args.run_name}.log")) args.logger.addHandler(filehandler) args.logger.info(vars(args)) # Setup MLFlow ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name=args.experiment_name, run_name=args.run_name) set_all_seeds(seed=args.seed) args.device, args.n_gpu = initialize_device_settings(use_cuda=True) # Create a tokenizer tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer" tokenizer = CustomTokenizer.load( pretrained_model_name_or_path=args.model_name, do_lower_case=args.do_lower_case, tokenizer_class=tok_class) # Create a processor for the dataset processor = load_processor(args, tokenizer) # Create a DataSilo that loads several datasets (train/dev/test) # provides DataLoaders and calculates descriptive statistics data_silo = DataSilo(processor=processor, batch_size=args.batch_size) if args.do_feat_embeds: args.feat_size = processor.feat_size # We do cross-validation if args.folds > 1: evaluate_kfold(args, data_silo, processor) else: adapt_model = train_on_split(args, data_silo, processor) evaluator_test = MultitaskEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=args.device) result = evaluator_test.eval(adapt_model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len( data_silo.get_data_loader("test"))) pred_tsv = pd.DataFrame() args.logger.info("Test results:") for res in result[1:]: args.logger.info(f"__{res['task_name']}__") if args.train_mode == "classification": metrics = classification_metrics(res.get("preds"), res.get("labels")) args.logger.info(metrics) else: metrics = regression_metrics(res.get("preds"), res.get("labels")) for metric in metrics.keys(): args.logger.info(f"{metric}: {metrics[metric]}") if args.save_predictions: pred_tsv[f"{res['task_name']}_preds"] = res.get("preds")[0] pred_tsv[f"{res['task_name']}_labels"] = res.get("labels")[0] if args.save_predictions: save_tsv(pred_tsv, os.path.join(args.out_dir, f"{args.run_name}.tsv")) # Load trained model and perform inference dicts = [ { "text": "The intense interest aroused in the public has now somewhat subsided." }, { "text": "The quick brown fox jumped over the lazy dog." }, ] model = MultitaskInferencer.load(args.save_dir, gpu=True, level="sentence") result = model.inference_from_dicts(dicts=dicts) args.logger.info("Inference example:") args.logger.info(result)
evaluate_every = 30 lang_model = "bert-base-cased" # 1.Create a tokenizer tokenizer = Tokenizer.from_pretrained(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # We do not have a sample dataset for regression yet, add your own dataset to run the example processor = RegressionProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="../data/<YOUR-DATASET>", label_column_name="label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text regression prediction_head = RegressionHead(layer_dims=[768, 1]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) # 5. Create an optimizer optimizer, warmup_linear = initialize_optimizer(
def test_ner_amp(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 2 evaluate_every = 1 lang_model = "bert-base-german-cased" if AMP_AVAILABLE: use_amp = 'O1' else: use_amp = None tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename=Path("train-sample.txt"), dev_filename=Path("dev-sample.txt"), test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-05, schedule_opts=None, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, use_amp=use_amp) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "1980 kam der Crown von Toyota" }, ] model = Inferencer.load(save_dir, gpu=True) result = model.inference_from_dicts(dicts=basic_texts, max_processes=1) #print(result) assert result[0]["predictions"][0]["context"] == "Crown" assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
def train_evaluation_single(seed=42): ########################## ########## Settings ########################## set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 32 * 4 # 4x V100 n_epochs = 2 evaluate_every = 2000000 # disabling dev eval lang_model = "roberta-base" do_lower_case = False # roberta is a cased model train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # Load model and train tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("testsave/data/squad20"), ) data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) language_model = LanguageModel.load(lang_model) prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) starttime = time() trainer.train() elapsed = time() - starttime save_dir = Path("testsave/roberta-qa-dev") model.save(save_dir) processor.save(save_dir) # Create Evaluator evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnrecall = results[0]["top_n_recall"] * 100 print(results) print(elapsed) gold_f1 = 82.155 gold_EM = 77.714 gold_tnrecall = 97.3721 # gold_elapsed = 1286.30 np.testing.assert_allclose( f1_score, gold_f1, rtol=0.01, err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}") np.testing.assert_allclose( em_score, gold_EM, rtol=0.01, err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}") np.testing.assert_allclose( tnrecall, gold_tnrecall, rtol=0.01, err_msg= f"FARM Training changed for top 1 recall by: {em_score - gold_EM}") np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds" )
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="../data/germeval18", label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ], class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1,
def question_answering_crossvalidation(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) #ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: ml_logger = MLFlowLogger(tracking_uri="logs") #ml_logger.init_experiment(experiment_name="QA_X-Validation", run_name="Squad_Roberta_Base") ########################## ########## Settings ########################## save_per_fold_results = False # unsupported for now set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/roberta-base-squad2" do_lower_case = False n_epochs = 2 batch_size = 80 learning_rate = 3e-5 data_dir = Path("../data/covidqa") filename = "COVID-QA.json" xval_folds = 5 dev_split = 0 evaluate_every = 0 no_ans_boost = -100 # use large negative values to disable giving "no answer" option accuracy_at = 3 # accuracy at n is useful for answers inside long documents use_amp = None ########################## ########## k fold Cross validation ########################## # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=["start_token", "end_token"], metric="squad", train_filename=filename, dev_filename=None, dev_split=dev_split, test_filename=None, data_dir=data_dir, doc_stride=192, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # fine-tune pre-trained question-answering model model = AdaptiveModel.convert_from_transformers( lang_model, device=device, task_type="question_answering") model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # If positive, thjs will boost "No Answer" as prediction. # If negative, this will prevent the model from giving "No Answer" as prediction. model.prediction_heads[0].no_ans_boost = no_ans_boost # Number of predictions the model will make per Question. # The multiple predictions are used for evaluating top n recall. model.prediction_heads[0].n_best = accuracy_at # # or train question-answering models from scratch # # Create an AdaptiveModel # # a) which consists of a pretrained language model as a basis # language_model = LanguageModel.load(lang_model) # # b) and a prediction head on top that is suited for our task => Question-answering # prediction_head = QuestionAnsweringHead(no_ans_boost=no_ans_boost, n_best=accuracy_at) # model = AdaptiveModel( # language_model=language_model, # prediction_heads=[prediction_head], # embeds_dropout_prob=0.1, # lm_output_types=["per_token"], # device=device,) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, then evaluate the model on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging all_results = [] all_preds = [] all_labels = [] all_f1 = [] all_em = [] all_topnaccuracy = [] for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", logging=False, steps=len(silo.get_data_loader("test")), num_fold=num_fold) all_results.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) all_f1.append(result[0]["f1"]) all_em.append(result[0]["EM"]) all_topnaccuracy.append(result[0]["top_n_accuracy"]) # emtpy cache to avoid memory leak and cuda OOM across multiple folds model.cpu() torch.cuda.empty_cache() # Save the per-fold results to json for a separate, more detailed analysis # TODO currently not supported - adjust to QAPred and QACandidate objects # if save_per_fold_results: # def convert_numpy_dtype(obj): # if type(obj).__module__ == "numpy": # return obj.item() # # raise TypeError("Unknown type:", type(obj)) # # with open("qa_xval.results.json", "wt") as fp: # json.dump(all_results, fp, default=convert_numpy_dtype) # calculate overall metrics across all folds xval_score = squad(preds=all_preds, labels=all_labels) logger.info(f"Single EM-Scores: {all_em}") logger.info(f"Single F1-Scores: {all_f1}") logger.info( f"Single top_{accuracy_at}_accuracy Scores: {all_topnaccuracy}") logger.info(f"XVAL EM: {xval_score['EM']}") logger.info(f"XVAL f1: {xval_score['f1']}") logger.info( f"XVAL top_{accuracy_at}_accuracy: {xval_score['top_n_accuracy']}") ml_logger.log_metrics({"XVAL EM": xval_score["EM"]}, 0) ml_logger.log_metrics({"XVAL f1": xval_score["f1"]}, 0) ml_logger.log_metrics( {f"XVAL top_{accuracy_at}_accuracy": xval_score["top_n_accuracy"]}, 0)
def test_doc_regression(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = RegressionProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/doc_regr"), train_filename="train-sample.tsv", dev_filename="test-sample.tsv", test_filename=None, label_column_name="label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = RegressionHead() model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'CosineWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() save_dir = Path("testsave/doc_regr") model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand." }, { "text": "it just did not fit right. The top is very thin showing everything." }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["pred"], np.float32)
def test_evaluation(): ########################## ########## Settings ########################## lang_model = "deepset/roberta-base-squad2" do_lower_case = False test_assertions = True data_dir = Path("testsave/data/squad20") evaluation_filename = "dev-v2.0.json" device, n_gpu = initialize_device_settings(use_cuda=True) # loading models and evals model = AdaptiveModel.convert_from_transformers( lang_model, device=device, task_type="question_answering") model.prediction_heads[0].no_ans_boost = 0 model.prediction_heads[0].n_best = 1 tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=None, dev_split=0, test_filename=evaluation_filename, data_dir=data_dir, doc_stride=128, ) starttime = time() data_silo = DataSilo(processor=processor, batch_size=50) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # 1. Test FARM internal evaluation results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnrecall = results[0]["top_n_recall"] * 100 elapsed = time() - starttime print(results) print(elapsed) gold_EM = 77.7478 gold_f1 = 82.1557 gold_tnrecall = 84.0646 # top 1 recall gold_elapsed = 70 # 4x V100 if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}") np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}") np.testing.assert_allclose( tnrecall, gold_tnrecall, rtol=0.001, err_msg=f"FARM Eval changed for top 1 recall by: {em_score-gold_EM}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds" ) # 2. Test FARM predictions with outside eval script starttime = time() model = Inferencer(model=model, processor=processor, task_type="question_answering", batch_size=50, gpu=device.type == "cuda") filename = data_dir / evaluation_filename result = model.inference_from_file(file=filename) elapsed = time() - starttime os.makedirs("../testsave", exist_ok=True) write_squad_predictions(predictions=result, predictions_filename=filename, out_filename="testsave/predictions.json") script_params = { "data_file": filename, "pred_file": "testsave/predictions.json", "na_prob_thresh": 1, "na_prob_file": False, "out_file": False } results_official = squad_evaluation.main(OPTS=DotMap(script_params)) f1_score = results_official["f1"] em_score = results_official["exact"] gold_EM = 78.4890 gold_f1 = 81.7104 gold_elapsed = 66 # 4x V100 print(elapsed) if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg= f"Eval with official script changed for EM by: {em_score - gold_EM}" ) np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg= f"Eval with official script changed for f1 score by: {f1_score - gold_f1}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds" )
def test_doc_classification(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 8 evaluate_every = 30 lang_model = "bert-base-german-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = GermEval18CoarseProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir="samples/doc_class", train_filename="train-sample.tsv", test_filename=None) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = TextClassificationHead( layer_dims=[768, len(processor.label_list)]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_examples=data_silo.n_samples("train"), batch_size=batch_size, n_epochs=1) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class" model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] model = Inferencer.load(save_dir) result = model.run_inference(dicts=basic_texts) assert result[0]["predictions"][0]["label"] == "OTHER" assert abs(result[0]["predictions"][0]["probability"] - 0.5358161) <= 0.0001
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_question_answering") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 2 evaluate_every = 2000 lang_model = "roberta-base" do_lower_case = False # roberta is a cased model train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("../data/squad20"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-english-qa-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) QA_input = [{ "qas": ["Who counted the game among the best ever made?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] model = QAInferencer.load(save_dir, batch_size=40, gpu=True) result = model.inference_from_dicts(dicts=QA_input)[0] pprint.pprint(result) # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk filename = os.path.join(processor.data_dir, processor.dev_filename) result = model.inference_from_file(file=filename, return_json=False) result_squad = [x.to_squad_eval() for x in result] write_squad_predictions(predictions=result_squad, predictions_filename=filename, out_filename="predictions.json")
def doc_classifcation(): device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) tokenizer = AutoTokenizer.from_pretrained(lang_model, strip_accents=False) #tokenizer = Tokenizer.load( # pretrained_model_name_or_path=lang_model, # do_lower_case=do_lower_case) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("./data/germeval18"), label_list=label_list, metric=metric, dev_filename="test.tsv", # we want to evaluate against test label_column_name="coarse_label", ) data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) earlystopping = EarlyStopping( metric=metric, mode="max", #save_dir=Path("./saved_models"), patience=3 ) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, early_stopping=earlystopping, device=device) trainer.train() return earlystopping.best_so_far
def doc_classification_with_earlystopping(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: # ml_logger = MLFlowLogger(tracking_uri="logs") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss # metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=Path("saved_models/bert-german-doc-tutorial-es" ), # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] # Load from the final epoch directory and apply print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING") model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool() # Load from saved best model print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING") model = Inferencer.load(earlystopping.save_dir) result = model.inference_from_dicts(dicts=basic_texts) print("APPLICATION ON BEST MODEL") print(result) model.close_multiprocessing_pool()
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_natural_questions") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 1 evaluate_every = 500 lang_model = "deepset/roberta-base-squad2" # start with a model that can already extract answers do_lower_case = False # roberta is a cased model train_filename = "train_medium.jsonl" dev_filename = "dev_medium.jsonl" keep_is_impossible = 0.15 # downsample negative examples after data conversion downsample_context_size = 300 # reduce length of wikipedia articles to relevant part around the answer # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case ) # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart html_tags = [ "<Th>","</Th>", "<Td>","</Td>", "<Tr>","</Tr>", "<Li>","</Li>", "<P>" ,"</P>", "<Ul>","</Ul>", "<H1>","</H1>", "<H2>","</H2>", "<H3>","</H3>", "<H4>","</H4>", "<H5>", "</H5>", "<Td_colspan=", ] tokenizer.add_tokens(html_tags) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NaturalQuestionsProcessor( tokenizer=tokenizer, max_seq_len=384, train_filename=train_filename, dev_filename=dev_filename, keep_no_answer=keep_is_impossible, downsample_context_size=downsample_context_size, data_dir=Path("../data/natural_questions"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, caching=True) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model,n_added_tokens=len(html_tags)) # b) and in case of Natural Questions we need two Prediction Heads # one for extractive Question Answering qa_head = QuestionAnsweringHead() # another one for answering yes/no questions or deciding if the given text passage might contain an answer classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) # answer_type_list = ["is_impossible", "span", "yes", "no"] model = AdaptiveModel( language_model=language_model, prediction_heads=[qa_head, classification_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/roberta-base-squad2-nq") model.save(save_dir) processor.save(save_dir) # 9. Since training on the whole NQ corpus requires substantial compute resources we trained and uploaded a model on s3 fetch_archive_from_http("https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/models/roberta-base-squad2-nq.zip", output_dir="../saved_models/farm") QA_input = [ { "qas": ["Did GameTrailers rated Twilight Princess as one of the best games ever created?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." } ] model = QAInferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True) result = model.inference_from_dicts(dicts=QA_input, return_json=False) # result is a list of QAPred objects print(f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?" f"\nAnswer from model: {result[0].prediction[0].answer}") model.close_multiprocessing_pool()
def train(self, data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_negatives: int = 0, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr-tutorial", query_encoder_save_dir: str = "lm1", passage_encoder_save_dir: str = "lm2"): """ train a DensePassageRetrieval model :param data_dir: Directory where training file, dev file and test file are present :param train_filename: training filename :param dev_filename: development set filename, file to be used by model in eval step of training :param test_filename: test set filename, file to be used by model in test step after training :param batch_size: total number of samples in 1 batch of data :param embed_title: whether to concatenate passage title with each passage. The default setting in official DPR embeds passage title with the corresponding passage :param num_hard_negatives: number of hard negative passages(passages which are very similar(high score by BM25) to query but do not contain the answer :param num_negatives: number of negative passages(any random passage from dataset which do not contain answer to query) :param n_epochs: number of epochs to train the model on :param evaluate_every: number of training steps after evaluation is run :param n_gpu: number of gpus to train on :param learning_rate: learning rate of optimizer :param epsilon: epsilon parameter of optimizer :param weight_decay: weight decay parameter of optimizer :param grad_acc_steps: number of steps to accumulate gradient over before back-propagation is done :param optimizer_name: what optimizer to use (default: TransformersAdamW) :param num_warmup_steps: number of warmup steps :param optimizer_correct_bias: Whether to correct bias in optimizer :param save_dir: directory where models are saved :param query_encoder_save_dir: directory inside save_dir where query_encoder model files are saved :param passage_encoder_save_dir: directory inside save_dir where passage_encoder model files are saved """ self.embed_title = embed_title self.processor = TextSimilarityProcessor( tokenizer=self.query_tokenizer, passage_tokenizer=self.passage_tokenizer, max_seq_len_passage=self.max_seq_len_passage, max_seq_len_query=self.max_seq_len_query, label_list=["hard_negative", "positive"], metric="text_similarity_metric", data_dir=data_dir, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, embed_title=self.embed_title, num_hard_negatives=num_hard_negatives, num_negatives=num_negatives) self.model.connect_heads_with_processor(self.processor.tasks, require_labels=True) data_silo = DataSilo(processor=self.processor, batch_size=batch_size, distributed=False) # 5. Create an optimizer self.model, optimizer, lr_schedule = initialize_optimizer( model=self.model, learning_rate=learning_rate, optimizer_opts={ "name": optimizer_name, "correct_bias": optimizer_correct_bias, "weight_decay": weight_decay, "eps": epsilon }, schedule_opts={ "name": "LinearWarmup", "num_warmup_steps": num_warmup_steps }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, grad_acc_steps=grad_acc_steps, device=self.device) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=self.model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=self.device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() self.model.save(Path(save_dir), lm1_name=query_encoder_save_dir, lm2_name=passage_encoder_save_dir) self.processor.save(Path(save_dir))
def train(self, data_dir, train_filename, dev_filename=None, test_file_name=None, use_gpu=None, batch_size=10, n_epochs=2, learning_rate=1e-5, max_seq_len=None, warmup_proportion=0.2, dev_split=0.1, evaluate_every=300, save_dir=None): """ Fine-tune a model on a QA dataset. Options: - Take a plain language model (e.g. `bert-base-cased`) and train it for QA (e.g. on SQuAD data) - Take a QA model (e.g. `deepset/bert-base-cased-squad2`) and fine-tune it for your domain (e.g. using your labels collected via the haystack annotation tool) :param data_dir: Path to directory containing your training data in SQuAD style :param train_filename: filename of training data :param dev_filename: filename of dev / eval data :param test_file_name: filename of test data :param dev_split: Instead of specifying a dev_filename you can also specify a ratio (e.g. 0.1) here that get's split off from training data for eval. :param use_gpu: Whether to use GPU (if available) :param batch_size: Number of samples the model receives in one batch for training :param n_epochs: number of iterations on the whole training data set :param learning_rate: learning rate of the optimizer :param max_seq_len: maximum text length (in tokens). Everything longer gets cut down. :param warmup_proportion: Proportion of training steps until maximum learning rate is reached. Until that point LR is increasing linearly. After that it's decreasing again linearly. Options for different schedules are available in FARM. :param evaluate_every: Evaluate the model every X steps on the hold-out eval dataset :param save_dir: Path to store the final model :return: None """ if dev_filename: dev_split = None set_all_seeds(seed=42) # For these variables, by default, we use the value set when initializing the FARMReader. # These can also be manually set when train() is called if you want a different value at train vs inference if use_gpu is None: use_gpu = self.use_gpu if max_seq_len is None: max_seq_len = self.max_seq_len device, n_gpu = initialize_device_settings(use_cuda=use_gpu) if not save_dir: save_dir = f"../../saved_models/{self.inferencer.model.language_model.name}" save_dir = Path(save_dir) # 1. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=self.inferencer.processor.tokenizer, max_seq_len=max_seq_len, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, dev_split=dev_split, test_filename=test_file_name, data_dir=Path(data_dir), ) # 2. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them # and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 3. Create an optimizer and pass the already initialized model model, optimizer, lr_schedule = initialize_optimizer( model=self.inferencer.model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) # 4. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 5. Let it grow! self.inferencer.model = trainer.train() self.save(save_dir)
def doc_classification( task_config, model_name_or_path, cache_dir, data_dir, save_dir, model_dir, run_name="0", lr=1e-05, warmup_steps=5000, balance_classes=True, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="roc_auc", early_stopping_mode="max", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=False, do_train=True, do_eval=True, do_hpo=False, print_preds=False, print_dev_preds=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = data_dir save_dir = save_dir model_dir = model_dir # Create label list from args list or (for large label lists) create from file by splitting by space if isinstance(task_config["data"]["label_list"], list): label_list = task_config["data"]["label_list"] else: with open(data_dir / 'labels' / task_config["data"]["label_list"]) as code_file: label_list = code_file.read().split(" ") # Register Outcome Metrics register_task_metrics(label_list) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=model_name_or_path, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, label_list=label_list, metric=task_config["metric"], multilabel=task_config["multilabel"], train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], dev_split=task_config["data"]["dev_split"] if "dev_split" in task_config["data"] else None, test_filename=task_config["data"]["test_filename"], delimiter=task_config["data"]["parsing"]["delimiter"], quote_char=task_config["data"]["parsing"]["quote_char"], label_column_name=task_config["data"]["parsing"]["label_column"]) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, caching=True, cache_path=Path(cache_dir), batch_size=batch_size) if do_train: # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment( experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name_or_path, language_model_class=model_class) # b) and a prediction head on top that is suited for our task # Define class weights if balance_classes: class_weights = data_silo.calculate_class_weights( task_name=task_config["task_type"]) else: class_weights = None # Create Multi- or Single-Label Classification Heads if task_config["multilabel"]: prediction_head = MultiLabelTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) else: prediction_head = ExtendedTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) model = ExtendedAdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=[task_config["output_type"]], device=device) # 5. Create an optimizer schedule_opts = { "name": "LinearWarmup", "num_warmup_steps": warmup_steps } model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping(mode=early_stopping_mode, min_delta=0.0001, save_dir=model_dir, metric=early_stopping_metric, patience=early_stopping_patience) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(model_dir + "/final_model") processor.save(model_dir + "/final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = model_dir else: model_dir = Path(model_name_or_path) logger.info("###### Eval on TEST SET #####") evaluator_test = ExtendedEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # Load trained model for evaluation model = ExtendedAdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir + "/eval_results.txt") if print_preds: # Print model test predictions utils.save_predictions(results, save_dir=model_dir, multilabel=task_config["multilabel"]) if print_dev_preds: # Evaluate on dev set, e.g. for threshold tuning evaluator_dev = Evaluator( data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) dev_results = evaluator_dev.eval(model, return_preds_and_labels=True) utils.log_results(dev_results, dataset_name="dev", steps=len(evaluator_dev.data_loader), save_path=model_dir + "/eval_dev_results.txt") # Print model dev predictions utils.save_predictions(dev_results, save_dir=model_dir, multilabel=task_config["multilabel"], dataset_name="dev")
def train( self, data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, ): """ Fine-tune a model on a QA dataset. Options: - Take a plain language model (e.g. `bert-base-cased`) and train it for QA (e.g. on SQuAD data) - Take a QA model (e.g. `deepset/bert-base-cased-squad2`) and fine-tune it for your domain (e.g. using your labels collected via the haystack annotation tool) :param data_dir: Path to directory containing your training data in SQuAD style :param train_filename: Filename of training data :param dev_filename: Filename of dev / eval data :param test_filename: Filename of test data :param dev_split: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here that gets split off from training data for eval. :param use_gpu: Whether to use GPU (if available) :param batch_size: Number of samples the model receives in one batch for training :param n_epochs: Number of iterations on the whole training data set :param learning_rate: Learning rate of the optimizer :param max_seq_len: Maximum text length (in tokens). Everything longer gets cut down. :param warmup_proportion: Proportion of training steps until maximum learning rate is reached. Until that point LR is increasing linearly. After that it's decreasing again linearly. Options for different schedules are available in FARM. :param evaluate_every: Evaluate the model every X steps on the hold-out eval dataset :param save_dir: Path to store the final model :param num_processes: The number of processes for `multiprocessing.Pool` during preprocessing. Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set. Set to None to use all CPU cores minus one. :param use_amp: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model. Available options: None (Don't use AMP) "O0" (Normal FP32 training) "O1" (Mixed Precision => Recommended) "O2" (Almost FP16) "O3" (Pure FP16). See details on: https://nvidia.github.io/apex/amp.html :return: None """ if dev_filename: dev_split = 0 if num_processes is None: num_processes = multiprocessing.cpu_count() - 1 or 1 set_all_seeds(seed=42) # For these variables, by default, we use the value set when initializing the FARMReader. # These can also be manually set when train() is called if you want a different value at train vs inference if use_gpu is None: use_gpu = self.use_gpu if max_seq_len is None: max_seq_len = self.max_seq_len device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=use_amp) if not save_dir: save_dir = f"../../saved_models/{self.inferencer.model.language_model.name}" # 1. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=self.inferencer.processor.tokenizer, max_seq_len=max_seq_len, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, dev_split=dev_split, test_filename=test_filename, data_dir=Path(data_dir), ) # 2. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them # and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False, max_processes=num_processes) # Quick-fix until this is fixed upstream in FARM: # We must avoid applying DataParallel twice (once when loading the inferencer, # once when calling initalize_optimizer) self.inferencer.model.save("tmp_model") model = BaseAdaptiveModel.load(load_dir="tmp_model", device=device, strict=True) shutil.rmtree('tmp_model') # 3. Create an optimizer and pass the already initialized model model, optimizer, lr_schedule = initialize_optimizer( model=model, # model=self.inferencer.model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, use_amp=use_amp, ) # 4. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, use_amp=use_amp, ) # 5. Let it grow! self.inferencer.model = trainer.train() self.save(Path(save_dir))