def test_processor_saving_loading(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir="samples/doc_class", train_filename="train-sample.tsv", dev_filename=None, test_filename=None, dev_split=0.1, columns=["text", "label", "unused"], label_list=["OTHER", "OFFENSE"], metrics=["f1_macro"]) dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv") data, tensor_names = processor.dataset_from_dicts(dicts) save_dir = "testsave/processor" processor.save(save_dir) processor = processor.load_from_dir(save_dir) dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv") data_loaded, tensor_names_loaded = processor.dataset_from_dicts(dicts) assert tensor_names == tensor_names_loaded for i in range(len(data.tensors)): assert torch.all(torch.eq(data.tensors[i], data_loaded.tensors[i]))
optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow model = trainer.train(model) # 8. Hooray! You have a model. Store it: save_dir = "saved_models/bert-german-doc-tutorial" model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) # fmt: on
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 1 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" # or a local path: # lang_model = Path("../saved_models/farm-bert-base-cased") use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label" ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, {"text": "Martin Müller spielt Handball in Berlin"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def test_doc_classification(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 8 evaluate_every = 5 lang_model = "bert-base-german-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="samples/doc_class", train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = TextClassificationHead(layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=1) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class" model.save(save_dir) processor.save(save_dir) basic_texts = [{ "text": "Martin Müller spielt Handball in Berlin." }, { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei." }, { "text": "Franzosen verteidigen 2:1-Führung – Kritische Stimmen zu Schwedens Superstar" }, { "text": "Neues Video von Designern macht im Netz die Runde" }, { "text": "23-jähriger Brasilianer muss vier Spiele pausieren – Entscheidung kann noch angefochten werden" }, { "text": "Aufständische verwendeten Chemikalie bei Gefechten im August." }, { "text": "Bewährungs- und Geldstrafe für 26-Jährigen wegen ausländerfeindlicher Äußerung" }, { "text": "ÖFB-Teamspieler nur sechs Minuten nach seinem Tor beim 1:1 gegen Sunderland verletzt ausgewechselt" }, { "text": "Ein 31-jähriger Polizist soll einer 42-Jährigen den Knöchel gebrochen haben" }, { "text": "18 Menschen verschleppt. Kabul – Nach einem Hubschrauber-Absturz im Norden Afghanistans haben Sicherheitskräfte am Mittwoch versucht" }] #TODO enable loading here again after we have finished migration towards "processor.tasks" #inf = Inferencer.load(save_dir) inf = Inferencer(model=model, processor=processor) result = inf.run_inference(dicts=basic_texts) assert result[0]["predictions"][0]["label"] == "OTHER" assert abs(result[0]["predictions"][0]["probability"] - 0.7) <= 0.1 loaded_processor = TextClassificationProcessor.load_from_dir(save_dir) inf2 = Inferencer(model=model, processor=loaded_processor) result_2 = inf2.run_inference(dicts=basic_texts) pprint(list(zip(result, result_2))) for r1, r2 in list(zip(result, result_2)): assert r1 == r2 # if(__name__=="__main__"): # test_doc_classification()
def test_doc_classification(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/doc_class"), train_filename=Path("train-sample.tsv"), label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = DistilBert.load(lang_model) prediction_head = TextClassificationHead() model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts=None) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = Path("testsave/doc_class") model.save(save_dir) processor.save(save_dir) basic_texts = [{ "text": "Malte liebt Berlin." }, { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei." }] inf = Inferencer.load(save_dir, batch_size=2) result = inf.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
def doc_classification(args): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") set_all_seeds(seed=42) save_dir = Path("/opt/ml/model") use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=args.base_lm_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=args.max_seq_len, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label", ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=args.batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(args.base_lm_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=args.n_epochs, use_amp=use_amp, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=args.n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=args.evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir)
def doc_classification_cola(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 5 batch_size = 100 evaluate_every = 20 lang_model = "bert-base-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load Cola 2018 Data. label_list = ["0", "1"] metric = "mcc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/cola"), dev_filename=Path("dev.tsv"), dev_split=None, test_filename=None, label_list=label_list, metric=metric, label_column_name="label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "The box contained the ball from the tree." }, { "text": "I'll fix you a drink." }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def main(args): print(f"[INFO] PyTorch Version: {torch.__version__}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("[INFO] Devices available: {}".format(device)) checkpoint_path = Path(args.ckpt_path) / args.run_name ml_logger = MLFlowLogger(tracking_uri=args.tracking_uri) ml_logger.init_experiment(experiment_name=args.experiment_name, run_name=args.run_name) tokenizer = Tokenizer.load( pretrained_model_name_or_path=args.pretrained_model_name_or_path, do_lower_case=False) # Processor if args.task_name == "text_classification": processor = TextClassificationProcessor( tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=None, test_filename=args.test_filename, header=0, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, label_column_name=args.label_column_name, text_column_name=args.text_column_name) elif args.task_name == "question_answering": processor = SquadProcessor(tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=args.test_filename, test_filename=args.test_filename, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, max_query_length=64, doc_stride=128, max_answers=1) else: raise ValueError("task name error") processor.save(checkpoint_path) # DataSilo data_silo = DataSilo(processor=processor, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, caching=True, cache_path=checkpoint_path) # LanguageModel: Build pretrained language model language_model = LanguageModel.load(args.pretrained_model_name_or_path, language="korean") # PredictionHead: Build predictor layer if args.task_name == "text_classification": # If you do classification on imbalanced classes, consider using class weights. # They change the loss function to down-weight frequent classes. prediction_head = TextClassificationHead( num_labels=len(args.label_list), class_weights=data_silo.calculate_class_weights( task_name=args.task_name)) elif args.task_name == "question_answering": prediction_head = QuestionAnsweringHead( layer_dims=[768, 2], task_name=args.task_name, ) else: raise ValueError("task name error") # AdaptiveModel: Combine all if args.task_name == "text_classification": lm_output_types = ["per_sequence"] elif args.task_name == "question_answering": lm_output_types = ["per_token"] else: raise ValueError("task name error") model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=args.embeds_dropout_prob, lm_output_types=lm_output_types, device=device) # Initialize Optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, device=device, learning_rate=args.learning_rate, n_batches=len(data_silo.loaders["train"]), n_epochs=args.n_epochs) # EarlyStopping earlymetric = "f1" if args.task_name == "question_answering" else "acc" mode = "max" if args.task_name in [ "text_classification", "question_answering" ] else "min" earlystop = EarlyStopping(save_dir=checkpoint_path, metric=earlymetric, mode=mode, patience=5) # Trainer trainer = Trainer( model=model, optimizer=optimizer, lr_schedule=lr_schedule, data_silo=data_silo, early_stopping=earlystop, evaluate_every=args.evaluate_every, checkpoints_to_keep=args.checkpoints_to_keep, checkpoint_root_dir=checkpoint_path, checkpoint_every=args.checkpoint_every, epochs=args.n_epochs, n_gpu=args.n_gpu, device=device, ) # now train! model = trainer.train()
def doc_classification_multilabel(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 32 evaluate_every = 500 lang_model = "bert-base-uncased" do_lower_case = True # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"] metric = "acc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/toxic-comments"), label_list=label_list, label_column_name="label", metric=metric, quote_char='"', multilabel=True, train_filename="train.tsv", dev_filename="val.tsv", test_filename=None, dev_split=0, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-german-multi-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ {"text": "You f*****g bastards"}, {"text": "What a lovely world"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def doc_classification_cola(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 3 batch_size = 8 evaluate_every = 450 lang_model = "/bert-base-chinese" #BERT中文模型的路径 #模型下载地址https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load Cola 2018 Data. label_list =["城乡建设","卫生计生","商贸旅游","劳动和社会保障","教育文体","交通运输","环境保护"] metric = "acc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=507, data_dir=Path("/BERT留言分类数据集"), #存放文本分类数据的文件夹路径,数据格式:第一列按字符分隔的text,第二列label,之间用制表符分隔。第一行需要有"text"与"label" dev_filename=None, #Path("dev.tsv"), dev_split=0.1, test_filename="/BERT留言分类数据集/test.tsv", label_list=label_list, metric=metric, label_column_name="label" ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights(task_name="text_classification")) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("/BERT文本分类输出的模型") model.save(save_dir) processor.save(save_dir)
def test_doc_classification(): #caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "roberta-base" tokenizer = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir="samples/doc_class", train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = Roberta.load(lang_model) prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts=None) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class_roberta" model.save(save_dir) processor.save(save_dir) basic_texts = [ {"text": "Martin Müller spielt Handball in Berlin."}, {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."} ] inf = Inferencer.load(save_dir,batch_size=2) result = inf.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["probability"],np.float32)
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 1 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" # or a local path: # lang_model = Path("../saved_models/farm-bert-base-cased") use_amp = None ############################################# # CUSTOM OPTIMIZER & LR SCHEDULE ############################################# # learning rate schedules from transformers schedule_opts = {"name": "LinearWarmup", "warmup_proportion": 0.4} # schedule_opts = {"name": "Constant"} # schedule_opts = {"name": "CosineWarmup", "warmup_proportion": 0.4} # schedule_opts = {"name": "CosineWarmupWithRestarts", "warmup_proportion": 0.4} # or from native pytorch (see https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html for all options) # schedule_opts = {"name": "StepLR", "step_size": 30, "gamma": 0.1} # schedule_opts = {"name": "ReduceLROnPlateau", "mode": 'min', "factor": 0.1, "patience":10} # optimizers from pytorch (see https://pytorch.org/docs/stable/optim.html for all options) optimizer_opts = {"name": "SGD", "momentum": 0.0} # or from apex (see https://github.com/NVIDIA/apex/tree/master/apex/optimizers for all options) # optimizer_opts = {"name": "FusedLAMB", "bias_correction": True} # or from transformers (default in FARM) #optimizer_opts = {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01} ############################################# device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=5e-3, optimizer_opts=optimizer_opts, schedule_opts=schedule_opts, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def test_doc_classification(data_dir_path, text_column_name, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-german-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False) tcp_params = dict(tokenizer=tokenizer, max_seq_len=8, data_dir=Path(data_dir_path), train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename="test-sample.tsv", dev_split=0.0, label_column_name="coarse_label") if text_column_name is not None: tcp_params["text_column_name"] = text_column_name processor = TextClassificationProcessor(**tcp_params) data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = TextClassificationHead(num_labels=2) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts=None) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() save_dir = Path("testsave/doc_class") model.save(save_dir) processor.save(save_dir) basic_texts = [ {"text": "Martin Müller spielt Handball in Berlin."}, {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."} ] inf = Inferencer.load(save_dir, batch_size=2, num_processes=0) result = inf.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["probability"], np.float32) result2 = inf.inference_from_dicts(dicts=basic_texts, return_json=True) assert result == result2 # is the rest result stored? assert trainer.test_result is not None
def doc_classification_crossvalidation(): # the code for this function is partially taken from: # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_multilabel.py and # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_crossvalidation.py # for local logging: ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="covid-document-classification", run_name=RUNNAME) # model settings xval_folds = FOLDS set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) if RUNLOCAL: device = "cpu" n_epochs = NEPOCHS batch_size = BATCHSIZE evaluate_every = EVALEVERY lang_model = MODELTYPE do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) metric = "f1_macro" # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # The processor wants to know the possible labels ... label_list = LABELS processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=MAXLEN, data_dir=DATADIR, train_filename=TRAIN, test_filename=TEST, dev_split=0.1, label_list=label_list, metric=metric, label_column_name="Categories", # confusing parameter name: it should be called multiCLASS # not multiLABEL multilabel=True ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir, dev): # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead( # there is still an error with class weights ... # class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=dev) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=dev, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer save_dir = Path(str(save_dir) + f"-{n_fold}") # unfortunately, early stopping is still not working earlystopping = EarlyStopping( metric="f1_macro", mode="max", save_dir=save_dir, # where to save the best model patience=5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=dev, evaluator_test=False, #early_stopping=earlystopping) ) # train it trainer.train() trainer.model.save(save_dir) return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_macro = -1 save_dir = Path("saved_models/covid-classification-v1") for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir, device) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator( data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device, ) result = evaluator_test.eval(model, return_preds_and_labels=True) os.makedirs(os.path.dirname(BESTMODEL + "/classification_report.txt"), exist_ok=True) with open(BESTMODEL + "/classification_report.txt", "a+") as file: file.write("Evaluation on withheld split for numfold no. {} \n".format(num_fold)) file.write(result[0]["report"]) file.write("\n\n") file.close() evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_macro = result[0]["f1_macro"] if f1_macro > bestf1_macro: bestf1_macro = f1_macro bestfold = num_fold # Save the per-fold results to json for a separate, more detailed analysis with open("../data/predictions/covid-classification-xval.results.json", "wt") as fp: json.dump(allresults, fp, cls=NumpyArrayEncoder) # calculate overall f1 score across all folds xval_f1_macro = f1_score(all_labels, all_preds, average="macro") ml_logger.log_metrics({"f1 macro across all folds": xval_f1_macro}, step=None) # test performance evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/covid-classification-v1-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) ml_logger.log_metrics({"f1 macro on final test set": result[0]["f1_macro"]}, step=None) with open(BESTMODEL + "/classification_report.txt", "a+") as file: file.write("Final result of the best model \n") file.write(result[0]["report"]) file.write("\n\n") file.close() ml_logger.log_artifacts(BESTMODEL + "/") # save model for later use processor.save(BESTMODEL) model.save(BESTMODEL) return model
def doc_classification_with_earlystopping(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: # ml_logger = MLFlowLogger(tracking_uri="logs") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss # metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=Path("saved_models/bert-german-doc-tutorial-es" ), # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] # Load from the final epoch directory and apply print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING") model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool() # Load from saved best model print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING") model = Inferencer.load(earlystopping.save_dir) result = model.inference_from_dicts(dicts=basic_texts) print("APPLICATION ON BEST MODEL") print(result) model.close_multiprocessing_pool()
def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, evaluate_every, use_cuda, max_seq_len, learning_rate, do_lower_case, register_model, save_model=True, early_stopping=False): language = cu.params.get('language') # Check task if cu.tasks.get(str(task)).get('type') != 'classification': raise Exception('NOT A CLASSIFICATION TASK') # Data dt_task = dt.Data(task=task) ## Download training files if not os.path.isfile(dt_task.get_path('fn_train', dir='data_dir')): dt_task.download('data_dir', dir='data_dir', source='datastore') # Settings set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) lang_model = he.get_farm_model(model_type, language) save_dir = dt_task.get_path('model_dir') label_list = dt_task.load('fn_label', dir='data_dir', header=None)[0].to_list() # AML log try: aml_run.log('task', task) aml_run.log('language', language) aml_run.log('n_epochs', n_epochs) aml_run.log('batch_size', batch_size) aml_run.log('learning_rate', learning_rate) aml_run.log('embeds_dropout', embeds_dropout) aml_run.log('max_seq_len', max_seq_len) aml_run.log('lang_model', lang_model) aml_run.log_list('label_list', label_list) except: pass # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") # AML log try: aml_run.log('acc', acc.get('acc')) aml_run.log('f1macro', f1macro) aml_run.log('f1micro', f1micro) except: pass return {"acc": acc, "f1_macro": f1macro, "f1_micro": f1micro} register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=dt_task.data_dir, label_list=label_list, metric=metric, label_column_name="label", train_filename=dt_task.get_path('fn_train', dir='data_dir'), test_filename=dt_task.get_path('fn_test', dir='data_dir')) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel ## Pretrained language model as a basis language_model = LanguageModel.load(lang_model) ## Prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(processor.tasks["text_classification"]["label_list"]), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, learning_rate=learning_rate, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. if early_stopping: earlystopping = EarlyStopping( metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=save_dir, # where to save the best model patience= 2 # number of evaluations to wait for improvement before terminating the training ) else: earlystopping = None trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Store it: # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training if save_model: model.save(save_dir) processor.save(save_dir) if register_model: dt_task.upload('model_dir', destination='model')
def doc_classification( task_config, model_name_or_path, cache_dir, data_dir, save_dir, model_dir, run_name="0", lr=1e-05, warmup_steps=5000, balance_classes=True, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="roc_auc", early_stopping_mode="max", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=False, do_train=True, do_eval=True, do_hpo=False, print_preds=False, print_dev_preds=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = data_dir save_dir = save_dir model_dir = model_dir # Create label list from args list or (for large label lists) create from file by splitting by space if isinstance(task_config["data"]["label_list"], list): label_list = task_config["data"]["label_list"] else: with open(data_dir / 'labels' / task_config["data"]["label_list"]) as code_file: label_list = code_file.read().split(" ") # Register Outcome Metrics register_task_metrics(label_list) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=model_name_or_path, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, label_list=label_list, metric=task_config["metric"], multilabel=task_config["multilabel"], train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], dev_split=task_config["data"]["dev_split"] if "dev_split" in task_config["data"] else None, test_filename=task_config["data"]["test_filename"], delimiter=task_config["data"]["parsing"]["delimiter"], quote_char=task_config["data"]["parsing"]["quote_char"], label_column_name=task_config["data"]["parsing"]["label_column"]) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, caching=True, cache_path=Path(cache_dir), batch_size=batch_size) if do_train: # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment( experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name_or_path, language_model_class=model_class) # b) and a prediction head on top that is suited for our task # Define class weights if balance_classes: class_weights = data_silo.calculate_class_weights( task_name=task_config["task_type"]) else: class_weights = None # Create Multi- or Single-Label Classification Heads if task_config["multilabel"]: prediction_head = MultiLabelTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) else: prediction_head = ExtendedTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) model = ExtendedAdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=[task_config["output_type"]], device=device) # 5. Create an optimizer schedule_opts = { "name": "LinearWarmup", "num_warmup_steps": warmup_steps } model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping(mode=early_stopping_mode, min_delta=0.0001, save_dir=model_dir, metric=early_stopping_metric, patience=early_stopping_patience) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(model_dir + "/final_model") processor.save(model_dir + "/final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = model_dir else: model_dir = Path(model_name_or_path) logger.info("###### Eval on TEST SET #####") evaluator_test = ExtendedEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # Load trained model for evaluation model = ExtendedAdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir + "/eval_results.txt") if print_preds: # Print model test predictions utils.save_predictions(results, save_dir=model_dir, multilabel=task_config["multilabel"]) if print_dev_preds: # Evaluate on dev set, e.g. for threshold tuning evaluator_dev = Evaluator( data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) dev_results = evaluator_dev.eval(model, return_preds_and_labels=True) utils.log_results(dev_results, dataset_name="dev", steps=len(evaluator_dev.data_loader), save_path=model_dir + "/eval_dev_results.txt") # Print model dev predictions utils.save_predictions(dev_results, save_dir=model_dir, multilabel=task_config["multilabel"], dataset_name="dev")
def test_doc_classification(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 8 evaluate_every = 30 lang_model = "bert-base-german-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="samples/doc_class", train_filename="train-sample.tsv", dev_filename=None, test_filename=None, dev_split=0.1, columns=["text", "label", "unused"], label_list=["OTHER", "OFFENSE"], metrics=["f1_macro"] ) data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = TextClassificationHead(layer_dims=[768, len(processor.label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=1) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class" model.save(save_dir) processor.save(save_dir) basic_texts = [ {"text": "Martin Müller spielt Handball in Berlin."}, {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."}, ] model = Inferencer.load(save_dir) result = model.run_inference(dicts=basic_texts) assert result[0]["predictions"][0]["label"] == "OTHER" assert abs(result[0]["predictions"][0]["probability"] - 0.7) <= 0.1