def test_processor_saving_loading(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir="samples/doc_class", train_filename="train-sample.tsv", dev_filename=None, test_filename=None, dev_split=0.1, columns=["text", "label", "unused"], label_list=["OTHER", "OFFENSE"], metrics=["f1_macro"]) dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv") data, tensor_names = processor.dataset_from_dicts(dicts) save_dir = "testsave/processor" processor.save(save_dir) processor = processor.load_from_dir(save_dir) dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv") data_loaded, tensor_names_loaded = processor.dataset_from_dicts(dicts) assert tensor_names == tensor_names_loaded for i in range(len(data.tensors)): assert torch.all(torch.eq(data.tensors[i], data_loaded.tensors[i]))
def evaluate_classification(): ########################## ########## Settings ########################## device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/bert-base-german-cased-sentiment-Germeval17" do_lower_case = False batch_size = 100 data_dir = Path("../data/germeval17") evaluation_filename = "test_TIMESTAMP1.tsv" label_list = ["negative", "neutral", "positive"] metric = "f1_macro" # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2017 Data automaticaly if it is not available. processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=None, dev_filename=None, dev_split=0, test_filename=evaluation_filename, data_dir=data_dir, ) # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an Evaluator evaluator = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # 5. Load model model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="text_classification") # use "load" if you want to use a local model that was trained with FARM # model = AdaptiveModel.load(lang_model, device=device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # 6. Run the Evaluator results = evaluator.eval(model) f1_score = results[0]["f1_macro"] print("Macro-averaged F1-Score:", f1_score)
def test_prediction_head_load_save_class_weights(tmp_path, caplog=None): """This is a regression test for #428 and #422.""" if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 1 lang_model = "bert-base-german-cased" data_dir_path = "samples/doc_class" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False) tcp_params = dict(tokenizer=tokenizer, max_seq_len=8, data_dir=Path(data_dir_path), train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") processor = TextClassificationProcessor(**tcp_params) data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = TextClassificationHead( num_labels=2, class_weights=data_silo.calculate_class_weights(task_name="text_classification")) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model.save(tmp_path) model_loaded = AdaptiveModel.load(tmp_path, device='cpu') assert model_loaded is not None
def test_data_silo_for_holdout1(): n_splits = 5 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir=Path("data/germeval18"), label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_split=0.2, dev_stratification=False, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=32, max_processes=128) silos = DataSiloForHoldout.make( data_silo, sets=['train'], n_splits=n_splits, train_split=0.8, ) assert len(silos) == n_splits for idx, silo in enumerate(silos): train_ds = silo.get_data_loader("train").dataset dev_ds = silo.get_data_loader("dev").dataset test_ds = silo.get_data_loader("test").dataset train_indices = train_ds.datasets[0].indices dev_indices = dev_ds.indices train_idx_set = set([i for i in train_indices]) dev_idx_set = set([i for i in dev_indices]) orig_train = train_ds.datasets[0].dataset orig_train_idx = orig_train.indices test_idx = test_ds.indices orig_train_idx_set = set([i for i in orig_train_idx]) test_idx_set = set([i for i in test_idx]) assert len(orig_train_idx_set.intersection(test_idx_set)) == 0 assert len(train_idx_set.intersection(dev_idx_set)) == 0 del tokenizer del processor del data_silo del silos
def test_doc_classification(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 8 evaluate_every = 5 lang_model = "bert-base-german-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="samples/doc_class", train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = TextClassificationHead(layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=1) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class" model.save(save_dir) processor.save(save_dir) basic_texts = [{ "text": "Martin Müller spielt Handball in Berlin." }, { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei." }, { "text": "Franzosen verteidigen 2:1-Führung – Kritische Stimmen zu Schwedens Superstar" }, { "text": "Neues Video von Designern macht im Netz die Runde" }, { "text": "23-jähriger Brasilianer muss vier Spiele pausieren – Entscheidung kann noch angefochten werden" }, { "text": "Aufständische verwendeten Chemikalie bei Gefechten im August." }, { "text": "Bewährungs- und Geldstrafe für 26-Jährigen wegen ausländerfeindlicher Äußerung" }, { "text": "ÖFB-Teamspieler nur sechs Minuten nach seinem Tor beim 1:1 gegen Sunderland verletzt ausgewechselt" }, { "text": "Ein 31-jähriger Polizist soll einer 42-Jährigen den Knöchel gebrochen haben" }, { "text": "18 Menschen verschleppt. Kabul – Nach einem Hubschrauber-Absturz im Norden Afghanistans haben Sicherheitskräfte am Mittwoch versucht" }] #TODO enable loading here again after we have finished migration towards "processor.tasks" #inf = Inferencer.load(save_dir) inf = Inferencer(model=model, processor=processor) result = inf.run_inference(dicts=basic_texts) assert result[0]["predictions"][0]["label"] == "OTHER" assert abs(result[0]["predictions"][0]["probability"] - 0.7) <= 0.1 loaded_processor = TextClassificationProcessor.load_from_dir(save_dir) inf2 = Inferencer(model=model, processor=loaded_processor) result_2 = inf2.run_inference(dicts=basic_texts) pprint(list(zip(result, result_2))) for r1, r2 in list(zip(result, result_2)): assert r1 == r2 # if(__name__=="__main__"): # test_doc_classification()
def test_data_silo_for_cross_val_nested(): lang_model = "distilbert-base-german-cased" n_outer_splits = 3 n_inner_splits = 3 tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir=Path("data/germeval18"), label_list=["OTHER", "OFFENSE"], metric="f1_macro", label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=32) silos = DataSiloForCrossVal.make( data_silo, sets=['test', 'train'], n_splits=n_outer_splits, n_inner_splits=n_inner_splits, ) # check number of silos assert len(silos) == (n_outer_splits * n_inner_splits) # because the outer cross validation creates the test set it must be the same # in silo 0 and silo 1 data_loader_test_indices_0 = silos[0].get_data_loader( 'test').dataset.indices data_loader_test_indices_1 = silos[1].get_data_loader( 'test').dataset.indices assert data_loader_test_indices_0.size > 0 assert data_loader_test_indices_1.size > 0 assert data_loader_test_indices_0.ndim == 1 assert data_loader_test_indices_1.ndim == 1 assert np.array_equal(data_loader_test_indices_0, data_loader_test_indices_1) # because the inner cross validation creates the dev set it must be different # in silo 0 and silo 1 data_loader_dev_indices_0 = silos[0].get_data_loader('dev').dataset.indices data_loader_dev_indices_1 = silos[1].get_data_loader('dev').dataset.indices assert data_loader_dev_indices_0.size > 0 assert data_loader_dev_indices_1.size > 0 assert data_loader_dev_indices_0.ndim == 1 assert data_loader_dev_indices_1.ndim == 1 assert not np.array_equal(data_loader_dev_indices_0, data_loader_dev_indices_1) # extract and test train sets of silo 0 and 1 data_loader_train_indices_0 = silos[0].get_data_loader( 'train').dataset.indices data_loader_train_indices_1 = silos[1].get_data_loader( 'train').dataset.indices assert data_loader_train_indices_0.size > 0 assert data_loader_train_indices_1.size > 0 assert data_loader_train_indices_0.ndim == 1 assert data_loader_train_indices_1.ndim == 1 # size of dev + train + test must be same on all folds assert (data_loader_train_indices_0.size + \ data_loader_dev_indices_0.size + \ data_loader_test_indices_0.size) == \ (data_loader_train_indices_1.size + \ data_loader_dev_indices_1.size + \ data_loader_test_indices_1.size) del tokenizer del processor del data_silo del silos
# 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load Cola 2018 Data. label_list = ["0", "1"] metric = "mcc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir="../data/cola", dev_filename="dev.tsv", dev_split=None, test_filename=None, label_list=label_list, metric=metric, label_column_name="label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead(
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 1 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" # or a local path: # lang_model = Path("../saved_models/farm-bert-base-cased") use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label" ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, {"text": "Martin Müller spielt Handball in Berlin"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
########## Settings ########################## set_all_seeds(seed=42) batch_size = 32 use_gpu = True device, n_gpu = initialize_device_settings(use_cuda=use_gpu) lang_model = "bert-base-german-cased" # 1.Create a tokenizer tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False ) # 2. Create a DataProcessor only for inference, in this case, we utilize the TextClassificationProcessor and data_dir can be empty processor = TextClassificationProcessor( data_dir="", tokenizer=tokenizer, max_seq_len=128 ) # 4. Create an AdaptiveModel with a pretrained language model as a basis language_model = Bert.load(lang_model) adaptive_model = AdaptiveModel( language_model=language_model, prediction_heads=[], embeds_dropout_prob=0, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Extract embeddings with model in inference mode basic_texts = [
def main(args): print(f"[INFO] PyTorch Version: {torch.__version__}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("[INFO] Devices available: {}".format(device)) checkpoint_path = Path(args.ckpt_path) / args.run_name ml_logger = MLFlowLogger(tracking_uri=args.tracking_uri) ml_logger.init_experiment(experiment_name=args.experiment_name, run_name=args.run_name) tokenizer = Tokenizer.load( pretrained_model_name_or_path=args.pretrained_model_name_or_path, do_lower_case=False) # Processor if args.task_name == "text_classification": processor = TextClassificationProcessor( tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=None, test_filename=args.test_filename, header=0, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, label_column_name=args.label_column_name, text_column_name=args.text_column_name) elif args.task_name == "question_answering": processor = SquadProcessor(tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=args.test_filename, test_filename=args.test_filename, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, max_query_length=64, doc_stride=128, max_answers=1) else: raise ValueError("task name error") processor.save(checkpoint_path) # DataSilo data_silo = DataSilo(processor=processor, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, caching=True, cache_path=checkpoint_path) # LanguageModel: Build pretrained language model language_model = LanguageModel.load(args.pretrained_model_name_or_path, language="korean") # PredictionHead: Build predictor layer if args.task_name == "text_classification": # If you do classification on imbalanced classes, consider using class weights. # They change the loss function to down-weight frequent classes. prediction_head = TextClassificationHead( num_labels=len(args.label_list), class_weights=data_silo.calculate_class_weights( task_name=args.task_name)) elif args.task_name == "question_answering": prediction_head = QuestionAnsweringHead( layer_dims=[768, 2], task_name=args.task_name, ) else: raise ValueError("task name error") # AdaptiveModel: Combine all if args.task_name == "text_classification": lm_output_types = ["per_sequence"] elif args.task_name == "question_answering": lm_output_types = ["per_token"] else: raise ValueError("task name error") model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=args.embeds_dropout_prob, lm_output_types=lm_output_types, device=device) # Initialize Optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, device=device, learning_rate=args.learning_rate, n_batches=len(data_silo.loaders["train"]), n_epochs=args.n_epochs) # EarlyStopping earlymetric = "f1" if args.task_name == "question_answering" else "acc" mode = "max" if args.task_name in [ "text_classification", "question_answering" ] else "min" earlystop = EarlyStopping(save_dir=checkpoint_path, metric=earlymetric, mode=mode, patience=5) # Trainer trainer = Trainer( model=model, optimizer=optimizer, lr_schedule=lr_schedule, data_silo=data_silo, early_stopping=earlystop, evaluate_every=args.evaluate_every, checkpoints_to_keep=args.checkpoints_to_keep, checkpoint_root_dir=checkpoint_path, checkpoint_every=args.checkpoint_every, epochs=args.n_epochs, n_gpu=args.n_gpu, device=device, ) # now train! model = trainer.train()
pretrained_model_name_or_path=lang_model, do_lower_case=True) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] metric = "acc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="../data/toxic-comments", label_list=label_list, label_column_name="label", metric=metric, quote_char='"', multilabel=True, train_filename="train.tsv", dev_filename="val.tsv", test_filename=None, dev_split=0) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = Bert.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead(layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"])
def test_doc_classification(): #caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "roberta-base" tokenizer = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir="samples/doc_class", train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = Roberta.load(lang_model) prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts=None) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class_roberta" model.save(save_dir) processor.save(save_dir) basic_texts = [ {"text": "Martin Müller spielt Handball in Berlin."}, {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."} ] inf = Inferencer.load(save_dir,batch_size=2) result = inf.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["probability"],np.float32)
lang_model = "xlnet-large-cased" do_lower_case = False tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) metric = "f1_macro" # desired metric for evaluation processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=20, # BERT can only handle sequence lengths of up to 512 label_list=label_list, data_dir='new/', label_column_name="class", # our labels are located in the "genre" column metric=metric, quote_char='"', multilabel=True, train_filename="train.tsv", dev_filename=None, test_filename="test.tsv", dev_split=0.1 # this will extract 10% of the train set to create a dev set ) data_silo = DataSilo(processor=processor, batch_size=batch_size) # loading the pretrained BERT base cased model language_model = LanguageModel.load(lang_model) # prediction head for our model that is suited for classifying news article genres prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model,
def load(cls, model_name_or_path, batch_size=4, gpu=False, task_type=None, return_class_probs=False, strict=True, max_seq_len=256, doc_stride=128, extraction_layer=None, extraction_strategy=None, s3e_stats=None, num_processes=None, disable_tqdm=False): """ Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by 1. specifying a public name from transformers' model hub (https://huggingface.co/models) 2. or pointing to a local directory it is saved in. :param model_name_or_path: Local directory or public name of the model to load. :type model_name_or_path: str :param batch_size: Number of samples computed once per batch :type batch_size: int :param gpu: If GPU shall be used :type gpu: bool :param task_type: Type of task the model should be used for. Currently supporting: "embeddings", "question_answering", "text_classification", "ner". More coming soon... :param task_type: str :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in the PredictionHead (see torch.nn.module.load_state_dict()). Set to `False` for backwards compatibility with PHs saved with older version of FARM. :type strict: bool :param max_seq_len: maximum length of one text sample :type max_seq_len: int :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride :type doc_stride: int :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean' (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors) :type extraction_strategy: str :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer). :type extraction_layer: int :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()` (only needed for task_type="embeddings" and extraction_strategy = "s3e") :type s3e_stats: dict :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable multiprocessing. Set to None to let Inferencer use all CPU cores. If you want to debug the Language Model, you might need to disable multiprocessing! :type num_processes: int :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing) :type disable_tqdm: bool :return: An instance of the Inferencer. """ device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, use_amp=None) name = os.path.basename(model_name_or_path) # a) either from local dir if os.path.exists(model_name_or_path): model = BaseAdaptiveModel.load(load_dir=model_name_or_path, device=device, strict=strict) if task_type == "embeddings": processor = InferenceProcessor.load_from_dir( model_name_or_path) else: processor = Processor.load_from_dir(model_name_or_path) # override processor attributes loaded from config file with inferencer params processor.max_seq_len = max_seq_len if hasattr(processor, "doc_stride"): processor.doc_stride = doc_stride # b) or from remote transformers model hub else: logger.info( f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..." ) if not task_type: raise ValueError( "Please specify the 'task_type' of the model you want to load from transformers. " "Valid options for arg `task_type`:" "'question_answering', 'embeddings', 'text_classification', 'ner'" ) model = AdaptiveModel.convert_from_transformers( model_name_or_path, device, task_type) config = AutoConfig.from_pretrained(model_name_or_path) tokenizer = Tokenizer.load(model_name_or_path) # TODO infer task_type automatically from config (if possible) if task_type == "question_answering": processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, label_list=["start_token", "end_token"], metric="squad", data_dir="data", doc_stride=doc_stride) elif task_type == "embeddings": processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len) elif task_type == "text_classification": label_list = list(config.id2label[id] for id in range(len(config.id2label))) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir="data", label_list=label_list, label_column_name="label", metric="acc", quote_char='"', ) elif task_type == "ner": label_list = list(config.label2id.keys()) processor = NERProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir="data", metric="seq_f1", label_list=label_list) else: raise ValueError( f"`task_type` {task_type} is not supported yet. " f"Valid options for arg `task_type`: 'question_answering', " f"'embeddings', 'text_classification', 'ner'") return cls(model, processor, task_type=task_type, batch_size=batch_size, gpu=gpu, name=name, return_class_probs=return_class_probs, extraction_strategy=extraction_strategy, extraction_layer=extraction_layer, s3e_stats=s3e_stats, num_processes=num_processes, disable_tqdm=disable_tqdm)
def doc_classification_cola(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 3 batch_size = 8 evaluate_every = 450 lang_model = "/bert-base-chinese" #BERT中文模型的路径 #模型下载地址https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load Cola 2018 Data. label_list =["城乡建设","卫生计生","商贸旅游","劳动和社会保障","教育文体","交通运输","环境保护"] metric = "acc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=507, data_dir=Path("/BERT留言分类数据集"), #存放文本分类数据的文件夹路径,数据格式:第一列按字符分隔的text,第二列label,之间用制表符分隔。第一行需要有"text"与"label" dev_filename=None, #Path("dev.tsv"), dev_split=0.1, test_filename="/BERT留言分类数据集/test.tsv", label_list=label_list, metric=metric, label_column_name="label" ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights(task_name="text_classification")) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("/BERT文本分类输出的模型") model.save(save_dir) processor.save(save_dir)
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 1 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" # or a local path: # lang_model = Path("../saved_models/farm-bert-base-cased") use_amp = None ############################################# # CUSTOM OPTIMIZER & LR SCHEDULE ############################################# # learning rate schedules from transformers schedule_opts = {"name": "LinearWarmup", "warmup_proportion": 0.4} # schedule_opts = {"name": "Constant"} # schedule_opts = {"name": "CosineWarmup", "warmup_proportion": 0.4} # schedule_opts = {"name": "CosineWarmupWithRestarts", "warmup_proportion": 0.4} # or from native pytorch (see https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html for all options) # schedule_opts = {"name": "StepLR", "step_size": 30, "gamma": 0.1} # schedule_opts = {"name": "ReduceLROnPlateau", "mode": 'min', "factor": 0.1, "patience":10} # optimizers from pytorch (see https://pytorch.org/docs/stable/optim.html for all options) optimizer_opts = {"name": "SGD", "momentum": 0.0} # or from apex (see https://github.com/NVIDIA/apex/tree/master/apex/optimizers for all options) # optimizer_opts = {"name": "FusedLAMB", "bias_correction": True} # or from transformers (default in FARM) #optimizer_opts = {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01} ############################################# device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=5e-3, optimizer_opts=optimizer_opts, schedule_opts=schedule_opts, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def test_doc_classification(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/doc_class"), train_filename=Path("train-sample.tsv"), label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = DistilBert.load(lang_model) prediction_head = TextClassificationHead() model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts=None) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = Path("testsave/doc_class") model.save(save_dir) processor.save(save_dir) basic_texts = [{ "text": "Malte liebt Berlin." }, { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei." }] inf = Inferencer.load(save_dir, batch_size=2) result = inf.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
def doc_classification_multilabel(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 32 evaluate_every = 500 lang_model = "bert-base-uncased" do_lower_case = True # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"] metric = "acc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/toxic-comments"), label_list=label_list, label_column_name="label", metric=metric, quote_char='"', multilabel=True, train_filename="train.tsv", dev_filename="val.tsv", test_filename=None, dev_split=0, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-german-multi-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ {"text": "You f*****g bastards"}, {"text": "What a lovely world"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def load(cls, model_name_or_path, batch_size=4, gpu=False, task_type=None, return_class_probs=False, strict=True, max_seq_len=256): """ Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by 1. specifying a public name from transformers' model hub (https://huggingface.co/models) 2. or pointing to a local directory it is saved in. :param model_name_or_path: Local directory or public name of the model to load. :type model_name_or_path: str :param batch_size: Number of samples computed once per batch :type batch_size: int :param gpu: If GPU shall be used :type gpu: bool :param task_type: Type of task the model should be used for. Currently supporting: "embeddings", "question_answering", "text_classification". More coming soon... :param task_type: str :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in the PredictionHead (see torch.nn.module.load_state_dict()). Set to `False` for backwards compatibility with PHs saved with older version of FARM. :type strict: bool :return: An instance of the Inferencer. """ device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, use_amp=None) name = os.path.basename(model_name_or_path) # a) either from local dir if os.path.exists(model_name_or_path): model = AdaptiveModel.load(model_name_or_path, device, strict=strict) if task_type == "embeddings": processor = InferenceProcessor.load_from_dir( model_name_or_path) else: processor = Processor.load_from_dir(model_name_or_path) # b) or from remote transformers model hub else: logger.info( f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..." ) if not task_type: raise ValueError( "Please specify the 'task_type' of the model you want to load from transformers. " "Valid options for arg `task_type`:" "'question_answering', 'embeddings', 'text_classification'" ) model = AdaptiveModel.convert_from_transformers( model_name_or_path, device, task_type) config = AutoConfig.from_pretrained(model_name_or_path) tokenizer = Tokenizer.load(model_name_or_path) # TODO infer task_type automatically from config (if possible) if task_type == "question_answering": processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, label_list=["start_token", "end_token"], metric="squad", data_dir=None, ) elif task_type == "embeddings": processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len) elif task_type == "text_classification": label_list = list(config.id2label[id] for id in range(len(config.id2label))) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=None, label_list=label_list, label_column_name="label", metric="acc", quote_char='"', ) # elif task_type == "multilabel-classification": # # label_list = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] # label_list = list(config.label2id.keys()) # # processor = TextClassificationProcessor(tokenizer=tokenizer, # max_seq_len=max_seq_len, # data_dir=None, # label_list=label_list, # label_column_name="label", # metric="acc", # quote_char='"', # multilabel=True, # ) elif task_type == "ner": label_list = list(config.label2id.keys()) processor = NERProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=None, metric="seq_f1", label_list=label_list) else: raise ValueError( f"`task_type` {task_type} is not supported yet. " f"Valid options for arg `task_type`: 'question_answering', 'embeddings', 'text_classification'" ) return cls( model, processor, batch_size=batch_size, gpu=gpu, name=name, return_class_probs=return_class_probs, )
def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, evaluate_every, use_cuda, max_seq_len, learning_rate, do_lower_case, register_model, save_model=True, early_stopping=False): language = cu.params.get('language') # Check task if cu.tasks.get(str(task)).get('type') != 'classification': raise Exception('NOT A CLASSIFICATION TASK') # Data dt_task = dt.Data(task=task) ## Download training files if not os.path.isfile(dt_task.get_path('fn_train', dir='data_dir')): dt_task.download('data_dir', dir='data_dir', source='datastore') # Settings set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) lang_model = he.get_farm_model(model_type, language) save_dir = dt_task.get_path('model_dir') label_list = dt_task.load('fn_label', dir='data_dir', header=None)[0].to_list() # AML log try: aml_run.log('task', task) aml_run.log('language', language) aml_run.log('n_epochs', n_epochs) aml_run.log('batch_size', batch_size) aml_run.log('learning_rate', learning_rate) aml_run.log('embeds_dropout', embeds_dropout) aml_run.log('max_seq_len', max_seq_len) aml_run.log('lang_model', lang_model) aml_run.log_list('label_list', label_list) except: pass # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") # AML log try: aml_run.log('acc', acc.get('acc')) aml_run.log('f1macro', f1macro) aml_run.log('f1micro', f1micro) except: pass return {"acc": acc, "f1_macro": f1macro, "f1_micro": f1micro} register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=dt_task.data_dir, label_list=label_list, metric=metric, label_column_name="label", train_filename=dt_task.get_path('fn_train', dir='data_dir'), test_filename=dt_task.get_path('fn_test', dir='data_dir')) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel ## Pretrained language model as a basis language_model = LanguageModel.load(lang_model) ## Prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(processor.tasks["text_classification"]["label_list"]), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, learning_rate=learning_rate, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. if early_stopping: earlystopping = EarlyStopping( metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=save_dir, # where to save the best model patience= 2 # number of evaluations to wait for improvement before terminating the training ) else: earlystopping = None trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Store it: # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training if save_model: model.save(save_dir) processor.save(save_dir) if register_model: dt_task.upload('model_dir', destination='model')
def doc_classification_crossvalidation(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: ml_logger = MLFlowLogger(tracking_uri="logs") # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## xval_folds = 5 xval_stratified = True set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" use_amp = None # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. # For xval, we also store the actual predictions and labels in each result so we can # calculate overall metrics over all folds later def mymetrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") mcc = matthews_corrcoef(labels, preds) return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro, "mcc": mcc } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_fold}") earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_offense = -1 save_dir = Path("saved_models/bert-german-doc-tutorial-es") for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_offense = result[0]["f1_offense"] if f1_offense > bestf1_offense: bestf1_offense = f1_offense bestfold = num_fold # Save the per-fold results to json for a separate, more detailed analysis with open("doc_classification_xval.results.json", "wt") as fp: json.dump(allresults, fp) # calculate overall metrics across all folds xval_f1_micro = f1_score(all_labels, all_preds, labels=label_list, average="micro") xval_f1_macro = f1_score(all_labels, all_preds, labels=label_list, average="macro") xval_f1_offense = f1_score(all_labels, all_preds, labels=label_list, pos_label="OFFENSE") xval_f1_other = f1_score(all_labels, all_preds, labels=label_list, pos_label="OTHER") xval_mcc = matthews_corrcoef(all_labels, all_preds) logger.info("XVAL F1 MICRO: ", xval_f1_micro) logger.info("XVAL F1 MACRO: ", xval_f1_macro) logger.info("XVAL F1 OFFENSE: ", xval_f1_offense) logger.info("XVAL F1 OTHER: ", xval_f1_other) logger.info("XVAL MCC: ", xval_mcc) # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info("TEST F1 MICRO: ", result[0]["f1_micro"]) logger.info("TEST F1 MACRO: ", result[0]["f1_macro"]) logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"]) logger.info("TEST F1 OTHER: ", result[0]["f1_other"]) logger.info("TEST MCC: ", result[0]["mcc"])
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification_fasttext") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 3 batch_size = 32 evaluate_every = 100 # load fasttext from a local path: #fasttext_model = "../saved_models/fasttext-german-uncased" # or through s3 fasttext_model = "fasttext-german-uncased" do_lower_case = True max_features = 10_000 # maximum number of unique words we will transform device, n_gpu = initialize_device_settings(use_cuda=True) # 1. To make Fasttext work within FARM and with advanced aggregation strategies, we need a fixed vocabulary and associated Wordembeddings ft_converter = Fasttext_converter( pretrained_model_name_or_path=fasttext_model, do_lower_case=do_lower_case, data_path=Path("../data/germeval18"), train_filename="train.tsv", output_path=Path("../saved_models/fasttext-german-uncased-converted"), language="German", max_features=max_features) # We convert the data to have fixed size vocab and embeddings vocab_counts = ft_converter.convert_on_data() # 2. Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=ft_converter.output_path, do_lower_case=do_lower_case) # 3. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=ft_converter.data_path, label_list=label_list, train_filename=ft_converter.train_filename, dev_split=0, test_filename="test.tsv", metric=metric, label_column_name="coarse_label") # 4. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size, max_processes=1 ) # multiprocessing with WordembeddingTokenizer is not optimal - so disable it # 5. Create an AdaptiveModel # a) which consists of the newly created embedding model as a basis. language_model = LanguageModel.load(ft_converter.output_path) # b) and a prediction head on top that is suited for our task => Text classification # Since we do not have a powerful Transformer based Language Model, we need a slightly deeper NN # for going the Classification prediction_head = TextClassificationHead( layer_dims=[300, 600, len(label_list)], class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 6. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-3, device=device, n_batches=len( data_silo.get_data_loader("train") ), #len(data_silo.loaders["train"]),streaming: len(data_silo.get_data_loader("train")) n_epochs=n_epochs) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 8. Let it grow trainer.train()
def doc_classifcation(): device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) tokenizer = AutoTokenizer.from_pretrained(lang_model, strip_accents=False) #tokenizer = Tokenizer.load( # pretrained_model_name_or_path=lang_model, # do_lower_case=do_lower_case) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("./data/germeval18"), label_list=label_list, metric=metric, dev_filename="test.tsv", # we want to evaluate against test label_column_name="coarse_label", ) data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) earlystopping = EarlyStopping( metric=metric, mode="max", #save_dir=Path("./saved_models"), patience=3 ) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, early_stopping=earlystopping, device=device) trainer.train() return earlystopping.best_so_far
def doc_classification_with_earlystopping(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: # ml_logger = MLFlowLogger(tracking_uri="logs") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss # metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=Path("saved_models/bert-german-doc-tutorial-es" ), # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] # Load from the final epoch directory and apply print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING") model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool() # Load from saved best model print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING") model = Inferencer.load(earlystopping.save_dir) result = model.inference_from_dicts(dicts=basic_texts) print("APPLICATION ON BEST MODEL") print(result) model.close_multiprocessing_pool()
def doc_classification_cola(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 5 batch_size = 100 evaluate_every = 20 lang_model = "bert-base-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load Cola 2018 Data. label_list = ["0", "1"] metric = "mcc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/cola"), dev_filename=Path("dev.tsv"), dev_split=None, test_filename=None, label_list=label_list, metric=metric, label_column_name="label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "The box contained the ball from the tree." }, { "text": "I'll fix you a drink." }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def train( self, data_dir: str, train_filename: str, label_list: List[str], delimiter: str, metric: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, ): """ Fine-tune a model on a TextClassification dataset. The dataset needs to be in tabular format (CSV, TSV, etc.), with columns called "label" and "text" in no specific order. Options: - Take a plain language model (e.g. `bert-base-cased`) and train it for TextClassification - Take a TextClassification model and fine-tune it for your domain :param data_dir: Path to directory containing your training data :param label_list: list of labels in the training dataset, e.g., ["0", "1"] :param delimiter: delimiter that separates columns in the training dataset, e.g., "\t" :param metric: evaluation metric to be used while training, e.g., "f1_macro" :param train_filename: Filename of training data :param dev_filename: Filename of dev / eval data :param test_filename: Filename of test data :param dev_split: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here that gets split off from training data for eval. :param use_gpu: Whether to use GPU (if available) :param batch_size: Number of samples the model receives in one batch for training :param n_epochs: Number of iterations on the whole training data set :param learning_rate: Learning rate of the optimizer :param max_seq_len: Maximum text length (in tokens). Everything longer gets cut down. :param warmup_proportion: Proportion of training steps until maximum learning rate is reached. Until that point LR is increasing linearly. After that it's decreasing again linearly. Options for different schedules are available in FARM. :param evaluate_every: Evaluate the model every X steps on the hold-out eval dataset :param save_dir: Path to store the final model :param num_processes: The number of processes for `multiprocessing.Pool` during preprocessing. Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set. Set to None to use all CPU cores minus one. :param use_amp: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model. Available options: None (Don't use AMP) "O0" (Normal FP32 training) "O1" (Mixed Precision => Recommended) "O2" (Almost FP16) "O3" (Pure FP16). See details on: https://nvidia.github.io/apex/amp.html :return: None """ if dev_filename: dev_split = 0 if num_processes is None: num_processes = multiprocessing.cpu_count() - 1 or 1 set_all_seeds(seed=42) # For these variables, by default, we use the value set when initializing the FARMClassifier. # These can also be manually set when train() is called if you want a different value at train vs inference if use_gpu is None: use_gpu = self.use_gpu if max_seq_len is None: max_seq_len = self.max_seq_len device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=use_amp) if not save_dir: save_dir = f"saved_models/{self.inferencer.model.language_model.name}" # 1. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = TextClassificationProcessor( tokenizer=self.inferencer.processor.tokenizer, max_seq_len=max_seq_len, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, dev_split=dev_split, test_filename=test_filename, data_dir=Path(data_dir), delimiter=delimiter, ) # 2. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them # and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False, max_processes=num_processes) # 3. Create an optimizer and pass the already initialized model model, optimizer, lr_schedule = initialize_optimizer( model=self.inferencer.model, learning_rate=learning_rate, schedule_opts={"name": "LinearWarmup", "warmup_proportion": warmup_proportion}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, use_amp=use_amp, ) # 4. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, use_amp=use_amp, disable_tqdm=not self.progress_bar ) # 5. Let it grow! self.inferencer.model = trainer.train() self.save(Path(save_dir))
device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="../data/germeval18", label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ], class_weights=data_silo.calculate_class_weights(
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification_glove") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 3 batch_size = 32 evaluate_every = 100 # load from a local path: lang_model = Path("../saved_models/glove-german-uncased") # or through s3 #lang_model = "glove-german-uncased" do_lower_case = True device, n_gpu = initialize_device_settings(use_cuda=True) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, dev_split=0, test_filename="test.tsv", train_filename="train.tsv", metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) # 4. Create an AdaptiveModel # a) which consists of an embedding model as a basis. # Word embedding models only converts words it has seen during training to embedding vectors. language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( layer_dims=[300, 600, len(label_list)], class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train()
def doc_classification( task_config, model_name_or_path, cache_dir, data_dir, save_dir, model_dir, run_name="0", lr=1e-05, warmup_steps=5000, balance_classes=True, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="roc_auc", early_stopping_mode="max", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=False, do_train=True, do_eval=True, do_hpo=False, print_preds=False, print_dev_preds=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = data_dir save_dir = save_dir model_dir = model_dir # Create label list from args list or (for large label lists) create from file by splitting by space if isinstance(task_config["data"]["label_list"], list): label_list = task_config["data"]["label_list"] else: with open(data_dir / 'labels' / task_config["data"]["label_list"]) as code_file: label_list = code_file.read().split(" ") # Register Outcome Metrics register_task_metrics(label_list) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=model_name_or_path, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, label_list=label_list, metric=task_config["metric"], multilabel=task_config["multilabel"], train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], dev_split=task_config["data"]["dev_split"] if "dev_split" in task_config["data"] else None, test_filename=task_config["data"]["test_filename"], delimiter=task_config["data"]["parsing"]["delimiter"], quote_char=task_config["data"]["parsing"]["quote_char"], label_column_name=task_config["data"]["parsing"]["label_column"]) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, caching=True, cache_path=Path(cache_dir), batch_size=batch_size) if do_train: # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment( experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name_or_path, language_model_class=model_class) # b) and a prediction head on top that is suited for our task # Define class weights if balance_classes: class_weights = data_silo.calculate_class_weights( task_name=task_config["task_type"]) else: class_weights = None # Create Multi- or Single-Label Classification Heads if task_config["multilabel"]: prediction_head = MultiLabelTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) else: prediction_head = ExtendedTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) model = ExtendedAdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=[task_config["output_type"]], device=device) # 5. Create an optimizer schedule_opts = { "name": "LinearWarmup", "num_warmup_steps": warmup_steps } model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping(mode=early_stopping_mode, min_delta=0.0001, save_dir=model_dir, metric=early_stopping_metric, patience=early_stopping_patience) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(model_dir + "/final_model") processor.save(model_dir + "/final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = model_dir else: model_dir = Path(model_name_or_path) logger.info("###### Eval on TEST SET #####") evaluator_test = ExtendedEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # Load trained model for evaluation model = ExtendedAdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir + "/eval_results.txt") if print_preds: # Print model test predictions utils.save_predictions(results, save_dir=model_dir, multilabel=task_config["multilabel"]) if print_dev_preds: # Evaluate on dev set, e.g. for threshold tuning evaluator_dev = Evaluator( data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) dev_results = evaluator_dev.eval(model, return_preds_and_labels=True) utils.log_results(dev_results, dataset_name="dev", steps=len(evaluator_dev.data_loader), save_path=model_dir + "/eval_dev_results.txt") # Print model dev predictions utils.save_predictions(dev_results, save_dir=model_dir, multilabel=task_config["multilabel"], dataset_name="dev")
def perform_fine_tuning(current_info_need, bert_model, label_list, num_epochs, condition, folds=10, stratified=True, learning_rate=2e-5, batch_size=32, embeds_dropout_prob=.1): ## Define evaluation metrics ## def evaluation_metrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="Other") f1infoneed = f1_score(y_true=labels, y_pred=preds, pos_label=current_info_need) recall_infoneed = recall_score(y_true=labels, y_pred=preds, pos_label=current_info_need) precision_infoneed = precision_score(y_true=labels, y_pred=preds, pos_label=current_info_need) recall_other = recall_score(y_true=labels, y_pred=preds, pos_label="Other") precision_other = precision_score(y_true=labels, y_pred=preds, pos_label="Other") recall_macro = recall_score(y_true=labels, y_pred=preds, average="macro") precision_macro = precision_score(y_true=labels, y_pred=preds, average="macro") recall_micro = recall_score(y_true=labels, y_pred=preds, average="micro") precision_micro = precision_score(y_true=labels, y_pred=preds, average="micro") recall_weighted = recall_score(y_true=labels, y_pred=preds, average="weighted") precision_weighted = precision_score(y_true=labels, y_pred=preds, average="weighted") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") mcc = matthews_corrcoef(labels, preds) f1weighted = f1_score(y_true=labels, y_pred=preds, average="weighted") return { "info_need": current_info_need, "model": bert_model, "num_epochs": num_epochs, "condition": condition, "acc": acc, "f1_other": f1other, "f1_infoneed": f1infoneed, "precision_infoneed": precision_infoneed, "recall_infoneed": recall_infoneed, "recall_other": recall_other, "precision_other": precision_other, "recall_macro": recall_macro, "precision_macro": precision_macro, "recall_micro": recall_micro, "precision_micro": precision_micro, "recall_weighted": recall_weighted, "precision_weighted": precision_weighted, "f1_weighted": f1weighted, "f1_macro": f1macro, "f1_micro": f1micro, "f1_weighted": f1weighted, "mcc": mcc } register_metrics( f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs', evaluation_metrics) metric = f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs' set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) logger, ml_logger = init_logging() tokenizer = Tokenizer.load(pretrained_model_name_or_path=bert_model, do_lower_case=False) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=256, train_filename= f"{current_info_need}_{condition}_{num_epochs}_epochs_train.csv", test_filename= f"{current_info_need}_{condition}_{num_epochs}_epochs_test.csv", data_dir="data/", label_list=label_list, metric=metric, text_column_name="utterance", label_column_name=level, delimiter=";") data_silo = DataSilo(processor=processor, batch_size=batch_size) silos = DataSiloForCrossVal.make(data_silo, n_splits=folds, sets=['train', 'test']) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(bert_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout_prob, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=num_epochs, use_amp=None) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_fold}") earlystopping = EarlyStopping( metric="f1_infoneed", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=num_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=100, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_info_need = -1 language_model_name = bert_model if language_model_name.find("/") != -1: language_model_name = language_model_name.replace("/", "_") save_dir = Path( f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}" ) for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator( data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_info_need = result[0]["f1_infoneed"] if f1_info_need > bestf1_info_need: bestf1_info_need = f1_info_need bestfold = num_fold # emtpy cache to avoid memory leak and cuda OOM across multiple folds model.cpu() torch.cuda.empty_cache() # Save the per-fold results to json for a separate, more detailed analysis with open( f"classification_results/test/{current_info_need}-{language_model_name}-{condition}-{num_epochs}_epochs-{folds}-fold-cv.results.json", "wt") as fp: json.dump(allresults, fp) # calculate overall metrics across all folds xval_f1_other = f1_score(all_labels, all_preds, labels=label_list, pos_label="Other") xval_f1_info_need = f1_score(all_labels, all_preds, labels=label_list, pos_label=current_info_need) xval_f1_micro = f1_score(all_labels, all_preds, labels=label_list, average="micro") xval_f1_macro = f1_score(all_labels, all_preds, labels=label_list, average="macro") xval_mcc = matthews_corrcoef(all_labels, all_preds) xval_overall_results = { "xval_f1_other": xval_f1_other, f"xval_f1_infoneed": xval_f1_info_need, "xval_f1_micro": xval_f1_micro, "xval_f1_macro": xval_f1_macro, "xval_f1_mcc": xval_mcc } logger.info(f"XVAL F1 MICRO: {xval_f1_micro}") logger.info(f"XVAL F1 MACRO: {xval_f1_macro}") logger.info(f"XVAL F1 OTHER: {xval_f1_other}") logger.info( f"XVAL F1 {current_info_need} {condition} {num_epochs} epochs: {xval_f1_info_need}" ) logger.info(f"XVAL MCC: {xval_mcc}") # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path( f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}-{bestfold}" ) model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info("TEST F1 MICRO: {}".format(result[0]["f1_micro"])) logger.info("TEST F1 MACRO: {}".format(result[0]["f1_macro"])) logger.info("TEST F1 OTHER: {}".format(result[0]["f1_other"])) logger.info("TEST F1 {0}: {1}".format(current_info_need, result[0]["f1_infoneed"])) logger.info("TEST MCC: {}".format(result[0]["mcc"])) test_set_results = { "test_f1_other": result[0]["f1_other"], "test_f1_infoneed": result[0][f"f1_infoneed"], "test_f1_micro": result[0]["f1_micro"], "test_f1_macro": result[0]["f1_macro"], "test_f1_mcc": result[0]["mcc"] }