def build_task_data(params: Params, data_supplier) -> SeqTagTaskData: dataset_dict: Dict[str, List[TaggedSequence]] = data_supplier() ner_labels = ["[PAD]", NIT] + list( set(tag for taggedseqs in dataset_dict.values() for taggedseq in taggedseqs for tok, tag in taggedseq)) ml_logger = MLFlowLogger(tracking_uri=os.environ["HOME"] + "/data/mlflow_experiments/mlruns") ml_logger.init_experiment(experiment_name="Sequence_Tagging", run_name="Run_ner") lang_model = "bert-base-cased" do_lower_case = False tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = NERProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=None, # noqa metric="seq_f1", label_list=ner_labels, ) task_data = { "num_labels": len(ner_labels), "lang_model": lang_model, "ml_logger": ml_logger, "processor": processor, "params": params, } return SeqTagTaskData(data=dataset_dict, task_data=task_data)
########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 4 batch_size = 32 evaluate_every = 50 lang_model = "bert-base-german-cased" # 1.Create a tokenizer tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="../data/conll03-de") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = Bert.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead( layer_dims=[768, len(processor.label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head],
def test_ner_amp(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 2 evaluate_every = 1 lang_model = "bert-base-german-cased" if AMP_AVAILABLE: use_amp = 'O1' else: use_amp = None tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename=Path("train-sample.txt"), dev_filename=Path("dev-sample.txt"), test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-05, schedule_opts=None, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, use_amp=use_amp) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "1980 kam der Crown von Toyota" }, ] model = Inferencer.load(save_dir, gpu=True) result = model.inference_from_dicts(dicts=basic_texts, max_processes=1) #print(result) assert result[0]["predictions"][0]["context"] == "Crown" assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
evaluate_every = 100 lang_model = "bert-base-german-cased" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="../data/conll03-de", metric="seq_f1", label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead( task_name="ner", layer_dims=[768, len(processor.tasks["ner"]["label_list"])]) model = AdaptiveModel(
def ner(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_ner") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 4 batch_size = 32 evaluate_every = 400 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/conll03-de"), delimiter=" ", metric="seq_f1", label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = "saved_models/bert-german-ner-tutorial" model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def test_ner(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 2 evaluate_every = 1 lang_model = "bert-base-german-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False ) ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"] processor = NERProcessor( tokenizer=tokenizer, max_seq_len=8, data_dir="samples/ner", train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1" ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(layer_dims=[768, len(ner_labels)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) save_dir = "testsave/ner" model = trainer.train(model) model.save(save_dir) processor.save(save_dir) basic_texts = [ {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) assert result[0]["predictions"][0]["context"] == "sagte" assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
def load(cls, model_name_or_path, batch_size=4, gpu=False, task_type=None, return_class_probs=False, strict=True, max_seq_len=256): """ Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by 1. specifying a public name from transformers' model hub (https://huggingface.co/models) 2. or pointing to a local directory it is saved in. :param model_name_or_path: Local directory or public name of the model to load. :type model_name_or_path: str :param batch_size: Number of samples computed once per batch :type batch_size: int :param gpu: If GPU shall be used :type gpu: bool :param task_type: Type of task the model should be used for. Currently supporting: "embeddings", "question_answering", "text_classification". More coming soon... :param task_type: str :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in the PredictionHead (see torch.nn.module.load_state_dict()). Set to `False` for backwards compatibility with PHs saved with older version of FARM. :type strict: bool :return: An instance of the Inferencer. """ device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, use_amp=None) name = os.path.basename(model_name_or_path) # a) either from local dir if os.path.exists(model_name_or_path): model = AdaptiveModel.load(model_name_or_path, device, strict=strict) if task_type == "embeddings": processor = InferenceProcessor.load_from_dir( model_name_or_path) else: processor = Processor.load_from_dir(model_name_or_path) # b) or from remote transformers model hub else: logger.info( f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..." ) if not task_type: raise ValueError( "Please specify the 'task_type' of the model you want to load from transformers. " "Valid options for arg `task_type`:" "'question_answering', 'embeddings', 'text_classification'" ) model = AdaptiveModel.convert_from_transformers( model_name_or_path, device, task_type) config = AutoConfig.from_pretrained(model_name_or_path) tokenizer = Tokenizer.load(model_name_or_path) # TODO infer task_type automatically from config (if possible) if task_type == "question_answering": processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, label_list=["start_token", "end_token"], metric="squad", data_dir=None, ) elif task_type == "embeddings": processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len) elif task_type == "text_classification": label_list = list(config.id2label[id] for id in range(len(config.id2label))) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=None, label_list=label_list, label_column_name="label", metric="acc", quote_char='"', ) # elif task_type == "multilabel-classification": # # label_list = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] # label_list = list(config.label2id.keys()) # # processor = TextClassificationProcessor(tokenizer=tokenizer, # max_seq_len=max_seq_len, # data_dir=None, # label_list=label_list, # label_column_name="label", # metric="acc", # quote_char='"', # multilabel=True, # ) elif task_type == "ner": label_list = list(config.label2id.keys()) processor = NERProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=None, metric="seq_f1", label_list=label_list) else: raise ValueError( f"`task_type` {task_type} is not supported yet. " f"Valid options for arg `task_type`: 'question_answering', 'embeddings', 'text_classification'" ) return cls( model, processor, batch_size=batch_size, gpu=gpu, name=name, return_class_probs=return_class_probs, )
def execML(self, job): start_time = time.time() if job.task == 'analyse': basic_texts = [] # Will donwload and store dataset... sample = self.downloadAndConvertText(job, job.data_sample) for text in sample.encode('utf-8').splitlines(): basic_texts.append({'text': text.decode('utf-8')}) # Will donwload and store model... self.downloadAndStoreZIPModel(job, job.model) self.updateJobStatus(job, 'analysing') save_dir = 'tmp/' + job.model['id'] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) self.persistResult(job, result) model.close_multiprocessing_pool() self.updateJobStatus(job, 'completed') elif job.task == 'train': self.updateJobStatus(job, 'training') # Will donwload and store dataset... self.downloadAndStoreZIPDataset(job, job.data_source) # Will donwload and store model... self.downloadAndStoreZIPModel(job, job.model) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 4 evaluate_every = 400 do_lower_case = False batch_size = 32 lang_model = os.path.join(Path.cwd(), 'tmp', job.model['id']) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] # 1. Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case, tokenizer_class='BertTokenizer' ) #tokenizer_class='BertTokenizer' # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=str( os.path.join(Path.cwd(), 'tmp', job.data_source['id'])), delimiter=' ', metric='seq_f1', label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) # 4. Create an AdaptiveModel # 4.1 which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # 4.2 and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead( num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=['per_token'], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: newModelId = str(uuid.uuid4()) save_dir = 'tmp/' + newModelId model.save(save_dir) processor.save(save_dir) model.close_multiprocessing_pool() self.persistZIPModel(newModelId, job) self.updateJobStatus(job, 'completed') elapsed_time = time.time() - start_time print('Execution time max: ', elapsed_time, 'for job.id:', job.id, flush=True) return {'status': True, 'code': 'ok', 'msg': 'success'}
def test_ner(caplog): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 3 batch_size = 2 evaluate_every = 1 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'LinearWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") model = trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Paris is a town in France." }, ] model = Inferencer.load( model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english", num_processes=0, task_type="ner") # labels arent correctly inserted from transformers # They are converted to LABEL_1 ... LABEL_N # For the inference result to contain predictions we need them in IOB NER format model.processor.tasks["ner"]["label_list"][-1] = "B-LOC" result = model.inference_from_dicts(dicts=basic_texts) assert result[0]["predictions"][0]["context"] == "Paris" assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
def test_ner(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 5 batch_size = 2 evaluate_every = 1 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'LinearWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") model = trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Albrecht Lehman ist eine Person" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts, max_processes=1) #print(result) #assert result[0]["predictions"][0]["context"] == "sagte" #assert isinstance(result[0]["predictions"][0]["probability"], np.float32) result2 = model.inference_from_dicts(dicts=basic_texts, rest_api_schema=True) assert result == result2
########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" # 1.Create a tokenizer tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False ) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NERProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir="../data/conll03-de" ) ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"] processor.add_task("ner", "seq_f1", ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = Bert.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(task_name="ner", layer_dims=[768, len(processor.tasks["ner"]["label_list"])]) model = AdaptiveModel(
def load(cls, model_name_or_path, batch_size=4, gpu=False, task_type=None, return_class_probs=False, strict=True, max_seq_len=256, doc_stride=128, extraction_layer=None, extraction_strategy=None, s3e_stats=None, num_processes=None, disable_tqdm=False): """ Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by 1. specifying a public name from transformers' model hub (https://huggingface.co/models) 2. or pointing to a local directory it is saved in. :param model_name_or_path: Local directory or public name of the model to load. :type model_name_or_path: str :param batch_size: Number of samples computed once per batch :type batch_size: int :param gpu: If GPU shall be used :type gpu: bool :param task_type: Type of task the model should be used for. Currently supporting: "embeddings", "question_answering", "text_classification", "ner". More coming soon... :param task_type: str :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in the PredictionHead (see torch.nn.module.load_state_dict()). Set to `False` for backwards compatibility with PHs saved with older version of FARM. :type strict: bool :param max_seq_len: maximum length of one text sample :type max_seq_len: int :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride :type doc_stride: int :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean' (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors) :type extraction_strategy: str :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer). :type extraction_layer: int :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()` (only needed for task_type="embeddings" and extraction_strategy = "s3e") :type s3e_stats: dict :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable multiprocessing. Set to None to let Inferencer use all CPU cores. If you want to debug the Language Model, you might need to disable multiprocessing! :type num_processes: int :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing) :type disable_tqdm: bool :return: An instance of the Inferencer. """ device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, use_amp=None) name = os.path.basename(model_name_or_path) # a) either from local dir if os.path.exists(model_name_or_path): model = BaseAdaptiveModel.load(load_dir=model_name_or_path, device=device, strict=strict) if task_type == "embeddings": processor = InferenceProcessor.load_from_dir( model_name_or_path) else: processor = Processor.load_from_dir(model_name_or_path) # override processor attributes loaded from config file with inferencer params processor.max_seq_len = max_seq_len if hasattr(processor, "doc_stride"): processor.doc_stride = doc_stride # b) or from remote transformers model hub else: logger.info( f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..." ) if not task_type: raise ValueError( "Please specify the 'task_type' of the model you want to load from transformers. " "Valid options for arg `task_type`:" "'question_answering', 'embeddings', 'text_classification', 'ner'" ) model = AdaptiveModel.convert_from_transformers( model_name_or_path, device, task_type) config = AutoConfig.from_pretrained(model_name_or_path) tokenizer = Tokenizer.load(model_name_or_path) # TODO infer task_type automatically from config (if possible) if task_type == "question_answering": processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, label_list=["start_token", "end_token"], metric="squad", data_dir="data", doc_stride=doc_stride) elif task_type == "embeddings": processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len) elif task_type == "text_classification": label_list = list(config.id2label[id] for id in range(len(config.id2label))) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir="data", label_list=label_list, label_column_name="label", metric="acc", quote_char='"', ) elif task_type == "ner": label_list = list(config.label2id.keys()) processor = NERProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir="data", metric="seq_f1", label_list=label_list) else: raise ValueError( f"`task_type` {task_type} is not supported yet. " f"Valid options for arg `task_type`: 'question_answering', " f"'embeddings', 'text_classification', 'ner'") return cls(model, processor, task_type=task_type, batch_size=batch_size, gpu=gpu, name=name, return_class_probs=return_class_probs, extraction_strategy=extraction_strategy, extraction_layer=extraction_layer, s3e_stats=s3e_stats, num_processes=num_processes, disable_tqdm=disable_tqdm)
def ner(self, task, model_type, n_epochs, batch_size, evaluate_every, use_cude): aml_run = he.get_context() # Check task if cu.tasks.get(str(task)).get('type') != 'ner': raise Exception('NOT A NER TASK') language = cu.params.get('language') # Data dt_task = dt.Data(task=task) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = he.get_farm_model(model_type, language) save_dir = dt_task.get_path('model_dir') # ner_labels = dt_task.load('fn_label', header=None)[0].to_list() ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] # n_epochs = 4 # batch_size = 32 # evaluate_every = 750 # lang_model = "xlm-roberta-large" # AML log try: aml_run.log('task', task) aml_run.log('language', language) aml_run.log('n_epochs', n_epochs) aml_run.log('batch_size', batch_size) aml_run.log('lang_model', lang_model) aml_run.log_list('label_list', label_list) except: pass # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=dt_task.data_dir, metric="seq_f1", label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir)
def ner(task: str, lm: str): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ########################## ########## Settings ########################## set_all_seeds(seed=42, deterministic_cudnn=use_cuda) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) n_epochs = 10 batch_size = 32 evaluate_every = 1000 model_dir = MODEL_DIR if lm == 'bert-hgcrw': lang_model = "redewiedergabe/bert-base-historical-german-rw-cased" model_dir += '_bert-hgcrw' elif lm == 'lmgot01': lang_model = Path( "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_01") model_dir += '_lmgot01' elif lm == 'lmgot02': lang_model = Path( "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_02") model_dir += '_lmgot02' else: lang_model = "bert-base-german-cased" if task != 'all': model_dir += '_' + task do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor if task == 'direct': ner_labels = ["[PAD]", "X", "O", "B-DIR", "I-DIR"] elif task == 'indirect': ner_labels = ["[PAD]", "X", "O", "B-IND", "I-IND"] elif task == 'reported': ner_labels = ["[PAD]", "X", "O", "B-REP", "I-REP"] else: ner_labels = [ "[PAD]", "X", "O", "B-DIR", "I-DIR", "B-IND", "I-IND", "B-REP", "I-REP" ] data_dir = DATA_DIR if task != 'all': data_dir += task + '/' processor = NERProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir=Path(data_dir), delimiter="\t", metric="seq_f1", label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_loader_worker = 1 data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=data_loader_worker) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = model_dir model.save(save_dir) processor.save(save_dir)
def load( cls, model_name_or_path, batch_size=4, gpu=False, task_type=None, return_class_probs=False, strict=True, max_seq_len=256, doc_stride=128, extraction_layer=None, extraction_strategy=None, s3e_stats=None, num_processes=None, disable_tqdm=False, tokenizer_class=None, use_fast=False, tokenizer_args=None, dummy_ph=False, benchmarking=False, ): """ Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by 1. specifying a public name from transformers' model hub (https://huggingface.co/models) 2. or pointing to a local directory it is saved in. :param model_name_or_path: Local directory or public name of the model to load. :type model_name_or_path: str :param batch_size: Number of samples computed once per batch :type batch_size: int :param gpu: If GPU shall be used :type gpu: bool :param task_type: Type of task the model should be used for. Currently supporting: "embeddings", "question_answering", "text_classification", "ner". More coming soon... :param task_type: str :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in the PredictionHead (see torch.nn.module.load_state_dict()). Set to `False` for backwards compatibility with PHs saved with older version of FARM. :type strict: bool :param max_seq_len: maximum length of one text sample :type max_seq_len: int :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride :type doc_stride: int :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean' (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors) :type extraction_strategy: str :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer). :type extraction_layer: int :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()` (only needed for task_type="embeddings" and extraction_strategy = "s3e") :type s3e_stats: dict :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable multiprocessing. Set to None to let Inferencer use all CPU cores minus one. If you want to debug the Language Model, you might need to disable multiprocessing! **Warning!** If you use multiprocessing you have to close the `multiprocessing.Pool` again! To do so call :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are done using this class. The garbage collector will not do this for you! :type num_processes: int :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing) :type disable_tqdm: bool :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). :param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method. See https://huggingface.co/transformers/main_classes/tokenizer.html and detailed tokenizer documentation on `Hugging Face Transformers <https://huggingface.co/transformers/>`_. :type tokenizer_args: dict :type use_fast: bool :param dummy_ph: If True, methods of the prediction head will be replaced with a dummy method. This is used to isolate lm run time from ph run time. :type dummy_ph: bool :param benchmarking: If True, a benchmarking object will be initialised within the class and certain parts of the code will be timed for benchmarking. Should be kept False if not benchmarking since these timing checkpoints require synchronization of the asynchronous Pytorch operations and may slow down the model. :type benchmarking: bool :return: An instance of the Inferencer. """ if tokenizer_args is None: tokenizer_args = {} device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, use_amp=None) name = os.path.basename(model_name_or_path) # a) either from local dir if os.path.exists(model_name_or_path): model = BaseAdaptiveModel.load(load_dir=model_name_or_path, device=device, strict=strict) if task_type == "embeddings": processor = InferenceProcessor.load_from_dir( model_name_or_path) else: processor = Processor.load_from_dir(model_name_or_path) # override processor attributes loaded from config file with inferencer params processor.max_seq_len = max_seq_len if hasattr(processor, "doc_stride"): assert doc_stride < max_seq_len, "doc_stride is longer than max_seq_len. This means that there will be gaps " \ "as the passage windows slide, causing the model to skip over parts of the document. "\ "Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384) " processor.doc_stride = doc_stride # b) or from remote transformers model hub else: logger.info( f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..." ) if not task_type: raise ValueError( "Please specify the 'task_type' of the model you want to load from transformers. " "Valid options for arg `task_type`:" "'question_answering', 'embeddings', 'text_classification', 'ner'" ) model = AdaptiveModel.convert_from_transformers( model_name_or_path, device, task_type) config = AutoConfig.from_pretrained(model_name_or_path) tokenizer = Tokenizer.load( model_name_or_path, tokenizer_class=tokenizer_class, use_fast=use_fast, **tokenizer_args, ) # TODO infer task_type automatically from config (if possible) if task_type == "question_answering": processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, label_list=["start_token", "end_token"], metric="squad", data_dir="data", doc_stride=doc_stride) elif task_type == "embeddings": processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len) elif task_type == "text_classification": label_list = list(config.id2label[id] for id in range(len(config.id2label))) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir="data", label_list=label_list, label_column_name="label", metric="acc", quote_char='"', ) elif task_type == "ner": label_list = list(config.label2id.keys()) processor = NERProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir="data", metric="seq_f1", label_list=label_list) else: raise ValueError( f"`task_type` {task_type} is not supported yet. " f"Valid options for arg `task_type`: 'question_answering', " f"'embeddings', 'text_classification', 'ner'") if not isinstance(model, ONNXAdaptiveModel): model, _ = optimize_model(model=model, device=device, local_rank=-1, optimizer=None) return cls(model, processor, task_type=task_type, batch_size=batch_size, gpu=gpu, name=name, return_class_probs=return_class_probs, extraction_strategy=extraction_strategy, extraction_layer=extraction_layer, s3e_stats=s3e_stats, num_processes=num_processes, disable_tqdm=disable_tqdm, benchmarking=benchmarking, dummy_ph=dummy_ph)
def ner(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ########################## ########## Settings ########################## set_all_seeds(seed=42, deterministic_cudnn=True) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) n_epochs = 4 batch_size = 32 evaluate_every = 400 lang_model = "bert-base-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case ) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"] processor = NERProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path(DATA_DIR), delimiter=" ", metric="seq_f1", label_list=ner_labels ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_loader_worker = 15 data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=data_loader_worker) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = MODEL_DIR model.save(save_dir) processor.save(save_dir)