def test_nq(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "distilbert-base-uncased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model, do_lower_case=True) processor = NaturalQuestionsProcessor(tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train_sample.jsonl", dev_filename="dev_sample.jsonl", data_dir=Path("samples/nq")) data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(base_LM_model) qa_head = QuestionAnsweringHead() classification_head = TextClassificationHead( num_labels=len(processor.answer_type_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[qa_head, classification_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() save_dir = Path("testsave/nq") model.save(save_dir) processor.save(save_dir) inferencer = Inferencer.load(save_dir, batch_size=2, gpu=False, num_processes=0) qa_format_1 = [{ "questions": ["Who counted the game among the best ever made?"], "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] qa_format_2 = [{ "qas": ["Who counted the game among the best ever made?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.", }] result1 = inferencer.inference_from_dicts(dicts=qa_format_1) result2 = inferencer.inference_from_dicts(dicts=qa_format_2) assert result1 == result2
def test_doc_classification(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/doc_class"), train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = TextClassificationHead(num_labels=2) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts=None) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() save_dir = Path("testsave/doc_class") model.save(save_dir) processor.save(save_dir) basic_texts = [{ "text": "Martin Müller spielt Handball in Berlin." }, { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei." }] inf = Inferencer.load(save_dir, batch_size=2) result = inf.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["probability"], np.float32) result2 = inf.inference_from_dicts(dicts=basic_texts, rest_api_schema=True) assert result == result2
def test_s3e_fit(): # small test data language_model = Path("samples/s3e/tiny_fasttext_model") corpus_path = Path("samples/s3e/tiny_corpus.txt") save_dir = Path("testsave/fitted_s3e/") do_lower_case = False batch_size = 2 use_gpu = False # Fit S3E on a corpus set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(language_model) model = AdaptiveModel(language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=[], device=device) model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, model=model, corpus=corpus_path, n_clusters=3, pca_n_components=30, svd_postprocessing=True, min_token_occurrences=1) # save everything to allow inference without fitting everything again model.save(save_dir) processor.save(save_dir) with open(save_dir / "s3e_stats.pkl", "wb") as f: pickle.dump(s3e_stats, f) # Load model, tokenizer and processor directly into Inferencer inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats, num_processes=0) # Input basic_texts = [ { "text": "a man is walking on the street." }, { "text": "a woman is walking on the street." }, ] # Get embeddings for input text (you can vary the strategy and layer) result = inferencer.inference_from_dicts(dicts=basic_texts) assert result[0]["context"] == [ 'a', 'man', 'is', 'walking', 'on', 'the', 'street', '.' ] assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6 assert result[0]["vec"][-2] - 0.06285100416478565 < 1e-6
def doc_classification_multilabel(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 32 evaluate_every = 500 lang_model = "bert-base-uncased" do_lower_case = True # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"] metric = "acc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/toxic-comments"), label_list=label_list, label_column_name="label", metric=metric, quote_char='"', multilabel=True, train_filename="train.tsv", dev_filename="val.tsv", test_filename=None, dev_split=0, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-german-multi-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ {"text": "You f*****g bastards"}, {"text": "What a lovely world"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def doc_classification_cola(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 5 batch_size = 100 evaluate_every = 20 lang_model = "bert-base-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load Cola 2018 Data. label_list = ["0", "1"] metric = "mcc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/cola"), dev_filename=Path("dev.tsv"), dev_split=None, test_filename=None, label_list=label_list, metric=metric, label_column_name="label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "The box contained the ball from the tree." }, { "text": "I'll fix you a drink." }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
label_list=label_list, data_dir='new/', label_column_name="class", # our labels are located in the "genre" column metric=metric, quote_char='"', multilabel=True, train_filename="train.tsv", dev_filename=None, test_filename="test.tsv", dev_split=0.1 # this will extract 10% of the train set to create a dev set ) data_silo = DataSilo(processor=processor, batch_size=batch_size) # loading the pretrained BERT base cased model language_model = LanguageModel.load(lang_model) # prediction head for our model that is suited for classifying news article genres prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs)
def __init__(self, document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", progress_bar: bool = True ): """ Init the Retriever incl. the two encoder models from a local or remote model checkpoint. The checkpoint format matches huggingface transformers' model format **Example:** ```python | # remote model from FAIR | DensePassageRetriever(document_store=your_doc_store, | query_embedding_model="facebook/dpr-question_encoder-single-nq-base", | passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base") | # or from local path | DensePassageRetriever(document_store=your_doc_store, | query_embedding_model="model_directory/question-encoder", | passage_embedding_model="model_directory/context-encoder") ``` :param document_store: An instance of DocumentStore from which to retrieve documents. :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the one used by hugging-face transformers' modelhub models Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"`` :param passage_embedding_model: Local path or remote name of passage encoder checkpoint. The format equals the one used by hugging-face transformers' modelhub models Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"`` :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :param max_seq_len_query: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down." :param max_seq_len_passage: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down." :param use_gpu: Whether to use gpu or not :param batch_size: Number of questions or passages to encode at once :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding. This is the approach used in the original paper and is likely to improve performance if your titles contain meaningful information for retrieval (topic, entities etc.) . The title is expected to be present in doc.meta["name"] and can be supplied in the documents before writing them to the DocumentStore like this: {"text": "my text", "meta": {"name": "my title"}}. :param progress_bar: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. """ self.document_store = document_store self.batch_size = batch_size self.max_seq_len_passage = max_seq_len_passage self.max_seq_len_query = max_seq_len_query self.progress_bar = progress_bar if document_store is None: logger.warning("DensePassageRetriever initialized without a document store. " "This is fine if you are performing DPR training. " "Otherwise, please provide a document store in the constructor.") elif document_store.similarity != "dot_product": logger.warning(f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. " "We recommend you use dot_product instead. " "This can be set when initializing the DocumentStore") if use_gpu and torch.cuda.is_available(): self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.embed_title = embed_title # Init & Load Encoders self.query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=query_embedding_model, revision=model_version, do_lower_case=True, use_fast=use_fast_tokenizers, tokenizer_class="DPRQuestionEncoderTokenizer") self.query_encoder = LanguageModel.load(pretrained_model_name_or_path=query_embedding_model, revision=model_version, language_model_class="DPRQuestionEncoder") self.passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_embedding_model, revision=model_version, do_lower_case=True, use_fast=use_fast_tokenizers, tokenizer_class="DPRContextEncoderTokenizer") self.passage_encoder = LanguageModel.load(pretrained_model_name_or_path=passage_embedding_model, revision=model_version, language_model_class="DPRContextEncoder") self.processor = TextSimilarityProcessor(tokenizer=self.query_tokenizer, passage_tokenizer=self.passage_tokenizer, max_seq_len_passage=self.max_seq_len_passage, max_seq_len_query=self.max_seq_len_query, label_list=["hard_negative", "positive"], metric="text_similarity_metric", embed_title=self.embed_title, num_hard_negatives=0, num_positives=1) prediction_head = TextSimilarityHead(similarity_function=similarity_function) self.model = BiAdaptiveModel( language_model1=self.query_encoder, language_model2=self.passage_encoder, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm1_output_types=["per_sequence"], lm2_output_types=["per_sequence"], device=self.device, ) self.model.connect_heads_with_processor(self.processor.tasks, require_labels=False)
def fit(language_model, corpus_path, save_dir, do_lower_case, batch_size=4, use_gpu=False): # Fit S3E on a corpus set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(language_model) model = AdaptiveModel(language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, model=model, corpus=corpus_path, n_clusters=10, pca_n_components=300, svd_postprocessing=True, min_token_occurrences=1) # save everything to allow inference without fitting everything again model.save(save_dir) processor.save(save_dir) with open(save_dir / "s3e_stats.pkl", "wb") as f: pickle.dump(s3e_stats, f) # Load model, tokenizer and processor directly into Inferencer inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats) # Input basic_texts = [ { "text": "a man is walking on the street." }, { "text": "a woman is walking on the street." }, ] # Get embeddings for input text (you can vary the strategy and layer) result = inferencer.inference_from_dicts(dicts=basic_texts) print(result) inferencer.close_multiprocessing_pool()
def convert_from_transformers(cls, model_name_or_path, device, task_type, processor=None): """ Load a (downstream) model from huggingface's transformers format. Use cases: - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data) - compare models without switching frameworks - use model directly for inference :param model_name_or_path: local path of a saved model or name of a public one. Exemplary public names: - distilbert-base-uncased-distilled-squad - deepset/bert-large-uncased-whole-word-masking-squad2 See https://huggingface.co/models for full list :param device: "cpu" or "cuda" :param task_type: One of : - 'question_answering' - 'text_classification' - 'embeddings' More tasks coming soon ... :param processor: populates prediction head with information coming from tasks :type processor: Processor :return: AdaptiveModel """ lm = LanguageModel.load(model_name_or_path) #TODO Infer type of head automatically from config if task_type == "question_answering": ph = QuestionAnsweringHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "text_classification": if "roberta" in model_name_or_path: # The RobertaClassificationhead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = TextClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "ner": ph = TokenClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "embeddings": adaptive_model = cls(language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) else: raise NotImplementedError( f"Huggingface's transformer models of type {task_type} are not supported yet" ) if processor: adaptive_model.connect_heads_with_processor(processor.tasks) return adaptive_model
def convert_from_transformers(model_name_or_path, device, revision=None, task_type=None, processor=None, **kwargs): """ Load a (downstream) model from huggingface's transformers format. Use cases: - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data) - compare models without switching frameworks - use model directly for inference :param model_name_or_path: local path of a saved model or name of a public one. Exemplary public names: - distilbert-base-uncased-distilled-squad - deepset/bert-large-uncased-whole-word-masking-squad2 See https://huggingface.co/models for full list :param device: "cpu" or "cuda" :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :type revision: str :param task_type: One of : - 'question_answering' - 'text_classification' - 'embeddings' More tasks coming soon ... :param processor: populates prediction head with information coming from tasks :type processor: Processor :return: AdaptiveModel """ lm = LanguageModel.load(model_name_or_path, revision=revision, **kwargs) if task_type is None: # Infer task type from config architecture = lm.model.config.architectures[0] if "MaskedLM" in architecture: task_type = "lm" elif "QuestionAnswering" in architecture: task_type = "question_answering" elif "SequenceClassification" in architecture: if lm.model.config.num_labels == 1: task_type = "regression" else: task_type = "text_classification" elif "TokenClassification" in architecture: task_type = "ner" else: logger.error( "Could not infer task type from model config. Please provide task type manually. " "('lm', 'question_answering', 'regression', 'text_classification', 'ner' or 'embeddings')" ) if task_type == "lm": ph = BertLMHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "question_answering": ph = QuestionAnsweringHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "regression": if "roberta" in model_name_or_path: # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Regression with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = RegressionHead.load(model_name_or_path, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "text_classification": if "roberta" in model_name_or_path: # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = TextClassificationHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "ner": ph = TokenClassificationHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "embeddings": adaptive_model = am.AdaptiveModel( language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) if processor: adaptive_model.connect_heads_with_processor(processor.tasks) return adaptive_model
def test_lm_finetuning_custom_vocab(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) tokenizer.add_tokens([ "aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbbbbbb", "ccccccccccccccccccccccc" ]) processor = BertStyleLMProcessor(data_dir=Path("samples/lm_finetuning"), train_filename="train-sample.txt", test_filename="test-sample.txt", dev_filename=None, tokenizer=tokenizer, max_seq_len=12, next_sent_pred=True) data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model, n_added_tokens=len( tokenizer.added_tokens_decoder)) lm_prediction_head = BertLMHead.load(lang_model, n_added_tokens=len( tokenizer.added_tokens_decoder)) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'CosineWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) trainer.train() # LM embeddings and weight of decoder in head are shared and should therefore be equal assert torch.all( torch.eq(model.language_model.model.embeddings.word_embeddings.weight, model.prediction_heads[0].decoder.weight)) save_dir = Path("testsave/lm_finetuning") model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Farmer's life is great." }, { "text": "It's nothing for big city kids though." }, ] model = Inferencer.load(save_dir, task_type="embeddings") result = model.extract_vectors(dicts=basic_texts) assert result[0]["context"] == [ 'Farmer', "'", 's', 'life', 'is', 'great', '.' ] assert result[0]["vec"].shape == (768, ) # TODO check why results vary accross runs with same seed assert isinstance(result[0]["vec"][0], np.float32)
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 1 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" # or a local path: # lang_model = Path("../saved_models/farm-bert-base-cased") use_amp = None ############################################# # CUSTOM OPTIMIZER & LR SCHEDULE ############################################# # learning rate schedules from transformers schedule_opts = {"name": "LinearWarmup", "warmup_proportion": 0.4} # schedule_opts = {"name": "Constant"} # schedule_opts = {"name": "CosineWarmup", "warmup_proportion": 0.4} # schedule_opts = {"name": "CosineWarmupWithRestarts", "warmup_proportion": 0.4} # or from native pytorch (see https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html for all options) # schedule_opts = {"name": "StepLR", "step_size": 30, "gamma": 0.1} # schedule_opts = {"name": "ReduceLROnPlateau", "mode": 'min', "factor": 0.1, "patience":10} # optimizers from pytorch (see https://pytorch.org/docs/stable/optim.html for all options) optimizer_opts = {"name": "SGD", "momentum": 0.0} # or from apex (see https://github.com/NVIDIA/apex/tree/master/apex/optimizers for all options) # optimizer_opts = {"name": "FusedLAMB", "bias_correction": True} # or from transformers (default in FARM) #optimizer_opts = {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01} ############################################# device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=5e-3, optimizer_opts=optimizer_opts, schedule_opts=schedule_opts, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir="../data/squad20", ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(base_LM_model) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=1e-5,
def test_ner(caplog, use_fast): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 3 batch_size = 2 evaluate_every = 1 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False, use_fast=use_fast, ) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1", multithreading_rust=False) data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'LinearWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") model = trainer.train() model.save(save_dir) processor.save(save_dir) del model del processor del optimizer del data_silo del trainer basic_texts = [ { "text": "Paris is a town in France." }, ] model = Inferencer.load( model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english", num_processes=0, task_type="ner", use_fast=use_fast) # labels arent correctly inserted from transformers # They are converted to LABEL_1 ... LABEL_N # For the inference result to contain predictions we need them in IOB NER format model.processor.tasks["ner"]["label_list"][-1] = "B-LOC" result = model.inference_from_dicts(dicts=basic_texts) assert result[0]["predictions"][0]["context"] == "Paris" assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
def ner(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_ner") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 4 batch_size = 32 evaluate_every = 400 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case ) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"] processor = NERProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/conll03-de"), delimiter=" ", metric="seq_f1", label_list=ner_labels ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = "saved_models/bert-german-ner-tutorial" model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, {"text": "Martin Müller spielt Handball in Berlin"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool()
def ner(self, task, model_type, n_epochs, batch_size, evaluate_every, use_cude): aml_run = he.get_context() # Check task if cu.tasks.get(str(task)).get('type') != 'ner': raise Exception('NOT A NER TASK') language = cu.params.get('language') # Data dt_task = dt.Data(task=task) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = he.get_farm_model(model_type, language) save_dir = dt_task.get_path('model_dir') # ner_labels = dt_task.load('fn_label', header=None)[0].to_list() ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] # n_epochs = 4 # batch_size = 32 # evaluate_every = 750 # lang_model = "xlm-roberta-large" # AML log try: aml_run.log('task', task) aml_run.log('language', language) aml_run.log('n_epochs', n_epochs) aml_run.log('batch_size', batch_size) aml_run.log('lang_model', lang_model) aml_run.log_list('label_list', label_list) except: pass # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=dt_task.data_dir, metric="seq_f1", label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir)
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 1 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False # or a local path: # lang_model = Path("../saved_models/farm-bert-base-cased") use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def __init__(self, document_store: BaseDocumentStore, query_embedding_model: Union[ Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[ Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True): """ Init the Retriever incl. the two encoder models from a local or remote model checkpoint. The checkpoint format matches huggingface transformers' model format **Example:** ```python | # remote model from FAIR | DensePassageRetriever(document_store=your_doc_store, | query_embedding_model="facebook/dpr-question_encoder-single-nq-base", | passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base") | # or from local path | DensePassageRetriever(document_store=your_doc_store, | query_embedding_model="model_directory/question-encoder", | passage_embedding_model="model_directory/context-encoder") ``` :param document_store: An instance of DocumentStore from which to retrieve documents. :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the one used by hugging-face transformers' modelhub models Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"`` :param passage_embedding_model: Local path or remote name of passage encoder checkpoint. The format equals the one used by hugging-face transformers' modelhub models Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"`` :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :param max_seq_len_query: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down." :param max_seq_len_passage: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down." :param top_k: How many documents to return per query. :param use_gpu: Whether to use gpu or not :param batch_size: Number of questions or passages to encode at once :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding. This is the approach used in the original paper and is likely to improve performance if your titles contain meaningful information for retrieval (topic, entities etc.) . The title is expected to be present in doc.meta["name"] and can be supplied in the documents before writing them to the DocumentStore like this: {"text": "my text", "meta": {"name": "my title"}}. :param use_fast_tokenizers: Whether to use fast Rust tokenizers :param infer_tokenizer_classes: Whether to infer tokenizer class from the model config / name. If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. :param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training. Options: `dot_product` (Default) or `cosine` :param global_loss_buffer_size: Buffer size for all_gather() in DDP. Increase if errors like "encoded data exceeds max_size ..." come up :param progress_bar: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. """ # save init parameters to enable export of component config as YAML self.set_config( document_store=document_store, query_embedding_model=query_embedding_model, passage_embedding_model=passage_embedding_model, model_version=model_version, max_seq_len_query=max_seq_len_query, max_seq_len_passage=max_seq_len_passage, top_k=top_k, use_gpu=use_gpu, batch_size=batch_size, embed_title=embed_title, use_fast_tokenizers=use_fast_tokenizers, infer_tokenizer_classes=infer_tokenizer_classes, similarity_function=similarity_function, progress_bar=progress_bar, ) self.document_store = document_store self.batch_size = batch_size self.progress_bar = progress_bar self.top_k = top_k if document_store is None: logger.warning( "DensePassageRetriever initialized without a document store. " "This is fine if you are performing DPR training. " "Otherwise, please provide a document store in the constructor." ) elif document_store.similarity != "dot_product": logger.warning( f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. " "We recommend you use dot_product instead. " "This can be set when initializing the DocumentStore") self.device, _ = initialize_device_settings(use_cuda=use_gpu) self.infer_tokenizer_classes = infer_tokenizer_classes tokenizers_default_classes = { "query": "DPRQuestionEncoderTokenizer", "passage": "DPRContextEncoderTokenizer" } if self.infer_tokenizer_classes: tokenizers_default_classes["query"] = None # type: ignore tokenizers_default_classes["passage"] = None # type: ignore # Init & Load Encoders self.query_tokenizer = Tokenizer.load( pretrained_model_name_or_path=query_embedding_model, revision=model_version, do_lower_case=True, use_fast=use_fast_tokenizers, tokenizer_class=tokenizers_default_classes["query"]) self.query_encoder = LanguageModel.load( pretrained_model_name_or_path=query_embedding_model, revision=model_version, language_model_class="DPRQuestionEncoder") self.passage_tokenizer = Tokenizer.load( pretrained_model_name_or_path=passage_embedding_model, revision=model_version, do_lower_case=True, use_fast=use_fast_tokenizers, tokenizer_class=tokenizers_default_classes["passage"]) self.passage_encoder = LanguageModel.load( pretrained_model_name_or_path=passage_embedding_model, revision=model_version, language_model_class="DPRContextEncoder") self.processor = TextSimilarityProcessor( query_tokenizer=self.query_tokenizer, passage_tokenizer=self.passage_tokenizer, max_seq_len_passage=max_seq_len_passage, max_seq_len_query=max_seq_len_query, label_list=["hard_negative", "positive"], metric="text_similarity_metric", embed_title=embed_title, num_hard_negatives=0, num_positives=1) prediction_head = TextSimilarityHead( similarity_function=similarity_function, global_loss_buffer_size=global_loss_buffer_size) self.model = BiAdaptiveModel( language_model1=self.query_encoder, language_model2=self.passage_encoder, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm1_output_types=["per_sequence"], lm2_output_types=["per_sequence"], device=self.device, ) self.model.connect_heads_with_processor(self.processor.tasks, require_labels=False)
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_natural_questions") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 1 evaluate_every = 500 lang_model = "deepset/roberta-base-squad2" # start with a model that can already extract answers do_lower_case = False # roberta is a cased model train_filename = "train_medium.jsonl" dev_filename = "dev_medium.jsonl" keep_is_impossible = 0.15 # downsample negative examples after data conversion downsample_context_size = 300 # reduce length of wikipedia articles to relevant part around the answer # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case ) # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart html_tags = [ "<Th>","</Th>", "<Td>","</Td>", "<Tr>","</Tr>", "<Li>","</Li>", "<P>" ,"</P>", "<Ul>","</Ul>", "<H1>","</H1>", "<H2>","</H2>", "<H3>","</H3>", "<H4>","</H4>", "<H5>", "</H5>", "<Td_colspan=", ] tokenizer.add_tokens(html_tags) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NaturalQuestionsProcessor( tokenizer=tokenizer, max_seq_len=384, train_filename=train_filename, dev_filename=dev_filename, keep_no_answer=keep_is_impossible, downsample_context_size=downsample_context_size, data_dir=Path("../data/natural_questions"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, caching=True) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model,n_added_tokens=len(html_tags)) # b) and in case of Natural Questions we need two Prediction Heads # one for extractive Question Answering qa_head = QuestionAnsweringHead() # another one for answering yes/no questions or deciding if the given text passage might contain an answer classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) # answer_type_list = ["is_impossible", "span", "yes", "no"] model = AdaptiveModel( language_model=language_model, prediction_heads=[qa_head, classification_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/roberta-base-squad2-nq") model.save(save_dir) processor.save(save_dir) # 9. Since training on the whole NQ corpus requires substantial compute resources we trained and uploaded a model on s3 fetch_archive_from_http("https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/models/roberta-base-squad2-nq.zip", output_dir="../saved_models/farm") QA_input = [ { "qas": ["Did GameTrailers rated Twilight Princess as one of the best games ever created?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." } ] model = QAInferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True) result = model.inference_from_dicts(dicts=QA_input, return_json=False) # result is a list of QAPred objects print(f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?" f"\nAnswer from model: {result[0].prediction[0].answer}") model.close_multiprcessing_pool()
def doc_classification_cola(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 3 batch_size = 8 evaluate_every = 450 lang_model = "/bert-base-chinese" #BERT中文模型的路径 #模型下载地址https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load Cola 2018 Data. label_list =["城乡建设","卫生计生","商贸旅游","劳动和社会保障","教育文体","交通运输","环境保护"] metric = "acc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=507, data_dir=Path("/BERT留言分类数据集"), #存放文本分类数据的文件夹路径,数据格式:第一列按字符分隔的text,第二列label,之间用制表符分隔。第一行需要有"text"与"label" dev_filename=None, #Path("dev.tsv"), dev_split=0.1, test_filename="/BERT留言分类数据集/test.tsv", label_list=label_list, metric=metric, label_column_name="label" ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights(task_name="text_classification")) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("/BERT文本分类输出的模型") model.save(save_dir) processor.save(save_dir)
def main(args): print(f"[INFO] PyTorch Version: {torch.__version__}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("[INFO] Devices available: {}".format(device)) checkpoint_path = Path(args.ckpt_path) / args.run_name ml_logger = MLFlowLogger(tracking_uri=args.tracking_uri) ml_logger.init_experiment(experiment_name=args.experiment_name, run_name=args.run_name) tokenizer = Tokenizer.load( pretrained_model_name_or_path=args.pretrained_model_name_or_path, do_lower_case=False) # Processor if args.task_name == "text_classification": processor = TextClassificationProcessor( tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=None, test_filename=args.test_filename, header=0, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, label_column_name=args.label_column_name, text_column_name=args.text_column_name) elif args.task_name == "question_answering": processor = SquadProcessor(tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=args.test_filename, test_filename=args.test_filename, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, max_query_length=64, doc_stride=128, max_answers=1) else: raise ValueError("task name error") processor.save(checkpoint_path) # DataSilo data_silo = DataSilo(processor=processor, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, caching=True, cache_path=checkpoint_path) # LanguageModel: Build pretrained language model language_model = LanguageModel.load(args.pretrained_model_name_or_path, language="korean") # PredictionHead: Build predictor layer if args.task_name == "text_classification": # If you do classification on imbalanced classes, consider using class weights. # They change the loss function to down-weight frequent classes. prediction_head = TextClassificationHead( num_labels=len(args.label_list), class_weights=data_silo.calculate_class_weights( task_name=args.task_name)) elif args.task_name == "question_answering": prediction_head = QuestionAnsweringHead( layer_dims=[768, 2], task_name=args.task_name, ) else: raise ValueError("task name error") # AdaptiveModel: Combine all if args.task_name == "text_classification": lm_output_types = ["per_sequence"] elif args.task_name == "question_answering": lm_output_types = ["per_token"] else: raise ValueError("task name error") model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=args.embeds_dropout_prob, lm_output_types=lm_output_types, device=device) # Initialize Optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, device=device, learning_rate=args.learning_rate, n_batches=len(data_silo.loaders["train"]), n_epochs=args.n_epochs) # EarlyStopping earlymetric = "f1" if args.task_name == "question_answering" else "acc" mode = "max" if args.task_name in [ "text_classification", "question_answering" ] else "min" earlystop = EarlyStopping(save_dir=checkpoint_path, metric=earlymetric, mode=mode, patience=5) # Trainer trainer = Trainer( model=model, optimizer=optimizer, lr_schedule=lr_schedule, data_silo=data_silo, early_stopping=earlystop, evaluate_every=args.evaluate_every, checkpoints_to_keep=args.checkpoints_to_keep, checkpoint_root_dir=checkpoint_path, checkpoint_every=args.checkpoint_every, epochs=args.n_epochs, n_gpu=args.n_gpu, device=device, ) # now train! model = trainer.train()
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_question_answering") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 2 evaluate_every = 2000 base_LM_model = "roberta-base" train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("../data/squad20"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(base_LM_model) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-english-qa-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) QA_input = [{ "qas": ["Who counted the game among the best ever made?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] model = Inferencer.load(save_dir, batch_size=40, gpu=True) result = model.inference_from_dicts(dicts=QA_input) pprint.pprint(result) # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk filename = os.path.join(processor.data_dir, processor.dev_filename) result = model.inference_from_file(file=filename) write_squad_predictions(predictions=result, predictions_filename=filename, out_filename="predictions.json")
def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, evaluate_every, use_cuda, max_seq_len, learning_rate, do_lower_case, register_model, save_model=True, early_stopping=False): language = cu.params.get('language') # Check task if cu.tasks.get(str(task)).get('type') != 'classification': raise Exception('NOT A CLASSIFICATION TASK') # Data dt_task = dt.Data(task=task) ## Download training files if not os.path.isfile(dt_task.get_path('fn_train', dir='data_dir')): dt_task.download('data_dir', dir='data_dir', source='datastore') # Settings set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) lang_model = he.get_farm_model(model_type, language) save_dir = dt_task.get_path('model_dir') label_list = dt_task.load('fn_label', dir='data_dir', header=None)[0].to_list() # AML log try: aml_run.log('task', task) aml_run.log('language', language) aml_run.log('n_epochs', n_epochs) aml_run.log('batch_size', batch_size) aml_run.log('learning_rate', learning_rate) aml_run.log('embeds_dropout', embeds_dropout) aml_run.log('max_seq_len', max_seq_len) aml_run.log('lang_model', lang_model) aml_run.log_list('label_list', label_list) except: pass # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") # AML log try: aml_run.log('acc', acc.get('acc')) aml_run.log('f1macro', f1macro) aml_run.log('f1micro', f1micro) except: pass return {"acc": acc, "f1_macro": f1macro, "f1_micro": f1micro} register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=dt_task.data_dir, label_list=label_list, metric=metric, label_column_name="label", train_filename=dt_task.get_path('fn_train', dir='data_dir'), test_filename=dt_task.get_path('fn_test', dir='data_dir')) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel ## Pretrained language model as a basis language_model = LanguageModel.load(lang_model) ## Prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(processor.tasks["text_classification"]["label_list"]), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, learning_rate=learning_rate, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. if early_stopping: earlystopping = EarlyStopping( metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=save_dir, # where to save the best model patience= 2 # number of evaluations to wait for improvement before terminating the training ) else: earlystopping = None trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Store it: # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training if save_model: model.save(save_dir) processor.save(save_dir) if register_model: dt_task.upload('model_dir', destination='model')
def test_qa(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model, do_lower_case=False) label_list = ["start_token", "end_token"] processor = SquadProcessor(tokenizer=tokenizer, max_seq_len=16, max_query_length=4, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir="samples/qa", label_list=label_list, metric="squad") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(base_LM_model) prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=1e-5, warmup_proportion=0.2, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/qa" model.save(save_dir) processor.save(save_dir) QA_input = [{ "questions": ["In what country is Normandy located?"], "text": 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.', }] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=QA_input) assert isinstance(result[0]["predictions"][0]["end"], int)
def test_doc_regression(data_dir_path, text_column_name, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) rp_params = dict(tokenizer=tokenizer, max_seq_len=8, data_dir=Path(data_dir_path), train_filename="train-sample.tsv", dev_filename="test-sample.tsv", test_filename=None, label_column_name="label") if text_column_name is not None: rp_params["text_column_name"] = text_column_name processor = RegressionProcessor(**rp_params) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = RegressionHead() model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'CosineWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() save_dir = Path("testsave/doc_regr") model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand." }, { "text": "it just did not fit right. The top is very thin showing everything." }, ] model = Inferencer.load(save_dir, num_processes=0) result = model.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["pred"], np.float32)
def test_ner(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 5 batch_size = 2 evaluate_every = 1 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'LinearWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") model = trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Albrecht Lehman ist eine Person" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts, max_processes=1) print(result) #assert result[0]["predictions"][0]["context"] == "sagte" #assert isinstance(result[0]["predictions"][0]["probability"], np.float32) result2 = model.inference_from_dicts(dicts=basic_texts, rest_api_schema=True) assert result == result2
def load(cls, load_dir, device, strict=False, lm1_name="query", lm2_name="passage", processor=None): """ Loads a BiAdaptiveModel from a directory. The directory must contain: * directory "lm1_name" with following files: -> language_model.bin -> language_model_config.json * directory "lm2_name" with following files: -> language_model.bin -> language_model_config.json * prediction_head_X.bin multiple PH possible * prediction_head_X_config.json * processor_config.json config for transforming input * vocab.txt vocab file for language model, turning text to Wordpiece Token * special_tokens_map.json :param load_dir: location where adaptive model is stored :type load_dir: Path :param device: to which device we want to sent the model, either cpu or cuda :type device: torch.device :param lm1_name: the name to assign to the first loaded language model(for encoding queries) :type lm1_name: str :param lm2_name: the name to assign to the second loaded language model(for encoding context/passages) :type lm2_name: str :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in the PredictionHead (see torch.nn.module.load_state_dict()). Set to `False` for backwards compatibility with PHs saved with older version of FARM. :type strict: bool :param processor: populates prediction head with information coming from tasks :type processor: Processor """ # Language Model if lm1_name: language_model1 = LanguageModel.load( os.path.join(load_dir, lm1_name)) else: language_model1 = LanguageModel.load(load_dir) if lm2_name: language_model2 = LanguageModel.load( os.path.join(load_dir, lm2_name)) else: language_model2 = LanguageModel.load(load_dir) # Prediction heads ph_config_files = cls._get_prediction_head_files(load_dir) prediction_heads = [] ph_output_type = [] for config_file in ph_config_files: head = PredictionHead.load(config_file, strict=False, load_weights=False) prediction_heads.append(head) ph_output_type.append(head.ph_output_type) model = cls(language_model1, language_model2, prediction_heads, 0.1, device) if processor: model.connect_heads_with_processor(processor.tasks) return model
def test_ner_amp(caplog): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 2 evaluate_every = 1 lang_model = "bert-base-german-cased" if AMP_AVAILABLE: use_amp = 'O1' else: use_amp = None tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename=Path("train-sample.txt"), dev_filename=Path("dev-sample.txt"), test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-05, schedule_opts=None, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, use_amp=use_amp) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "1980 kam der Crown von Toyota" }, ] model = Inferencer.load(save_dir, num_processes=0) result = model.inference_from_dicts(dicts=basic_texts) assert result[0]["predictions"][0][0]["context"] == "1980" assert isinstance(result[0]["predictions"][0][0]["probability"], np.float32) assert np.isclose(result[0]["predictions"][0][0]["probability"], 0.161, rtol=0.05) assert result[0]["predictions"][0][0]["label"] == "LOC"
def text_pair_classification(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 2 batch_size = 64 evaluate_every = 500 lang_model = "bert-base-cased" label_list = ["0", "1"] train_filename = "train.tsv" dev_filename = "dev_200k.tsv" # The source data can be found here https://github.com/microsoft/MSMARCO-Passage-Ranking generate_data = False data_dir = Path("../data/msmarco_passage") predictions_raw_filename = "predictions_raw.txt" predictions_filename = "predictions.txt" train_source_filename = "triples.train.1m.tsv" qrels_filename = "qrels.dev.tsv" queries_filename = "queries.dev.tsv" passages_filename = "collection.tsv" top1000_filename = "top1000.dev" # 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once! # The final format is a tsv file with 3 columns (text, text_b and label) if generate_data: reformat_msmarco_train(data_dir / train_source_filename, data_dir / train_filename) reformat_msmarco_dev(data_dir / queries_filename, data_dir / passages_filename, data_dir / qrels_filename, data_dir / top1000_filename, data_dir / dev_filename) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Evaluation during training will be performed on a slice of the train set # We will be using the msmarco dev set as our final evaluation set processor = TextPairClassificationProcessor(tokenizer=tokenizer, label_list=label_list, train_filename=train_filename, test_filename=None, dev_split=0.001, max_seq_len=128, data_dir=data_dir, delimiter="\t") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification"), ) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/passage_ranking_model") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) # Add your own text adapted to the dataset you provide model = Inferencer.load(save_dir, gpu=True, max_seq_len=128, batch_size=128) result = model.inference_from_file(data_dir / dev_filename) write_msmarco_results(result, save_dir / predictions_raw_filename) msmarco_evaluation(preds_file=save_dir / predictions_raw_filename, dev_file=data_dir / dev_filename, qrels_file=data_dir / qrels_filename, output_file=save_dir / predictions_filename) model.close_multiprocessing_pool()
def get_surprisals(args): set_seed(args.seed, cuda=args.cuda) logger.info("Importing tokenizer and pre-trained model") tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer" ref = args.reference_hf_model if args.reference_hf_model is not None else args.model_name_or_path model = AutoModelWithLMHead.from_pretrained(ref) # Loading a local model, we need to replace the AutoModel with the local model if args.reference_hf_model is not None: farm_lm = LanguageModel.load( args.model_name_or_path, language_model_class=args.model_class_name) # Set the underlying model to the custom loaded model # The LM head used for surprisal is the original pretrained head logger.info( f"Setting model.{model.base_model_prefix} attribute with model: {args.model_name_or_path}" ) setattr(model, model.base_model_prefix, farm_lm.model) tokenizer = CustomTokenizer.load( pretrained_model_name_or_path=args.model_name_or_path, do_lower_case=args.do_lower_case, tokenizer_class=tok_class, ) else: tokenizer = AutoTokenizer.from_pretrained(ref) device = torch.device("cuda" if args.cuda else "cpu") model.to(device) model.eval() logger.info(f"Reading sentences from {args.inputf}") if args.inputf.endswith(".tsv"): # lingcomp tsv format df = read_tsv(args.inputf) sentences = list(df["text"]) elif args.inputf.endswith(".json"): # syntaxgym test suite format sentences = get_sentences_from_json(args.inputf) elif args.inputf.endswith(".txt"): # one sentencen per line sentences = open(args.inputf, "r").read().split("\n") else: raise AttributeError( "Only .tsv, .json and .txt input files are supported.") dict_list = [] for i, sentence in tqdm(enumerate(sentences)): surprisals = get_surprisal_scores(sentence, tokenizer, model, device) if args.mode in ["token", "sentence"]: for token, token_idx, surprisal, _, _ in surprisals: dict_list.append({ "sentence_id": i + 1, "token_id": token_idx, "token": token, "surprisal": surprisal }) elif args.mode == "word": words, word_surps, word_spans = aggregate_word_level( sentence, surprisals) for j, word in enumerate(words): dict_list.append({ "start": word_spans[j]["start"], "end": word_spans[j]["end"], "context": word, "surprisal": word_surps[j], "sentence_id": i + 1, "token_id": j + 1, }) out = pd.DataFrame(dict_list) if args.mode == "sentence": surprisals = list( out.groupby("sentence_id", sort=False).sum()["surprisal"]) assert len(surprisals) == len( sentences), "Sentence-surprisal number mismatch" dict_list = [] for k, sent in enumerate(sentences): dict_list.append({ "sentence_id": k + 1, "sentence": sent, "surprisal": surprisals[k] }) out = pd.DataFrame(dict_list) logger.info( f"Surprisal values at {args.mode}-level were saved to {args.outputf}") save_tsv(out, args.outputf)