def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path) corpus = NLPTaskDataFetcher.load_classification_corpus( data_folder=tasks_base_path / "multi_class" ) label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings = DocumentRNNEmbeddings( embeddings=[word_embedding], hidden_size=32, reproject_words=False, bidirectional=False, ) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, EvaluationMetric.MICRO_F1_SCORE, mini_batch_size=1, max_epochs=100, test_mode=True, checkpoint=False, ) sentence = Sentence("apple tv") for s in model.predict(sentence): for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float sentence = Sentence("apple tv") for s in model.predict(sentence): assert "apple" in sentence.get_label_names() assert "tv" in sentence.get_label_names() for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def calc_metrics_with_sklearn(clf: TextClassifier, sentences: List[Sentence]): targets = get_labels(sentences) clf.predict(sentences) prediction = get_labels(sentences) report = metrics.classification_report(y_true=targets, y_pred=prediction, digits=3, output_dict=True) return report
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path) corpus = NLPTaskDataFetcher.load_classification_corpus( data_folder=tasks_base_path / 'multi_class') label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings = DocumentLSTMEmbeddings(embeddings=[glove_embedding], hidden_size=32, reproject_words=False, bidirectional=False) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=100, test_mode=True, checkpoint=False) sentence = Sentence('apple tv') for s in model.predict(sentence): for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) sentence = Sentence("apple tv") for s in model.predict(sentence): assert ('apple' in sentence.get_label_names()) assert ('tv' in sentence.get_label_names()) for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def classify(data, labels, test, train, validation): train_data = [k for k in data.keys() if k in train] train_labels = [labels[k] for k in train_data] train_data = [data[k] for k in train_data] test_data = [k for k in data.keys() if k in test] test_labels = [labels[k] for k in test_data] test_data = [data[k] for k in test_data] validation_data = [k for k in data.keys() if k in validation] validation_labels = [labels[k] for k in validation_data] validation_data = [data[k] for k in validation_data] save_training_files(train_data, train_labels, test_data, test_labels, validation_data, validation_labels) corpus = NLPTaskDataFetcher.load_classification_corpus( Path('./'), test_file='test.txt', dev_file='dev.txt', train_file='train.txt') word_embeddings = [ WordEmbeddings('pl'), FlairEmbeddings('polish-forward'), FlairEmbeddings('polish-backward') ] doc_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( doc_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train('./', max_epochs=25) classifier = TextClassifier.load_from_file('./best-model.pt') validation_data = [Sentence(x) for x in validation_data] for x in validation_data: classifier.predict(x) predicted = [int(x.labels[0].value) for x in validation_data] remove_training_files() precision, recall, f1, _ = precision_recall_fscore_support( validation_labels, predicted, average='binary') return { 'accuracy': float("{:.3f}".format(round(precision, 3))), 'recall': float("{:.3f}".format(round(recall, 3))), 'f1': float("{:.3f}".format(round(f1, 3))) }
def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB, base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False) document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_charlm_load_use_classifier(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) label_dict = corpus.make_label_dictionary() glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file('./results/final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree('./results')
def predict_instance(classifier: TextClassifier, example: Sentence) -> Dict: flair_result = classifier.predict(example)[0] result = { "text": " ".join([token.text for token in flair_result.tokens]), "prediction": int(flair_result.labels[0].value) } return result
def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() embedding: TokenEmbeddings = FlairEmbeddings("news-forward-fast", use_cache=False) document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_classifier_with_prob(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [word_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence, multi_class_prob=True): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence, multi_class_prob=True) loaded_model.predict([sentence, sentence_empty], multi_class_prob=True) loaded_model.predict([sentence_empty], multi_class_prob=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(u'imdb', base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding = FlairEmbeddings(u'news-forward-fast') document_embeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MACRO_F1_SCORE, max_epochs=2, test_mode=True) sentence = Sentence(u'Berlin is a really nice city.') for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file( (results_base_path / u'final-model.pt')) sentence = Sentence(u'I love Berlin') sentence_empty = Sentence(u' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) shutil.rmtree(results_base_path)
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus("imdb", base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() embedding: TokenEmbeddings = FlairEmbeddings("news-forward-fast") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [embedding], 128, 1, False, 64, False, False ) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, EvaluationMetric.MACRO_F1_SCORE, max_epochs=2, test_mode=True ) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus( (tasks_base_path / 'multi_class')) label_dict = corpus.make_label_dictionary() word_embedding = WordEmbeddings('turian') document_embeddings = DocumentRNNEmbeddings(embeddings=[word_embedding], hidden_size=32, reproject_words=False, bidirectional=False) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, mini_batch_size=1, max_epochs=100, shuffle=False, checkpoint=False) sentence = Sentence('apple tv') for s in model.predict(sentence): for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) sentence = Sentence('apple tv') for s in model.predict(sentence): assert ('apple' in sentence.get_label_names()) assert ('tv' in sentence.get_label_names()) for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load((results_base_path / 'final-model.pt')) sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) shutil.rmtree(results_base_path)
def batched_predict_instances(classifier: TextClassifier, examples: List[Sentence], batch_size: int = 16) -> List[Dict[str, Any]]: results = [] # type: List[Dict[str, Any]] for i in range(0, len(examples), batch_size): batch_examples = examples[i:i + batch_size] flair_batch_results = classifier.predict(batch_examples, mini_batch_size=batch_size) batch_results = [{ "text": " ".join([token.text for token in r.tokens]), "prediction": int(r.labels[0].value) } for r in flair_batch_results] results.extend(batch_results) return results
def test_train_classifier_with_sampler(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'imdb')) label_dict = corpus.make_label_dictionary() word_embedding = WordEmbeddings('turian') document_embeddings = DocumentRNNEmbeddings([word_embedding], 32, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, sampler=ImbalancedClassificationDatasetSampler) sentence = Sentence('Berlin is a really nice city.') for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load((results_base_path / 'final-model.pt')) shutil.rmtree(results_base_path)
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'imdb')) label_dict = corpus.make_label_dictionary() embedding = FlairEmbeddings('news-forward-fast') document_embeddings = DocumentRNNEmbeddings([embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False) sentence = Sentence('Berlin is a really nice city.') for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load((results_base_path / 'final-model.pt')) sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) shutil.rmtree(results_base_path)
def test_train_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus('imdb', base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings('turian') document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [word_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=2, test_mode=True) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
classifier = TextClassifier.load('./' + prefix_model_output_dir + '_' + sLang + prefix + '/best-model.pt') dev_data = read_file(data_path + sLang + "/intertass_" + sLang + "_dev.xml") print("Writing " + output_dir + sLang + "_dev_" + prefix_model_output_dir + prefix + ".tsv") with open(data_path + sLang + '/' + dev_filename) as f_in, \ open(output_dir + sLang + "_dev_" + prefix_model_output_dir + prefix + ".tsv", 'w', newline='') as out_file, \ open(output_dir + sLang + "_dev_" + prefix_model_output_dir + prefix + ".csv", 'w', newline='') as out_csv_file: tsv_writer = csv.writer(out_file, delimiter='\t') csv_writer = csv.writer(out_csv_file) csv_writer.writerow(labels) for i, line in enumerate(f_in): aLine = line.split('\t') txt = aLine[1] prediction = classifier.predict(Sentence(txt), multi_class_prob=True) max_score = 0.0 row_values = dict() for lbl in zip(prediction[0].labels): label_to_write = lbl[0].value if bReduceLabels is True: if lbl[0].value == 'O': label_to_write = replace_reducedLabel_perLanguage[sLang] row_values[label_to_write] = lbl[0].score else: row_values[label_to_write] = lbl[0].score if lbl[0].score > max_score: max_score = lbl[0].score max_arg = label_to_write values = [str(row_values[column]) for column in labels if column != 'ID'] label_to_write = max_arg
reproject_words_dimension=256, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training trainer.train('/home/anna/Desktop/markup/8', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=150) # 8. plot weight traces (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_weights('/home/anna/Desktop/markup/8/weights.txt') classifier = TextClassifier.load('/home/anna/Desktop/markup/8/final-model.pt') # create example sentences sentence = Sentence('France is the current world cup winner.') # predict class and print classifier.predict(sentence) print(sentence.labels)
# Create model from flair.models import TextClassifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # Create model trainer from flair.trainers import ModelTrainer trainer = ModelTrainer(classifier, corpus) # Train the model trainer.train('model-saves', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=8, max_epochs=200) # Load the model and make predictions from flair.data import Sentence classifier = TextClassifier.load('model-saves/final-model.pt') pos_sentence = Sentence(preprocess('I love Python!')) neg_sentence = Sentence(preprocess('Python is the worst!')) classifier.predict(pos_sentence) classifier.predict(neg_sentence) print(pos_sentence.labels, neg_sentence.labels)
sentence_id_list.append(str(sentence_id)) return article_id_list, sentence_id_list, sentence_list # reading data from the development set dev_article_id_list, dev_sentence_id_list, dev_sentence_list = read_articles_from_file_list(dev_folder) print (len(dev_article_id_list)) #start prediction prd=[] for x in range(len(dev)): print (x) prd.append(classifier.predict(Sentence(dev_sentence_list[x]))) preds=np.array([(int(y[0].labels[0].value) if len(y[0].labels) > 0 else 0) for y in prd]) # computing the predictions on the development set label_inverse_mapping={1:'propaganda',0:'non-propaganda'} ## label encoding predictions=np.array([label_inverse_mapping[x] for x in preds]) print (np.unique(predictions,return_counts=True),np.unique(preds,return_counts=True)) task_SLC_output_file = "/content/drive/My Drive/emnlp/submissions/finSub.txt"
class TARSClassifier(FewshotClassifier): """ TARS model for text classification. In the backend, the model uses a BERT based binary text classifier which given a <label, text> pair predicts the probability of two classes "True", and "False". The input data is a usual Sentence object which is inflated by the model internally before pushing it through the transformer stack of BERT. """ static_label_type = "tars_label" LABEL_MATCH = "YES" LABEL_NO_MATCH = "NO" def __init__( self, task_name: Optional[str] = None, label_dictionary: Optional[Dictionary] = None, label_type: Optional[str] = None, embeddings: str = 'bert-base-uncased', num_negative_labels_to_sample: int = 2, prefix: bool = True, **tagger_args, ): """ Initializes a TextClassifier :param task_name: a string depicting the name of the task :param label_dictionary: dictionary of labels you want to predict :param embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased' etc :param num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a sentence during training. Defaults to 2 negative labels for each positive label. The model would sample all the negative labels if None is passed. That slows down the training considerably. :param multi_label: auto-detected by default, but you can set this to True to force multi-label predictionor False to force single-label prediction :param multi_label_threshold: If multi-label you can set the threshold to make predictions :param beta: Parameter for F-beta score for evaluation and training annealing """ super(TARSClassifier, self).__init__() from flair.embeddings import TransformerDocumentEmbeddings if not isinstance(embeddings, TransformerDocumentEmbeddings): embeddings = TransformerDocumentEmbeddings( model=embeddings, fine_tune=True, layers='-1', layer_mean=False, ) # prepare TARS dictionary tars_dictionary = Dictionary(add_unk=False) tars_dictionary.add_item(self.LABEL_NO_MATCH) tars_dictionary.add_item(self.LABEL_MATCH) # initialize a bare-bones sequence tagger self.tars_model = TextClassifier( document_embeddings=embeddings, label_dictionary=tars_dictionary, label_type=self.static_label_type, **tagger_args, ) # transformer separator self.separator = str(self.tars_embeddings.tokenizer.sep_token) if self.tars_embeddings.tokenizer._bos_token: self.separator += str(self.tars_embeddings.tokenizer.bos_token) self.prefix = prefix self.num_negative_labels_to_sample = num_negative_labels_to_sample if task_name and label_dictionary and label_type: # Store task specific labels since TARS can handle multiple tasks self.add_and_switch_to_new_task(task_name, label_dictionary, label_type) else: log.info( "TARS initialized without a task. You need to call .add_and_switch_to_new_task() " "before training this model") self.clean_up_labels = True def _clean(self, label_value: str) -> str: if self.clean_up_labels: return label_value.replace("_", " ") else: return label_value def _get_tars_formatted_sentence(self, label, sentence): label = self._clean(label) original_text = sentence.to_tokenized_string() label_text_pair = f"{label} {self.separator} {original_text}" if self.prefix \ else f"{original_text} {self.separator} {label}" sentence_labels = [ self._clean(label.value) for label in sentence.get_labels(self.get_current_label_type()) ] tars_label = self.LABEL_MATCH if label in sentence_labels else self.LABEL_NO_MATCH tars_sentence = Sentence(label_text_pair, use_tokenizer=False).add_label( self.static_label_type, tars_label) return tars_sentence def _get_state_dict(self): model_state = { "state_dict": self.state_dict(), "current_task": self._current_task, "label_type": self.get_current_label_type(), "label_dictionary": self.get_current_label_dictionary(), "tars_model": self.tars_model, "num_negative_labels_to_sample": self.num_negative_labels_to_sample, "task_specific_attributes": self._task_specific_attributes, } return model_state @staticmethod def _init_model_with_state_dict(state): # init new TARS classifier label_dictionary = state["label_dictionary"] label_type = "default_label" if not state["label_type"] else state[ "label_type"] model: TARSClassifier = TARSClassifier( task_name=state["current_task"], label_dictionary=label_dictionary, label_type=label_type, embeddings=state["tars_model"].document_embeddings, num_negative_labels_to_sample=state[ "num_negative_labels_to_sample"], ) # set all task information model._task_specific_attributes = state["task_specific_attributes"] # linear layers of internal classifier model.load_state_dict(state["state_dict"]) return model @staticmethod def _fetch_model(model_name) -> str: model_map = {} hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models" model_map["tars-base"] = "/".join( [hu_path, "tars-base", "tars-base-v8.pt"]) cache_dir = Path("models") if model_name in model_map: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name @property def tars_embeddings(self): return self.tars_model.document_embeddings def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size=32, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", label_threshold: float = 0.5, multi_label: Optional[bool] = None, ): """ Predict sequence tags for Named Entity Recognition task :param sentences: a Sentence or a List of Sentence :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory, up to a point when it has no more effect. :param all_tag_prob: True to compute the score for each tag on each token, otherwise only the score of the best tag is returned :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss :param label_name: set this to change the name of the label type that is predicted :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ if not label_name: label_name = self.get_current_label_type() if multi_label is None: multi_label = self.is_current_task_multi_label() # with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, Sentence): sentences = [sentences] # set context if not set already previous_sentence = None for sentence in sentences: if sentence.is_context_set(): continue sentence._previous_sentence = previous_sentence sentence._next_sentence = None if previous_sentence: previous_sentence._next_sentence = sentence previous_sentence = sentence # reverse sort all sequences by their length rev_order_len_index = sorted(range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True) reordered_sentences: List[Union[Sentence, str]] = [ sentences[index] for index in rev_order_len_index ] dataloader = DataLoader(dataset=SentenceDataset(reordered_sentences), batch_size=mini_batch_size) # progress bar for verbosity if verbose: dataloader = tqdm(dataloader) overall_loss = 0 overall_count = 0 batch_no = 0 with torch.no_grad(): for batch in dataloader: batch_no += 1 if verbose: dataloader.set_description( f"Inferencing on batch {batch_no}") batch = self._filter_empty_sentences(batch) # stop if all sentences are empty if not batch: continue # go through each sentence in the batch for sentence in batch: # always remove tags first sentence.remove_labels(label_name) all_labels = [ label.decode("utf-8") for label in self.get_current_label_dictionary().idx2item ] best_label = None for label in all_labels: tars_sentence = self._get_tars_formatted_sentence( label, sentence) loss_and_count = self.tars_model.predict( tars_sentence, label_name=label_name, return_loss=True, return_probabilities_for_all_classes=True if label_threshold < 0.5 else False, ) overall_loss += loss_and_count[0].item() overall_count += loss_and_count[1] # add all labels that according to TARS match the text and are above threshold for predicted_tars_label in tars_sentence.get_labels( label_name): if predicted_tars_label.value == self.LABEL_MATCH \ and predicted_tars_label.score > label_threshold: # do not add labels below confidence threshold sentence.add_label(label_name, label, predicted_tars_label.score) # only use label with highest confidence if enforcing single-label predictions if not multi_label: if len(sentence.get_labels(label_name)) > 0: # get all label scores and do an argmax to get the best label label_scores = torch.tensor([ label.score for label in sentence.get_labels(label_name) ], dtype=torch.float) best_label = sentence.get_labels(label_name)[ torch.argmax(label_scores)] # remove previously added labels and only add the best label sentence.remove_labels(label_name) sentence.add_label(typename=label_name, value=best_label.value, score=best_label.score) # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) if return_loss: return overall_loss, overall_count