def run_zero_shot(train_tweets, train_y, val_tweets, val_y): """ Performs the training of the zero shot learning model @param train_tweets: the tweets that will be used for training @param train_y: the training labels @param val_tweets: the tweets that will be used for validation @param val_y: the validation labels @return: None """ # 1. Load our pre-trained TARS model for English print("Zero shot") # download https://nlp.informatik.hu-berlin.de/resources/models/tars-base/tars-base.pt tars = TARSClassifier.load( os.path.join(os.path.dirname(__file__), "..", "..", "saved_models", "tars-base.pt")) train_tweets["output"] = train_y.iloc[:] train = train_tweets.apply(create_sentences, axis=1).tolist() train = SentenceDataset(train) val_tweets["output"] = val_y.iloc[:] val = val_tweets.apply(create_sentences, axis=1).tolist() val = SentenceDataset(val) corpus = Corpus(train=train, test=val) tars.add_and_switch_to_new_task( "POSITIVE_NEGATIVE", label_dictionary=corpus.make_label_dictionary()) trainer = ModelTrainer(tars, corpus) # 4. train model trainer.train( base_path='../../data/zero_shot', # path to store the model artifacts learning_rate=0.02, # use very small learning rate mini_batch_size=16, # small mini-batch size since corpus is tiny max_epochs=10, # terminate after 10 epochs ) print("DONE TRAINING") tars = TARSClassifier.load('../../model/zero_shot/final-model.pt') val_tweets["pred"] = val_tweets.apply(predict_few_shot, args=(tars, ), axis=1) val_tweets["pred"] = val_tweets["pred"].apply(lambda x: 1 if x == "positive" else -1) pred = pd.DataFrame(list(val_tweets["pred"]), columns=['Prediction']) pred.index += 1 pred.insert(0, 'Id', pred.index) pred.to_csv("../../predictions/zero_shot_pred.csv", index=False)
def to_corpus(self, cache=False) -> Corpus: data_file = (TRANSFORMED_ROOT / self.filename).with_suffix(".pickle.xz") if data_file.exists(): with lzma.open(data_file) as fd: return pickle.load(fd) dataset = Tox21().to_df() def plain_tokenizer(text: str) -> Iterable[Token]: res = [] for tok in text.split(): res.append(Token(tok)) return res def iterate_dataframe(dataset: pd.DataFrame) -> Iterable[Sentence]: for _, row in dataset.iterrows(): res = encoder(row.smiles) if not res: continue res = res.replace("]", "] ").replace(".", "DOT ") sent = Sentence(res.strip(), use_tokenizer=plain_tokenizer) for col, val in row.items(): if isinstance(val, float): if val == 1.0: sent.add_label(None, col.replace(" ", "_") + "_P ") if val == 0.0: sent.add_label(None, col.replace(" ", "_") + "_N ") yield sent train = dataset.sample(frac=0.7, random_state=18) dataset = dataset.drop(train.index) dev = dataset.sample(frac=0.333334, random_state=18) test = dataset.drop(dev.index) train = SentenceDataset(list(iterate_dataframe(train))) dev = SentenceDataset(list(iterate_dataframe(dev))) test = SentenceDataset(list(iterate_dataframe(test))) corpus = Corpus(train, dev, test, "Molecules") if cache: TRANSFORMED_ROOT.mkdir(parents=True, exist_ok=True) with lzma.open(data_file, "wb") as fd: pickle.dump(corpus, fd) return corpus
def _predict(self, sentences, tagger): tokenizer = SegtokTokenizer() dataset = SentenceDataset( [Sentence(text, tokenizer) for text in sentences]) tagger.predict(dataset, mini_batch_size=self.mini_batch_size, embedding_storage_mode=self.embedding_storage_mode, verbose=self.verbose) return [sentence for sentence in dataset]
def spelling_aug(corpus): aug = naw.SpellingAug() # augmented_sentences = [] # go through all train and dev sentences for sentence in corpus.train: augmented_texts = aug.augment(sentence, n=3) corpus = Corpus(train=SentenceDataset(augmented_texts), dev=corpus.dev, test=corpus.test) return corpus
def train(self): from flair.data import Corpus from flair.datasets import SentenceDataset from flair.data import Sentence self.classes = utils.read_class_titles(settings.CAT_DEPTH) self.classes['NOCAT'] = 'NOCAT' train = SentenceDataset([ Sentence(row['titlen']).add_label('law_topic', self.classes[row['cat1']]) for i, row in self.df_train.iterrows() ]) # make a corpus with train and test split self.corpus = Corpus(train=train, dev=train) # 1. load base TARS tars = self._load_pretained_model() # 2. make the model aware of the desired set of labels from the new corpus tars.add_and_switch_to_new_task( "LAW_TOPIC", label_dictionary=self.corpus.make_label_dictionary()) # 3. initialize the text classifier trainer with your corpus from flair.trainers import ModelTrainer trainer = ModelTrainer(tars, self.corpus) # 4. train model path = settings.WORKING_DIR if 1: trainer.train( base_path=path, # path to store the model artifacts learning_rate=5e-2, # 5ep, 0.2 bad; 5ep with 0.1 looks ok. mini_batch_size=settings.MINIBATCH, # mini_batch_chunk_size=1, mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine max_epochs=settings.EPOCHS, # terminate after 10 epochs train_with_dev=False, save_final_model=False, param_selection_mode=True, # True to avoid model saves shuffle=False, # Already done ) # from flair.models.text_classification_model import TARSClassifier # self.model = TARSClassifier.load( # os.path.join(path, 'best-model.pt') # ) self.model = tars
def to_corpus(self) -> Corpus: dataset = Tox21().to_df() def plain_tokenizer(text: str) -> Iterable[Token]: res = [] for tok in text.split(): res.append(Token(tok)) return res def iterate_dataframe(dataset: pd.DataFrame) -> Iterable[Sentence]: for _, row in dataset.iterrows(): res = encoder(row.smiles) if not res: continue res = res.replace("]", "] ").replace(".", "DOT ") sent = Sentence(res.strip(), use_tokenizer=plain_tokenizer) for col, val in row.items(): if isinstance(val, float): if val == 1.0: sent.add_label(None, col.replace(" ", "_") + "_P ") if val == 0.0: sent.add_label(None, col.replace(" ", "_") + "_N ") yield sent train = dataset.sample(frac=0.7, random_state=18) dataset = dataset.drop(train.index) dev = dataset.sample(frac=0.333334, random_state=18) test = dataset.drop(dev.index) train = SentenceDataset(list(iterate_dataframe(train))) dev = SentenceDataset(list(iterate_dataframe(dev))) test = SentenceDataset(list(iterate_dataframe(test))) corpus = Corpus(train, dev, test, "Molecules") return corpus
def capitalization_aug(corpus): augmented_sentences = [] # go through all train and dev sentences for sentence in corpus.train: augmented_sentence: Sentence = Sentence() for token in sentence: token.text = token.text.lower() augmented_sentence.add_token(token) # append to augmented sentences if len(augmented_sentence) > 0: augmented_sentences.append(augmented_sentence) # make a new corpus with the augmented sentences corpus = Corpus(train=SentenceDataset(augmented_sentences), dev=corpus.dev, test=corpus.test) return corpus
def punctuation_aug(corpus): augmented_sentences = [] # go through all train and dev sentences for sentence in corpus.train: punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~''' augmented_sentence: Sentence = Sentence() for token in sentence: if token.text not in punc: augmented_sentence.add_token(token) # append to augmented sentences if len(augmented_sentence) > 0: augmented_sentences.append(augmented_sentence) # make a new corpus with the augmented sentences corpus = Corpus(train=SentenceDataset(augmented_sentences), dev=corpus.dev, test=corpus.test) return corpus
def predict(self, sentences: Union[List[Sentence], Sentence], mini_batch_size: int = 32, num_workers: int = 8, print_tree: bool = False, embedding_storage_mode="none", ) -> None: """ Predict arcs and tags for Dependency Parser task :param sentences: a Sentence or a List of Sentence :param mini_batch_size: mini batch size to use :param print_tree: set to True to print dependency parser of sentence as tree shape :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ sentences = SentenceDataset(sentences) data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) for batch in data_loader: with torch.no_grad(): score_arc, score_rel = self.forward(batch) arc_prediction, relation_prediction = self._obtain_labels_(score_arc, score_rel) for sentnce_index, (sentence, sent_tags, sent_arcs) in enumerate(zip(batch, relation_prediction, arc_prediction)): for token_index, (token, tag, head_id) in enumerate(zip(sentence.tokens, sent_tags, sent_arcs)): token.add_tag(self.tag_type, tag, score_rel[sentnce_index][token_index]) token.head_id = int(head_id) if print_tree: tree_printer(sentence, self.tag_type) print("-" * 50) store_embeddings(batch, storage_mode=embedding_storage_mode)
def ocr_aug(corpus): aug = nac.OcrAug(tokenizer=whitespace_tokenizer) # go through all train and dev sentences augmented_sentences = [] for sentence in corpus.train: augmented_texts = aug.augment(sentence.to_tokenized_string(), n=3) for augmented_text in augmented_texts: augmented_sentence: Sentence = Sentence() augmented_token_texts = augmented_text.split(" ") for augmented_token_text, original_token in zip(augmented_token_texts, sentence): # make a new token augmented_token = Token(augmented_token_text) # transfer annotations over to augmented token augmented_token.annotation_layers = original_token.annotation_layers # add augmented token to augmented sentence augmented_sentence.add_token(augmented_token) # add augmented sentence to list of all augmented sentences augmented_sentences.append(augmented_sentence) corpus = Corpus(train=SentenceDataset(augmented_sentences), dev=corpus.dev, test=corpus.test) return corpus
def evaluate(self, sentences: Union[List[DataPoint], Dataset], out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: int = 8, main_score_type: Tuple[str, str] = ("micro avg", 'f1-score'), return_predictions: bool = False) -> (Result, float): # read Dataset into data loader (if list of sentences passed, make Dataset first) if not isinstance(sentences, Dataset): sentences = SentenceDataset(sentences) data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) # use scikit-learn to evaluate y_true = [] y_pred = [] with torch.no_grad(): eval_loss = 0 lines: List[str] = [] batch_count: int = 0 for batch in data_loader: batch_count += 1 # remove previously predicted labels [sentence.remove_labels('predicted') for sentence in batch] # get the gold labels true_values_for_batch = [ sentence.get_labels(self.label_type) for sentence in batch ] # predict for batch loss = self.predict( batch, embedding_storage_mode=embedding_storage_mode, mini_batch_size=mini_batch_size, label_name='predicted', return_loss=True) eval_loss += loss sentences_for_batch = [ sent.to_plain_string() for sent in batch ] # get the predicted labels predictions = [ sentence.get_labels('predicted') for sentence in batch ] for sentence, prediction, true_value in zip( sentences_for_batch, predictions, true_values_for_batch, ): eval_line = "{}\t{}\t{}\n".format(sentence, true_value, prediction) lines.append(eval_line) for predictions_for_sentence, true_values_for_sentence in zip( predictions, true_values_for_batch): true_values_for_sentence = [ label.value for label in true_values_for_sentence ] predictions_for_sentence = [ label.value for label in predictions_for_sentence ] y_true_instance = np.zeros(len(self.label_dictionary), dtype=int) for i in range(len(self.label_dictionary)): if self.label_dictionary.get_item_for_index( i) in true_values_for_sentence: y_true_instance[i] = 1 y_true.append(y_true_instance.tolist()) y_pred_instance = np.zeros(len(self.label_dictionary), dtype=int) for i in range(len(self.label_dictionary)): if self.label_dictionary.get_item_for_index( i) in predictions_for_sentence: y_pred_instance[i] = 1 y_pred.append(y_pred_instance.tolist()) store_embeddings(batch, embedding_storage_mode) # remove predicted labels if return_predictions is False # Problem here: the predictions are only contained in sentences if it was chosen memory_mode="full" during # creation of the ClassificationDataset in the ClassificationCorpus creation. If the ClassificationCorpus has # memory mode "partial", then the predicted labels are not contained in sentences in any case so the following # optional removal has no effect. Predictions won't be accessible outside the eval routine in this case regardless # whether return_predictions is True or False. TODO: fix this if not return_predictions: for sentence in sentences: sentence.annotation_layers['predicted'] = [] if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) # make "classification report" target_names = [] for i in range(len(self.label_dictionary)): target_names.append( self.label_dictionary.get_item_for_index(i)) classification_report = metrics.classification_report( y_true, y_pred, digits=4, target_names=target_names, zero_division=0) classification_report_dict = metrics.classification_report( y_true, y_pred, digits=4, target_names=target_names, zero_division=0, output_dict=True) # get scores micro_f_score = round( metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', zero_division=0), 4) accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4) macro_f_score = round( metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', zero_division=0), 4) precision_score = round( metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), 4) recall_score = round( metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), 4) detailed_result = ("\nResults:" f"\n- F-score (micro) {micro_f_score}" f"\n- F-score (macro) {macro_f_score}" f"\n- Accuracy {accuracy_score}" '\n\nBy class:\n' + classification_report) # line for log file if not self.multi_label: log_header = "ACCURACY" log_line = f"\t{accuracy_score}" else: log_header = "PRECISION\tRECALL\tF1\tACCURACY" log_line = f"{precision_score}\t" \ f"{recall_score}\t" \ f"{macro_f_score}\t" \ f"{accuracy_score}" result = Result(main_score=classification_report_dict[ main_score_type[0]][main_score_type[1]], log_line=log_line, log_header=log_header, detailed_results=detailed_result, classification_report=classification_report_dict) eval_loss /= batch_count return result, eval_loss
def evaluate( self, sentences: Union[List[DataPoint], Dataset], out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: int = 8, ) -> (Result, float): # read Dataset into data loader (if list of sentences passed, make Dataset first) if not isinstance(sentences, Dataset): sentences = SentenceDataset(sentences) data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) # use scikit-learn to evaluate y_true = [] y_pred = [] with torch.no_grad(): eval_loss = 0 lines: List[str] = [] batch_count: int = 0 for batch in data_loader: batch_count += 1 # remove previously predicted labels [sentence.remove_labels('predicted') for sentence in batch] # get the gold labels true_values_for_batch = [ sentence.get_labels(self.label_type) for sentence in batch ] # predict for batch loss = self.predict( batch, embedding_storage_mode=embedding_storage_mode, mini_batch_size=mini_batch_size, label_name='predicted', return_loss=True) eval_loss += loss sentences_for_batch = [ sent.to_plain_string() for sent in batch ] # get the predicted labels predictions = [ sentence.get_labels('predicted') for sentence in batch ] for sentence, prediction, true_value in zip( sentences_for_batch, predictions, true_values_for_batch, ): eval_line = "{}\t{}\t{}\n".format(sentence, true_value, prediction) lines.append(eval_line) for predictions_for_sentence, true_values_for_sentence in zip( predictions, true_values_for_batch): true_values_for_sentence = [ label.value for label in true_values_for_sentence ] predictions_for_sentence = [ label.value for label in predictions_for_sentence ] y_true_instance = np.zeros(len(self.label_dictionary), dtype=int) for i in range(len(self.label_dictionary)): if self.label_dictionary.get_item_for_index( i) in true_values_for_sentence: y_true_instance[i] = 1 y_true.append(y_true_instance.tolist()) y_pred_instance = np.zeros(len(self.label_dictionary), dtype=int) for i in range(len(self.label_dictionary)): if self.label_dictionary.get_item_for_index( i) in predictions_for_sentence: y_pred_instance[i] = 1 y_pred.append(y_pred_instance.tolist()) store_embeddings(batch, embedding_storage_mode) # remove predicted labels for sentence in sentences: sentence.annotation_layers['predicted'] = [] if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) # make "classification report" target_names = [] for i in range(len(self.label_dictionary)): target_names.append( self.label_dictionary.get_item_for_index(i)) classification_report = metrics.classification_report( y_true, y_pred, digits=4, target_names=target_names, zero_division=0) # get scores micro_f_score = round( metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', zero_division=0), 4) accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4) macro_f_score = round( metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', zero_division=0), 4) precision_score = round( metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), 4) recall_score = round( metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), 4) detailed_result = ("\nResults:" f"\n- F-score (micro) {micro_f_score}" f"\n- F-score (macro) {macro_f_score}" f"\n- Accuracy {accuracy_score}" '\n\nBy class:\n' + classification_report) # line for log file if not self.multi_label: log_header = "ACCURACY" log_line = f"\t{accuracy_score}" else: log_header = "PRECISION\tRECALL\tF1\tACCURACY" log_line = f"{precision_score}\t" \ f"{recall_score}\t" \ f"{macro_f_score}\t" \ f"{accuracy_score}" result = Result( main_score=micro_f_score, log_line=log_line, log_header=log_header, detailed_results=detailed_result, ) eval_loss /= batch_count return result, eval_loss
tag = tkn.get_tag("ner").value pref, tag_no_pref = _split_tag(tag) if tag_no_pref is None: break tag_no_pref_encoded = tag_no_pref.encode("utf-8") if tag_no_pref_encoded in tag_dictionary_no_prefix.idx2item and tag_countdown[ tag_dictionary_no_prefix.item2idx[tag_no_pref_encoded]] > 0: corpus_sents.append(sent) tag_countdown[ tag_dictionary_no_prefix.item2idx[tag_no_pref_encoded]] -= 1 sent_picked = True print("sents for training: " + str(len(corpus_sents))) print("amount of items in dict: " + str(len(tag_dictionary.item2idx))) training_dataset = SentenceDataset(corpus_sents) training_corpus = Corpus(train=training_dataset, dev=corpus_small.dev, test=corpus_small.test, sample_missing_splits=False) trainer = ModelTrainer(tagger, training_corpus, optimizer=torch.optim.AdamW) tag_dictionary = training_corpus.make_label_dictionary(tag_type) tagger.add_and_switch_to_new_task("fewshot-moviecomplex-simple-to-conll3", tag_dictionary=tag_dictionary, tag_type=tag_type) trainer.train( base_path='resources/v3/fewshot-moviecomplex-simple-to-conll3-k' + str(k), learning_rate=5.0e-5, mini_batch_size=32, mini_batch_chunk_size=None, max_epochs=10,
def evaluate(self, sentences: Union[List[DataPoint], Dataset], out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: int = 8, **kwargs) -> (Result, float): # read Dataset into data loader (if list of sentences passed, make Dataset first) if not isinstance(sentences, Dataset): sentences = SentenceDataset(sentences) data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) with torch.no_grad(): eval_loss = 0 metric = MetricRegression("Evaluation") lines: List[str] = [] total_count = 0 for batch_nr, batch in enumerate(data_loader): if isinstance(batch, Sentence): batch = [batch] scores, loss = self.forward_labels_and_loss(batch) true_values = [] for sentence in batch: total_count += 1 for label in sentence.labels: true_values.append(float(label.value)) results = [] for score in scores: if type(score[0]) is Label: results.append(float(score[0].score)) else: results.append(float(score[0])) eval_loss += loss metric.true.extend(true_values) metric.pred.extend(results) for sentence, prediction, true_value in zip( batch, results, true_values): eval_line = "{}\t{}\t{}\n".format( sentence.to_original_text(), true_value, prediction) lines.append(eval_line) store_embeddings(batch, embedding_storage_mode) eval_loss /= total_count ##TODO: not saving lines yet if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}\t{metric.pearsonr()}" log_header = "MSE\tSPEARMAN\tPEARSON" detailed_result = ( f"AVG: mse: {metric.mean_squared_error():.4f} - " f"mae: {metric.mean_absolute_error():.4f} - " f"pearson: {metric.pearsonr():.4f} - " f"spearman: {metric.spearmanr():.4f}") result: Result = Result( main_score=metric.pearsonr(), loss=eval_loss, log_header=log_header, log_line=log_line, detailed_results=detailed_result, ) return result
def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size: int = 32, multi_class_prob: bool = False, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", ): """ Predicts the class labels for the given sentences. The labels are directly added to the sentences. :param sentences: list of sentences :param mini_batch_size: mini batch size to use :param multi_class_prob : return probability for all class for multiclass :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss :param label_name: set this to change the name of the label type that is predicted :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ if label_name == None: label_name = self.label_type if self.label_type is not None else 'label' with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, DataPoint): sentences = [sentences] # filter empty sentences if isinstance(sentences[0], Sentence): sentences = [ sentence for sentence in sentences if len(sentence) > 0 ] if len(sentences) == 0: return sentences # reverse sort all sequences by their length rev_order_len_index = sorted(range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True) reordered_sentences: List[Union[DataPoint, str]] = [ sentences[index] for index in rev_order_len_index ] dataloader = DataLoader( dataset=SentenceDataset(reordered_sentences), batch_size=mini_batch_size) # progress bar for verbosity if verbose: dataloader = tqdm(dataloader) overall_loss = 0 batch_no = 0 for batch in dataloader: batch_no += 1 if verbose: dataloader.set_description( f"Inferencing on batch {batch_no}") # stop if all sentences are empty if not batch: continue scores = self.forward(batch) if return_loss: overall_loss += self._calculate_loss(scores, batch) predicted_labels = self._obtain_labels( scores, predict_prob=multi_class_prob) for (sentence, labels) in zip(batch, predicted_labels): for label in labels: if self.multi_label or multi_class_prob: sentence.add_label(label_name, label.value, label.score) else: sentence.set_label(label_name, label.value, label.score) # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) if return_loss: return overall_loss / batch_no
def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size=32, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", label_threshold: float = 0.5, multi_label: Optional[bool] = None, ): """ Predict sequence tags for Named Entity Recognition task :param sentences: a Sentence or a List of Sentence :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory, up to a point when it has no more effect. :param all_tag_prob: True to compute the score for each tag on each token, otherwise only the score of the best tag is returned :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss :param label_name: set this to change the name of the label type that is predicted :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ if not label_name: label_name = self.get_current_label_type() if multi_label is None: multi_label = self.is_current_task_multi_label() # with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, Sentence): sentences = [sentences] # set context if not set already previous_sentence = None for sentence in sentences: if sentence.is_context_set(): continue sentence._previous_sentence = previous_sentence sentence._next_sentence = None if previous_sentence: previous_sentence._next_sentence = sentence previous_sentence = sentence # reverse sort all sequences by their length rev_order_len_index = sorted(range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True) reordered_sentences: List[Union[Sentence, str]] = [ sentences[index] for index in rev_order_len_index ] dataloader = DataLoader(dataset=SentenceDataset(reordered_sentences), batch_size=mini_batch_size) # progress bar for verbosity if verbose: dataloader = tqdm(dataloader) overall_loss = 0 overall_count = 0 batch_no = 0 with torch.no_grad(): for batch in dataloader: batch_no += 1 if verbose: dataloader.set_description( f"Inferencing on batch {batch_no}") batch = self._filter_empty_sentences(batch) # stop if all sentences are empty if not batch: continue # go through each sentence in the batch for sentence in batch: # always remove tags first sentence.remove_labels(label_name) all_labels = [ label.decode("utf-8") for label in self.get_current_label_dictionary().idx2item ] best_label = None for label in all_labels: tars_sentence = self._get_tars_formatted_sentence( label, sentence) loss_and_count = self.tars_model.predict( tars_sentence, label_name=label_name, return_loss=True, return_probabilities_for_all_classes=True if label_threshold < 0.5 else False, ) overall_loss += loss_and_count[0].item() overall_count += loss_and_count[1] # add all labels that according to TARS match the text and are above threshold for predicted_tars_label in tars_sentence.get_labels( label_name): if predicted_tars_label.value == self.LABEL_MATCH \ and predicted_tars_label.score > label_threshold: # do not add labels below confidence threshold sentence.add_label(label_name, label, predicted_tars_label.score) # only use label with highest confidence if enforcing single-label predictions if not multi_label: if len(sentence.get_labels(label_name)) > 0: # get all label scores and do an argmax to get the best label label_scores = torch.tensor([ label.score for label in sentence.get_labels(label_name) ], dtype=torch.float) best_label = sentence.get_labels(label_name)[ torch.argmax(label_scores)] # remove previously added labels and only add the best label sentence.remove_labels(label_name) sentence.add_label(typename=label_name, value=best_label.value, score=best_label.score) # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) if return_loss: return overall_loss, overall_count
def evaluate( self, sentences: Union[List[Sentence], Dataset], out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: int = 8, ) -> (Result, float): # read Dataset into data loader (if list of sentences passed, make Dataset first) if not isinstance(sentences, Dataset): sentences = SentenceDataset(sentences) data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) # if span F1 needs to be used, use separate eval method if self._requires_span_F1_evaluation(): return self._evaluate_with_span_F1(data_loader, embedding_storage_mode, mini_batch_size, out_path) # else, use scikit-learn to evaluate y_true = [] y_pred = [] labels = Dictionary(add_unk=False) eval_loss = 0 batch_no: int = 0 lines: List[str] = [] for batch in data_loader: # predict for batch loss = self.predict(batch, embedding_storage_mode=embedding_storage_mode, mini_batch_size=mini_batch_size, label_name='predicted', return_loss=True) eval_loss += loss batch_no += 1 for sentence in batch: for token in sentence: # add gold tag gold_tag = token.get_tag(self.tag_type).value y_true.append(labels.add_item(gold_tag)) # add predicted tag predicted_tag = token.get_tag('predicted').value y_pred.append(labels.add_item(predicted_tag)) # for file output lines.append(f'{token.text} {gold_tag} {predicted_tag}\n') lines.append('\n') if out_path: with open(Path(out_path), "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) eval_loss /= batch_no # use sklearn from sklearn import metrics # make "classification report" target_names = [] for i in range(len(labels)): target_names.append(labels.get_item_for_index(i)) classification_report = metrics.classification_report(y_true, y_pred, digits=4, target_names=target_names, zero_division=1) # get scores macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro'), 4) micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro'), 4) accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4) detailed_result = ( "\nResults:" f"\n- F-score (micro) {macro_f_score}" f"\n- F-score (macro) {micro_f_score}" f"\n- Accuracy {accuracy_score}" '\n\nBy class:\n' + classification_report ) # line for log file log_header = "ACCURACY" log_line = f"\t{accuracy_score}" result = Result( main_score=macro_f_score, log_line=log_line, log_header=log_header, detailed_results=detailed_result, ) return result, eval_loss
def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size: int = 32, return_probabilities_for_all_classes: bool = False, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", ): """ Predicts the class labels for the given sentences. The labels are directly added to the sentences. :param sentences: list of sentences :param mini_batch_size: mini batch size to use :param return_probabilities_for_all_classes : return probabilities for all classes instead of only best predicted :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss :param label_name: set this to change the name of the label type that is predicted :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ if label_name is None: label_name = self.label_type if self.label_type is not None else "label" with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, DataPoint): sentences = [sentences] # filter empty sentences if isinstance(sentences[0], DataPoint): sentences = [ sentence for sentence in sentences if len(sentence) > 0 ] if len(sentences) == 0: return sentences # reverse sort all sequences by their length rev_order_len_index = sorted(range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True) reordered_sentences: List[Union[DataPoint, str]] = [ sentences[index] for index in rev_order_len_index ] dataloader = DataLoader( dataset=SentenceDataset(reordered_sentences), batch_size=mini_batch_size) # progress bar for verbosity if verbose: dataloader = tqdm(dataloader) overall_loss = 0 batch_no = 0 label_count = 0 for batch in dataloader: batch_no += 1 if verbose: dataloader.set_description( f"Inferencing on batch {batch_no}") # stop if all sentences are empty if not batch: continue scores, gold_labels, data_points, label_candidates = self.forward_pass( batch, return_label_candidates=True) # remove previously predicted labels of this type for sentence in data_points: sentence.remove_labels(label_name) if return_loss: overall_loss += self._calculate_loss(scores, gold_labels)[0] label_count += len(label_candidates) # if anything could possibly be predicted if len(label_candidates) > 0: if self.multi_label: sigmoided = torch.sigmoid( scores) # size: (n_sentences, n_classes) n_labels = sigmoided.size(1) for s_idx, (data_point, label_candidate) in enumerate( zip(data_points, label_candidates)): for l_idx in range(n_labels): label_value = self.label_dictionary.get_item_for_index( l_idx) if label_value == 'O': continue label_threshold = self._get_label_threshold( label_value) label_score = sigmoided[s_idx, l_idx].item() if label_score > label_threshold or return_probabilities_for_all_classes: label = label_candidate.spawn( value=label_value, score=label_score) data_point.add_complex_label( label_name, label) else: softmax = torch.nn.functional.softmax(scores, dim=-1) if return_probabilities_for_all_classes: n_labels = softmax.size(1) for s_idx, (data_point, label_candidate) in enumerate( zip(data_points, label_candidates)): for l_idx in range(n_labels): label_value = self.label_dictionary.get_item_for_index( l_idx) if label_value == 'O': continue label_score = softmax[s_idx, l_idx].item() label = label_candidate.spawn( value=label_value, score=label_score) data_point.add_complex_label( label_name, label) else: conf, idx = torch.max(softmax, dim=-1) for data_point, label_candidate, c, i in zip( data_points, label_candidates, conf, idx): label_value = self.label_dictionary.get_item_for_index( i.item()) if label_value == 'O': continue label = label_candidate.spawn( value=label_value, score=c.item()) data_point.add_complex_label(label_name, label) store_embeddings(batch, storage_mode=embedding_storage_mode) if return_loss: return overall_loss, label_count
def predict( self, sentences: Union[List[Sentence], Sentence, List[str], str], mini_batch_size: int = 32, embedding_storage_mode="none", multi_class_prob: bool = False, verbose: bool = False, use_tokenizer: Union[bool, Callable[[str], List[Token]]] = space_tokenizer, ) -> List[Sentence]: """ Predicts the class labels for the given sentences. The labels are directly added to the sentences. :param sentences: list of sentences :param mini_batch_size: mini batch size to use :param embedding_storage_mode: 'none' for the minimum memory footprint, 'cpu' to store embeddings in Ram, 'gpu' to store embeddings in GPU memory. :param multi_class_prob : return probability for all class for multiclass :param verbose: set to True to display a progress bar :param use_tokenizer: a custom tokenizer when string are provided (default is space based tokenizer). :return: the list of sentences containing the labels """ with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, Sentence) or isinstance(sentences, str): sentences = [sentences] if (flair.device.type == "cuda") and embedding_storage_mode == "cpu": log.warning( "You are inferring on GPU with parameter 'embedding_storage_mode' set to 'cpu'." "This option will slow down your inference, usually 'none' (default value) " "is a better choice." ) # reverse sort all sequences by their length rev_order_len_index = sorted( range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True ) original_order_index = sorted( range(len(rev_order_len_index)), key=lambda k: rev_order_len_index[k] ) reordered_sentences: List[Union[Sentence, str]] = [ sentences[index] for index in rev_order_len_index ] if isinstance(sentences[0], Sentence): # remove previous embeddings store_embeddings(reordered_sentences, "none") dataset = SentenceDataset(reordered_sentences) else: dataset = StringDataset( reordered_sentences, use_tokenizer=use_tokenizer ) dataloader = DataLoader( dataset=dataset, batch_size=mini_batch_size, collate_fn=lambda x: x ) # progress bar for verbosity if verbose: dataloader = tqdm(dataloader) results: List[Sentence] = [] for i, batch in enumerate(dataloader): if verbose: dataloader.set_description(f"Inferencing on batch {i}") results += batch batch = self._filter_empty_sentences(batch) # stop if all sentences are empty if not batch: continue scores = self.forward(batch) predicted_labels = self._obtain_labels( scores, predict_prob=multi_class_prob ) for (sentence, labels) in zip(batch, predicted_labels): sentence.labels = labels # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) results: List[Union[Sentence, str]] = [ results[index] for index in original_order_index ] assert len(sentences) == len(results) return results
money = "money" tech = "tech" # training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink") train = SentenceDataset([ Sentence('Are You Trading or Gambling?').add_label(label_name, finance), Sentence('Amazon capitalization reached trillion dollars').add_label( label_name, finance), Sentence('Finance dictionary: SPACs and IPOs').add_label( label_name, finance), Sentence('Developer salaries development').add_label(label_name, money), Sentence('My annual income as developer since 2008').add_label( label_name, money), Sentence('How to maximize your income as a developer').add_label( label_name, money), Sentence('Levels.fyi salary information in tech').add_label( label_name, money), Sentence('New version of ruby').add_label(label_name, tech), Sentence('Python: 30 years in').add_label(label_name, tech), Sentence( 'Things I learned developing D3 library for visualization').add_label( label_name, tech), Sentence('Bitcoin price most volatile since 2019').add_label( label_name, crypto), Sentence('Cryptocurrency mining consumes as much energy as some countries' ).add_label(label_name, crypto), Sentence('Bitcoin is a scam').add_label(label_name, crypto), ]) # test dataset consisting of two sentences (1 labeled as "food" and 1 labeled as "drink") test = SentenceDataset([ Sentence('Coinbase S-1 filing').add_label(label_name, finance),
def evaluate( self, sentences: Union[List[Sentence], Dataset], out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: int = 8, wsd_evaluation: bool = False, **kwargs, ) -> (Result, float): # read Dataset into data loader (if list of sentences passed, make Dataset first) if not isinstance(sentences, Dataset): sentences = SentenceDataset(sentences) data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) eval_loss = 0 eval_count = 0 batch_no: int = 0 metric = Metric("Evaluation", beta=self.beta) lines: List[str] = [] y_true = [] y_pred = [] for batch in data_loader: # predict for batch loss_and_count = self.predict( batch, embedding_storage_mode=embedding_storage_mode, mini_batch_size=mini_batch_size, label_name='predicted', return_loss=True) eval_loss += loss_and_count[0] eval_count += loss_and_count[1] batch_no += 1 for sentence in batch: # make list of gold tags gold_spans = sentence.get_spans(self.get_current_tag_type()) gold_tags = [(span.tag, repr(span)) for span in gold_spans] # make list of predicted tags predicted_spans = sentence.get_spans("predicted") predicted_tags = [(span.tag, repr(span)) for span in predicted_spans] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) tags_gold = [] tags_pred = [] # also write to file in BIO format to use old conlleval script if out_path: for token in sentence: # check if in gold spans gold_tag = 'O' for span in gold_spans: if token in span: gold_tag = 'B-' + span.tag if token == span[ 0] else 'I-' + span.tag tags_gold.append(gold_tag) predicted_tag = 'O' # check if in predicted spans for span in predicted_spans: if token in span: predicted_tag = 'B-' + span.tag if token == span[ 0] else 'I-' + span.tag tags_pred.append(predicted_tag) lines.append( f'{token.text} {gold_tag} {predicted_tag}\n') lines.append('\n') y_true.append(tags_gold) y_pred.append(tags_pred) if out_path: with open(Path(out_path), "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) detailed_result = ( "\nResults:" f"\n- F1-score (micro) {metric.micro_avg_f_score():.4f}" f"\n- F1-score (macro) {metric.macro_avg_f_score():.4f}" '\n\nBy class:') for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss / eval_count
def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size=32, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", ): # return """ Predict sequence tags for Named Entity Recognition task :param sentences: a Sentence or a List of Sentence :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory, up to a point when it has no more effect. :param all_tag_prob: True to compute the score for each tag on each token, otherwise only the score of the best tag is returned :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss :param label_name: set this to change the name of the label type that is predicted :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ if label_name == None: label_name = self.get_current_tag_type() # with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, Sentence): sentences = [sentences] # set context if not set already previous_sentence = None for sentence in sentences: if sentence.is_context_set(): continue sentence._previous_sentence = previous_sentence sentence._next_sentence = None if previous_sentence: previous_sentence._next_sentence = sentence previous_sentence = sentence # reverse sort all sequences by their length rev_order_len_index = sorted(range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True) reordered_sentences: List[Union[Sentence, str]] = [ sentences[index] for index in rev_order_len_index ] dataloader = DataLoader(dataset=SentenceDataset(reordered_sentences), batch_size=mini_batch_size) # progress bar for verbosity if verbose: dataloader = tqdm(dataloader) overall_loss = 0 overall_count = 0 batch_no = 0 with torch.no_grad(): for batch in dataloader: batch_no += 1 if verbose: dataloader.set_description( f"Inferencing on batch {batch_no}") batch = self._filter_empty_sentences(batch) # stop if all sentences are empty if not batch: continue # go through each sentence in the batch for sentence in batch: # always remove tags first for token in sentence: token.remove_labels(label_name) all_labels = [ label.decode("utf-8") for label in self.get_current_tag_dictionary().idx2item ] all_detected = {} for label in all_labels: tars_sentence = self._get_tars_formatted_sentence( label, sentence) label_length = 0 if not self.prefix else len( label.split(" ")) + len(self.separator.split(" ")) loss_and_count = self.tars_model.predict( tars_sentence, label_name=label_name, all_tag_prob=True, return_loss=True) overall_loss += loss_and_count[0].item() overall_count += loss_and_count[1] for span in tars_sentence.get_spans(label_name): span.set_label('tars_temp_label', label) all_detected[span] = span.score for span in tars_sentence.get_spans(label_name): for token in span: corresponding_token = sentence.get_token( token.idx - label_length) if corresponding_token is None: continue if corresponding_token.get_tag(label_name).value != '' and \ corresponding_token.get_tag(label_name).score > token.get_tag(label_name).score: continue corresponding_token.add_tag( label_name, token.get_tag(label_name).value + label, token.get_tag(label_name).score, ) # import operator # sorted_x = sorted(all_detected.items(), key=operator.itemgetter(1)) # sorted_x.reverse() # print(sorted_x) # for tuple in sorted_x: # span = tuple[0] # # tag_this = True # # for token in span: # corresponding_token = sentence.get_token(token.idx) # if corresponding_token is None: # tag_this = False # continue # if corresponding_token.get_tag(label_name).value != '' and \ # corresponding_token.get_tag(label_name).score > token.get_tag(label_name).score: # tag_this = False # continue # # if tag_this: # for token in span: # corresponding_token = sentence.get_token(token.idx) # corresponding_token.add_tag( # label_name, # token.get_tag(label_name).value + span.get_labels('tars_temp_label')[0].value, # token.get_tag(label_name).score, # ) # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) if return_loss: return overall_loss, overall_count
def evaluate( self, data_points: Union[List[DataPoint], Dataset], gold_label_type: str, out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: int = 8, main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"), gold_label_dictionary: Optional[Dictionary] = None, ) -> Result: if not isinstance(data_points, Dataset): data_points = SentenceDataset(data_points) data_loader = DataLoader(data_points, batch_size=mini_batch_size, num_workers=num_workers) lines: List[str] = ["token gold_tag gold_arc predicted_tag predicted_arc\n"] average_over = 0 eval_loss_arc = 0 eval_loss_rel = 0 y_true = [] y_pred = [] parsing_metric = ParsingMetric() for batch in data_loader: average_over += 1 with torch.no_grad(): score_arc, score_rel = self.forward(batch) loss_arc, loss_rel = self._calculate_loss(score_arc, score_rel, batch) arc_prediction, relation_prediction = self._obtain_labels_(score_arc, score_rel) parsing_metric(arc_prediction, relation_prediction, batch, gold_label_type) eval_loss_arc += loss_arc eval_loss_rel += loss_rel for (sentence, arcs, sent_tags) in zip(batch, arc_prediction, relation_prediction): for (token, arc, tag) in zip(sentence.tokens, arcs, sent_tags): token: Token = token token.add_tag_label("predicted", Label(tag)) token.add_tag_label("predicted_head_id", Label(str(int(arc)))) # append both to file for evaluation eval_line = "{} {} {} {} {}\n".format(token.text, token.get_tag(gold_label_type).value, str(token.head_id), tag, str(int(arc))) lines.append(eval_line) lines.append("\n") for sentence in batch: gold_tags = [token.get_tag(gold_label_type).value for token in sentence.tokens] predicted_tags = [tag.tag for tag in sentence.get_spans("predicted")] y_pred += [self.relations_dictionary.get_idx_for_item(tag) for tag in predicted_tags] y_true += [self.relations_dictionary.get_idx_for_item(tag) for tag in gold_tags] store_embeddings(batch, embedding_storage_mode) eval_loss_arc /= average_over eval_loss_rel /= average_over if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) classification_report_dict = sklearn.metrics.classification_report(y_true, y_pred, target_names=self.relations_dictionary.idx2item, zero_division=0, output_dict=True, labels=range(len(self.relations_dictionary))) accuracy_score = round(sklearn.metrics.accuracy_score(y_true, y_pred), 4) precision_score = round(classification_report_dict["micro avg"]["precision"], 4) recall_score = round(classification_report_dict["micro avg"]["recall"], 4) micro_f_score = round(classification_report_dict["micro avg"]["f1-score"], 4) macro_f_score = round(classification_report_dict["macro avg"]["f1-score"], 4) main_score = classification_report_dict[main_evaluation_metric[0]][main_evaluation_metric[1]] detailed_result = ( f"\nUAS : {parsing_metric.get_uas():.4f} - LAS : {parsing_metric.get_las():.4f}" f"\neval loss rel : {eval_loss_rel:.4f} - eval loss arc : {eval_loss_arc:.4f}" f"\nF-Score: micro : {micro_f_score} - macro : {macro_f_score}" f"\n Accuracy: {accuracy_score} - Precision {precision_score} - Recall {recall_score}" ) log_header = "PRECISION\tRECALL\tF1\tACCURACY" log_line = f"{precision_score}\t" f"{recall_score}\t" f"{micro_f_score}\t" f"{accuracy_score}" result = Result( main_score=main_score, log_line=log_line, log_header=log_header, detailed_results=detailed_result, classification_report=classification_report_dict, loss=eval_loss_rel+eval_loss_arc ) return result
train = SentenceDataset( [ Sentence('email').add_label('contact_type', 'email'), Sentence('21 Jan: email client about signing them up for phrase 2 of Project Alpha').add_label('contact_type', 'email'), Sentence('Project Alpha: email client about signing them up for phase 2').add_label('contact_type', 'email'), Sentence('emailed client').add_label('contact_type', 'email'), Sentence('sent an email to the client about the project').add_label('contact_type', 'email'), Sentence('sent email to the client').add_label('contact_type', 'email'), Sentence('e-mailed to the client').add_label('contact_type', 'email'), Sentence('e-mailing to the client').add_label('contact_type', 'email'), Sentence('e-mail to the client').add_label('contact_type', 'email'), Sentence('sent email to the client about the new offer').add_label('contact_type', 'email'), Sentence('as I planned yesterday I emailed client').add_label('contact_type', 'email'), Sentence('emailing recent discussions').add_label('contact_type', 'email'), Sentence('today(project alpha) emailing recent discussions').add_label('contact_type', 'email'), Sentence('emailed client to schedule a meeting next week').add_label('contact_type', 'email'), Sentence('sent an email to client to schedule a meeting next week').add_label('contact_type', 'email'), Sentence('sent an email to client to set up a skype call').add_label('contact_type', 'email'), Sentence('21 Jan: call with client about signing them up for phase 2 of Project').add_label('contact_type', 'call'), Sentence('21 Jan: phone call with client about signing them up for phase 2 of Project').add_label('contact_type', 'call'), Sentence('21 Jan: skype with client about signing them up for phase 2 of Project').add_label('contact_type', 'call'), Sentence('Project Alpha: call with client about signing them up for phase 2').add_label('contact_type', 'call'), Sentence('Project Alpha: phone call with client about signing them up for phase 2').add_label('contact_type', 'call'), Sentence('Project Alpha: skype with client about signing them up for phase 2').add_label('contact_type', 'call'), Sentence('Phoned client about the phase 2 of the Project').add_label('contact_type', 'call'), Sentence('about the phase 2 of the Project, I called client').add_label('contact_type', 'call'), Sentence('phoning client about the project').add_label('contact_type', 'call'), Sentence('call with client').add_label('contact_type', 'call'), Sentence('calling').add_label('contact_type', 'call'), Sentence('skype call').add_label('contact_type', 'call'), Sentence('skype video call with Mary to discuss').add_label('contact_type', 'call'), Sentence('skyped client').add_label('contact_type', 'call'), Sentence('called client to inform them').add_label('contact_type', 'call'), Sentence('have a call about the project').add_label('contact_type', 'call'), Sentence('give a call').add_label('contact_type', 'call'), Sentence('telephone call with client about the project').add_label('contact_type', 'call'), Sentence('today: telephone call with client about the project').add_label('contact_type', 'call'), Sentence('18 December: telephone call with client about the project').add_label('contact_type', 'call'), Sentence('December 18th: telephone call with client about the project').add_label('contact_type', 'call'), Sentence('called client to schedule a meeting next week').add_label('contact_type', 'call'), Sentence('called client to set up a meeting next week').add_label('contact_type', 'call'), Sentence('21 Jan: meeting with client about signing them up for phrase 2 of Project Alpha').add_label('contact_type', 'meeting'), Sentence('21 Jan: meet with client about signing them up for phrase 2 of Project Alpha').add_label('contact_type', 'meeting'), Sentence('Project Alpha: meeting with client about signing them up for phase 2').add_label('contact_type', 'meeting'), Sentence('Project Alpha: meeting with client about signing them up for phase 2').add_label('contact_type', 'meeting'), Sentence('meet up with them').add_label('contact_type', 'meeting'), Sentence('meeting with client').add_label('contact_type', 'meeting'), Sentence('met client to discuss project').add_label('contact_type', 'meeting'), Sentence('meet with client at their office to review project').add_label('contact_type', 'meeting'), Sentence('meet').add_label('contact_type', 'meeting'), Sentence('set up a meeting').add_label('contact_type', 'meeting'), Sentence('joined a meeting').add_label('contact_type', 'meeting'), Sentence('participate in a meeting').add_label('contact_type', 'meeting'), Sentence('represent client at a meeting').add_label('contact_type', 'meeting'), Sentence('10 October: represent client at a meeting').add_label('contact_type', 'meeting'), Sentence('October 10th: represent client at a meeting').add_label('contact_type', 'meeting'), Sentence('met with client and decided to discuss it later over a call').add_label('contact_type', 'meeting'), Sentence('met with client and agreed to continue over email').add_label('contact_type', 'meeting'), ])
def evaluate( self, data_points: Union[List[DataPoint], Dataset], gold_label_type: str, out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: int = 8, main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"), exclude_labels: List[str] = [], gold_label_dictionary: Optional[Dictionary] = None, ) -> Result: import numpy as np import sklearn # read Dataset into data loader (if list of sentences passed, make Dataset first) if not isinstance(data_points, Dataset): data_points = SentenceDataset(data_points) data_loader = DataLoader(data_points, batch_size=mini_batch_size, num_workers=num_workers) with torch.no_grad(): # loss calculation eval_loss = 0 average_over = 0 # variables for printing lines: List[str] = [] # variables for computing scores all_spans: List[str] = [] all_true_values = {} all_predicted_values = {} sentence_id = 0 for batch in data_loader: # remove any previously predicted labels for datapoint in batch: datapoint.remove_labels('predicted') # predict for batch loss_and_count = self.predict( batch, embedding_storage_mode=embedding_storage_mode, mini_batch_size=mini_batch_size, label_name='predicted', return_loss=True) if isinstance(loss_and_count, Tuple): average_over += loss_and_count[1] eval_loss += loss_and_count[0] else: eval_loss += loss_and_count # get the gold labels for datapoint in batch: for gold_label in datapoint.get_labels(gold_label_type): representation = str( sentence_id) + ': ' + gold_label.identifier value = gold_label.value if gold_label_dictionary and gold_label_dictionary.get_idx_for_item( value) == 0: value = '<unk>' if representation not in all_true_values: all_true_values[representation] = [value] else: all_true_values[representation].append(value) if representation not in all_spans: all_spans.append(representation) for predicted_span in datapoint.get_labels("predicted"): representation = str( sentence_id) + ': ' + predicted_span.identifier # add to all_predicted_values if representation not in all_predicted_values: all_predicted_values[representation] = [ predicted_span.value ] else: all_predicted_values[representation].append( predicted_span.value) if representation not in all_spans: all_spans.append(representation) sentence_id += 1 store_embeddings(batch, embedding_storage_mode) # make printout lines if out_path: lines.extend( self._print_predictions(batch, gold_label_type)) # write all_predicted_values to out_file if set if out_path: with open(Path(out_path), "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) # make the evaluation dictionary evaluation_label_dictionary = Dictionary(add_unk=False) evaluation_label_dictionary.add_item("O") for true_values in all_true_values.values(): for label in true_values: evaluation_label_dictionary.add_item(label) for predicted_values in all_predicted_values.values(): for label in predicted_values: evaluation_label_dictionary.add_item(label) # finally, compute numbers y_true = [] y_pred = [] for span in all_spans: true_values = all_true_values[ span] if span in all_true_values else ['O'] predicted_values = all_predicted_values[ span] if span in all_predicted_values else ['O'] y_true_instance = np.zeros(len(evaluation_label_dictionary), dtype=int) for true_value in true_values: y_true_instance[evaluation_label_dictionary. get_idx_for_item(true_value)] = 1 y_true.append(y_true_instance.tolist()) y_pred_instance = np.zeros(len(evaluation_label_dictionary), dtype=int) for predicted_value in predicted_values: y_pred_instance[evaluation_label_dictionary. get_idx_for_item(predicted_value)] = 1 y_pred.append(y_pred_instance.tolist()) # now, calculate evaluation numbers target_names = [] labels = [] counter = Counter() counter.update( list(itertools.chain.from_iterable(all_true_values.values()))) counter.update( list(itertools.chain.from_iterable(all_predicted_values.values()))) for label_name, count in counter.most_common(): if label_name == 'O': continue if label_name in exclude_labels: continue target_names.append(label_name) labels.append( evaluation_label_dictionary.get_idx_for_item(label_name)) # there is at least one gold label or one prediction (default) if len(all_true_values) + len(all_predicted_values) > 1: classification_report = sklearn.metrics.classification_report( y_true, y_pred, digits=4, target_names=target_names, zero_division=0, labels=labels, ) classification_report_dict = sklearn.metrics.classification_report( y_true, y_pred, target_names=target_names, zero_division=0, output_dict=True, labels=labels, ) accuracy_score = round( sklearn.metrics.accuracy_score(y_true, y_pred), 4) precision_score = round( classification_report_dict["micro avg"]["precision"], 4) recall_score = round( classification_report_dict["micro avg"]["recall"], 4) micro_f_score = round( classification_report_dict["micro avg"]["f1-score"], 4) macro_f_score = round( classification_report_dict["macro avg"]["f1-score"], 4) main_score = classification_report_dict[main_evaluation_metric[0]][ main_evaluation_metric[1]] else: # issue error and default all evaluation numbers to 0. log.error( "ACHTUNG! No gold labels and no all_predicted_values found! Could be an error in your corpus or how you " "initialize the trainer!") accuracy_score = precision_score = recall_score = micro_f_score = macro_f_score = main_score = 0. classification_report = "" classification_report_dict = {} detailed_result = ("\nResults:" f"\n- F-score (micro) {micro_f_score}" f"\n- F-score (macro) {macro_f_score}" f"\n- Accuracy {accuracy_score}" "\n\nBy class:\n" + classification_report) # line for log file log_header = "PRECISION\tRECALL\tF1\tACCURACY" log_line = f"{precision_score}\t" f"{recall_score}\t" f"{micro_f_score}\t" f"{accuracy_score}" if average_over > 0: eval_loss /= average_over result = Result(main_score=main_score, log_line=log_line, log_header=log_header, detailed_results=detailed_result, classification_report=classification_report_dict, loss=eval_loss) return result
def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size=32, all_tag_prob: bool = False, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", ): """ Predict sequence tags for Named Entity Recognition task :param sentences: a Sentence or a List of Sentence :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory, up to a point when it has no more effect. :param all_tag_prob: True to compute the score for each tag on each token, otherwise only the score of the best tag is returned :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss :param label_name: set this to change the name of the label type that is predicted :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ if label_name == None: label_name = self.tag_type with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, Sentence): sentences = [sentences] # reverse sort all sequences by their length rev_order_len_index = sorted( range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True ) reordered_sentences: List[Union[Sentence, str]] = [ sentences[index] for index in rev_order_len_index ] dataloader = DataLoader( dataset=SentenceDataset(reordered_sentences), batch_size=mini_batch_size ) if self.use_crf: transitions = self.transitions.detach().cpu().numpy() else: transitions = None # progress bar for verbosity if verbose: dataloader = tqdm(dataloader) overall_loss = 0 batch_no = 0 for batch in dataloader: batch_no += 1 if verbose: dataloader.set_description(f"Inferencing on batch {batch_no}") batch = self._filter_empty_sentences(batch) # stop if all sentences are empty if not batch: continue feature = self.forward(batch) if return_loss: overall_loss += self._calculate_loss(feature, batch) tags, all_tags = self._obtain_labels( feature=feature, batch_sentences=batch, transitions=transitions, get_all_tags=all_tag_prob, ) for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token.add_tag_label(label_name, tag) # all_tags will be empty if all_tag_prob is set to False, so the for loop will be avoided for (sentence, sent_all_tags) in zip(batch, all_tags): for (token, token_all_tags) in zip(sentence.tokens, sent_all_tags): token.add_tags_proba_dist(label_name, token_all_tags) # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) if return_loss: return overall_loss / batch_no
def predict( self, sentences: Union[List[Sentence], Sentence, List[str], str], mini_batch_size=32, embedding_storage_mode="none", all_tag_prob: bool = False, verbose: bool = False, use_tokenizer: Union[bool, Callable[[str], List[Token]]] = space_tokenizer, ) -> List[Sentence]: """ Predict sequence tags for Named Entity Recognition task :param sentences: a Sentence or a string or a List of Sentence or a List of string. :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory, up to a point when it has no more effect. :param embedding_storage_mode: 'none' for the minimum memory footprint, 'cpu' to store embeddings in Ram, 'gpu' to store embeddings in GPU memory. :param all_tag_prob: True to compute the score for each tag on each token, otherwise only the score of the best tag is returned :param verbose: set to True to display a progress bar :param use_tokenizer: a custom tokenizer when string are provided (default is space based tokenizer). :return: List of Sentence enriched by the predicted tags """ with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, Sentence) or isinstance(sentences, str): sentences = [sentences] if (flair.device.type == "cuda") and embedding_storage_mode == "cpu": log.warning( "You are inferring on GPU with parameter 'embedding_storage_mode' set to 'cpu'." "This option will slow down your inference, usually 'none' (default value) " "is a better choice." ) # reverse sort all sequences by their length rev_order_len_index = sorted( range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True ) original_order_index = sorted( range(len(rev_order_len_index)), key=lambda k: rev_order_len_index[k] ) reordered_sentences: List[Union[Sentence, str]] = [ sentences[index] for index in rev_order_len_index ] if isinstance(sentences[0], Sentence): # remove previous embeddings store_embeddings(reordered_sentences, "none") dataset = SentenceDataset(reordered_sentences) else: dataset = StringDataset( reordered_sentences, use_tokenizer=use_tokenizer ) dataloader = DataLoader( dataset=dataset, batch_size=mini_batch_size, collate_fn=lambda x: x ) if self.use_crf: transitions = self.transitions.detach().cpu().numpy() else: transitions = None # progress bar for verbosity if verbose: dataloader = tqdm(dataloader) results: List[Sentence] = [] for i, batch in enumerate(dataloader): if verbose: dataloader.set_description(f"Inferencing on batch {i}") results += batch batch = self._filter_empty_sentences(batch) # stop if all sentences are empty if not batch: continue feature: torch.Tensor = self.forward(batch) tags, all_tags = self._obtain_labels( feature=feature, batch_sentences=batch, transitions=transitions, get_all_tags=all_tag_prob, ) for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token.add_tag_label(self.tag_type, tag) # all_tags will be empty if all_tag_prob is set to False, so the for loop will be avoided for (sentence, sent_all_tags) in zip(batch, all_tags): for (token, token_all_tags) in zip(sentence.tokens, sent_all_tags): token.add_tags_proba_dist(self.tag_type, token_all_tags) # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) results: List[Union[Sentence, str]] = [ results[index] for index in original_order_index ] assert len(sentences) == len(results) return results
def predict(self, sentences: Union[List[Sentence], Sentence], label_name='predicted', mini_batch_size: int = 16, embedding_storage_mode="None", return_loss=False, print_prediction=False, ): ''' Predict lemmas of words for a given (list of) sentence(s). :param sentences: sentences to predict :param label_name: label name used for predicted lemmas :param mini_batch_size: number of tokens that are send through the RNN simultaneously, assuming batching_in_rnn is set to True :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. :param return_loss: whether or not to compute and return loss. Setting it to True only makes sense if labels are provided :param print_prediction: If True, lemmatized sentences will be printed in the console. ''' if isinstance(sentences, Sentence): sentences = [sentences] # filter empty sentences sentences = [sentence for sentence in sentences if len(sentence) > 0] if len(sentences) == 0: return sentences # max length of the predicted sequences if not self.dependent_on_input: max_length = self.max_sequence_length else: max_length = max([len(token.text) + 1 for sentence in sentences for token in sentence]) # for printing line_to_print = '' overall_loss = 0 number_tokens_in_total = 0 with torch.no_grad(): dataloader = DataLoader(dataset=SentenceDataset(sentences), batch_size=mini_batch_size) for batch in dataloader: # stop if all sentences are empty if not batch: continue # remove previously predicted labels of this type for sentence in batch: for token in sentence: token.remove_labels(label_name) # create list of tokens in batch tokens_in_batch = [token for sentence in batch for token in sentence] number_tokens = len(tokens_in_batch) number_tokens_in_total += number_tokens # encode inputs hidden, all_encoder_outputs = self.encode(batch) # create input for first pass (batch_size, 1, input_size), first letter is special character <S> # sequence length is always set to one in prediction input_indices = self.start_index * torch.ones(number_tokens, dtype=torch.long, device=flair.device).unsqueeze(1) # option 1: greedy decoding if self.beam_size == 1: # predictions predicted = [[] for _ in range(number_tokens)] for decode_step in range(max_length): # decode next character output_vectors, hidden = self.decode(input_indices, hidden, all_encoder_outputs) log_softmax_probs = torch.nn.functional.log_softmax(output_vectors, dim=2) # pick top beam size many outputs with highest probabilities input_indices = log_softmax_probs.argmax(dim=2) for i in range(number_tokens): if len(predicted[i]) > 0 and predicted[i][-1] == self.end_index: continue predicted[i].append(input_indices[i].item()) for t_id, token in enumerate(tokens_in_batch): predicted_lemma = ''.join( self.char_dictionary.get_item_for_index(idx) if idx != self.end_index else "" for idx in predicted[t_id]) token.set_label(typename=label_name, value=predicted_lemma) # option 2: beam search else: output_vectors, hidden = self.decode(input_indices, hidden, all_encoder_outputs) # out_probs = self.softmax(output_vectors).squeeze(1) log_softmax_probs = torch.nn.functional.log_softmax(output_vectors, dim=2).squeeze(1) # make sure no dummy symbol <> or start symbol <S> is predicted log_softmax_probs[:, self.dummy_index] = -inf log_softmax_probs[:, self.start_index] = -inf # pick top beam size many outputs with highest probabilities # probabilities, leading_indices = out_probs.topk(self.beam_size, 1) # max prob along dimension 1 log_probabilities, leading_indices = log_softmax_probs.topk(self.beam_size, 1) # leading_indices and probabilities have size (batch_size, beam_size) # keep scores of beam_size many hypothesis for each token in the batch scores = log_probabilities.view(-1, 1) # stack all leading indices of all hypothesis and corresponding hidden states in two tensors leading_indices = leading_indices.view(-1, 1) # this vector goes through RNN in each iteration hidden_states_beam = torch.stack(self.beam_size * [hidden], dim=2).view(self.rnn_layers, -1, self.rnn_hidden_size) # save sequences so far sequences = torch.tensor([[i.item()] for i in leading_indices], device=flair.device) # keep track of how many hypothesis were completed for each token n_completed = [0 for _ in range(number_tokens)] # cpu final_candidates = [[] for _ in range(number_tokens)] # cpu # if all_encoder_outputs returned, expand them to beam size (otherwise keep this as None) batched_encoding_output = torch.stack(self.beam_size * [all_encoder_outputs], dim=1).view( self.beam_size * number_tokens, -1, self.rnn_hidden_size) if self.use_attention else None for j in range(1, max_length): output_vectors, hidden_states_beam = self.decode(leading_indices, hidden_states_beam, batched_encoding_output) # decode with log softmax out_log_probs = torch.nn.functional.log_softmax(output_vectors, dim=2) # make sure no dummy symbol <> or start symbol <S> is predicted out_log_probs[:, 0, self.dummy_index] = -inf out_log_probs[:, 0, self.start_index] = -inf log_probabilities, index_candidates = out_log_probs.topk(self.beam_size, 2) log_probabilities.squeeze_(1) index_candidates.squeeze_(1) # check if an end symbol <E> has been predicted and, in that case, set hypothesis aside end_symbols = (index_candidates == self.end_index).nonzero(as_tuple=False) for tuple in end_symbols: # if the sequence is already ended, do not record as candidate if sequences[tuple[0], -1].item() == self.end_index: continue # index of token in in list tokens_in_batch token_number = torch.div(tuple[0], self.beam_size, rounding_mode='trunc') # print(token_number) seq = sequences[tuple[0], :] # hypothesis sequence # hypothesis score score = (scores[tuple[0]] + log_probabilities[tuple[0], tuple[1]]) / (len(seq) + 1) final_candidates[token_number].append((seq, score)) # TODO: remove token if number of completed hypothesis exceeds given value n_completed[token_number] += 1 # set score of corresponding entry to -inf so it will not be expanded log_probabilities[tuple[0], tuple[1]] = -inf # get leading_indices for next expansion # find highest scoring hypothesis among beam_size*beam_size possible ones for each token # take beam_size many copies of scores vector and add scores of possible new extensions # size (beam_size*batch_size, beam_size) hypothesis_scores = torch.cat(self.beam_size * [scores], dim=1) + log_probabilities # print(hypothesis_scores) # reshape to vector of size (batch_size, beam_size*beam_size), each row contains beam_size*beam_size scores of the new possible hypothesis hypothesis_scores_per_token = hypothesis_scores.view(number_tokens, self.beam_size ** 2) # print(hypothesis_scores_per_token) # choose beam_size best for each token - size (batch_size, beam_size) best_scores, indices_per_token = hypothesis_scores_per_token.topk(self.beam_size, 1) # out of indices_per_token we now need to recompute the original indices of the hypothesis in a list of length beam_size*batch_size # where the first three inidices belong to the first token, the next three to the second token, and so on beam_numbers = [] seq_numbers = [] for i, row in enumerate(indices_per_token): beam_numbers.extend(i * self.beam_size + index.item() // self.beam_size for index in row) seq_numbers.extend(index.item() % self.beam_size for index in row) # with these indices we can compute the tensors for the next iteration # expand sequences with corresponding index sequences = torch.cat( (sequences[beam_numbers], index_candidates[beam_numbers, seq_numbers].unsqueeze(1)), dim=1) # add log-probabilities to the scores scores = scores[beam_numbers] + log_probabilities[beam_numbers, seq_numbers].unsqueeze(1) # save new leading indices leading_indices = index_candidates[beam_numbers, seq_numbers].unsqueeze(1) # save corresponding hidden states hidden_states_beam = hidden_states_beam[:, beam_numbers, :] # it may happen that no end symbol <E> is predicted for a token in all of the max_length iterations # in that case we append one of the final seuqences without end symbol to the final_candidates best_scores, indices = scores.view(number_tokens, -1).topk(1, 1) for j, (score, index) in enumerate(zip(best_scores.squeeze(1), indices.squeeze(1))): if len(final_candidates[j]) == 0: beam = j * self.beam_size + index.item() final_candidates[j].append((sequences[beam, :], score / max_length)) # get best final hypothesis for each token output_sequences = [] for l in final_candidates: l_ordered = sorted(l, key=lambda tup: tup[1], reverse=True) output_sequences.append(l_ordered[0]) # get characters from index sequences and add predicted label to token for i, seq in enumerate(output_sequences): predicted_lemma = '' for idx in seq[0]: predicted_lemma += self.char_dictionary.get_item_for_index(idx) line_to_print += predicted_lemma line_to_print += ' ' tokens_in_batch[i].add_tag(tag_type=label_name, tag_value=predicted_lemma) if return_loss: overall_loss += self.forward_loss(batch)[0].item() store_embeddings(batch, storage_mode=embedding_storage_mode) if print_prediction: print(line_to_print) if return_loss: return overall_loss, number_tokens_in_total