def test_tagged_corpus_get_all_sentences(): train_sentence = Sentence("I'm used in training.") dev_sentence = Sentence("I'm a dev sentence.") test_sentence = Sentence("I will be only used for testing.") corpus: Corpus = Corpus( FlairDatapointDataset([train_sentence]), FlairDatapointDataset([dev_sentence]), FlairDatapointDataset([test_sentence]), ) all_sentences = corpus.get_all_sentences() assert 3 == len(all_sentences)
def test_tagged_corpus_downsample(): sentence = Sentence("I love Berlin.", use_tokenizer=True).add_label("label", "class_1") corpus: Corpus = Corpus( FlairDatapointDataset([ sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, ]), sample_missing_splits=False, ) assert 10 == len(corpus.train) corpus.downsample(percentage=0.3, downsample_dev=False, downsample_test=False) assert 3 == len(corpus.train)
def test_tagged_corpus_make_label_dictionary(): sentence_1 = Sentence("sentence 1").add_label("label", "class_1") sentence_2 = Sentence("sentence 2").add_label("label", "class_2") sentence_3 = Sentence("sentence 3").add_label("label", "class_1") corpus: Corpus = Corpus( FlairDatapointDataset([sentence_1, sentence_2, sentence_3]), FlairDatapointDataset([]), FlairDatapointDataset([]), ) label_dict = corpus.make_label_dictionary("label") assert 3 == len(label_dict) assert "<unk>" in label_dict.get_items() assert "class_1" in label_dict.get_items() assert "class_2" in label_dict.get_items()
def predict( self, sentences: Union[Sentence, List[Sentence]], mini_batch_size: int = 32, verbose: bool = False, label_name: Optional[str] = None, embedding_storage_mode="none", ) -> List[Sentence]: if label_name is None: label_name = self.label_name if self.label_name is not None else "label" with torch.no_grad(): if not isinstance(sentences, list): sentences = [sentences] if not sentences: return sentences reordered_sentences = sorted(sentences, key=lambda s: len(s), reverse=True) if len(reordered_sentences) == 0: return sentences dataloader = DataLoader( dataset=FlairDatapointDataset(reordered_sentences), batch_size=mini_batch_size, ) # progress bar for verbosity if verbose: progress_bar = tqdm(dataloader) progress_bar.set_description("Batch inference") dataloader = progress_bar for batch in dataloader: # stop if all sentences are empty if not batch: continue scores = self.forward(batch) for (sentence, score) in zip(batch, scores.tolist()): sentence.set_label(label_name, value=str(score[0])) # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) return sentences
def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size: int = 32, num_workers: int = 8, print_tree: bool = False, embedding_storage_mode="none", ) -> None: """ Predict arcs and tags for Dependency Parser task :param sentences: a Sentence or a List of Sentence :param mini_batch_size: mini batch size to use :param print_tree: set to True to print dependency parser of sentence as tree shape :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ if not isinstance(sentences, list): sentences = [sentences] sentence_dataset = FlairDatapointDataset(sentences) data_loader = DataLoader(sentence_dataset, batch_size=mini_batch_size, num_workers=num_workers) for batch in data_loader: with torch.no_grad(): score_arc, score_rel = self.forward(batch) arc_prediction, relation_prediction = self._obtain_labels_( score_arc, score_rel) for sentnce_index, (sentence, sent_tags, sent_arcs) in enumerate( zip(batch, relation_prediction, arc_prediction)): for token_index, (token, tag, head_id) in enumerate( zip(sentence.tokens, sent_tags, sent_arcs)): token.add_tag(self.tag_type, tag, score_rel[sentnce_index][token_index]) token.head_id = int(head_id) if print_tree: tree_printer(sentence, self.tag_type) print("-" * 50) store_embeddings(batch, storage_mode=embedding_storage_mode)
def test_tagged_corpus_make_vocab_dictionary(): train_sentence = Sentence("used in training. training is cool.") corpus: Corpus = Corpus(FlairDatapointDataset([train_sentence]), sample_missing_splits=False) vocab = corpus.make_vocab_dictionary(max_tokens=2, min_freq=-1) assert 3 == len(vocab) assert "<unk>" in vocab.get_items() assert "training" in vocab.get_items() assert "." in vocab.get_items() vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=-1) assert 7 == len(vocab) vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=2) assert 3 == len(vocab) assert "<unk>" in vocab.get_items() assert "training" in vocab.get_items() assert "." in vocab.get_items()
def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size: int = 32, return_probabilities_for_all_classes: bool = False, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", ): """ Predicts labels for current batch with CRF or Softmax. :param sentences: List of sentences in batch :param mini_batch_size: batch size for test data :param return_probabilities_for_all_classes: Whether to return probabilites for all classes :param verbose: whether to use progress bar :param label_name: which label to predict :param return_loss: whether to return loss value :param embedding_storage_mode: determines where to store embeddings - can be "gpu", "cpu" or None. """ if label_name is None: label_name = self.tag_type with torch.no_grad(): if not sentences: return sentences # make sure its a list if not isinstance(sentences, list) and not isinstance( sentences, flair.data.Dataset): sentences = [sentences] # filter empty sentences sentences = [ sentence for sentence in sentences if len(sentence) > 0 ] # reverse sort all sequences by their length reordered_sentences = sorted(sentences, key=lambda s: len(s), reverse=True) if len(reordered_sentences) == 0: return sentences dataloader = DataLoader( dataset=FlairDatapointDataset(reordered_sentences), batch_size=mini_batch_size, ) # progress bar for verbosity if verbose: dataloader = tqdm(dataloader, desc="Batch inference") overall_loss = torch.zeros(1, device=flair.device) batch_no = 0 label_count = 0 for batch in dataloader: batch_no += 1 # stop if all sentences are empty if not batch: continue # get features from forward propagation features, gold_labels = self.forward(batch) # remove previously predicted labels of this type for sentence in batch: sentence.remove_labels(label_name) # if return_loss, get loss value if return_loss: loss = self._calculate_loss(features, gold_labels) overall_loss += loss[0] label_count += loss[1] # Sort batch in same way as forward propagation lengths = torch.LongTensor( [len(sentence) for sentence in batch]) _, sort_indices = lengths.sort(dim=0, descending=True) batch = [batch[i] for i in sort_indices] # make predictions if self.use_crf: predictions, all_tags = self.viterbi_decoder.decode( features, return_probabilities_for_all_classes) else: predictions, all_tags = self._standard_inference( features, batch, return_probabilities_for_all_classes) # add predictions to Sentence for sentence, sentence_predictions in zip(batch, predictions): # BIOES-labels need to be converted to spans if self.predict_spans: sentence_tags = [ label[0] for label in sentence_predictions ] sentence_scores = [ label[1] for label in sentence_predictions ] predicted_spans = get_spans_from_bio( sentence_tags, sentence_scores) for predicted_span in predicted_spans: span: Span = sentence[ predicted_span[0][0]:predicted_span[0][-1] + 1] span.add_label(label_name, value=predicted_span[2], score=predicted_span[1]) # token-labels can be added directly else: for token, label in zip(sentence.tokens, sentence_predictions): token.add_label(typename=label_name, value=label[0], score=label[1]) # all_tags will be empty if all_tag_prob is set to False, so the for loop will be avoided for (sentence, sent_all_tags) in zip(batch, all_tags): for (token, token_all_tags) in zip(sentence.tokens, sent_all_tags): token.add_tags_proba_dist(label_name, token_all_tags) store_embeddings(sentences, storage_mode=embedding_storage_mode) if return_loss: return overall_loss, label_count
def predict( self, sentences: Union[List[DT], DT], mini_batch_size: int = 32, return_probabilities_for_all_classes: bool = False, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", ): """ Predicts the class labels for the given sentences. The labels are directly added to the sentences. # noqa: E501 :param sentences: list of sentences :param mini_batch_size: mini batch size to use :param return_probabilities_for_all_classes : return probabilities for all classes instead of only best predicted # noqa: E501 :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss :param label_name: set this to change the name of the label type that is predicted # noqa: E501 :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. # noqa: E501 'gpu' to store embeddings in GPU memory. """ if label_name is None: label_name = self.label_type if self.label_type is not None else "label" with torch.no_grad(): if not sentences: return sentences if not isinstance(sentences, list): sentences = [sentences] reordered_sentences = self._sort_data(sentences) if len(reordered_sentences) == 0: return sentences dataloader = DataLoader( dataset=FlairDatapointDataset(reordered_sentences), batch_size=mini_batch_size, ) # progress bar for verbosity if verbose: progress_bar = tqdm(dataloader) progress_bar.set_description("Batch inference") dataloader = progress_bar overall_loss = 0 label_count = 0 for batch in dataloader: # stop if all sentences are empty if not batch: continue scores, gold_labels, data_points, label_candidates = self.forward_pass( batch, return_label_candidates=True # type: ignore ) # remove previously predicted labels of this type for sentence in data_points: sentence.remove_labels(label_name) if return_loss: overall_loss += self._calculate_loss(scores, gold_labels)[0] label_count += len(label_candidates) # if anything could possibly be predicted if len(label_candidates) > 0: if self.multi_label: sigmoided = torch.sigmoid(scores) # size: (n_sentences, n_classes) n_labels = sigmoided.size(1) for s_idx, (data_point, label_candidate) in enumerate(zip(data_points, label_candidates)): for l_idx in range(n_labels): label_value = self.label_dictionary.get_item_for_index(l_idx) if label_value == "O": continue label_threshold = self._get_label_threshold(label_value) label_score = sigmoided[s_idx, l_idx].item() if label_score > label_threshold or return_probabilities_for_all_classes: label = label_candidate.spawn(value=label_value, score=label_score) data_point.add_complex_label(label_name, label) else: softmax = torch.nn.functional.softmax(scores, dim=-1) if return_probabilities_for_all_classes: n_labels = softmax.size(1) for s_idx, (data_point, label_candidate) in enumerate(zip(data_points, label_candidates)): for l_idx in range(n_labels): label_value = self.label_dictionary.get_item_for_index(l_idx) if label_value == "O": continue label_score = softmax[s_idx, l_idx].item() label = label_candidate.spawn(value=label_value, score=label_score) data_point.add_complex_label(label_name, label) else: conf, idx = torch.max(softmax, dim=-1) for data_point, label_candidate, c, i in zip(data_points, label_candidates, conf, idx): label_value = self.label_dictionary.get_item_for_index(i.item()) if label_value == "O": continue label = label_candidate.spawn(value=label_value, score=c.item()) data_point.add_complex_label(label_name, label) store_embeddings(batch, storage_mode=embedding_storage_mode) if return_loss: return overall_loss, label_count
def evaluate( self, data_points: Union[List[DT], Dataset], gold_label_type: str, out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: Optional[int] = 8, main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"), exclude_labels: List[str] = [], gold_label_dictionary: Optional[Dictionary] = None, **kwargs, ) -> Result: import numpy as np import sklearn # read Dataset into data loader, if list of sentences passed, make Dataset first if not isinstance(data_points, Dataset): data_points = FlairDatapointDataset(data_points) data_loader = DataLoader(data_points, batch_size=mini_batch_size, num_workers=num_workers) with torch.no_grad(): # loss calculation eval_loss = torch.zeros(1, device=flair.device) average_over = 0 # variables for printing lines: List[str] = [] # variables for computing scores all_spans: List[str] = [] all_true_values = {} all_predicted_values = {} sentence_id = 0 for batch in data_loader: # remove any previously predicted labels for datapoint in batch: datapoint.remove_labels("predicted") # predict for batch loss_and_count = self.predict( batch, embedding_storage_mode=embedding_storage_mode, mini_batch_size=mini_batch_size, label_name="predicted", return_loss=True, ) if isinstance(loss_and_count, tuple): average_over += loss_and_count[1] eval_loss += loss_and_count[0] else: eval_loss += loss_and_count # get the gold labels for datapoint in batch: for gold_label in datapoint.get_labels(gold_label_type): representation = str(sentence_id) + ": " + gold_label.identifier value = gold_label.value if gold_label_dictionary and gold_label_dictionary.get_idx_for_item(value) == 0: value = "<unk>" if representation not in all_true_values: all_true_values[representation] = [value] else: all_true_values[representation].append(value) if representation not in all_spans: all_spans.append(representation) for predicted_span in datapoint.get_labels("predicted"): representation = str(sentence_id) + ": " + predicted_span.identifier # add to all_predicted_values if representation not in all_predicted_values: all_predicted_values[representation] = [predicted_span.value] else: all_predicted_values[representation].append(predicted_span.value) if representation not in all_spans: all_spans.append(representation) sentence_id += 1 store_embeddings(batch, embedding_storage_mode) # make printout lines if out_path: lines.extend(self._print_predictions(batch, gold_label_type)) # write all_predicted_values to out_file if set if out_path: with open(Path(out_path), "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) # make the evaluation dictionary evaluation_label_dictionary = Dictionary(add_unk=False) evaluation_label_dictionary.add_item("O") for true_values in all_true_values.values(): for label in true_values: evaluation_label_dictionary.add_item(label) for predicted_values in all_predicted_values.values(): for label in predicted_values: evaluation_label_dictionary.add_item(label) # finally, compute numbers y_true = [] y_pred = [] for span in all_spans: true_values = all_true_values[span] if span in all_true_values else ["O"] predicted_values = all_predicted_values[span] if span in all_predicted_values else ["O"] y_true_instance = np.zeros(len(evaluation_label_dictionary), dtype=int) for true_value in true_values: y_true_instance[evaluation_label_dictionary.get_idx_for_item(true_value)] = 1 y_true.append(y_true_instance.tolist()) y_pred_instance = np.zeros(len(evaluation_label_dictionary), dtype=int) for predicted_value in predicted_values: y_pred_instance[evaluation_label_dictionary.get_idx_for_item(predicted_value)] = 1 y_pred.append(y_pred_instance.tolist()) # now, calculate evaluation numbers target_names = [] labels = [] counter = Counter(itertools.chain.from_iterable(all_true_values.values())) counter.update(list(itertools.chain.from_iterable(all_predicted_values.values()))) for label_name, count in counter.most_common(): if label_name == "O": continue if label_name in exclude_labels: continue target_names.append(label_name) labels.append(evaluation_label_dictionary.get_idx_for_item(label_name)) # there is at least one gold label or one prediction (default) if len(all_true_values) + len(all_predicted_values) > 1: classification_report = sklearn.metrics.classification_report( y_true, y_pred, digits=4, target_names=target_names, zero_division=0, labels=labels, ) classification_report_dict = sklearn.metrics.classification_report( y_true, y_pred, target_names=target_names, zero_division=0, output_dict=True, labels=labels, ) accuracy_score = round(sklearn.metrics.accuracy_score(y_true, y_pred), 4) precision_score = round(classification_report_dict["micro avg"]["precision"], 4) recall_score = round(classification_report_dict["micro avg"]["recall"], 4) micro_f_score = round(classification_report_dict["micro avg"]["f1-score"], 4) macro_f_score = round(classification_report_dict["macro avg"]["f1-score"], 4) main_score = classification_report_dict[main_evaluation_metric[0]][main_evaluation_metric[1]] else: # issue error and default all evaluation numbers to 0. log.error( "ACHTUNG! No gold labels and no all_predicted_values found! " "Could be an error in your corpus or how you " "initialize the trainer!" ) accuracy_score = precision_score = recall_score = micro_f_score = macro_f_score = main_score = 0.0 classification_report = "" classification_report_dict = {} detailed_result = ( "\nResults:" f"\n- F-score (micro) {micro_f_score}" f"\n- F-score (macro) {macro_f_score}" f"\n- Accuracy {accuracy_score}" "\n\nBy class:\n" + classification_report ) # line for log file log_header = "PRECISION\tRECALL\tF1\tACCURACY" log_line = f"{precision_score}\t" f"{recall_score}\t" f"{micro_f_score}\t" f"{accuracy_score}" if average_over > 0: eval_loss /= average_over result = Result( main_score=main_score, log_line=log_line, log_header=log_header, detailed_results=detailed_result, classification_report=classification_report_dict, loss=eval_loss.item(), ) return result
def evaluate( self, data_points: Union[List[Sentence], Dataset], gold_label_type: str, out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: Optional[int] = 8, **kwargs, ) -> Result: # read Dataset into data loader, if list of sentences passed, make Dataset first if not isinstance(data_points, Dataset): data_points = FlairDatapointDataset(data_points) data_loader = DataLoader(data_points, batch_size=mini_batch_size, num_workers=num_workers) with torch.no_grad(): eval_loss = torch.zeros(1, device=flair.device) metric = MetricRegression("Evaluation") lines: List[str] = [] total_count = 0 for batch_nr, batch in enumerate(data_loader): if isinstance(batch, Sentence): batch = [batch] scores, loss = self.forward_labels_and_loss(batch) true_values = [] for sentence in batch: total_count += 1 for label in sentence.get_labels(gold_label_type): true_values.append(float(label.value)) results = [] for score in scores: results.append(score[0]) eval_loss += loss metric.true.extend(true_values) metric.pred.extend(results) for sentence, prediction, true_value in zip( batch, results, true_values): eval_line = "{}\t{}\t{}\n".format( sentence.to_original_text(), true_value, prediction) lines.append(eval_line) store_embeddings(batch, embedding_storage_mode) eval_loss /= total_count # TODO: not saving lines yet if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}" f"\t{metric.pearsonr()}" log_header = "MSE\tSPEARMAN\tPEARSON" detailed_result = ( f"AVG: mse: {metric.mean_squared_error():.4f} - " f"mae: {metric.mean_absolute_error():.4f} - " f"pearson: {metric.pearsonr():.4f} - " f"spearman: {metric.spearmanr():.4f}") result: Result = Result( main_score=metric.pearsonr(), loss=eval_loss.item(), log_header=log_header, log_line=log_line, detailed_results=detailed_result, ) return result
def evaluate( self, data_points: Union[List[DataPoint], Dataset], gold_label_type: str, out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: Optional[int] = 8, main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"), exclude_labels: List[str] = [], gold_label_dictionary: Optional[Dictionary] = None, **kwargs, ) -> Result: if not isinstance(data_points, Dataset): data_points = FlairDatapointDataset(data_points) data_loader = DataLoader(data_points, batch_size=mini_batch_size, num_workers=num_workers) lines: List[str] = [ "token gold_tag gold_arc predicted_tag predicted_arc\n" ] average_over = 0 eval_loss_arc = 0.0 eval_loss_rel = 0.0 y_true = [] y_pred = [] parsing_metric = ParsingMetric() for batch in data_loader: average_over += 1 with torch.no_grad(): score_arc, score_rel = self.forward(batch) loss_arc, loss_rel = self._calculate_loss( score_arc, score_rel, batch) arc_prediction, relation_prediction = self._obtain_labels_( score_arc, score_rel) parsing_metric(arc_prediction, relation_prediction, batch, gold_label_type) eval_loss_arc += loss_arc.item() eval_loss_rel += loss_rel.item() for (sentence, arcs, sent_tags) in zip(batch, arc_prediction, relation_prediction): for (token, arc, tag) in zip(sentence.tokens, arcs, sent_tags): token.add_tag_label("predicted", Label(tag)) token.add_tag_label("predicted_head_id", Label(str(int(arc)))) # append both to file for evaluation eval_line = "{} {} {} {} {}\n".format( token.text, token.get_tag(gold_label_type).value, str(token.head_id), tag, str(int(arc)), ) lines.append(eval_line) lines.append("\n") for sentence in batch: gold_tags = [ token.get_tag(gold_label_type).value for token in sentence.tokens ] predicted_tags = [ tag.tag for tag in sentence.get_spans("predicted") ] y_pred += [ self.relations_dictionary.get_idx_for_item(tag) for tag in predicted_tags ] y_true += [ self.relations_dictionary.get_idx_for_item(tag) for tag in gold_tags ] store_embeddings(batch, embedding_storage_mode) eval_loss_arc /= average_over eval_loss_rel /= average_over if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) classification_report_dict = sklearn.metrics.classification_report( y_true, y_pred, target_names=self.relations_dictionary.idx2item, zero_division=0, output_dict=True, labels=range(len(self.relations_dictionary)), ) accuracy_score = round(sklearn.metrics.accuracy_score(y_true, y_pred), 4) precision_score = round( classification_report_dict["micro avg"]["precision"], 4) recall_score = round(classification_report_dict["micro avg"]["recall"], 4) micro_f_score = round( classification_report_dict["micro avg"]["f1-score"], 4) macro_f_score = round( classification_report_dict["macro avg"]["f1-score"], 4) main_score = classification_report_dict[main_evaluation_metric[0]][ main_evaluation_metric[1]] detailed_result = ( f"\nUAS : {parsing_metric.get_uas():.4f} - LAS : {parsing_metric.get_las():.4f}" f"\neval loss rel : {eval_loss_rel:.4f} - eval loss arc : {eval_loss_arc:.4f}" f"\nF-Score: micro : {micro_f_score} - macro : {macro_f_score}" f"\n Accuracy: {accuracy_score} - Precision {precision_score} - Recall {recall_score}" ) log_header = "PRECISION\tRECALL\tF1\tACCURACY" log_line = f"{precision_score}\t" f"{recall_score}\t" f"{micro_f_score}\t" f"{accuracy_score}" result = Result( main_score=main_score, log_line=log_line, log_header=log_header, detailed_results=detailed_result, classification_report=classification_report_dict, loss=(eval_loss_rel + eval_loss_arc), ) return result
def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size: int = 16, return_probabilities_for_all_classes: bool = False, verbose: bool = False, label_name="predicted", return_loss=False, embedding_storage_mode="none", ): """ Predict lemmas of words for a given (list of) sentence(s). :param sentences: sentences to predict :param label_name: label name used for predicted lemmas :param mini_batch_size: number of tokens that are send through the RNN simultaneously, assuming batching_in_rnn is set to True :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. :param return_loss: whether or not to compute and return loss. Setting it to True only makes sense if labels are provided :param verbose: If True, lemmatized sentences will be printed in the console. """ if isinstance(sentences, Sentence): sentences = [sentences] # filter empty sentences sentences = [sentence for sentence in sentences if len(sentence) > 0] if len(sentences) == 0: return sentences # max length of the predicted sequences if not self.dependent_on_input: max_length = self.max_sequence_length else: max_length = max([ len(token.text) + 1 for sentence in sentences for token in sentence ]) # for printing line_to_print = "" overall_loss = 0.0 number_tokens_in_total = 0 with torch.no_grad(): dataloader = DataLoader(dataset=FlairDatapointDataset(sentences), batch_size=mini_batch_size) for batch in dataloader: # stop if all sentences are empty if not batch: continue # remove previously predicted labels of this type for sentence in batch: for token in sentence: token.remove_labels(label_name) # create list of tokens in batch tokens_in_batch = [ token for sentence in batch for token in sentence ] number_tokens = len(tokens_in_batch) number_tokens_in_total += number_tokens # encode inputs hidden, all_encoder_outputs = self.encode(batch) # create input for first pass (batch_size, 1, input_size), first letter is special character <S> # sequence length is always set to one in prediction input_indices = self.start_index * torch.ones( number_tokens, dtype=torch.long, device=flair.device).unsqueeze(1) # option 1: greedy decoding if self.beam_size == 1: # predictions predicted: List[List[int]] = [[] for _ in range(number_tokens) ] for decode_step in range(max_length): # decode next character output_vectors, hidden = self.decode( input_indices, hidden, all_encoder_outputs) log_softmax_probs = torch.nn.functional.log_softmax( output_vectors, dim=2) # pick top beam size many outputs with highest probabilities input_indices = log_softmax_probs.argmax(dim=2) for i in range(number_tokens): if len(predicted[i]) > 0 and predicted[i][ -1] == self.end_index: continue predicted[i].append(input_indices[i].item()) for t_id, token in enumerate(tokens_in_batch): predicted_lemma = "".join( self.char_dictionary.get_item_for_index(idx) if idx != self.end_index else "" for idx in predicted[t_id]) token.set_label(typename=label_name, value=predicted_lemma) # option 2: beam search else: output_vectors, hidden = self.decode( input_indices, hidden, all_encoder_outputs) # out_probs = self.softmax(output_vectors).squeeze(1) log_softmax_probs = torch.nn.functional.log_softmax( output_vectors, dim=2).squeeze(1) # make sure no dummy symbol <> or start symbol <S> is predicted log_softmax_probs[:, self.dummy_index] = -inf log_softmax_probs[:, self.start_index] = -inf # pick top beam size many outputs with highest probabilities # probabilities, leading_indices = out_probs.topk(self.beam_size, 1) # max prob along dimension 1 log_probabilities, leading_indices = log_softmax_probs.topk( self.beam_size, 1) # leading_indices and probabilities have size (batch_size, beam_size) # keep scores of beam_size many hypothesis for each token in the batch scores = log_probabilities.view(-1, 1) # stack all leading indices of all hypothesis and corresponding hidden states in two tensors leading_indices = leading_indices.view( -1, 1) # this vector goes through RNN in each iteration hidden_states_beam = torch.stack(self.beam_size * [hidden], dim=2).view( self.rnn_layers, -1, self.rnn_hidden_size) # save sequences so far sequences = torch.tensor([[i.item()] for i in leading_indices], device=flair.device) # keep track of how many hypothesis were completed for each token n_completed = [0 for _ in range(number_tokens)] # cpu final_candidates: List[List[Tuple[ torch.Tensor, float]]] = [[] for _ in range(number_tokens)] # cpu # if all_encoder_outputs returned, expand them to beam size (otherwise keep this as None) batched_encoding_output = (torch.stack( self.beam_size * [all_encoder_outputs], dim=1).view( self.beam_size * number_tokens, -1, self.rnn_hidden_size) if self.use_attention else None) for j in range(1, max_length): output_vectors, hidden_states_beam = self.decode( leading_indices, hidden_states_beam, batched_encoding_output) # decode with log softmax out_log_probs = torch.nn.functional.log_softmax( output_vectors, dim=2) # make sure no dummy symbol <> or start symbol <S> is predicted out_log_probs[:, 0, self.dummy_index] = -inf out_log_probs[:, 0, self.start_index] = -inf log_probabilities, index_candidates = out_log_probs.topk( self.beam_size, 2) log_probabilities.squeeze_(1) index_candidates.squeeze_(1) # check if an end symbol <E> has been predicted and, in that case, set hypothesis aside end_symbols = ( index_candidates == self.end_index).nonzero( as_tuple=False) for tuple in end_symbols: # if the sequence is already ended, do not record as candidate if sequences[tuple[0], -1].item() == self.end_index: continue # index of token in in list tokens_in_batch token_number = torch.div(tuple[0], self.beam_size, rounding_mode="trunc") # print(token_number) seq = sequences[tuple[0], :] # hypothesis sequence # hypothesis score score = (scores[tuple[0]] + log_probabilities[tuple[0], tuple[1]]) / ( len(seq) + 1) final_candidates[token_number].append( (seq, score.item())) # TODO: remove token if number of completed hypothesis exceeds given value n_completed[token_number] += 1 # set score of corresponding entry to -inf so it will not be expanded log_probabilities[tuple[0], tuple[1]] = -inf # get leading_indices for next expansion # find highest scoring hypothesis among beam_size*beam_size possible ones for each token # take beam_size many copies of scores vector and add scores of possible new extensions # size (beam_size*batch_size, beam_size) hypothesis_scores = torch.cat( self.beam_size * [scores], dim=1) + log_probabilities # print(hypothesis_scores) # reshape to vector of size (batch_size, beam_size*beam_size), # each row contains beam_size*beam_size scores of the new possible hypothesis hypothesis_scores_per_token = hypothesis_scores.view( number_tokens, self.beam_size**2) # print(hypothesis_scores_per_token) # choose beam_size best for each token - size (batch_size, beam_size) ( best_scores, indices_per_token, ) = hypothesis_scores_per_token.topk( self.beam_size, 1) # out of indices_per_token we now need to recompute the original indices of the hypothesis in # a list of length beam_size*batch_size # where the first three inidices belong to the first token, the next three to the second token, # and so on beam_numbers: List[int] = [] seq_numbers: List[int] = [] for i, row in enumerate(indices_per_token): beam_numbers.extend(i * self.beam_size + index.item() // self.beam_size for index in row) seq_numbers.extend(index.item() % self.beam_size for index in row) # with these indices we can compute the tensors for the next iteration # expand sequences with corresponding index sequences = torch.cat( ( sequences[beam_numbers], index_candidates[beam_numbers, seq_numbers].unsqueeze(1), ), dim=1, ) # add log-probabilities to the scores scores = scores[beam_numbers] + log_probabilities[ beam_numbers, seq_numbers].unsqueeze(1) # save new leading indices leading_indices = index_candidates[ beam_numbers, seq_numbers].unsqueeze(1) # save corresponding hidden states hidden_states_beam = hidden_states_beam[:, beam_numbers, :] # it may happen that no end symbol <E> is predicted for a token in all of the max_length iterations # in that case we append one of the final seuqences without end symbol to the final_candidates best_scores, indices = scores.view(number_tokens, -1).topk(1, 1) for j, (score, index) in enumerate( zip(best_scores.squeeze(1), indices.squeeze(1))): if len(final_candidates[j]) == 0: beam = j * self.beam_size + index.item() final_candidates[j].append( (sequences[beam, :], score.item() / max_length)) # get best final hypothesis for each token output_sequences = [] for candidate in final_candidates: l_ordered = sorted(candidate, key=lambda tup: tup[1], reverse=True) output_sequences.append(l_ordered[0]) # get characters from index sequences and add predicted label to token for i, out_seq in enumerate(output_sequences): predicted_lemma = "" for idx in out_seq[0]: predicted_lemma += self.char_dictionary.get_item_for_index( idx) line_to_print += predicted_lemma line_to_print += " " tokens_in_batch[i].add_tag(tag_type=label_name, tag_value=predicted_lemma) if return_loss: overall_loss += self.forward_loss(batch)[0].item() store_embeddings(batch, storage_mode=embedding_storage_mode) if verbose: log.info(line_to_print) if return_loss: return overall_loss, number_tokens_in_total
def evaluate( self, data_points: Union[List[DataPair[DT, DT2]], Dataset], gold_label_type: str, out_path: Union[str, Path] = None, embedding_storage_mode="none", mini_batch_size=32, num_workers: Optional[int] = 8, **kwargs, ) -> Result: # assumes that for each data pair there's at least one embedding per modality if not isinstance(data_points, Dataset): data_points = FlairDatapointDataset(data_points) data_loader = DataLoader(data_points, batch_size=mini_batch_size, num_workers=num_workers) with torch.no_grad(): # pre-compute embeddings for all targets in evaluation dataset target_index: Dict[str, int] = {} all_target_embeddings_list = [] for batch in data_loader: target_inputs = [] for data_point in batch: if str(data_point.second) not in target_index: target_index[str(data_point.second)] = len(target_index) target_inputs.append(data_point) if target_inputs: all_target_embeddings_list.append(self._embed_target(target_inputs).to(self.eval_device)) store_embeddings(data_points, embedding_storage_mode) all_target_embeddings = torch.cat(all_target_embeddings_list, dim=0) # [n0, d0] assert len(target_index) == all_target_embeddings.shape[0] ranks = [] for batch in data_loader: batch_embeddings = self._embed_source(batch) batch_source_embeddings = batch_embeddings.to(self.eval_device) # compute the similarity batch_similarity_matrix = self.similarity_measure.forward( [batch_source_embeddings, all_target_embeddings] ) # sort the similarity matrix across modality 1 batch_modality_1_argsort = torch.argsort(batch_similarity_matrix, descending=True, dim=1) # get the ranks, so +1 to start counting ranks from 1 batch_modality_1_ranks = torch.argsort(batch_modality_1_argsort, dim=1) + 1 batch_target_indices = [target_index[str(data_point.second)] for data_point in batch] batch_gt_ranks = batch_modality_1_ranks[ torch.arange(batch_similarity_matrix.shape[0]), torch.tensor(batch_target_indices), ] ranks.extend(batch_gt_ranks.tolist()) store_embeddings(data_points, embedding_storage_mode) ranks_arr = np.array(ranks) median_rank = np.median(ranks_arr) recall_at = {k: np.mean(ranks_arr <= k) for k in self.recall_at_points} results_header = ["Median rank"] + ["Recall@top" + str(r) for r in self.recall_at_points] results_header_str = "\t".join(results_header) epoch_results = [str(median_rank)] + [str(recall_at[k]) for k in self.recall_at_points] epoch_results_str = "\t".join(epoch_results) detailed_results = ", ".join([f"{h}={v}" for h, v in zip(results_header, epoch_results)]) validated_measure = sum( [recall_at[r] * w for r, w in zip(self.recall_at_points, self.recall_at_points_weights)] ) return Result( validated_measure, results_header_str, epoch_results_str, detailed_results, loss=0.0, )
def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size=32, return_probabilities_for_all_classes: bool = False, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", label_threshold: float = 0.5, multi_label: Optional[bool] = None, ): """ Predict sequence tags for Named Entity Recognition task :param sentences: a Sentence or a List of Sentence :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory, up to a point when it has no more effect. :param all_tag_prob: True to compute the score for each tag on each token, otherwise only the score of the best tag is returned :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss :param label_name: set this to change the name of the label type that is predicted :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ if label_name is None: label_name = self.get_current_label_type() if multi_label is None: multi_label = self.is_current_task_multi_label() # with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, Sentence): sentences = [sentences] # set context if not set already previous_sentence = None for sentence in sentences: if sentence.is_context_set(): continue sentence._previous_sentence = previous_sentence sentence._next_sentence = None if previous_sentence: previous_sentence._next_sentence = sentence previous_sentence = sentence reordered_sentences = sorted(sentences, key=lambda s: len(s), reverse=True) dataloader = DataLoader( dataset=FlairDatapointDataset(reordered_sentences), batch_size=mini_batch_size, ) # progress bar for verbosity if verbose: progressbar = tqdm(dataloader) progressbar.set_description("Batch inference") dataloader = progressbar overall_loss = 0 overall_count = 0 batch_no = 0 with torch.no_grad(): for batch in dataloader: batch_no += 1 batch = self._filter_empty_sentences(batch) # stop if all sentences are empty if not batch: continue # go through each sentence in the batch for sentence in batch: # always remove tags first sentence.remove_labels(label_name) all_labels = [ label.decode("utf-8") for label in self.get_current_label_dictionary().idx2item ] best_label = None for label in all_labels: tars_sentence = self._get_tars_formatted_sentence( label, sentence) loss_and_count = self.tars_model.predict( tars_sentence, label_name=label_name, return_loss=True, return_probabilities_for_all_classes=True if label_threshold < 0.5 else False, ) overall_loss += loss_and_count[0].item() overall_count += loss_and_count[1] # add all labels that according to TARS match the text and are above threshold for predicted_tars_label in tars_sentence.get_labels( label_name): if (predicted_tars_label.value == self.LABEL_MATCH and predicted_tars_label.score > label_threshold): # do not add labels below confidence threshold sentence.add_label(label_name, label, predicted_tars_label.score) # only use label with highest confidence if enforcing single-label predictions if not multi_label: if len(sentence.get_labels(label_name)) > 0: # get all label scores and do an argmax to get the best label label_scores = torch.tensor( [ label.score for label in sentence.get_labels(label_name) ], dtype=torch.float, ) best_label = sentence.get_labels(label_name)[ torch.argmax(label_scores)] # remove previously added labels and only add the best label sentence.remove_labels(label_name) sentence.add_label( typename=label_name, value=best_label.value, score=best_label.score, ) # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) if return_loss: return overall_loss, overall_count
def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size=32, return_probabilities_for_all_classes: bool = False, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", most_probable_first: bool = True, ): # return """ Predict sequence tags for Named Entity Recognition task :param sentences: a Sentence or a List of Sentence :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory, up to a point when it has no more effect. :param all_tag_prob: True to compute the score for each tag on each token, otherwise only the score of the best tag is returned :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss :param label_name: set this to change the name of the label type that is predicted :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ if label_name is None: label_name = self.get_current_label_type() # with torch.no_grad(): if not sentences: return sentences if not isinstance(sentences, list): sentences = [sentences] reordered_sentences = sorted(sentences, key=lambda s: len(s), reverse=True) dataloader = DataLoader( dataset=FlairDatapointDataset(reordered_sentences), batch_size=mini_batch_size, ) # progress bar for verbosity if verbose: dataloader = tqdm(dataloader) overall_loss = 0 overall_count = 0 with torch.no_grad(): for batch in dataloader: batch = self._filter_empty_sentences(batch) # stop if all sentences are empty if not batch: continue # go through each sentence in the batch for sentence in batch: # always remove tags first for token in sentence: token.remove_labels(label_name) all_labels = [ label.decode("utf-8") for label in self.get_current_label_dictionary().idx2item ] all_detected = {} for label in all_labels: tars_sentence = self._get_tars_formatted_sentence( label, sentence) label_length = 0 if not self.prefix else len( label.split(" ")) + len(self.separator.split(" ")) loss_and_count = self.tars_model.predict( tars_sentence, label_name=label_name, return_loss=True, ) overall_loss += loss_and_count[0].item() overall_count += loss_and_count[1] for span in tars_sentence.get_spans(label_name): span.set_label("tars_temp_label", label) all_detected[span] = span.score if not most_probable_first: for span in tars_sentence.get_spans(label_name): for token in span: corresponding_token = sentence.get_token( token.idx - label_length) if corresponding_token is None: continue if (corresponding_token.get_tag( label_name).value != "" and corresponding_token.get_tag( label_name).score > token.get_tag(label_name).score): continue corresponding_token.add_tag( label_name, token.get_tag(label_name).value + label, token.get_tag(label_name).score, ) if most_probable_first: import operator sorted_x = sorted(all_detected.items(), key=operator.itemgetter(1)) sorted_x.reverse() for tuple in sorted_x: # get the span and its label span = tuple[0] label = span.get_labels("tars_temp_label")[0].value label_length = (0 if not self.prefix else len(label.split(" ")) + len(self.separator.split(" "))) # determine whether tokens in this span already have a label tag_this = True for token in span: corresponding_token = sentence.get_token( token.idx - label_length) if corresponding_token is None: tag_this = False continue if (corresponding_token.get_tag( label_name).value != "" and corresponding_token.get_tag( label_name).score > token.get_tag(label_name).score): tag_this = False continue # only add if all tokens have no label if tag_this: for token in span: corresponding_token = sentence.get_token( token.idx - label_length) corresponding_token.add_tag( label_name, token.get_tag(label_name).value + label, token.get_tag(label_name).score, ) # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) if return_loss: return overall_loss, overall_count