def evaluate(self, sentences: List[Sentence], eval_class_metrics: bool = False, mini_batch_size: int = 32, embeddings_in_memory: bool = False, metric_name: str = 'MICRO_AVG') -> (dict, float): """ Evaluates the model with the given list of sentences. :param sentences: the list of sentences :param eval_class_metrics: boolean indicating whether to print class metrics or not :param mini_batch_size: the mini batch size to use :param embeddings_in_memory: boolean value indicating, if embeddings should be kept in memory or not :param metric_name: the name of the metrics to compute :return: list of metrics, and the loss """ with torch.no_grad(): eval_loss = 0 batches = [ sentences[x:x + mini_batch_size] for x in range(0, len(sentences), mini_batch_size) ] metric = Metric(metric_name) for batch in batches: scores = self.model.forward(batch) labels = self.model.obtain_labels(scores) loss = self.model.calculate_loss(scores, batch) clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) eval_loss += loss for predictions, true_values in zip( [[label.value for label in sent_labels] for sent_labels in labels], [sentence.get_label_names() for sentence in batch]): for prediction in predictions: if prediction in true_values: metric.tp() if eval_class_metrics: metric.tp(prediction) else: metric.fp() if eval_class_metrics: metric.fp(prediction) for true_value in true_values: if true_value not in predictions: metric.fn() if eval_class_metrics: metric.fn(true_value) else: metric.tn() if eval_class_metrics: metric.tn(true_value) eval_loss /= len(sentences) return metric, eval_loss
def evaluate(self, evaluation: List[Sentence], out_path=None, evaluation_method: str = 'F1', embeddings_in_memory: bool = True): tp: int = 0 fp: int = 0 batch_no: int = 0 mini_batch_size = 32 batches = [evaluation[x:x + mini_batch_size] for x in range(0, len(evaluation), mini_batch_size)] metric = Metric('') lines: List[str] = [] for batch in batches: batch_no += 1 self.model.embeddings.embed(batch) for sentence in batch: sentence: Sentence = sentence # Step 3. Run our forward pass. score, tag_seq = self.model.predict_scores(sentence) # Step 5. Compute predictions predicted_id = tag_seq for (token, pred_id) in zip(sentence.tokens, predicted_id): token: Token = token # get the predicted tag predicted_tag = self.model.tag_dictionary.get_item_for_index(pred_id) token.add_tag('predicted', predicted_tag) # get the gold tag gold_tag = token.get_tag(self.model.tag_type) # append both to file for evaluation eval_line = token.text + ' ' + gold_tag + ' ' + predicted_tag + "\n" # positives if predicted_tag != '': # true positives if predicted_tag == gold_tag: metric.tp() # false positive if predicted_tag != gold_tag: metric.fp() # negatives if predicted_tag == '': # true negative if predicted_tag == gold_tag: metric.tn() # false negative if predicted_tag != gold_tag: metric.fn() lines.append(eval_line) lines.append('\n') if not embeddings_in_memory: self.clear_embeddings_in_batch(batch) if out_path is not None: test_tsv = os.path.join(out_path, "test.tsv") with open(test_tsv, "w", encoding='utf-8') as outfile: outfile.write(''.join(lines)) if evaluation_method == 'span-F1': # get the eval script eval_script = cached_path('https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/scripts/conll03_eval_script.pl', cache_dir='scripts') os.chmod(eval_script, 0o777) eval_data = ''.join(lines) p = run(eval_script, stdout=PIPE, input=eval_data, encoding='utf-8') main_result = p.stdout print(main_result) main_result = main_result.split('\n')[1] # parse the result file main_result = re.sub(';', ' ', main_result) main_result = re.sub('precision', 'p', main_result) main_result = re.sub('recall', 'r', main_result) main_result = re.sub('accuracy', 'acc', main_result) f_score = float(re.findall(r'\d+\.\d+$', main_result)[0]) return f_score, metric._fp, main_result if evaluation_method == 'accuracy': score = metric.accuracy() return score, metric._fp, str(score) if evaluation_method == 'F1': score = metric.f_score() return score, metric._fp, str(metric)
def evaluate(self, evaluation: List[Sentence], out_path=None, evaluation_method: str = 'F1', eval_batch_size: int = 32, embeddings_in_memory: bool = True): with torch.no_grad(): batch_no: int = 0 batches = [ evaluation[x:x + eval_batch_size] for x in range(0, len(evaluation), eval_batch_size) ] metric = Metric('') lines: List[str] = [] for batch in batches: batch_no += 1 scores, tag_seq = self.model._predict_scores_batch(batch) predicted_ids = tag_seq all_tokens = [] for sentence in batch: all_tokens.extend(sentence.tokens) for (token, score, predicted_id) in zip(all_tokens, scores, predicted_ids): token: Token = token # get the predicted tag predicted_value = self.model.tag_dictionary.get_item_for_index( predicted_id) token.add_tag('predicted', predicted_value, score) for sentence in batch: # add predicted tags for token in sentence.tokens: predicted_tag: Label = token.get_tag('predicted') # append both to file for evaluation eval_line = '{} {} {}\n'.format( token.text, token.get_tag(self.model.tag_type).value, predicted_tag.value) lines.append(eval_line) lines.append('\n') # make list of gold tags gold_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans(self.model.tag_type) ] # make list of predicted tags predicted_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans('predicted') ] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.tp() metric.tp(tag) else: metric.fp() metric.fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.fn() metric.fn(tag) else: metric.tn() metric.tn(tag) if not embeddings_in_memory: self.clear_embeddings_in_batch(batch) if out_path is not None: test_tsv = os.path.join(out_path, "test.tsv") with open(test_tsv, "w", encoding='utf-8') as outfile: outfile.write(''.join(lines)) if evaluation_method == 'accuracy': score = metric.accuracy() return score, metric if evaluation_method == 'F1': score = metric.f_score() return score, metric