def evaluate( self, data_loader: DataLoader, out_path: Path = None, embeddings_storage_mode: str = "cpu", prediction_mode: bool = False, ) -> (Result, float): data_loader.assign_embeddings() with torch.no_grad(): if self.binary: eval_loss = 0 batch_no: int = 0 # metric = Metric("Evaluation") # sentence_writer = open('temps/'+str(uid)+'_eval'+'.conllu','w') lines: List[str] = [] utp = 0 ufp = 0 ufn = 0 ltp = 0 lfp = 0 lfn = 0 for batch in data_loader: batch_no += 1 arc_scores, rel_scores = self.forward(batch) mask=self.mask root_mask = mask.clone() root_mask[:,0] = 0 binary_mask = root_mask.unsqueeze(-1) * mask.unsqueeze(-2) arc_predictions = (arc_scores.sigmoid() > 0.5) * binary_mask rel_predictions = (rel_scores.softmax(-1)*binary_mask.unsqueeze(-1)).argmax(-1) if not prediction_mode: arc_mat=torch.stack([getattr(sentence,self.tag_type+'_arc_tags').to(flair.device) for sentence in batch],0).float() rel_mat=torch.stack([getattr(sentence,self.tag_type+'_rel_tags').to(flair.device) for sentence in batch],0).long() loss = self._calculate_loss(arc_scores, rel_scores, batch, mask) if self.is_srl: # let the head selection fixed to the gold predicate only binary_mask[:,:,0] = arc_mat[:,:,0] arc_predictions = (arc_scores.sigmoid() > 0.5) * binary_mask # UF1 true_positives = arc_predictions * arc_mat # (n x m x m) -> () n_predictions = arc_predictions.sum() n_unlabeled_predictions = n_predictions n_targets = arc_mat.sum() n_unlabeled_targets = n_targets n_true_positives = true_positives.sum() # () - () -> () n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = arc_mat.sum([1,2]) n_true_positives_per_sequence = true_positives.sum([1,2]) # (n) x 2 -> () n_correct_sequences = (n_true_positives_per_sequence==n_targets_per_sequence).sum() utp += n_true_positives ufp += n_false_positives ufn += n_false_negatives # LF1 # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = (rel_predictions == rel_mat) * arc_predictions correct_label_tokens = (rel_predictions == rel_mat) * arc_mat # (n x m x m) -> () # n_unlabeled_predictions = tf.reduce_sum(unlabeled_predictions) # n_unlabeled_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = true_positives.sum() n_correct_label_tokens = correct_label_tokens.sum() # () - () -> () n_false_positives = n_unlabeled_predictions - n_true_positives n_false_negatives = n_unlabeled_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = arc_mat.sum([1,2]) n_true_positives_per_sequence = true_positives.sum([1,2]) n_correct_label_tokens_per_sequence = correct_label_tokens.sum([1,2]) # (n) x 2 -> () n_correct_sequences = (n_true_positives_per_sequence == n_targets_per_sequence).sum() n_correct_label_sequences = ((n_correct_label_tokens_per_sequence == n_targets_per_sequence)).sum() ltp += n_true_positives lfp += n_false_positives lfn += n_false_negatives eval_loss += loss eval_loss /= batch_no UF1=self.compute_F1(utp,ufp,ufn) LF1=self.compute_F1(ltp,lfp,lfn) if out_path is not None: masked_arc_scores = arc_scores.masked_fill(~binary_mask.bool(), float(-1e9)) # if self.target # lengths = [len(sentence.tokens) for sentence in batch] # temp_preds = eisner(arc_scores, mask) if not self.is_mst: temp_preds = eisner(arc_scores, root_mask.bool()) for (sent_idx, sentence) in enumerate(batch): if self.is_mst: preds=MST_inference(torch.softmax(masked_arc_scores[sent_idx],-1).cpu().numpy(), len(sentence), binary_mask[sent_idx].cpu().numpy()) else: preds=temp_preds[sent_idx] for token_idx, token in enumerate(sentence): if token_idx == 0: continue # append both to file for evaluation arc_heads = torch.where(arc_predictions[sent_idx,token_idx]>0)[0] if preds[token_idx] not in arc_heads: val=torch.zeros(1).type_as(arc_heads) val[0]=preds[token_idx].item() arc_heads=torch.cat([arc_heads,val],0) if len(arc_heads) == 0: arc_heads = masked_arc_scores[sent_idx,token_idx].argmax().unsqueeze(0) rel_index = rel_predictions[sent_idx,token_idx,arc_heads] rel_labels = [self.tag_dictionary.get_item_for_index(x) for x in rel_index] arc_list=[] for i, label in enumerate(rel_labels): if '+' in label: labels = label.split('+') for temp_label in labels: arc_list.append(str(arc_heads[i].item())+':'+temp_label) else: arc_list.append(str(arc_heads[i].item())+':'+label) eval_line = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( token_idx, token.text, 'X', 'X', 'X', 'X=X', str(token_idx-1), 'root' if token_idx-1==0 else 'det', '|'.join(arc_list), 'X', ) lines.append(eval_line) lines.append("\n") if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) if prediction_mode: return None, None result = Result( main_score=LF1, log_line=f"\nUF1: {UF1} - LF1 {LF1}", log_header="PRECISION\tRECALL\tF1", detailed_results=f"\nUF1: {UF1} - LF1 {LF1}", ) else: if prediction_mode: eval_loss, metric=self.dependency_evaluate(data_loader,out_path=out_path,prediction_mode=prediction_mode) return eval_loss, metric else: eval_loss, metric=self.dependency_evaluate(data_loader,out_path=out_path) UAS=metric.uas LAS=metric.las result = Result(main_score=LAS,log_line=f"\nUAS: {UAS} - LAS {LAS}",log_header="PRECISION\tRECALL\tF1",detailed_results=f"\nUAS: {UAS} - LAS {LAS}",) return result, eval_loss
def evaluate( self, data_loader: DataLoader, out_path: Path = None, embeddings_storage_mode: str = "cpu", prediction_mode: bool = False, ) -> (Result, float): eval_loss = 0 batch_no = 0 data_loader.assign_embeddings() if out_path is not None: outfile = open(out_path, "w", encoding="utf-8") if not self.binary: metric = Metric("Evaluation") with torch.no_grad(): for batch in data_loader: batch_no += 1 scores = self.forward(batch, prediction_mode=prediction_mode) loss = self._calculate_loss(scores, batch, self.mask) eval_loss += loss if self.binary: pdb.set_trace() result = Result( main_score=LF1, log_line=f"\nUF1: {UF1} - LF1 {LF1}", log_header="PRECISION\tRECALL\tF1", detailed_results=f"\nUF1: {UF1} - LF1 {LF1}", ) else: # if prediction_mode: # eval_loss, metric=self.dependency_evaluate(data_loader,out_path=out_path,prediction_mode=prediction_mode) # return eval_loss, metric # else: tags, _ = self._obtain_labels(scores, batch) for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token: Token = token token.add_tag_label("predicted", tag) # append both to file for evaluation eval_line = "{} {} {} {}\n".format( token.text, token.get_tag(self.tag_type).value, tag.value, tag.score, ) # lines.append(eval_line) if out_path is not None: outfile.write(eval_line) # lines.append("\n") if out_path is not None: outfile.write("\n") for sentence in batch: # make list of gold tags gold_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type) ] # make list of predicted tags predicted_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans("predicted") ] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) eval_loss /= batch_no if out_path is not None: outfile.close() detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss