def evaluate(
		self,
		data_loader: DataLoader,
		out_path: Path = None,
		embeddings_storage_mode: str = "cpu",
		prediction_mode: bool = False,
	) -> (Result, float):
		data_loader.assign_embeddings()
		with torch.no_grad():
			if self.binary:
				eval_loss = 0

				batch_no: int = 0

				# metric = Metric("Evaluation")
				# sentence_writer = open('temps/'+str(uid)+'_eval'+'.conllu','w')
				lines: List[str] = []
				utp = 0
				ufp = 0
				ufn = 0
				ltp = 0
				lfp = 0
				lfn = 0
				for batch in data_loader:
					batch_no += 1
					
					arc_scores, rel_scores = self.forward(batch)
					mask=self.mask
					root_mask = mask.clone()
					root_mask[:,0] = 0
					binary_mask = root_mask.unsqueeze(-1) * mask.unsqueeze(-2)
					
					arc_predictions = (arc_scores.sigmoid() > 0.5) * binary_mask
					rel_predictions = (rel_scores.softmax(-1)*binary_mask.unsqueeze(-1)).argmax(-1)
					if not prediction_mode:
						arc_mat=torch.stack([getattr(sentence,self.tag_type+'_arc_tags').to(flair.device) for sentence in batch],0).float()
						rel_mat=torch.stack([getattr(sentence,self.tag_type+'_rel_tags').to(flair.device) for sentence in batch],0).long()
						loss = self._calculate_loss(arc_scores, rel_scores, batch, mask)
						if self.is_srl:
							# let the head selection fixed to the gold predicate only
							binary_mask[:,:,0] = arc_mat[:,:,0]
							arc_predictions = (arc_scores.sigmoid() > 0.5) * binary_mask
							

						# UF1
						true_positives = arc_predictions * arc_mat
						# (n x m x m) -> ()
						n_predictions = arc_predictions.sum()
						n_unlabeled_predictions = n_predictions
						n_targets = arc_mat.sum()
						n_unlabeled_targets = n_targets
						n_true_positives = true_positives.sum()
						# () - () -> ()
						n_false_positives = n_predictions - n_true_positives
						n_false_negatives = n_targets - n_true_positives
						# (n x m x m) -> (n)
						n_targets_per_sequence = arc_mat.sum([1,2])
						n_true_positives_per_sequence = true_positives.sum([1,2])
						# (n) x 2 -> ()
						n_correct_sequences = (n_true_positives_per_sequence==n_targets_per_sequence).sum()
						utp += n_true_positives
						ufp += n_false_positives
						ufn += n_false_negatives

						# LF1
						# (n x m x m) (*) (n x m x m) -> (n x m x m)
						true_positives = (rel_predictions == rel_mat) * arc_predictions
						correct_label_tokens = (rel_predictions == rel_mat) * arc_mat
						# (n x m x m) -> ()
						# n_unlabeled_predictions = tf.reduce_sum(unlabeled_predictions)
						# n_unlabeled_targets = tf.reduce_sum(unlabeled_targets)
						n_true_positives = true_positives.sum()
						n_correct_label_tokens = correct_label_tokens.sum()
						# () - () -> ()
						n_false_positives = n_unlabeled_predictions - n_true_positives
						n_false_negatives = n_unlabeled_targets - n_true_positives
						# (n x m x m) -> (n)
						n_targets_per_sequence = arc_mat.sum([1,2])
						n_true_positives_per_sequence = true_positives.sum([1,2])
						n_correct_label_tokens_per_sequence = correct_label_tokens.sum([1,2])
						# (n) x 2 -> ()
						n_correct_sequences = (n_true_positives_per_sequence == n_targets_per_sequence).sum()
						n_correct_label_sequences = ((n_correct_label_tokens_per_sequence == n_targets_per_sequence)).sum()
						ltp += n_true_positives
						lfp += n_false_positives
						lfn += n_false_negatives

						eval_loss += loss
						eval_loss /= batch_no
						UF1=self.compute_F1(utp,ufp,ufn)
						LF1=self.compute_F1(ltp,lfp,lfn)

					if out_path is not None:
						masked_arc_scores = arc_scores.masked_fill(~binary_mask.bool(), float(-1e9))
						# if self.target
						# lengths = [len(sentence.tokens) for sentence in batch]
						
						# temp_preds = eisner(arc_scores, mask)
						if not self.is_mst:
							temp_preds = eisner(arc_scores, root_mask.bool())
						for (sent_idx, sentence) in enumerate(batch):
							if self.is_mst:
								preds=MST_inference(torch.softmax(masked_arc_scores[sent_idx],-1).cpu().numpy(), len(sentence), binary_mask[sent_idx].cpu().numpy())
							else:
								preds=temp_preds[sent_idx]

							for token_idx, token in enumerate(sentence):
								if token_idx == 0:
									continue

								# append both to file for evaluation
								arc_heads = torch.where(arc_predictions[sent_idx,token_idx]>0)[0]
								if preds[token_idx] not in arc_heads:
									val=torch.zeros(1).type_as(arc_heads)
									val[0]=preds[token_idx].item()
									arc_heads=torch.cat([arc_heads,val],0)
								if len(arc_heads) == 0:
									arc_heads = masked_arc_scores[sent_idx,token_idx].argmax().unsqueeze(0)
								rel_index = rel_predictions[sent_idx,token_idx,arc_heads]
								rel_labels = [self.tag_dictionary.get_item_for_index(x) for x in rel_index]
								arc_list=[]
								for i, label in enumerate(rel_labels):
									if '+' in label:
										labels = label.split('+')
										for temp_label in labels:
											arc_list.append(str(arc_heads[i].item())+':'+temp_label)
									else:
										arc_list.append(str(arc_heads[i].item())+':'+label)
								eval_line = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
									token_idx,
									token.text,
									'X',
									'X',
									'X',
									'X=X',
									str(token_idx-1),
									'root' if token_idx-1==0 else 'det',
									'|'.join(arc_list),
									'X',
								)
								lines.append(eval_line)
							lines.append("\n")

				if out_path is not None:
					with open(out_path, "w", encoding="utf-8") as outfile:
						outfile.write("".join(lines))
				if prediction_mode:
					return None, None

				result = Result(
					main_score=LF1,
					log_line=f"\nUF1: {UF1} - LF1 {LF1}",
					log_header="PRECISION\tRECALL\tF1",
					detailed_results=f"\nUF1: {UF1} - LF1 {LF1}",
				)
			else:
				if prediction_mode:
					eval_loss, metric=self.dependency_evaluate(data_loader,out_path=out_path,prediction_mode=prediction_mode)
					return eval_loss, metric
				else:   
					eval_loss, metric=self.dependency_evaluate(data_loader,out_path=out_path)
				
				UAS=metric.uas
				LAS=metric.las
				result = Result(main_score=LAS,log_line=f"\nUAS: {UAS} - LAS {LAS}",log_header="PRECISION\tRECALL\tF1",detailed_results=f"\nUAS: {UAS} - LAS {LAS}",)
			return result, eval_loss
Beispiel #2
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embeddings_storage_mode: str = "cpu",
        prediction_mode: bool = False,
    ) -> (Result, float):
        eval_loss = 0
        batch_no = 0
        data_loader.assign_embeddings()
        if out_path is not None:
            outfile = open(out_path, "w", encoding="utf-8")
        if not self.binary:
            metric = Metric("Evaluation")
        with torch.no_grad():
            for batch in data_loader:
                batch_no += 1
                scores = self.forward(batch, prediction_mode=prediction_mode)
                loss = self._calculate_loss(scores, batch, self.mask)
                eval_loss += loss
                if self.binary:
                    pdb.set_trace()
                    result = Result(
                        main_score=LF1,
                        log_line=f"\nUF1: {UF1} - LF1 {LF1}",
                        log_header="PRECISION\tRECALL\tF1",
                        detailed_results=f"\nUF1: {UF1} - LF1 {LF1}",
                    )
                else:
                    # if prediction_mode:
                    #   eval_loss, metric=self.dependency_evaluate(data_loader,out_path=out_path,prediction_mode=prediction_mode)
                    #   return eval_loss, metric
                    # else:

                    tags, _ = self._obtain_labels(scores, batch)
                    for (sentence, sent_tags) in zip(batch, tags):
                        for (token, tag) in zip(sentence.tokens, sent_tags):
                            token: Token = token
                            token.add_tag_label("predicted", tag)

                            # append both to file for evaluation
                            eval_line = "{} {} {} {}\n".format(
                                token.text,
                                token.get_tag(self.tag_type).value,
                                tag.value,
                                tag.score,
                            )
                            # lines.append(eval_line)
                            if out_path is not None:
                                outfile.write(eval_line)
                        # lines.append("\n")
                        if out_path is not None:
                            outfile.write("\n")
                    for sentence in batch:
                        # make list of gold tags
                        gold_tags = [
                            (tag.tag, str(tag))
                            for tag in sentence.get_spans(self.tag_type)
                        ]
                        # make list of predicted tags
                        predicted_tags = [
                            (tag.tag, str(tag))
                            for tag in sentence.get_spans("predicted")
                        ]

                        # check for true positives, false positives and false negatives
                        for tag, prediction in predicted_tags:
                            if (tag, prediction) in gold_tags:
                                metric.add_tp(tag)
                            else:
                                metric.add_fp(tag)

                        for tag, gold in gold_tags:
                            if (tag, gold) not in predicted_tags:
                                metric.add_fn(tag)
                            else:
                                metric.add_tn(tag)
        eval_loss /= batch_no
        if out_path is not None:
            outfile.close()
        detailed_result = (
            f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
            f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
        )
        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                f"{metric.f_score(class_name):.4f}")

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=
            f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )
        return result, eval_loss