def test_array_to_numpy(caplog): """Unit test of array_to_numpy.""" caplog.set_level(logging.INFO) assert (np.array_equal(array_to_numpy([[1, 2], [3, 4]]), np.array([[1, 2], [3, 4]])) is True) assert (np.array_equal(torch.tensor([[1, 2], [3, 4]]), np.array([[1, 2], [3, 4]])) is True) assert np.array_equal(array_to_numpy([[1, 2], [3, 4]], flatten=True), np.array([1, 2, 3, 4]))
def test_array_to_numpy(caplog): """Unit test of array_to_numpy.""" caplog.set_level(logging.INFO) assert (np.array_equal(array_to_numpy([[1, 2], [3, 4]]), np.array([[1, 2], [3, 4]])) is True) assert (np.array_equal(array_to_numpy(torch.tensor([[1, 2], [3, 4]])), np.array([[1, 2], [3, 4]])) is True) assert np.array_equal(array_to_numpy([[1, 2], [3, 4]], flatten=True), np.array([1, 2, 3, 4])) with pytest.raises(ValueError): array_to_numpy(1.23)
def score( self, golds: Union[ndarray, List[ndarray]], preds: Union[ndarray, List[ndarray]], probs: Union[ndarray, List[ndarray]], uids: List[str] = None, ) -> Dict[str, float]: """Calculate the score. Args: golds: Ground truth values. probs: Predicted probabilities. preds: Predicted values. uids: Unique ids, defaults to None. Returns: Score dict. """ metric_dict = dict() for metric_name, metric in self.metrics.items(): # handle no examples if len(golds) == 0: metric_dict[metric_name] = float("nan") continue try: golds = array_to_numpy(golds) except ValueError: pass try: probs = array_to_numpy(probs) if probs is not None else probs except ValueError: pass try: preds = array_to_numpy(preds) if preds is not None else preds except ValueError: pass res = metric(golds, preds, probs, uids) if isinstance(res, dict): metric_dict.update(res) else: metric_dict[metric_name] = res return metric_dict
def collect_result(uid_d, gold_d, pred_d, prob_d, out_d, cur_sentidx_nummen): """Merges results for the sentences where all mentions have been evaluated.""" final_uid_d = defaultdict(list) final_prob_d = defaultdict(list) final_pred_d = defaultdict(list) final_gold_d = defaultdict(list) final_out_d = defaultdict(lambda: defaultdict(list)) sentidxs_finalized = [] # print("FINALIZE", cur_sentidx_nummen, [sent_idx2num_mens[str(k)] for k in cur_sentidx_nummen]) for sent_idx, cur_mention_set in cur_sentidx_nummen.items(): assert ( len(cur_mention_set) <= sent_idx2num_mens[str(sent_idx)] ), f"Too many mentions for {sent_idx}: {cur_mention_set} VS {sent_idx2num_mens[str(sent_idx)]}" if len(cur_mention_set) == sent_idx2num_mens[str(sent_idx)]: sentidxs_finalized.append(sent_idx) for task_name in uid_d: final_uid_d[task_name].extend(uid_d[task_name][sent_idx]) final_prob_d[task_name].extend(prob_d[task_name][sent_idx]) final_pred_d[task_name].extend(pred_d[task_name][sent_idx]) final_gold_d[task_name].extend(gold_d[task_name][sent_idx]) if task_name in out_d.keys(): for action_name in out_d[task_name].keys(): final_out_d[task_name][action_name].extend( out_d[task_name][action_name][sent_idx]) # If batch size is close to 1 and accumulation step was close to 1, # we may get to where there are no complete sentences if len(sentidxs_finalized) == 0: return {}, sentidxs_finalized res = { "uids": final_uid_d, "golds": final_gold_d, } for task_name in final_prob_d.keys(): final_prob_d[task_name] = array_to_numpy(final_prob_d[task_name]) res["probs"] = final_prob_d for task_name in final_pred_d.keys(): final_pred_d[task_name] = array_to_numpy(final_pred_d[task_name]) res["preds"] = final_pred_d res["outputs"] = final_out_d return res, sentidxs_finalized
def score(self, golds, preds, probs, uids=None): metric_dict = dict() for metric_name, metric in self.metrics.items(): # handle no examples if len(golds) == 0: metric_dict[metric_name] = float("nan") continue golds = array_to_numpy(golds) preds = array_to_numpy(preds) probs = array_to_numpy(probs) res = metric(golds, preds, probs, uids) if isinstance(res, dict): metric_dict.update(res) else: metric_dict[metric_name] = res return metric_dict
def score(self, golds: ndarray, preds: ndarray, probs: ndarray, uids: List[str] = None) -> Dict[str, float]: """Calculate the score. Args: golds(ndarray): Ground truth values. probs(ndarray): Predicted probabilities. preds(ndarray): Predicted values. uids(list, optional): Unique ids, defaults to None. Returns: dict: score dict. """ metric_dict = dict() for metric_name, metric in self.metrics.items(): # handle no examples if len(golds) == 0: metric_dict[metric_name] = float("nan") continue golds = array_to_numpy(golds) preds = array_to_numpy(preds) probs = array_to_numpy(probs) res = metric(golds, preds, probs, uids) if isinstance(res, dict): metric_dict.update(res) else: metric_dict[metric_name] = res return metric_dict
def predict( self, dataloader: EmmentalDataLoader, return_probs: bool = True, return_preds: bool = False, return_action_outputs: bool = False, ) -> Dict[str, Any]: """Predict from dataloader. Args: dataloader: The dataloader to predict. return_probs: Whether return prob not, defaults to True. return_preds: Whether return predictions or not, defaults to False. return_action_outputs: Whether return action_outputs or not, defaults to False. Returns: The result dict. """ self.eval() uid_dict: Dict[str, List[str]] = defaultdict(list) prob_dict: Dict[str, Union[ndarray, List[ndarray]]] = defaultdict(list) pred_dict: Dict[str, Union[ndarray, List[ndarray]]] = defaultdict(list) gold_dict: Dict[str, List[Union[ndarray, int, float]]] = defaultdict(list) out_dict: Dict[str, Dict[str, List[Union[ndarray, int, float]]]] = defaultdict( lambda: defaultdict(list) ) loss_dict: Dict[str, Union[ndarray, float]] = defaultdict(list) # type: ignore if not dataloader.is_learnable: gold_dict = None loss_dict = None # Collect dataloader information task_to_label_dict = dataloader.task_to_label_dict uid = dataloader.uid with torch.no_grad(): for batch_num, bdict in tqdm( enumerate(dataloader), total=len(dataloader), desc=f"Evaluating {dataloader.data_name} ({dataloader.split})", ): if isinstance(bdict, dict) == 1: X_bdict = bdict Y_bdict = None else: X_bdict, Y_bdict = bdict if not dataloader.is_learnable: Y_bdict = None if return_action_outputs: ( uid_bdict, loss_bdict, prob_bdict, gold_bdict, out_bdict, ) = self.forward( # type: ignore X_bdict[uid], X_bdict, Y_bdict, task_to_label_dict, return_action_outputs=return_action_outputs, return_probs=return_probs or return_preds, ) else: ( uid_bdict, loss_bdict, prob_bdict, gold_bdict, ) = self.forward( # type: ignore X_bdict[uid], X_bdict, Y_bdict, task_to_label_dict, return_action_outputs=return_action_outputs, return_probs=return_probs or return_preds, ) out_bdict = None for task_name in uid_bdict.keys(): uid_dict[task_name].extend(uid_bdict[task_name]) if return_probs: prob_dict[task_name].extend( # type: ignore prob_bdict[task_name] ) if return_preds: pred_dict[task_name].extend( # type: ignore prob_to_pred(prob_bdict[task_name]) ) if dataloader.is_learnable: gold_dict[task_name].extend(gold_bdict[task_name]) if len(loss_bdict[task_name].size()) == 0: if loss_dict[task_name] == []: loss_dict[task_name] = 0 loss_dict[task_name] += loss_bdict[task_name].item() * len( uid_bdict[task_name] ) else: loss_dict[task_name].extend( # type: ignore loss_bdict[task_name].cpu().numpy() ) if return_action_outputs and out_bdict: for task_name in out_bdict.keys(): for action_name in out_bdict[task_name].keys(): out_dict[task_name][action_name].extend( out_bdict[task_name][action_name] ) # Calculate average loss if dataloader.is_learnable: for task_name in uid_dict.keys(): if not isinstance(loss_dict[task_name], list): loss_dict[task_name] /= len(uid_dict[task_name]) res = { "uids": uid_dict, "golds": gold_dict, "losses": loss_dict, } if return_probs: for task_name in prob_dict.keys(): prob_dict[task_name] = array_to_numpy(prob_dict[task_name]) res["probs"] = prob_dict if return_preds: for task_name in pred_dict.keys(): pred_dict[task_name] = array_to_numpy(pred_dict[task_name]) res["preds"] = pred_dict if return_action_outputs: res["outputs"] = out_dict return res