def test_prob_to_pred(caplog): """Unit test of prob_to_pred.""" caplog.set_level(logging.INFO) assert (np.array_equal(prob_to_pred(np.array([[0, 1], [1, 0]])), np.array([1, 0])) is True) assert (np.array_equal( prob_to_pred(np.array([[0.4, 0.5], [0.2, 0.8], [0.9, 0.1]])), np.array([1, 1, 0]), ) is True)
def precision_scorer( golds: ndarray, probs: Optional[ndarray], preds: ndarray, uids: Optional[List[str]] = None, pos_label: int = 1, ) -> Dict[str, float]: """Precision. Args: golds(ndarray): Ground truth values. probs(ndarray or None): Predicted probabilities. preds(ndarray): Predicted values. uids(list, optional): Unique ids, defaults to None. pos_label(int, optional): The positive class label, defaults to 1. Returns: dict: Precision. """ if len(golds.shape) > 1: golds = prob_to_pred(golds) pred_pos = np.where(preds == pos_label, True, False) gt_pos = np.where(golds == pos_label, True, False) TP = np.sum(pred_pos * gt_pos) FP = np.sum(pred_pos * np.logical_not(gt_pos)) precision = TP / (TP + FP) if TP + FP > 0 else 0.0 return {"precision": precision}
def recall_scorer( golds: ndarray, probs: Optional[ndarray], preds: ndarray, uids: Optional[List[str]] = None, pos_label: int = 1, ) -> Dict[str, float]: """Recall. Args: golds: Ground truth values. probs: Predicted probabilities. preds: Predicted values. uids: Unique ids, defaults to None. pos_label: The positive class label, defaults to 1. Returns: Recall. """ # Convert probabilistic label to hard label if len(golds.shape) == 2: golds = prob_to_pred(golds) pred_pos = np.where(preds == pos_label, True, False) gt_pos = np.where(golds == pos_label, True, False) TP = np.sum(pred_pos * gt_pos) FN = np.sum(np.logical_not(pred_pos) * gt_pos) recall = TP / (TP + FN) if TP + FN > 0 else 0.0 return {"recall": recall}
def fbeta_scorer( golds: ndarray, probs: Optional[ndarray], preds: ndarray, uids: Optional[List[str]] = None, pos_label: int = 1, beta: int = 1, ) -> Dict[str, float]: """F-beta score is the weighted harmonic mean of precision and recall. Args: golds: Ground truth values. probs: Predicted probabilities. preds: Predicted values. uids: Unique ids, defaults to None. pos_label: The positive class label, defaults to 1. beta: Weight of precision in harmonic mean, defaults to 1. Returns: F-beta score. """ # Convert probabilistic label to hard label if len(golds.shape) == 2: golds = prob_to_pred(golds) precision = precision_scorer(golds, probs, preds, uids, pos_label)["precision"] recall = recall_scorer(golds, probs, preds, uids, pos_label)["recall"] fbeta = ((1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall) if (beta**2 * precision) + recall > 0 else 0.0) return {f"f{beta}": fbeta}
def roc_auc_scorer( golds: ndarray, probs: ndarray, preds: Optional[ndarray], uids: Optional[List[str]] = None, ) -> Dict[str, float]: """ROC AUC. Args: golds(ndarray): Ground truth values. probs(ndarray): Predicted probabilities. preds(ndarray or None): Predicted values. uids(list, optional): Unique ids, defaults to None. pos_label(int, optional): The positive class label, defaults to 1. Returns: dict: ROC AUC score. """ if len(probs.shape) == 2 and probs.shape[1] == 1: probs = probs.reshape(probs.shape[0]) if len(golds.shape) == 2 and golds.shape[1] == 1: golds = golds.reshape(golds.shape[0]) if len(probs.shape) > 1: if len(golds.shape) > 1: golds = pred_to_prob(prob_to_pred(golds), n_classes=probs.shape[1]) else: golds = pred_to_prob(golds, n_classes=probs.shape[1]) else: if len(golds.shape) > 1: golds = prob_to_pred(golds) try: roc_auc = roc_auc_score(golds, probs) except ValueError: logger.warning( "Only one class present in golds." "ROC AUC score is not defined in that case, set as nan instead." ) roc_auc = float("nan") return {"roc_auc": roc_auc}
def predict(self, dataloader: EmmentalDataLoader, return_preds: bool = False) -> Dict[str, Any]: r"""Predict from dataloader. Args: dataloader(EmmentalDataLoader): The dataloader to predict. return_preds(bool): Whether return predictions or not, defaults to False. Returns: dict: The result dict. """ self.eval() uid_dict: Dict[str, List[str]] = defaultdict(list) gold_dict: Dict[str, List[Union[ndarray, int, float]]] = defaultdict(list) prob_dict: Dict[str, List[Union[ndarray, int, float]]] = defaultdict(list) pred_dict: Dict[str, List[ndarray]] = defaultdict(list) loss_dict: Dict[str, Union[ndarray, float]] = defaultdict(float) # Collect dataloader information task_to_label_dict = dataloader.task_to_label_dict uid = dataloader.uid for batch_num, (X_bdict, Y_bdict) in tqdm(enumerate(dataloader), total=len(dataloader)): uid_bdict, loss_bdict, prob_bdict, gold_bdict = self.forward( X_bdict[uid], X_bdict, Y_bdict, task_to_label_dict) for task_name in uid_bdict.keys(): uid_dict[task_name].extend(uid_bdict[task_name]) prob_dict[task_name].extend(prob_bdict[task_name]) gold_dict[task_name].extend(gold_bdict[task_name]) loss_dict[task_name] += loss_bdict[task_name].item() * len( uid_bdict[task_name]) # Calculate average loss for task_name in uid_dict.keys(): loss_dict[task_name] /= len(uid_dict[task_name]) res = { "uids": uid_dict, "golds": gold_dict, "probs": prob_dict, "losses": loss_dict, } if return_preds: for task_name, prob in prob_dict.items(): pred_dict[task_name] = prob_to_pred(prob) res["preds"] = pred_dict return res
def _aggregate_running_metrics(self, model): """Calculate the running overall and task specific metrics.""" metric_dict = dict() total_count = 0 # Log task specific loss for identifier in self.running_uids.keys(): count = len(self.running_uids[identifier]) if count > 0: metric_dict[identifier + "/loss"] = ( self.running_losses[identifier] / count ) total_count += count # Calculate average micro loss if total_count > 0: total_loss = sum(self.running_losses.values()) metric_dict["model/all/train/loss"] = total_loss / total_count micro_score_dict = defaultdict(list) macro_score_dict = defaultdict(list) # Calculate training metric for identifier in self.running_uids.keys(): task_name, data_name, split = identifier.split("/") metric_score = model.scorers[task_name].score( self.running_golds[identifier], self.running_probs[identifier], prob_to_pred(self.running_probs[identifier]), self.running_uids[identifier], ) for metric_name, metric_value in metric_score.items(): metric_dict[f"{identifier}/{metric_name}"] = metric_value # Collect average score identifier = construct_identifier(task_name, data_name, split, "average") metric_dict[identifier] = np.mean(list(metric_score.values())) micro_score_dict[split].extend(list(metric_score.values())) macro_score_dict[split].append(metric_dict[identifier]) # Collect split-wise micro/macro average score for split in micro_score_dict.keys(): identifier = construct_identifier("model", "all", split, "micro_average") metric_dict[identifier] = np.mean(micro_score_dict[split]) identifier = construct_identifier("model", "all", split, "macro_average") metric_dict[identifier] = np.mean(macro_score_dict[split]) # Log the learning rate metric_dict["model/all/train/lr"] = self.optimizer.param_groups[0]["lr"] return metric_dict
def accuracy_scorer( golds: ndarray, probs: Optional[ndarray], preds: Optional[ndarray], uids: Optional[List[str]] = None, normalize: bool = True, topk: int = 1, ) -> Dict[str, Union[float, int]]: r"""Accuracy classification score. Args: golds(ndarray): Ground truth values. probs(ndarray or None): Predicted probabilities. preds(ndarray or None): Predicted values. uids(list, optional): Unique ids, defaults to None. normalize(bool, optional): Normalize the results or not, defaults to True. topk(int, optional): Top K accuracy, defaults to 1. Returns: dict: Accuracy, if normalize is True, return the fraction of correctly predicted samples (float), else returns the number of correctly predicted samples (int). """ # Convert probabilistic label to hard label if len(golds.shape) == 2: golds = prob_to_pred(golds) if topk == 1 and preds is not None: n_matches = np.where(golds == preds)[0].shape[0] else: topk_preds = probs.argsort(axis=1)[:, -topk:][:, ::-1] n_matches = np.logical_or.reduce(topk_preds == golds.reshape(-1, 1), axis=1).sum() if normalize: return { "accuracy" if topk == 1 else f"accuracy@{topk}": n_matches / golds.shape[0] } else: return {"accuracy" if topk == 1 else f"accuracy@{topk}": n_matches}
def predict(self, dataloader, return_preds=False): self.eval() uid_dict = defaultdict(list) gold_dict = defaultdict(list) prob_dict = defaultdict(list) pred_dict = defaultdict(list) loss_dict = defaultdict(float) # Collect dataloader information task_to_label_dict = dataloader.task_to_label_dict uid = dataloader.uid for batch_num, (X_bdict, Y_bdict) in enumerate(dataloader): uid_bdict, loss_bdict, prob_bdict, gold_bdict = self.forward( X_bdict[uid], X_bdict, Y_bdict, task_to_label_dict) for task_name in uid_bdict.keys(): uid_dict[task_name].extend(uid_bdict[task_name]) prob_dict[task_name].extend(prob_bdict[task_name]) gold_dict[task_name].extend(gold_bdict[task_name]) loss_dict[task_name] += loss_bdict[task_name].item() * len( uid_bdict[task_name]) # Calculate average loss for task_name in uid_dict.keys(): loss_dict[task_name] /= len(uid_dict[task_name]) res = { "uids": uid_dict, "golds": gold_dict, "probs": prob_dict, "losses": loss_dict, } if return_preds: for task_name, prob in prob_dict.items(): pred_dict[task_name] = prob_to_pred(prob) res["preds"] = pred_dict return res
def matthews_correlation_coefficient_scorer( golds: ndarray, probs: Optional[ndarray], preds: ndarray, uids: Optional[List[str]] = None, ) -> Dict[str, float]: """Matthews correlation coefficient (MCC). Args: golds: Ground truth values. probs: Predicted probabilities. preds: Predicted values. uids: Unique ids, defaults to None. Returns: Matthews correlation coefficient score. """ # Convert probabilistic label to hard label if len(golds.shape) == 2: golds = prob_to_pred(golds) return {"matthews_corrcoef": matthews_corrcoef(golds, preds)}
def recall_scorer(golds, probs, preds, uids=None, pos_label=1): """Recall. :param golds: Ground truth (correct) target values. :type golds: 1-d np.array :param probs: Predicted target probabilities. (Not used!) :type probs: k-d np.array :param preds: Predicted target values. :type preds: 1-d np.array :param uids: Unique ids. :type uids: list :return: Recall. :rtype: dict """ if len(golds.shape) > 1: golds = prob_to_pred(golds) pred_pos = np.where(preds == pos_label, True, False) gt_pos = np.where(golds == pos_label, True, False) TP = np.sum(pred_pos * gt_pos) FN = np.sum(np.logical_not(pred_pos) * gt_pos) recall = TP / (TP + FN) if TP + FN > 0 else 0.0 return {"recall": recall}
def predict( self, dataloader: EmmentalDataLoader, return_probs: bool = True, return_preds: bool = False, return_action_outputs: bool = False, ) -> Dict[str, Any]: """Predict from dataloader. Args: dataloader: The dataloader to predict. return_probs: Whether return prob not, defaults to True. return_preds: Whether return predictions or not, defaults to False. return_action_outputs: Whether return action_outputs or not, defaults to False. Returns: The result dict. """ self.eval() uid_dict: Dict[str, List[str]] = defaultdict(list) prob_dict: Dict[str, Union[ndarray, List[ndarray]]] = defaultdict(list) pred_dict: Dict[str, Union[ndarray, List[ndarray]]] = defaultdict(list) gold_dict: Dict[str, List[Union[ndarray, int, float]]] = defaultdict(list) out_dict: Dict[str, Dict[str, List[Union[ndarray, int, float]]]] = defaultdict( lambda: defaultdict(list) ) loss_dict: Dict[str, Union[ndarray, float]] = defaultdict(list) # type: ignore if not dataloader.is_learnable: gold_dict = None loss_dict = None # Collect dataloader information task_to_label_dict = dataloader.task_to_label_dict uid = dataloader.uid with torch.no_grad(): for batch_num, bdict in tqdm( enumerate(dataloader), total=len(dataloader), desc=f"Evaluating {dataloader.data_name} ({dataloader.split})", ): if isinstance(bdict, dict) == 1: X_bdict = bdict Y_bdict = None else: X_bdict, Y_bdict = bdict if not dataloader.is_learnable: Y_bdict = None if return_action_outputs: ( uid_bdict, loss_bdict, prob_bdict, gold_bdict, out_bdict, ) = self.forward( # type: ignore X_bdict[uid], X_bdict, Y_bdict, task_to_label_dict, return_action_outputs=return_action_outputs, return_probs=return_probs or return_preds, ) else: ( uid_bdict, loss_bdict, prob_bdict, gold_bdict, ) = self.forward( # type: ignore X_bdict[uid], X_bdict, Y_bdict, task_to_label_dict, return_action_outputs=return_action_outputs, return_probs=return_probs or return_preds, ) out_bdict = None for task_name in uid_bdict.keys(): uid_dict[task_name].extend(uid_bdict[task_name]) if return_probs: prob_dict[task_name].extend( # type: ignore prob_bdict[task_name] ) if return_preds: pred_dict[task_name].extend( # type: ignore prob_to_pred(prob_bdict[task_name]) ) if dataloader.is_learnable: gold_dict[task_name].extend(gold_bdict[task_name]) if len(loss_bdict[task_name].size()) == 0: if loss_dict[task_name] == []: loss_dict[task_name] = 0 loss_dict[task_name] += loss_bdict[task_name].item() * len( uid_bdict[task_name] ) else: loss_dict[task_name].extend( # type: ignore loss_bdict[task_name].cpu().numpy() ) if return_action_outputs and out_bdict: for task_name in out_bdict.keys(): for action_name in out_bdict[task_name].keys(): out_dict[task_name][action_name].extend( out_bdict[task_name][action_name] ) # Calculate average loss if dataloader.is_learnable: for task_name in uid_dict.keys(): if not isinstance(loss_dict[task_name], list): loss_dict[task_name] /= len(uid_dict[task_name]) res = { "uids": uid_dict, "golds": gold_dict, "losses": loss_dict, } if return_probs: for task_name in prob_dict.keys(): prob_dict[task_name] = array_to_numpy(prob_dict[task_name]) res["probs"] = prob_dict if return_preds: for task_name in pred_dict.keys(): pred_dict[task_name] = array_to_numpy(pred_dict[task_name]) res["preds"] = pred_dict if return_action_outputs: res["outputs"] = out_dict return res
def _aggregate_running_metrics( self, model: EmmentalModel, calc_running_scores: bool = False) -> Dict[str, float]: """Calculate the running overall and task specific metrics. Args: model: The model to evaluate. calc_running_scores: Whether to calc running scores Returns: The score dict. """ metric_dict: Dict[str, float] = dict() total_count = 0 # Log task specific loss for identifier in self.running_uids.keys(): count = len(self.running_uids[identifier]) if count > 0: metric_dict[identifier + "/loss"] = float( self.running_losses[identifier] / count) total_count += count # Calculate average micro loss if total_count > 0: total_loss = sum(self.running_losses.values()) metric_dict["model/all/train/loss"] = float(total_loss / total_count) if calc_running_scores: micro_score_dict: Dict[str, List[float]] = defaultdict(list) macro_score_dict: Dict[str, List[float]] = defaultdict(list) # Calculate training metric for identifier in self.running_uids.keys(): task_name, data_name, split = identifier.split("/") if (model.scorers[task_name] and self.running_golds[identifier] and self.running_probs[identifier]): metric_score = model.scorers[task_name].score( self.running_golds[identifier], self.running_probs[identifier], prob_to_pred(self.running_probs[identifier]), self.running_uids[identifier], ) for metric_name, metric_value in metric_score.items(): metric_dict[ f"{identifier}/{metric_name}"] = metric_value # Collect average score identifier = construct_identifier(task_name, data_name, split, "average") metric_dict[identifier] = np.mean( list(metric_score.values())) micro_score_dict[split].extend( list(metric_score.values()) # type: ignore ) macro_score_dict[split].append(metric_dict[identifier]) # Collect split-wise micro/macro average score for split in micro_score_dict.keys(): identifier = construct_identifier("model", "all", split, "micro_average") metric_dict[identifier] = np.mean( micro_score_dict[split] # type: ignore ) identifier = construct_identifier("model", "all", split, "macro_average") metric_dict[identifier] = np.mean( macro_score_dict[split] # type: ignore ) # Log the learning rate metric_dict["model/all/train/lr"] = self.optimizer.param_groups[0][ "lr"] return metric_dict
def batched_pred_iter( model, dataloader, eval_accumulation_steps, sent_idx2num_mens, ): """Predict from dataloader taking into account eval accumulation steps. Will yield a new prediction set after each set accumulation steps for writing out. If a sentence or batch doesn't have any mentions, it will not be returned by this method. Recall that we split up sentences that are too long to feed to the model. We use the sent_idx2num_mens dict to ensure we have full sentences evaluated before returning, otherwise we'll have incomplete sentences to merge together when dumping. Args: model: model dataloader: The dataloader to predict eval_accumulation_steps: Number of eval steps to run before returning sent_idx2num_mens: list of sent index to number of mentions Returns: Iterator over result dict. """ def collect_result(uid_d, gold_d, pred_d, prob_d, out_d, cur_sentidx_nummen): """Merges results for the sentences where all mentions have been evaluated.""" final_uid_d = defaultdict(list) final_prob_d = defaultdict(list) final_pred_d = defaultdict(list) final_gold_d = defaultdict(list) final_out_d = defaultdict(lambda: defaultdict(list)) sentidxs_finalized = [] # print("FINALIZE", cur_sentidx_nummen, [sent_idx2num_mens[str(k)] for k in cur_sentidx_nummen]) for sent_idx, cur_mention_set in cur_sentidx_nummen.items(): assert ( len(cur_mention_set) <= sent_idx2num_mens[str(sent_idx)] ), f"Too many mentions for {sent_idx}: {cur_mention_set} VS {sent_idx2num_mens[str(sent_idx)]}" if len(cur_mention_set) == sent_idx2num_mens[str(sent_idx)]: sentidxs_finalized.append(sent_idx) for task_name in uid_d: final_uid_d[task_name].extend(uid_d[task_name][sent_idx]) final_prob_d[task_name].extend(prob_d[task_name][sent_idx]) final_pred_d[task_name].extend(pred_d[task_name][sent_idx]) final_gold_d[task_name].extend(gold_d[task_name][sent_idx]) if task_name in out_d.keys(): for action_name in out_d[task_name].keys(): final_out_d[task_name][action_name].extend( out_d[task_name][action_name][sent_idx]) # If batch size is close to 1 and accumulation step was close to 1, # we may get to where there are no complete sentences if len(sentidxs_finalized) == 0: return {}, sentidxs_finalized res = { "uids": final_uid_d, "golds": final_gold_d, } for task_name in final_prob_d.keys(): final_prob_d[task_name] = array_to_numpy(final_prob_d[task_name]) res["probs"] = final_prob_d for task_name in final_pred_d.keys(): final_pred_d[task_name] = array_to_numpy(final_pred_d[task_name]) res["preds"] = final_pred_d res["outputs"] = final_out_d return res, sentidxs_finalized model.eval() # Will store sent_idx -> task_name -> list output uid_dict = defaultdict(lambda: defaultdict(list)) prob_dict = defaultdict(lambda: defaultdict(list)) pred_dict = defaultdict(lambda: defaultdict(list)) gold_dict = defaultdict(lambda: defaultdict(list)) # Will store sent_idx -> task_name -> output key -> list output out_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) # list of all finalized and yielded sentences all_finalized_sentences = [] # Storing currently stored sent idx -> unique mentions seed (for sentences that aren't complete, # we'll hold until they are) cur_sentidx2_nummentions = dict() num_eval_steps = 0 # Collect dataloader information task_to_label_dict = dataloader.task_to_label_dict uid = dataloader.uid with torch.no_grad(): for batch_num, bdict in tqdm( enumerate(dataloader), total=len(dataloader), desc=f"Evaluating {dataloader.data_name} ({dataloader.split})", ): num_eval_steps += 1 X_bdict, Y_bdict = bdict ( uid_bdict, loss_bdict, prob_bdict, gold_bdict, out_bdict, ) = model.forward( # type: ignore X_bdict[uid], X_bdict, Y_bdict, task_to_label_dict, return_action_outputs=True, return_probs=True, ) assert ( NED_TASK in uid_bdict ), f"{NED_TASK} task needs to be in returned in uid to get number of mentions" for task_name in uid_bdict.keys(): for ex_idx in range(len(uid_bdict[task_name])): # Recall that our uid is # ============================ # guid_dtype = np.dtype( # [ # ("sent_idx", "i8", 1), # ("subsent_idx", "i8", 1), # ("alias_orig_list_pos", "i8", (data_config.max_aliases,)), # ] # ) # ============================ # Index 0 -> sent_idx, Index 1 -> subsent_idx, Index 2 -> List of aliases positions # (-1 means no mention in train example) sent_idx = uid_bdict[task_name][ex_idx][0] # Only incredment for NED TASK if task_name == NED_TASK: # alias_pos_for_eval gives which mentions are meant to be evaluated in this batch (-1 means # skip) for scoring. This will be different than the mentions seen by the model as we window # sentences and a mention may be seen multiple times but only scored once. This includes for # True and False anchors - we dump all anchors for analysis alias_pos_for_eval = out_bdict[task_name][ "_input__for_dump_gold_cand_K_idx_train"][ex_idx] assert len(alias_pos_for_eval) == len( uid_bdict[task_name][ex_idx][2]) if sent_idx not in cur_sentidx2_nummentions: cur_sentidx2_nummentions[sent_idx] = set() # Index 2 is index of alias positions in original list (-1 means no mention) cur_sentidx2_nummentions[sent_idx].update( set([ i for j, i in enumerate(uid_bdict[task_name] [ex_idx][2]) if alias_pos_for_eval[j] != -1 ])) uid_dict[task_name][sent_idx].extend( uid_bdict[task_name][ex_idx:ex_idx + 1]) prob_dict[task_name][sent_idx].extend( prob_bdict[task_name][ex_idx:ex_idx + 1]) # type: ignore pred_dict[task_name][sent_idx].extend( # type: ignore prob_to_pred(prob_bdict[task_name][ex_idx:ex_idx + 1])) gold_dict[task_name][sent_idx].extend( gold_bdict[task_name][ex_idx:ex_idx + 1]) if task_name in out_bdict.keys(): for action_name in out_bdict[task_name].keys(): out_dict[task_name][action_name][sent_idx].extend( out_bdict[task_name][action_name] [ex_idx:ex_idx + 1]) if num_eval_steps >= eval_accumulation_steps: # Collect the sentences that have all mentions collected res, finalized_sent_idxs = collect_result( uid_dict, gold_dict, pred_dict, prob_dict, out_dict, cur_sentidx2_nummentions, ) all_finalized_sentences.extend( [str(s) for s in finalized_sent_idxs]) num_eval_steps = 0 for final_sent_i in finalized_sent_idxs: assert final_sent_i in cur_sentidx2_nummentions del cur_sentidx2_nummentions[final_sent_i] for task_name in uid_dict.keys(): del uid_dict[task_name][final_sent_i] del prob_dict[task_name][final_sent_i] del pred_dict[task_name][final_sent_i] del gold_dict[task_name][final_sent_i] if task_name in out_dict.keys(): for action_name in out_dict[task_name].keys(): del out_dict[task_name][action_name][ final_sent_i] if len(res) > 0: # print("FINALIZED", finalized_sent_idxs) yield res res, finalized_sent_idxs = collect_result(uid_dict, gold_dict, pred_dict, prob_dict, out_dict, cur_sentidx2_nummentions) all_finalized_sentences.extend([str(s) for s in finalized_sent_idxs]) for final_sent_i in finalized_sent_idxs: del cur_sentidx2_nummentions[final_sent_i] if len(res) > 0: # print("FINALIZED", finalized_sent_idxs) yield res assert ( len(cur_sentidx2_nummentions) == 0 ), f"After eval, some sentences had left over mentions {cur_sentidx2_nummentions}" assert set(all_finalized_sentences).intersection(sent_idx2num_mens.keys( )) == set([k for k, v in sent_idx2num_mens.items() if v > 0]), ( f"Some sentences are left over " f"{[s for s in sent_idx2num_mens if s not in set(all_finalized_sentences) and sent_idx2num_mens[s] > 0]}" ) return None
def predict(self, dataloader: EmmentalDataLoader, return_preds: bool = False) -> Dict[str, Any]: """Predict from dataloader. Args: dataloader: The dataloader to predict. return_preds: Whether return predictions or not, defaults to False. Returns: The result dict. """ self.eval() uid_dict: Dict[str, List[str]] = defaultdict(list) gold_dict: Dict[str, List[Union[ndarray, int, float]]] = defaultdict(list) prob_dict: Dict[str, List[Union[ndarray, int, float]]] = defaultdict(list) pred_dict: Dict[str, List[ndarray]] = defaultdict(list) # Fix it later loss_dict: Dict[str, Union[ndarray, float]] = defaultdict(list) # type: ignore # Collect dataloader information task_to_label_dict = dataloader.task_to_label_dict uid = dataloader.uid for batch_num, (X_bdict, Y_bdict) in enumerate(dataloader): uid_bdict, loss_bdict, prob_bdict, gold_bdict = self.forward( X_bdict[uid], X_bdict, Y_bdict, task_to_label_dict) for task_name in uid_bdict.keys(): uid_dict[task_name].extend(uid_bdict[task_name]) prob_dict[task_name].extend(prob_bdict[task_name]) gold_dict[task_name].extend(gold_bdict[task_name]) if len(loss_bdict[task_name].size()) == 0: if loss_dict[task_name] == []: loss_dict[task_name] = 0 loss_dict[task_name] += loss_bdict[task_name].item() * len( uid_bdict[task_name]) else: loss_dict[task_name].extend( # type: ignore loss_bdict[task_name].cpu().numpy()) # Calculate average loss for task_name in uid_dict.keys(): if not isinstance(loss_dict[task_name], list): loss_dict[task_name] /= len(uid_dict[task_name]) res = { "uids": uid_dict, "golds": gold_dict, "probs": prob_dict, "losses": loss_dict, } if return_preds: for task_name, prob in prob_dict.items(): pred_dict[task_name] = prob_to_pred(prob) res["preds"] = pred_dict return res
def predict(self, dataloader, return_preds=False, return_uids=False): self.eval() uid_key = dataloader.dataset.uid # Check uid exists if return_uids and uid_key is None: return_uids = False logger.info("No uid exist, skip it...") uid_dict = defaultdict(list) gold_dict = defaultdict(list) prob_dict = defaultdict(list) for batch_num, (X_batch_dict, Y_batch_dict) in enumerate(dataloader): prob_batch_dict = self._calculate_probs( X_batch_dict, dataloader.task_to_label_dict.keys() ) for task_name in dataloader.task_to_label_dict.keys(): if return_uids: uid_dict[task_name].extend(X_batch_dict[uid_key]) prob_dict[task_name].extend(prob_batch_dict[task_name]) gold_dict[task_name].extend( Y_batch_dict[dataloader.task_to_label_dict[task_name]].cpu().numpy() ) for task_name in gold_dict: gold_dict[task_name] = np.array(gold_dict[task_name]) prob_dict[task_name] = np.array(prob_dict[task_name]) if len(gold_dict[task_name].shape) == 1: active = ( gold_dict[task_name] != Meta.config["learner_config"]["ignore_index"] ).reshape(-1) else: active = ( np.sum( gold_dict[task_name] == Meta.config["learner_config"]["ignore_index"], axis=1, ) > 0 ) if 0 in active: gold_dict[task_name] = gold_dict[task_name][active] prob_dict[task_name] = prob_dict[task_name][active] if return_uids: uid_dict[task_name] = [ uid_dict[task_name][i] for i, value in enumerate(active) if value ] if return_preds: pred_dict = defaultdict(list) for task_name, prob in prob_dict.items(): pred_dict[task_name] = prob_to_pred(prob) res = {"golds": gold_dict, "probs": prob_dict} if return_preds: res["preds"] = pred_dict if return_uids: res["uids"] = uid_dict return res