Example #1
0
def test_prob_to_pred(caplog):
    """Unit test of prob_to_pred."""
    caplog.set_level(logging.INFO)

    assert (np.array_equal(prob_to_pred(np.array([[0, 1], [1, 0]])),
                           np.array([1, 0])) is True)
    assert (np.array_equal(
        prob_to_pred(np.array([[0.4, 0.5], [0.2, 0.8], [0.9, 0.1]])),
        np.array([1, 1, 0]),
    ) is True)
Example #2
0
def precision_scorer(
    golds: ndarray,
    probs: Optional[ndarray],
    preds: ndarray,
    uids: Optional[List[str]] = None,
    pos_label: int = 1,
) -> Dict[str, float]:
    """Precision.

    Args:
      golds(ndarray): Ground truth values.
      probs(ndarray or None): Predicted probabilities.
      preds(ndarray): Predicted values.
      uids(list, optional): Unique ids, defaults to None.
      pos_label(int, optional): The positive class label, defaults to 1.

    Returns:
      dict: Precision.

    """
    if len(golds.shape) > 1:
        golds = prob_to_pred(golds)
    pred_pos = np.where(preds == pos_label, True, False)
    gt_pos = np.where(golds == pos_label, True, False)
    TP = np.sum(pred_pos * gt_pos)
    FP = np.sum(pred_pos * np.logical_not(gt_pos))

    precision = TP / (TP + FP) if TP + FP > 0 else 0.0

    return {"precision": precision}
Example #3
0
def recall_scorer(
    golds: ndarray,
    probs: Optional[ndarray],
    preds: ndarray,
    uids: Optional[List[str]] = None,
    pos_label: int = 1,
) -> Dict[str, float]:
    """Recall.

    Args:
      golds: Ground truth values.
      probs: Predicted probabilities.
      preds: Predicted values.
      uids: Unique ids, defaults to None.
      pos_label: The positive class label, defaults to 1.

    Returns:
      Recall.
    """
    # Convert probabilistic label to hard label
    if len(golds.shape) == 2:
        golds = prob_to_pred(golds)

    pred_pos = np.where(preds == pos_label, True, False)
    gt_pos = np.where(golds == pos_label, True, False)
    TP = np.sum(pred_pos * gt_pos)
    FN = np.sum(np.logical_not(pred_pos) * gt_pos)

    recall = TP / (TP + FN) if TP + FN > 0 else 0.0

    return {"recall": recall}
Example #4
0
def fbeta_scorer(
    golds: ndarray,
    probs: Optional[ndarray],
    preds: ndarray,
    uids: Optional[List[str]] = None,
    pos_label: int = 1,
    beta: int = 1,
) -> Dict[str, float]:
    """F-beta score is the weighted harmonic mean of precision and recall.

    Args:
      golds: Ground truth values.
      probs: Predicted probabilities.
      preds: Predicted values.
      uids: Unique ids, defaults to None.
      pos_label: The positive class label, defaults to 1.
      beta: Weight of precision in harmonic mean, defaults to 1.

    Returns:
      F-beta score.
    """
    # Convert probabilistic label to hard label
    if len(golds.shape) == 2:
        golds = prob_to_pred(golds)

    precision = precision_scorer(golds, probs, preds, uids,
                                 pos_label)["precision"]
    recall = recall_scorer(golds, probs, preds, uids, pos_label)["recall"]

    fbeta = ((1 + beta**2) * (precision * recall) /
             ((beta**2 * precision) + recall) if
             (beta**2 * precision) + recall > 0 else 0.0)

    return {f"f{beta}": fbeta}
Example #5
0
def roc_auc_scorer(
    golds: ndarray,
    probs: ndarray,
    preds: Optional[ndarray],
    uids: Optional[List[str]] = None,
) -> Dict[str, float]:
    """ROC AUC.

    Args:
      golds(ndarray): Ground truth values.
      probs(ndarray): Predicted probabilities.
      preds(ndarray or None): Predicted values.
      uids(list, optional): Unique ids, defaults to None.
      pos_label(int, optional): The positive class label, defaults to 1.

    Returns:
      dict: ROC AUC score.

    """

    if len(probs.shape) == 2 and probs.shape[1] == 1:
        probs = probs.reshape(probs.shape[0])

    if len(golds.shape) == 2 and golds.shape[1] == 1:
        golds = golds.reshape(golds.shape[0])

    if len(probs.shape) > 1:
        if len(golds.shape) > 1:
            golds = pred_to_prob(prob_to_pred(golds), n_classes=probs.shape[1])
        else:
            golds = pred_to_prob(golds, n_classes=probs.shape[1])
    else:
        if len(golds.shape) > 1:
            golds = prob_to_pred(golds)

    try:
        roc_auc = roc_auc_score(golds, probs)
    except ValueError:
        logger.warning(
            "Only one class present in golds."
            "ROC AUC score is not defined in that case, set as nan instead."
        )
        roc_auc = float("nan")

    return {"roc_auc": roc_auc}
Example #6
0
    def predict(self,
                dataloader: EmmentalDataLoader,
                return_preds: bool = False) -> Dict[str, Any]:
        r"""Predict from dataloader.

        Args:
          dataloader(EmmentalDataLoader): The dataloader to predict.
          return_preds(bool): Whether return predictions or not, defaults to False.

        Returns:
          dict: The result dict.

        """

        self.eval()

        uid_dict: Dict[str, List[str]] = defaultdict(list)
        gold_dict: Dict[str, List[Union[ndarray, int,
                                        float]]] = defaultdict(list)
        prob_dict: Dict[str, List[Union[ndarray, int,
                                        float]]] = defaultdict(list)
        pred_dict: Dict[str, List[ndarray]] = defaultdict(list)
        loss_dict: Dict[str, Union[ndarray, float]] = defaultdict(float)

        # Collect dataloader information
        task_to_label_dict = dataloader.task_to_label_dict
        uid = dataloader.uid

        for batch_num, (X_bdict, Y_bdict) in tqdm(enumerate(dataloader),
                                                  total=len(dataloader)):
            uid_bdict, loss_bdict, prob_bdict, gold_bdict = self.forward(
                X_bdict[uid], X_bdict, Y_bdict, task_to_label_dict)
            for task_name in uid_bdict.keys():
                uid_dict[task_name].extend(uid_bdict[task_name])
                prob_dict[task_name].extend(prob_bdict[task_name])
                gold_dict[task_name].extend(gold_bdict[task_name])
                loss_dict[task_name] += loss_bdict[task_name].item() * len(
                    uid_bdict[task_name])

        # Calculate average loss
        for task_name in uid_dict.keys():
            loss_dict[task_name] /= len(uid_dict[task_name])

        res = {
            "uids": uid_dict,
            "golds": gold_dict,
            "probs": prob_dict,
            "losses": loss_dict,
        }

        if return_preds:
            for task_name, prob in prob_dict.items():
                pred_dict[task_name] = prob_to_pred(prob)
            res["preds"] = pred_dict

        return res
Example #7
0
    def _aggregate_running_metrics(self, model):
        """Calculate the running overall and task specific metrics."""

        metric_dict = dict()

        total_count = 0
        # Log task specific loss
        for identifier in self.running_uids.keys():
            count = len(self.running_uids[identifier])
            if count > 0:
                metric_dict[identifier + "/loss"] = (
                    self.running_losses[identifier] / count
                )
            total_count += count

        # Calculate average micro loss
        if total_count > 0:
            total_loss = sum(self.running_losses.values())
            metric_dict["model/all/train/loss"] = total_loss / total_count

        micro_score_dict = defaultdict(list)
        macro_score_dict = defaultdict(list)

        # Calculate training metric
        for identifier in self.running_uids.keys():
            task_name, data_name, split = identifier.split("/")

            metric_score = model.scorers[task_name].score(
                self.running_golds[identifier],
                self.running_probs[identifier],
                prob_to_pred(self.running_probs[identifier]),
                self.running_uids[identifier],
            )
            for metric_name, metric_value in metric_score.items():
                metric_dict[f"{identifier}/{metric_name}"] = metric_value

            # Collect average score
            identifier = construct_identifier(task_name, data_name, split, "average")

            metric_dict[identifier] = np.mean(list(metric_score.values()))

            micro_score_dict[split].extend(list(metric_score.values()))
            macro_score_dict[split].append(metric_dict[identifier])

        # Collect split-wise micro/macro average score
        for split in micro_score_dict.keys():
            identifier = construct_identifier("model", "all", split, "micro_average")
            metric_dict[identifier] = np.mean(micro_score_dict[split])
            identifier = construct_identifier("model", "all", split, "macro_average")
            metric_dict[identifier] = np.mean(macro_score_dict[split])

        # Log the learning rate
        metric_dict["model/all/train/lr"] = self.optimizer.param_groups[0]["lr"]

        return metric_dict
Example #8
0
def accuracy_scorer(
    golds: ndarray,
    probs: Optional[ndarray],
    preds: Optional[ndarray],
    uids: Optional[List[str]] = None,
    normalize: bool = True,
    topk: int = 1,
) -> Dict[str, Union[float, int]]:
    r"""Accuracy classification score.

    Args:
      golds(ndarray): Ground truth values.
      probs(ndarray or None): Predicted probabilities.
      preds(ndarray or None): Predicted values.
      uids(list, optional): Unique ids, defaults to None.
      normalize(bool, optional): Normalize the results or not, defaults to True.
      topk(int, optional): Top K accuracy, defaults to 1.

    Returns:
      dict: Accuracy, if normalize is True, return the fraction of correctly
      predicted samples (float), else returns the number of correctly predicted
      samples (int).

    """

    # Convert probabilistic label to hard label
    if len(golds.shape) == 2:
        golds = prob_to_pred(golds)

    if topk == 1 and preds is not None:
        n_matches = np.where(golds == preds)[0].shape[0]
    else:
        topk_preds = probs.argsort(axis=1)[:, -topk:][:, ::-1]
        n_matches = np.logical_or.reduce(topk_preds == golds.reshape(-1, 1),
                                         axis=1).sum()

    if normalize:
        return {
            "accuracy" if topk == 1 else f"accuracy@{topk}":
            n_matches / golds.shape[0]
        }
    else:
        return {"accuracy" if topk == 1 else f"accuracy@{topk}": n_matches}
Example #9
0
    def predict(self, dataloader, return_preds=False):

        self.eval()

        uid_dict = defaultdict(list)
        gold_dict = defaultdict(list)
        prob_dict = defaultdict(list)
        pred_dict = defaultdict(list)
        loss_dict = defaultdict(float)

        # Collect dataloader information
        task_to_label_dict = dataloader.task_to_label_dict
        uid = dataloader.uid

        for batch_num, (X_bdict, Y_bdict) in enumerate(dataloader):
            uid_bdict, loss_bdict, prob_bdict, gold_bdict = self.forward(
                X_bdict[uid], X_bdict, Y_bdict, task_to_label_dict)
            for task_name in uid_bdict.keys():
                uid_dict[task_name].extend(uid_bdict[task_name])
                prob_dict[task_name].extend(prob_bdict[task_name])
                gold_dict[task_name].extend(gold_bdict[task_name])
                loss_dict[task_name] += loss_bdict[task_name].item() * len(
                    uid_bdict[task_name])

        # Calculate average loss
        for task_name in uid_dict.keys():
            loss_dict[task_name] /= len(uid_dict[task_name])

        res = {
            "uids": uid_dict,
            "golds": gold_dict,
            "probs": prob_dict,
            "losses": loss_dict,
        }

        if return_preds:
            for task_name, prob in prob_dict.items():
                pred_dict[task_name] = prob_to_pred(prob)
            res["preds"] = pred_dict

        return res
Example #10
0
def matthews_correlation_coefficient_scorer(
    golds: ndarray,
    probs: Optional[ndarray],
    preds: ndarray,
    uids: Optional[List[str]] = None,
) -> Dict[str, float]:
    """Matthews correlation coefficient (MCC).

    Args:
      golds: Ground truth values.
      probs: Predicted probabilities.
      preds: Predicted values.
      uids: Unique ids, defaults to None.

    Returns:
      Matthews correlation coefficient score.
    """
    # Convert probabilistic label to hard label
    if len(golds.shape) == 2:
        golds = prob_to_pred(golds)

    return {"matthews_corrcoef": matthews_corrcoef(golds, preds)}
Example #11
0
def recall_scorer(golds, probs, preds, uids=None, pos_label=1):
    """Recall.

    :param golds: Ground truth (correct) target values.
    :type golds: 1-d np.array
    :param probs: Predicted target probabilities. (Not used!)
    :type probs: k-d np.array
    :param preds: Predicted target values.
    :type preds: 1-d np.array
    :param uids: Unique ids.
    :type uids: list
    :return: Recall.
    :rtype: dict
    """
    if len(golds.shape) > 1:
        golds = prob_to_pred(golds)
    pred_pos = np.where(preds == pos_label, True, False)
    gt_pos = np.where(golds == pos_label, True, False)
    TP = np.sum(pred_pos * gt_pos)
    FN = np.sum(np.logical_not(pred_pos) * gt_pos)

    recall = TP / (TP + FN) if TP + FN > 0 else 0.0

    return {"recall": recall}
Example #12
0
    def predict(
        self,
        dataloader: EmmentalDataLoader,
        return_probs: bool = True,
        return_preds: bool = False,
        return_action_outputs: bool = False,
    ) -> Dict[str, Any]:
        """Predict from dataloader.

        Args:
          dataloader: The dataloader to predict.
          return_probs: Whether return prob not, defaults to True.
          return_preds: Whether return predictions or not, defaults to False.
          return_action_outputs: Whether return action_outputs or not,
          defaults to False.

        Returns:
          The result dict.
        """
        self.eval()

        uid_dict: Dict[str, List[str]] = defaultdict(list)
        prob_dict: Dict[str, Union[ndarray, List[ndarray]]] = defaultdict(list)
        pred_dict: Dict[str, Union[ndarray, List[ndarray]]] = defaultdict(list)
        gold_dict: Dict[str, List[Union[ndarray, int, float]]] = defaultdict(list)
        out_dict: Dict[str, Dict[str, List[Union[ndarray, int, float]]]] = defaultdict(
            lambda: defaultdict(list)
        )
        loss_dict: Dict[str, Union[ndarray, float]] = defaultdict(list)  # type: ignore

        if not dataloader.is_learnable:
            gold_dict = None
            loss_dict = None

        # Collect dataloader information
        task_to_label_dict = dataloader.task_to_label_dict
        uid = dataloader.uid

        with torch.no_grad():
            for batch_num, bdict in tqdm(
                enumerate(dataloader),
                total=len(dataloader),
                desc=f"Evaluating {dataloader.data_name} ({dataloader.split})",
            ):
                if isinstance(bdict, dict) == 1:
                    X_bdict = bdict
                    Y_bdict = None
                else:
                    X_bdict, Y_bdict = bdict
                    if not dataloader.is_learnable:
                        Y_bdict = None

                if return_action_outputs:
                    (
                        uid_bdict,
                        loss_bdict,
                        prob_bdict,
                        gold_bdict,
                        out_bdict,
                    ) = self.forward(  # type: ignore
                        X_bdict[uid],
                        X_bdict,
                        Y_bdict,
                        task_to_label_dict,
                        return_action_outputs=return_action_outputs,
                        return_probs=return_probs or return_preds,
                    )
                else:
                    (
                        uid_bdict,
                        loss_bdict,
                        prob_bdict,
                        gold_bdict,
                    ) = self.forward(  # type: ignore
                        X_bdict[uid],
                        X_bdict,
                        Y_bdict,
                        task_to_label_dict,
                        return_action_outputs=return_action_outputs,
                        return_probs=return_probs or return_preds,
                    )
                    out_bdict = None
                for task_name in uid_bdict.keys():
                    uid_dict[task_name].extend(uid_bdict[task_name])
                    if return_probs:
                        prob_dict[task_name].extend(  # type: ignore
                            prob_bdict[task_name]
                        )
                    if return_preds:
                        pred_dict[task_name].extend(  # type: ignore
                            prob_to_pred(prob_bdict[task_name])
                        )
                    if dataloader.is_learnable:
                        gold_dict[task_name].extend(gold_bdict[task_name])
                        if len(loss_bdict[task_name].size()) == 0:
                            if loss_dict[task_name] == []:
                                loss_dict[task_name] = 0
                            loss_dict[task_name] += loss_bdict[task_name].item() * len(
                                uid_bdict[task_name]
                            )
                        else:
                            loss_dict[task_name].extend(  # type: ignore
                                loss_bdict[task_name].cpu().numpy()
                            )
                if return_action_outputs and out_bdict:
                    for task_name in out_bdict.keys():
                        for action_name in out_bdict[task_name].keys():
                            out_dict[task_name][action_name].extend(
                                out_bdict[task_name][action_name]
                            )

        # Calculate average loss
        if dataloader.is_learnable:
            for task_name in uid_dict.keys():
                if not isinstance(loss_dict[task_name], list):
                    loss_dict[task_name] /= len(uid_dict[task_name])

        res = {
            "uids": uid_dict,
            "golds": gold_dict,
            "losses": loss_dict,
        }

        if return_probs:
            for task_name in prob_dict.keys():
                prob_dict[task_name] = array_to_numpy(prob_dict[task_name])
            res["probs"] = prob_dict

        if return_preds:
            for task_name in pred_dict.keys():
                pred_dict[task_name] = array_to_numpy(pred_dict[task_name])
            res["preds"] = pred_dict

        if return_action_outputs:
            res["outputs"] = out_dict

        return res
Example #13
0
    def _aggregate_running_metrics(
            self,
            model: EmmentalModel,
            calc_running_scores: bool = False) -> Dict[str, float]:
        """Calculate the running overall and task specific metrics.

        Args:
          model: The model to evaluate.
          calc_running_scores: Whether to calc running scores

        Returns:
          The score dict.
        """
        metric_dict: Dict[str, float] = dict()

        total_count = 0
        # Log task specific loss
        for identifier in self.running_uids.keys():
            count = len(self.running_uids[identifier])
            if count > 0:
                metric_dict[identifier + "/loss"] = float(
                    self.running_losses[identifier] / count)
            total_count += count

        # Calculate average micro loss
        if total_count > 0:
            total_loss = sum(self.running_losses.values())
            metric_dict["model/all/train/loss"] = float(total_loss /
                                                        total_count)

        if calc_running_scores:
            micro_score_dict: Dict[str, List[float]] = defaultdict(list)
            macro_score_dict: Dict[str, List[float]] = defaultdict(list)

            # Calculate training metric
            for identifier in self.running_uids.keys():
                task_name, data_name, split = identifier.split("/")

                if (model.scorers[task_name] and self.running_golds[identifier]
                        and self.running_probs[identifier]):
                    metric_score = model.scorers[task_name].score(
                        self.running_golds[identifier],
                        self.running_probs[identifier],
                        prob_to_pred(self.running_probs[identifier]),
                        self.running_uids[identifier],
                    )
                    for metric_name, metric_value in metric_score.items():
                        metric_dict[
                            f"{identifier}/{metric_name}"] = metric_value

                    # Collect average score
                    identifier = construct_identifier(task_name, data_name,
                                                      split, "average")
                    metric_dict[identifier] = np.mean(
                        list(metric_score.values()))
                    micro_score_dict[split].extend(
                        list(metric_score.values())  # type: ignore
                    )
                    macro_score_dict[split].append(metric_dict[identifier])

                # Collect split-wise micro/macro average score
                for split in micro_score_dict.keys():
                    identifier = construct_identifier("model", "all", split,
                                                      "micro_average")
                    metric_dict[identifier] = np.mean(
                        micro_score_dict[split]  # type: ignore
                    )
                    identifier = construct_identifier("model", "all", split,
                                                      "macro_average")
                    metric_dict[identifier] = np.mean(
                        macro_score_dict[split]  # type: ignore
                    )

        # Log the learning rate
        metric_dict["model/all/train/lr"] = self.optimizer.param_groups[0][
            "lr"]

        return metric_dict
Example #14
0
def batched_pred_iter(
    model,
    dataloader,
    eval_accumulation_steps,
    sent_idx2num_mens,
):
    """Predict from dataloader taking into account eval accumulation steps.
    Will yield a new prediction set after each set accumulation steps for
    writing out.

    If a sentence or batch doesn't have any mentions, it will not be returned by this method.

    Recall that we split up sentences that are too long to feed to the model.
    We use the sent_idx2num_mens dict to ensure we have full sentences evaluated before
    returning, otherwise we'll have incomplete sentences to merge together when dumping.

    Args:
      model: model
      dataloader: The dataloader to predict
      eval_accumulation_steps: Number of eval steps to run before returning
      sent_idx2num_mens: list of sent index to number of mentions

    Returns:
      Iterator over result dict.
    """
    def collect_result(uid_d, gold_d, pred_d, prob_d, out_d,
                       cur_sentidx_nummen):
        """Merges results for the sentences where all mentions have been
        evaluated."""
        final_uid_d = defaultdict(list)
        final_prob_d = defaultdict(list)
        final_pred_d = defaultdict(list)
        final_gold_d = defaultdict(list)
        final_out_d = defaultdict(lambda: defaultdict(list))
        sentidxs_finalized = []
        # print("FINALIZE", cur_sentidx_nummen, [sent_idx2num_mens[str(k)] for k in cur_sentidx_nummen])
        for sent_idx, cur_mention_set in cur_sentidx_nummen.items():
            assert (
                len(cur_mention_set) <= sent_idx2num_mens[str(sent_idx)]
            ), f"Too many mentions for {sent_idx}: {cur_mention_set} VS {sent_idx2num_mens[str(sent_idx)]}"
            if len(cur_mention_set) == sent_idx2num_mens[str(sent_idx)]:
                sentidxs_finalized.append(sent_idx)
                for task_name in uid_d:
                    final_uid_d[task_name].extend(uid_d[task_name][sent_idx])
                    final_prob_d[task_name].extend(prob_d[task_name][sent_idx])
                    final_pred_d[task_name].extend(pred_d[task_name][sent_idx])
                    final_gold_d[task_name].extend(gold_d[task_name][sent_idx])
                    if task_name in out_d.keys():
                        for action_name in out_d[task_name].keys():
                            final_out_d[task_name][action_name].extend(
                                out_d[task_name][action_name][sent_idx])
        # If batch size is close to 1 and accumulation step was close to 1,
        # we may get to where there are no complete sentences
        if len(sentidxs_finalized) == 0:
            return {}, sentidxs_finalized
        res = {
            "uids": final_uid_d,
            "golds": final_gold_d,
        }
        for task_name in final_prob_d.keys():
            final_prob_d[task_name] = array_to_numpy(final_prob_d[task_name])
        res["probs"] = final_prob_d
        for task_name in final_pred_d.keys():
            final_pred_d[task_name] = array_to_numpy(final_pred_d[task_name])
        res["preds"] = final_pred_d
        res["outputs"] = final_out_d
        return res, sentidxs_finalized

    model.eval()

    # Will store sent_idx -> task_name -> list output
    uid_dict = defaultdict(lambda: defaultdict(list))
    prob_dict = defaultdict(lambda: defaultdict(list))
    pred_dict = defaultdict(lambda: defaultdict(list))
    gold_dict = defaultdict(lambda: defaultdict(list))
    # Will store sent_idx -> task_name -> output key -> list output
    out_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    # list of all finalized and yielded sentences
    all_finalized_sentences = []
    # Storing currently stored sent idx -> unique mentions seed (for sentences that aren't complete,
    # we'll hold until they are)
    cur_sentidx2_nummentions = dict()
    num_eval_steps = 0

    # Collect dataloader information
    task_to_label_dict = dataloader.task_to_label_dict
    uid = dataloader.uid

    with torch.no_grad():
        for batch_num, bdict in tqdm(
                enumerate(dataloader),
                total=len(dataloader),
                desc=f"Evaluating {dataloader.data_name} ({dataloader.split})",
        ):
            num_eval_steps += 1
            X_bdict, Y_bdict = bdict
            (
                uid_bdict,
                loss_bdict,
                prob_bdict,
                gold_bdict,
                out_bdict,
            ) = model.forward(  # type: ignore
                X_bdict[uid],
                X_bdict,
                Y_bdict,
                task_to_label_dict,
                return_action_outputs=True,
                return_probs=True,
            )
            assert (
                NED_TASK in uid_bdict
            ), f"{NED_TASK} task needs to be in returned in uid to get number of mentions"
            for task_name in uid_bdict.keys():
                for ex_idx in range(len(uid_bdict[task_name])):
                    # Recall that our uid is
                    # ============================
                    # guid_dtype = np.dtype(
                    #     [
                    #         ("sent_idx", "i8", 1),
                    #         ("subsent_idx", "i8", 1),
                    #         ("alias_orig_list_pos", "i8", (data_config.max_aliases,)),
                    #     ]
                    # )
                    # ============================
                    # Index 0 -> sent_idx, Index 1 -> subsent_idx, Index 2 -> List of aliases positions
                    # (-1 means no mention in train example)
                    sent_idx = uid_bdict[task_name][ex_idx][0]
                    # Only incredment for NED TASK
                    if task_name == NED_TASK:
                        # alias_pos_for_eval gives which mentions are meant to be evaluated in this batch (-1 means
                        # skip) for scoring. This will be different than the mentions seen by the model as we window
                        # sentences and a mention may be seen multiple times but only scored once. This includes for
                        # True and False anchors - we dump all anchors for analysis
                        alias_pos_for_eval = out_bdict[task_name][
                            "_input__for_dump_gold_cand_K_idx_train"][ex_idx]
                        assert len(alias_pos_for_eval) == len(
                            uid_bdict[task_name][ex_idx][2])
                        if sent_idx not in cur_sentidx2_nummentions:
                            cur_sentidx2_nummentions[sent_idx] = set()
                        # Index 2 is index of alias positions in original list (-1 means no mention)
                        cur_sentidx2_nummentions[sent_idx].update(
                            set([
                                i for j, i in enumerate(uid_bdict[task_name]
                                                        [ex_idx][2])
                                if alias_pos_for_eval[j] != -1
                            ]))
                    uid_dict[task_name][sent_idx].extend(
                        uid_bdict[task_name][ex_idx:ex_idx + 1])
                    prob_dict[task_name][sent_idx].extend(
                        prob_bdict[task_name][ex_idx:ex_idx +
                                              1])  # type: ignore
                    pred_dict[task_name][sent_idx].extend(  # type: ignore
                        prob_to_pred(prob_bdict[task_name][ex_idx:ex_idx + 1]))
                    gold_dict[task_name][sent_idx].extend(
                        gold_bdict[task_name][ex_idx:ex_idx + 1])
                    if task_name in out_bdict.keys():
                        for action_name in out_bdict[task_name].keys():
                            out_dict[task_name][action_name][sent_idx].extend(
                                out_bdict[task_name][action_name]
                                [ex_idx:ex_idx + 1])
            if num_eval_steps >= eval_accumulation_steps:
                # Collect the sentences that have all mentions collected
                res, finalized_sent_idxs = collect_result(
                    uid_dict,
                    gold_dict,
                    pred_dict,
                    prob_dict,
                    out_dict,
                    cur_sentidx2_nummentions,
                )
                all_finalized_sentences.extend(
                    [str(s) for s in finalized_sent_idxs])
                num_eval_steps = 0
                for final_sent_i in finalized_sent_idxs:
                    assert final_sent_i in cur_sentidx2_nummentions
                    del cur_sentidx2_nummentions[final_sent_i]
                    for task_name in uid_dict.keys():
                        del uid_dict[task_name][final_sent_i]
                        del prob_dict[task_name][final_sent_i]
                        del pred_dict[task_name][final_sent_i]
                        del gold_dict[task_name][final_sent_i]
                        if task_name in out_dict.keys():
                            for action_name in out_dict[task_name].keys():
                                del out_dict[task_name][action_name][
                                    final_sent_i]
                if len(res) > 0:
                    # print("FINALIZED", finalized_sent_idxs)
                    yield res
    res, finalized_sent_idxs = collect_result(uid_dict, gold_dict, pred_dict,
                                              prob_dict, out_dict,
                                              cur_sentidx2_nummentions)
    all_finalized_sentences.extend([str(s) for s in finalized_sent_idxs])
    for final_sent_i in finalized_sent_idxs:
        del cur_sentidx2_nummentions[final_sent_i]
    if len(res) > 0:
        # print("FINALIZED", finalized_sent_idxs)
        yield res
    assert (
        len(cur_sentidx2_nummentions) == 0
    ), f"After eval, some sentences had left over mentions {cur_sentidx2_nummentions}"
    assert set(all_finalized_sentences).intersection(sent_idx2num_mens.keys(
    )) == set([k for k, v in sent_idx2num_mens.items() if v > 0]), (
        f"Some sentences are left over "
        f"{[s for s in sent_idx2num_mens if s not in set(all_finalized_sentences) and sent_idx2num_mens[s] > 0]}"
    )
    return None
Example #15
0
    def predict(self,
                dataloader: EmmentalDataLoader,
                return_preds: bool = False) -> Dict[str, Any]:
        """Predict from dataloader.

        Args:
          dataloader: The dataloader to predict.
          return_preds: Whether return predictions or not, defaults to False.

        Returns:
          The result dict.
        """
        self.eval()

        uid_dict: Dict[str, List[str]] = defaultdict(list)
        gold_dict: Dict[str, List[Union[ndarray, int,
                                        float]]] = defaultdict(list)
        prob_dict: Dict[str, List[Union[ndarray, int,
                                        float]]] = defaultdict(list)
        pred_dict: Dict[str, List[ndarray]] = defaultdict(list)
        # Fix it later
        loss_dict: Dict[str, Union[ndarray,
                                   float]] = defaultdict(list)  # type: ignore

        # Collect dataloader information
        task_to_label_dict = dataloader.task_to_label_dict
        uid = dataloader.uid

        for batch_num, (X_bdict, Y_bdict) in enumerate(dataloader):
            uid_bdict, loss_bdict, prob_bdict, gold_bdict = self.forward(
                X_bdict[uid], X_bdict, Y_bdict, task_to_label_dict)
            for task_name in uid_bdict.keys():
                uid_dict[task_name].extend(uid_bdict[task_name])
                prob_dict[task_name].extend(prob_bdict[task_name])
                gold_dict[task_name].extend(gold_bdict[task_name])
                if len(loss_bdict[task_name].size()) == 0:
                    if loss_dict[task_name] == []:
                        loss_dict[task_name] = 0
                    loss_dict[task_name] += loss_bdict[task_name].item() * len(
                        uid_bdict[task_name])
                else:
                    loss_dict[task_name].extend(  # type: ignore
                        loss_bdict[task_name].cpu().numpy())

        # Calculate average loss
        for task_name in uid_dict.keys():
            if not isinstance(loss_dict[task_name], list):
                loss_dict[task_name] /= len(uid_dict[task_name])

        res = {
            "uids": uid_dict,
            "golds": gold_dict,
            "probs": prob_dict,
            "losses": loss_dict,
        }

        if return_preds:
            for task_name, prob in prob_dict.items():
                pred_dict[task_name] = prob_to_pred(prob)
            res["preds"] = pred_dict

        return res
Example #16
0
    def predict(self, dataloader, return_preds=False, return_uids=False):

        self.eval()

        uid_key = dataloader.dataset.uid

        # Check uid exists
        if return_uids and uid_key is None:
            return_uids = False
            logger.info("No uid exist, skip it...")

        uid_dict = defaultdict(list)
        gold_dict = defaultdict(list)
        prob_dict = defaultdict(list)

        for batch_num, (X_batch_dict, Y_batch_dict) in enumerate(dataloader):
            prob_batch_dict = self._calculate_probs(
                X_batch_dict, dataloader.task_to_label_dict.keys()
            )
            for task_name in dataloader.task_to_label_dict.keys():
                if return_uids:
                    uid_dict[task_name].extend(X_batch_dict[uid_key])
                prob_dict[task_name].extend(prob_batch_dict[task_name])
                gold_dict[task_name].extend(
                    Y_batch_dict[dataloader.task_to_label_dict[task_name]].cpu().numpy()
                )
        for task_name in gold_dict:
            gold_dict[task_name] = np.array(gold_dict[task_name])
            prob_dict[task_name] = np.array(prob_dict[task_name])
            if len(gold_dict[task_name].shape) == 1:
                active = (
                    gold_dict[task_name]
                    != Meta.config["learner_config"]["ignore_index"]
                ).reshape(-1)
            else:
                active = (
                    np.sum(
                        gold_dict[task_name]
                        == Meta.config["learner_config"]["ignore_index"],
                        axis=1,
                    )
                    > 0
                )

            if 0 in active:
                gold_dict[task_name] = gold_dict[task_name][active]
                prob_dict[task_name] = prob_dict[task_name][active]
                if return_uids:
                    uid_dict[task_name] = [
                        uid_dict[task_name][i]
                        for i, value in enumerate(active)
                        if value
                    ]

        if return_preds:
            pred_dict = defaultdict(list)
            for task_name, prob in prob_dict.items():
                pred_dict[task_name] = prob_to_pred(prob)

        res = {"golds": gold_dict, "probs": prob_dict}

        if return_preds:
            res["preds"] = pred_dict

        if return_uids:
            res["uids"] = uid_dict

        return res