def confusion_matrix( gold, pred, null_pred=False, null_gold=False, normalize=False, pretty_print=True, ): """A shortcut method for building a confusion matrix all at once. Args: gold: an array-like of gold labels (ints) pred: an array-like of predictions (ints) null_pred: If True, include the row corresponding to null predictions null_gold: If True, include the col corresponding to null gold labels normalize: if True, divide counts by the total number of items pretty_print: if True, pretty-print the matrix before returning """ conf = ConfusionMatrix(null_pred=null_pred, null_gold=null_gold) gold = arraylike_to_numpy(gold) pred = arraylike_to_numpy(pred) conf.add(gold, pred) mat = conf.compile() if normalize: mat = mat / len(gold) if pretty_print: conf.display(normalize=normalize) return mat
def roc_auc_score(gold, probs, ignore_in_gold=[], ignore_in_pred=[]): """Compute the ROC AUC score, given the gold labels and predicted probs. Args: gold: A 1d array-like of gold labels probs: A 2d array-like of predicted probabilities ignore_in_gold: A list of labels for which elements having that gold label will be ignored. Returns: roc_auc_score: The (float) roc_auc score """ gold = arraylike_to_numpy(gold) # Filter out the ignore_in_gold (but not ignore_in_pred) # Note the current sub-functions (below) do not handle this... if len(ignore_in_pred) > 0: raise ValueError("ignore_in_pred not defined for ROC-AUC score.") keep = [x not in ignore_in_gold for x in gold] gold = gold[keep] probs = probs[keep, :] # Convert gold to one-hot indicator format, using the k inferred from probs gold_s = hard_to_soft(torch.from_numpy(gold), k=probs.shape[1]).numpy() return skm.roc_auc_score(gold_s, probs)
def single_lf_summary(Y_p, Y=None): """Calculates coverage, overlap, conflicts, and accuracy for a single LF Args: Y_p: a np.array or torch.Tensor of predicted labels Y: a np.array or torch.Tensor of true labels (if known) """ L = sparse.csr_matrix(arraylike_to_numpy(Y_p).reshape(-1, 1)) return lf_summary(L, Y)
def lf_empirical_accuracies(L, Y): """Return the **empirical accuracy** against a set of labels Y (e.g. dev set) for each LF. Args: L: an n x m scipy.sparse matrix where L_{i,j} is the label given by the jth LF to the ith candidate Y: an [n] or [n, 1] np.ndarray of gold labels """ # Assume labeled set is small, work with dense matrices Y = arraylike_to_numpy(Y) L = L.toarray() X = np.where(L == 0, 0, np.where(L == np.vstack([Y] * L.shape[1]).T, 1, -1)) return 0.5 * (X.sum(axis=0) / (L != 0).sum(axis=0) + 1)
def error_buckets(gold, pred, X=None): """Group items by error buckets Args: gold: an array-like of gold labels (ints) pred: an array-like of predictions (ints) X: an iterable of items Returns: buckets: A dict of items where buckets[i,j] is a list of items with predicted label i and true label j. If X is None, return indices instead. For a binary problem with (1=positive, 2=negative): buckets[1,1] = true positives buckets[1,2] = false positives buckets[2,1] = false negatives buckets[2,2] = true negatives """ buckets = defaultdict(list) gold = arraylike_to_numpy(gold) pred = arraylike_to_numpy(pred) for i, (y, l) in enumerate(zip(gold, pred)): buckets[y, l].append(X[i] if X is not None else i) return buckets
def _preprocess(gold, pred, ignore_in_gold, ignore_in_pred): gold = arraylike_to_numpy(gold) pred = arraylike_to_numpy(pred) if ignore_in_gold or ignore_in_pred: gold, pred = _drop_ignored(gold, pred, ignore_in_gold, ignore_in_pred) return gold, pred