Beispiel #1
0
def test_label_binarizer_errors():
    # Check that invalid arguments yield ValueError
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)

    multi_label = [(2, 3), (0,), (0, 2)]
    with pytest.raises(ValueError):
        lb.transform(multi_label)

    lb = LabelBinarizer()
    with pytest.raises(ValueError):
        lb.transform([])
    with pytest.raises(ValueError):
        lb.inverse_transform([])

    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=2, pos_label=1)
    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=2, pos_label=2)

    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)

    # Fail on y_type
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(
            y=csr_matrix([[1, 2], [2, 1]]),
            output_type="foo",
            classes=[1, 2],
            threshold=0,
        )

    # Sequence of seq type should raise ValueError
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
    with pytest.raises(ValueError):
        LabelBinarizer().fit_transform(y_seq_of_seqs)

    # Fail on the number of classes
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(
            y=csr_matrix([[1, 2], [2, 1]]),
            output_type="foo",
            classes=[1, 2, 3],
            threshold=0,
        )

    # Fail on the dimension of 'binary'
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(
            y=np.array([[1, 2, 3], [2, 1, 3]]),
            output_type="binary",
            classes=[1, 2, 3],
            threshold=0,
        )

    # Fail on multioutput data
    with pytest.raises(ValueError):
        LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
    with pytest.raises(ValueError):
        label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
Beispiel #2
0
def test_label_binarize_multilabel():
    y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
    classes = [0, 1, 2]
    pos_label = 2
    neg_label = 0
    expected = pos_label * y_ind
    y_sparse = [
        sparse_matrix(y_ind) for sparse_matrix in [
            coo_matrix,
            csc_matrix,
            csr_matrix,
            dok_matrix,
            lil_matrix,
        ]
    ]

    for y in [y_ind] + y_sparse:
        check_binarized_results(y, classes, pos_label, neg_label, expected)

    with pytest.raises(ValueError):
        label_binarize(y,
                       classes=classes,
                       neg_label=-1,
                       pos_label=pos_label,
                       sparse_output=True)
Beispiel #3
0
def test_label_binarize_multiclass():
    y = [0, 1, 2]
    classes = [0, 1, 2]
    pos_label = 2
    neg_label = 0
    expected = 2 * np.eye(3)

    check_binarized_results(y, classes, pos_label, neg_label, expected)

    with pytest.raises(ValueError):
        label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
                       sparse_output=True)
Beispiel #4
0
def test_label_binarize_with_class_order():
    out = label_binarize([1, 6], classes=[1, 2, 4, 6])
    expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
    assert_array_equal(out, expected)

    # Modified class order
    out = label_binarize([1, 6], classes=[1, 6, 4, 2])
    expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
    assert_array_equal(out, expected)

    out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
    expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
    assert_array_equal(out, expected)
Beispiel #5
0
def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if (pos_label == 0 or neg_label != 0) and sparse_output:
            with pytest.raises(ValueError):
                label_binarize(
                    y,
                    classes=classes,
                    neg_label=neg_label,
                    pos_label=pos_label,
                    sparse_output=sparse_output,
                )
            continue

        # check label_binarize
        binarized = label_binarize(
            y,
            classes=classes,
            neg_label=neg_label,
            pos_label=pos_label,
            sparse_output=sparse_output,
        )
        assert_array_equal(toarray(binarized), expected)
        assert issparse(binarized) == sparse_output

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(
                binarized,
                output_type=y_type,
                classes=classes,
                threshold=((neg_label + pos_label) / 2.0),
            )

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(
            neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
        )
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert issparse(binarized) == sparse_output
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert issparse(inverse_output) == issparse(y)
Beispiel #6
0
def pr_auc(y_true, predict_probas, labels=None):
    if labels is None:
        labels = np.unique(y_true)
    y_true = label_binarize(y_true, classes=labels)[:, 0]

    precision, recall, _ = precision_recall_curve(y_true, predict_probas)
    pr_auc = sklearn.metrics.auc(recall, precision)
    return pr_auc
Beispiel #7
0
def test_invalid_input_label_binarize():
    with pytest.raises(ValueError):
        label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
    with pytest.raises(ValueError, match="continuous target data is not "):
        label_binarize([1.2, 2.7], classes=[0, 1])
    with pytest.raises(ValueError, match="mismatch with the labels"):
        label_binarize([[1, 3]], classes=[1, 2, 3])
Beispiel #8
0
def lift(y_true, predict_probas, pct=0.05, labels=None):

    if labels is None:
        labels = np.unique(y_true)
    y_true = label_binarize(y_true, classes=labels)[:, 0]

    num_records = len(predict_probas)
    prediction = pd.DataFrame(data=predict_probas,
                              columns=['prediction_proba'])
    prediction['label'] = y_true
    top_pct = math.floor(num_records * pct)
    top = prediction.nlargest(top_pct, ['prediction_proba'])
    failures = len(y_true[(y_true)])  # or 5% if failure rate > 5%
    num_failures_detected_in_top = len(top[top['label'] == 1])
    lift = num_failures_detected_in_top / failures
    return lift
Beispiel #9
0
def _hosmer_lemeshow(y_true, predict_probas, num_groups=10, labels=None):

    df = pd.DataFrame(data=predict_probas, columns=['prediction_proba'])

    if labels is None:
        labels = np.unique(y_true)

    y_true = label_binarize(y_true, classes=labels)[:, 0]

    df['label'] = y_true
    df['quantile_rank'] = pd.qcut(df['prediction_proba'],
                                  num_groups,
                                  labels=False,
                                  duplicates='drop')
    h = 0
    results = pd.DataFrame(columns=[
        'decile', 'lower_bound', 'upper_bound', 'num_observations',
        'num_failures', 'predicted_failures'
    ])
    for i in range(num_groups):
        pcat_predictions = df[df['quantile_rank'] == i]
        num_observations = len(pcat_predictions)
        if num_observations == 0:
            continue
        obs1 = len(pcat_predictions[pcat_predictions['label'] ==
                                    1])  # how many were in category 1
        exp1 = pcat_predictions['prediction_proba'].mean() * num_observations
        lower_bound = pcat_predictions['prediction_proba'].min()
        upper_bound = pcat_predictions['prediction_proba'].max()
        obs0 = num_observations - obs1
        exp0 = num_observations - exp1
        h += ((obs1 - exp1)**2) / exp1 + ((obs0 - exp0)**2) / exp0
        results = results.append(
            {
                'decile': i + 1,
                'lower_bound': lower_bound,
                'upper_bound': upper_bound,
                'num_observations': num_observations,
                'num_failures': obs1,
                'predicted_failures': exp1
            },
            ignore_index=True)

    p = chi2.sf(h, num_groups - 2)
    return h, p, results
Beispiel #10
0
def calculate_metrics(metrics, y_true, y_pred, labels=None):
    output = {}

    for metric_name in metrics:
        score_function = get_standard_metric(metric_name)

        if score_function is None:
            score_function = get_custom_metric(metric_name)
        if score_function is None:
            continue  # log

        if metric_name in class_prediction_metrics:
            if labels is None:
                labels = np.unique(y_true)

            _y_true = label_binarize(y_true, classes=labels)[:, 0]
            score = score_function(_y_true, np.argmax(y_pred, axis=1))
        else:
            score = score_function(y_true, y_pred[:, 1])

        output[metric_name] = score

    return output
Beispiel #11
0
def test_invalid_input_label_binarize():
    with pytest.raises(ValueError):
        label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
Beispiel #12
0
def test_label_binarizer_errors():
    # Check that invalid arguments yield ValueError
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)

    multi_label = [(2, 3), (0, ), (0, 2)]
    err_msg = "You appear to be using a legacy multi-label data representation."
    with pytest.raises(ValueError, match=err_msg):
        lb.transform(multi_label)

    lb = LabelBinarizer()
    err_msg = "This LabelBinarizer instance is not fitted yet"
    with pytest.raises(ValueError, match=err_msg):
        lb.transform([])
    with pytest.raises(ValueError, match=err_msg):
        lb.inverse_transform([])

    input_labels = [0, 1, 0, 1]
    err_msg = "neg_label=2 must be strictly less than pos_label=1."
    lb = LabelBinarizer(neg_label=2, pos_label=1)
    with pytest.raises(ValueError, match=err_msg):
        lb.fit(input_labels)
    err_msg = "neg_label=2 must be strictly less than pos_label=2."
    lb = LabelBinarizer(neg_label=2, pos_label=2)
    with pytest.raises(ValueError, match=err_msg):
        lb.fit(input_labels)
    err_msg = (
        "Sparse binarization is only supported with non zero pos_label and zero "
        "neg_label, got pos_label=2 and neg_label=1")
    lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
    with pytest.raises(ValueError, match=err_msg):
        lb.fit(input_labels)

    # Fail on y_type
    err_msg = "foo format is not supported"
    with pytest.raises(ValueError, match=err_msg):
        _inverse_binarize_thresholding(
            y=csr_matrix([[1, 2], [2, 1]]),
            output_type="foo",
            classes=[1, 2],
            threshold=0,
        )

    # Sequence of seq type should raise ValueError
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
    err_msg = "You appear to be using a legacy multi-label data representation"
    with pytest.raises(ValueError, match=err_msg):
        LabelBinarizer().fit_transform(y_seq_of_seqs)

    # Fail on the number of classes
    err_msg = "The number of class is not equal to the number of dimension of y."
    with pytest.raises(ValueError, match=err_msg):
        _inverse_binarize_thresholding(
            y=csr_matrix([[1, 2], [2, 1]]),
            output_type="foo",
            classes=[1, 2, 3],
            threshold=0,
        )

    # Fail on the dimension of 'binary'
    err_msg = "output_type='binary', but y.shape"
    with pytest.raises(ValueError, match=err_msg):
        _inverse_binarize_thresholding(
            y=np.array([[1, 2, 3], [2, 1, 3]]),
            output_type="binary",
            classes=[1, 2, 3],
            threshold=0,
        )

    # Fail on multioutput data
    err_msg = "Multioutput target data is not supported with label binarization"
    with pytest.raises(ValueError, match=err_msg):
        LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
    with pytest.raises(ValueError, match=err_msg):
        label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
Beispiel #13
0
    def visualize_testing_result(cls, truth_labels, pred_class_ids):
        '''
        Use to visualize results from using a 
        saved model on a set of test-set samples.
        
        Draws a PR curve, and adds a table with 
        the average precison (AP) of each class.
        '''
        # Find number of classes involved:
        all_class_ids = set(truth_labels)
        num_classes = len(all_class_ids)

        # Will alternately treat each class
        # prediction as a one-vs-all binary
        # classification. For each class ID (cid<n>),
        # get 0/1 guess separately for each sample:
        #
        #                 cid0      cid1
        #   pred_sample0   1          0
        #   pred_sample1   0          0
        #   pred_sample2   0          1
        #             ...
        # Same with labels:
        #                 cid0      cid1
        #   labl_sample0   1          0
        #   labl_sample1   0          0
        #   labl_sample2   0          1
        #             ...

        bin_labels = label_binarize(truth_labels,
                                    classes=list(range(num_classes)))

        # Make tensors just for manipulation
        # convenience:

        bin_labels_tn = torch.tensor(bin_labels)
        preds_tn = torch.tensor(pred_class_ids)

        precisions = dict()
        recalls = dict()
        average_precisions = dict()

        # Go through each column, i.e. the
        # 1/0 labels/preds for one class at
        # a time, and get the prec/rec numbers.
        # The [1] in prec & rec is b/c precision_recall_curve
        # returns a triplet for binary classification:
        # prec/rec at thresholds 0, 1, putting 1 as the
        # last element. The prec/rec we want is the
        # where 1 is the thresholds:

        for i in range(num_classes):

            bin_labels_arr = bin_labels_tn[:, i].tolist()
            preds_arr = preds_tn.tolist()

            # Get precision and recall at each
            # of the default thresholds:
            precs, recs = \
                cls.compute_binary_pr_curve(bin_labels_arr,
                                            preds_arr
                                            )
            precisions[i] = precs
            recalls[i] = recs

            # Avg prec is:
            #
            #      AP = SUM_ovr_n((R_n - R_n-1)*P_n
            #
            # I.e. the increase in recalls times current
            # precisions as each pred/sample pair is
            # processed:

            average_precisions[i] = \
                average_precision_score(bin_labels_arr,
                                        preds_arr,
                                        average='macro',
                                        )

        mAP = np.mean(list(average_precisions.values()))

        return (mAP, precisions, recalls, average_precisions)
Beispiel #14
0
    def compute_multiclass_pr_curves(cls,
                                     truth_labels,
                                     raw_preds,
                                     thresholds=[0.2, 0.4, 0.6, 0.8]):
        '''
        Computes the data needed to draw
        a family of PR curves for the results
        of multiclass classifier output.
        
        Returns a dict of the constituent 
        single-class curve specs, and a
        mean average precision (mAP) score
        for all curves combined.
        
        Each result dict maps a class ID
        to all info needed for one of the
        curves:

          1:
              {'best_op_pt' : best_operating_pt,
               'precisions' : precisions,
               'recalls'    : recalls,
               'thresholds' : thresholds,
               'avg_prec'   : avg_precision
               }
          2:
              {'best_op_pt' : best_operating_pt,
               'precisions' : precisions,
               'recalls'    : recalls,
               'thresholds' : thresholds,
               'avg_prec'   : avg_precision
               }

        where best_op_pt is:

               {'threshold' : <optimal decision probability value>
                'f1'        : <f1 at the optimal threshold>
                'prec'      : <precision at the optimal threshold>
                'thresholds' : thresholds,
                'rec'       : <recall at the optimal threshold>
                }

        Each of the avg_prec is the 
        the average of precisions across the 
        samples of one class (AP). I.e. there will
        be as many elements in average_precisions
        as there are classes. 
        
        The Mean Average Precision (mAP) is 
        the mean of the average_precision values.
        This measure summarizes the family of PR curves.
        It is comparable to AUC ROC.
        
        The precisions and recalls returns are dicts.
        The keys are class IDs, and the values are the
        precisions for that class. They are the quantities
        from which the average_precision values are 
        computed.
        
        Summary: 
            o precisions/recalls are the lowest granularity
              of information: the per class precs and recs
              at different thresholds.
              
              There are as many entries in these dicts as
              there are classes. And prec/rec value pair
              from the precisions and recalls dict are results
              of one threshold. 

               TODO: o finish this sentence by running and
                       seeing what's what
                     o A unit test for this method
                     o Finally: the actual drawing of the 
                         curves with pyplot
                         
            o average_precision aggregates the precisions
              of one class across multiple thresholds. There 
              will be as many entries in this dict as there 
              are classes.
              
            o mAP aggregates the average_precision values
              across all classes. This is one number.

        :param truth_labels: all truth labels shaped
            torch.Size([num-batches, batch-size])
        :type truth_labels: Tensor
        :param raw_preds: the logits for each class for
            each sample as 
            torch.Shape([num-batches, batch-size, num-classes])
        :type raw_preds: Tensor
        :return: (precisions, recalls, average_precisions, mAP)
        :rtype: ({int : [floats]}, {int : [floats]}, [floats], float)
        '''

        (num_batches, batch_size, num_classes) = raw_preds.shape

        num_samples = num_batches * batch_size

        # Will alternately treat each class
        # prediction as a one-vs-all binary
        # classification.
        #
        # Ex. let labels = [1,0,0,1,2]
        #      and preds = [0.3,0.6,0.1,0.7,0.9]
        #
        # Convert the labels to a one-hot vector;
        # the width of the binarized labels is
        # num_classes:
        #
        #       L A B E L S               P R E D S
        #       ------------              ----------
        #     [1,         [[0, 1, 0],       [0.3,
        #      0,          [1, 0, 0],        0.6,
        #      0,   ==>    [1, 0, 0],        0.1,
        #      1,          [0, 1, 0],        0.7,
        #      2]          [0, 0, 1]]        0.9]
        #
        # Then evaluate each label column vector
        # separately.

        bin_labels = label_binarize(truth_labels.flatten(),
                                    classes=list(range(num_classes)))

        assert (bin_labels.shape == torch.Size([num_samples, num_classes]))
        assert(raw_preds.shape == \
               torch.Size([num_batches, batch_size, num_classes])
               )

        # Want straight down: logits for each class, for
        # each sample ('lst' for 'list'):

        raw_preds_lst = raw_preds.reshape([num_samples, num_classes])

        assert (raw_preds_lst.shape == bin_labels.shape)

        # Turn logits into probs, rowise:
        preds = torch.softmax(raw_preds_lst, dim=1)

        # Place to hold the result dicts
        # from compute_binary_pr_curve()
        # for each of the classes. This
        # will be class-name : binary-result-dict

        all_curves_info = {}

        # Go through each column, class_id i.e. the
        # 1/0-vector label columns and preds
        # columns for one class at
        # a time, and get the prec/rec numbers.

        for col_idx in range(num_classes):
            bin_label_col = torch.tensor(bin_labels[:, col_idx])
            preds_col = preds[:, col_idx]

            # Get all info for this single, binary
            # classification: list of 1/0 labels, and
            # list of floats, which are the preds for
            # the current class:

            #**************
            # # Using sklearn's precision_recall_curve,
            # # which determines thresholds by its own
            # # algorithm:
            #
            # from sklearn.metrics import precision_recall_curve
            # sklearn_precs,\
            # sklearn_recs,\
            # sklearn_thresholds = \
            #     precision_recall_curve(bin_label_col, preds_col)
            #**************

            # Obtain the information needed to
            # draw one PR curve: a CurveSpecification
            # instance:
            one_class_curve = cls.compute_binary_pr_curve(
                bin_label_col,
                preds_col,
                col_idx,  # class_id
                thresholds)

            # Accumulate the curve indices
            # in a dict, keyed by class ID:
            all_curves_info[col_idx] = one_class_curve

        avg_precs = [
            binary_curve_info['avg_prec']
            for binary_curve_info in all_curves_info.values()
        ]
        mAP = np.mean(np.array(avg_precs)).tolist()

        return (all_curves_info, mAP)