Exemple #1
0
def com(im):
    """
    Compute the center of mass of im.
    Expects that im is leveled (ie zero-centered). Ie, a pure noise image should have zero mean.
    Sometimes this is improved if you square the im first com(im**2)
    Returns:
        y, x in array form.
    """
    im = np.nan_to_num(im)
    mass = np.sum(im)
    ry = (
        np.arange(im.shape[0]) + 0.5
    )  # 0.5 because we want the mass of a pixel at the center of the pixel
    rx = np.arange(im.shape[1]) + 0.5
    y = np.sum(ry * np.sum(im, axis=1))
    x = np.sum(rx * np.sum(im, axis=0))
    return utils.np_safe_divide(np.array([y, x]), mass)
Exemple #2
0
def zest_pr_curve_no_tied_scores_mean_recall():
    """
    Testing situations with some right and some wrong calls, but no tied scores.
    Adding to this different train_recall factors to test that those get included
    in recall calculations.

    Same fixtures as above except the means, see above for comments.
    """

    # first entry is null peptide
    stub_sim_result = Munch(train_recalls=np.array([-1.0, 0.1, 0.2, 0.3]))
    stub_prep_result = Munch(n_peps=4)

    true_pep_iz = np.array([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3])
    pred_pep_iz = np.array([1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1])
    scores = np.array(
        [0.8, 0.9, 0.7, 0.6, 0.85, 0.53, 0.54, 0.55, 0.75, 0.4, 0.3, 0.35])

    cb = CallBag(
        sim_result=stub_sim_result,
        prep_result=stub_prep_result,
        true_pep_iz=true_pep_iz,
        pred_pep_iz=pred_pep_iz,
        scores=scores,
    )

    cum_sum_correct = np.array(
        [0.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 5.0, 5.0, 6.0])
    cum_sum_count = np.array(range(1, len(pred_pep_iz) + 1))

    # precision at score threshold
    prec_at_thresh = utils.np_safe_divide(cum_sum_correct,
                                          cum_sum_count,
                                          default=0)
    prec_at_thresh = np.append([1.0], prec_at_thresh)

    # recall at each threshold
    recall_at_thresh = cum_sum_correct / len(pred_pep_iz)
    recall_at_thresh = np.append([0.0], recall_at_thresh)

    zest()
Exemple #3
0
 def pr(cm, p_i):
     prec = np_safe_divide(cm[p_i, p_i], np.sum(cm[p_i, :]))
     recall = np_safe_divide(cm[p_i, p_i], np.sum(cm[:, p_i]))
     return prec, recall
Exemple #4
0
    def pr_curve_either(
        self,
        level,
        true,
        pred,
        scores,
        true_in_subset_mask,
        pred_in_subset_mask,
        level_iz_subset,
        n_steps=50,
    ):
        # Throughout this function, "level" (e.g. level_iz_subset) refers to "peptide or protein" (or
        # potentially, in the future, PTM or whatever else we might make a PR curve of)

        # At this point, true_ and pred_in_subset_mask are masks on the original set.
        # We now reduce to the set of interest so that we sort a smaller set

        true_or_pred_subset_mask = true_in_subset_mask | pred_in_subset_mask
        true = true[true_or_pred_subset_mask]
        pred = pred[true_or_pred_subset_mask]
        scores = scores[true_or_pred_subset_mask]
        true_in_subset_mask = true_in_subset_mask[true_or_pred_subset_mask]
        pred_in_subset_mask = pred_in_subset_mask[true_or_pred_subset_mask]

        # Now sort on a smaller set
        sorted_iz = np.argsort(scores)[::-1]
        true = true[sorted_iz]
        pred = pred[sorted_iz]
        scores = scores[sorted_iz]

        true_in_subset_mask = true_in_subset_mask[sorted_iz]
        pred_in_subset_mask = pred_in_subset_mask[sorted_iz]

        # How many true are in the subset? This will be
        # used as the denominator of recall.
        n_true_in_subset = true_in_subset_mask.sum()

        # WALK through scores linearlly from high to low, starting
        # at (1.0 - step_size) so that the first group has contents.
        step_size = 1.0 / n_steps

        # prsa sdtands for "Precision Recall Score Area_under_curve"
        prsa = np.zeros((n_steps, 4))
        precision_column = 0
        recall_column = 1
        score_thresh_column = 2
        auc_column = 3

        for prsa_i, score_thresh in enumerate(
                np.linspace(1 - step_size, 0, n_steps)):
            # i is the index where *ALL* scores before this point are greater
            # than or equal to the score_thresh. Note that because many calls
            # may have *tied* scores, we use np_arg_last_where to pick the
            # *last* position (ie lowest score) where the statement is true.
            i = utils.np_arg_last_where(scores >= score_thresh)
            if i is None:
                prsa[prsa_i] = (0.0, 0.0, score_thresh, 0.0)
            else:
                correct_at_i_mask = true[0:i + 1] == pred[0:i + 1]
                pred_at_i_mask = pred_in_subset_mask[0:i + 1]

                # At i, count:
                #  * How many of the subset of interest have been predicted?
                #    This will be used as the denominator of precision.
                #  * How many correct calls of the subset have been made?
                #    This is the numerator of precision and recall.
                #  Note that for the correct, the masking doesn't matter if
                #  we choose the true_mask or pred_mask because they are they same
                #  in the case of a correct call.
                n_pred_at_i = pred_at_i_mask.sum()
                n_correct_and_in_subset_at_i = (
                    correct_at_i_mask  # & pred_at_i_mask
                ).sum()

                prsa[prsa_i] = (
                    # Precision: Fraction of those that were called apples at i that were in fact apples
                    utils.np_safe_divide(n_correct_and_in_subset_at_i,
                                         n_pred_at_i),
                    # Recall: Fraction of all apples that were called apples at i
                    utils.np_safe_divide(n_correct_and_in_subset_at_i,
                                         n_true_in_subset),
                    # Score threshold is stepping down linearly
                    score_thresh,
                    0.0,
                )
                # The Area under the curve up to this point (requires two points)
                prsa[prsa_i, auc_column] = self._auc(
                    prsa[0:prsa_i + 1, recall_column],
                    prsa[0:prsa_i + 1, precision_column],
                )

        # CORRECT for the prior-recall.
        # During simulation some rows may be all-dark.
        # Those are accounted for here by scaling down the recall by
        # the fraction of non-dark rows / all rows.
        # This is done as MEAN of all recalls over the set of interest.

        # EXTRACT training recalls from the subset of peps or pros.
        # This will leave NANs for all those that are not in the subset.
        if self._sim_result is not None and level == "pep":
            filtered_level_recalls = np.full_like(
                self._sim_result.train_pep_recalls, np.nan)
            filtered_level_recalls[
                level_iz_subset] = self._sim_result.train_pep_recalls[
                    level_iz_subset]
        elif self._sim_result is not None and level == "pro":
            pro_pep_df = self._prep_result.pros__peps()
            n_pro = pro_pep_df.pro_id.nunique()
            filtered_level_recalls = np.full((n_pro, ), np.nan)
            filtered_level_recalls[level_iz_subset] = 0.0
            for protein_index in level_iz_subset:
                for peptide_index in pro_pep_df.loc[pro_pep_df["pro_i"] ==
                                                    protein_index].pep_i:
                    # NOTE: we are assuming here that the recall of the best performing peptide of a given
                    #      protein is a good approximation of the protein as a whole.  Some initial literature
                    #      research indicates this is a decent starting approximation, but it may need to
                    #      be revisited in the future.
                    filtered_level_recalls[protein_index] = max(
                        filtered_level_recalls[protein_index],
                        self._sim_result.train_pep_recalls[peptide_index],
                    )
        else:
            filtered_level_recalls = np.full((prsa.shape[0], ), 1.0)

        # Use nanmean to ignore al those nans (the peps not in the subset)
        # And then use np.nan_to_num in case the subset was empty, we want 0 not nan
        mean_recall = np.nan_to_num(np.nanmean(filtered_level_recalls))
        assert 0.0 <= mean_recall <= 1.0

        # SCALE-DOWN all recall
        prsa[:, recall_column] *= mean_recall

        # SKIP all initial rows where the recall is zero, these clutter up the graph
        # The return may thus have fewer than n_steps rows.
        first_non_zero_i = utils.np_arg_first_where(
            prsa[:, recall_column] > 0.0)

        filtered_prsa = prsa[first_non_zero_i:]

        assert np.all(np.diff(filtered_prsa[:, 2]) <= 0.0)

        return (
            filtered_prsa[:, 0],  # Precision
            filtered_prsa[:, 1],  # Recall
            filtered_prsa[:, 2],  # Score thresholds
            filtered_prsa[:, 3],  # AUC
        )
Exemple #5
0
    def fdr_explore_df(self, with_flu_info=False, pred_pep_i=None):
        """
        Exploratory, for use with notebooks etc.

        If pred_pep_i is None, fdr_decoy and fdr_truth both refer to composite
        FDR based on decoy counting and known truth (for test data only).

        If pred_pep_i is not None, fdr_decoy is still based on composite (all peptide calls)
        decoy counting, as must always be the case for real data.  But fdr_truth
        will be reported for the specific peptide.  This allows comparing
        predicted real-data decoy-based FDR to known test-data per-peptide FDR.

        see additional notes in fdr_from_decoys_df()

        Returns:
            A DataFrame with various useful information related to the FDR for this
            CallBag.  Note that if this CallBag refers to real sigproc data, we have
            at most the decoy rate to deal with.  If instead we are working with test
            data, the "true FDR" and other statistics will be computed for comparison.
        """

        df = self.fdr_from_decoys_df()
        if pred_pep_i is not None:
            df = df[df.pred_pep_iz == pred_pep_i]

        # If this is test data, we can add a lot more information since we know truth.

        if "true_pep_iz" in self.df.columns:
            # add column for convenience that says if prediction was wrong
            df["pred_wrong"] = df.true_pep_iz != df.pred_pep_iz

            # compute P(wciad), the "probability that a wrong call is a decoy"
            # note for abund, this overcounts since we really should divide by n_samples, but this divides out.
            df["p_wciad"] = utils.np_safe_divide(
                np.cumsum(df.pred_is_decoy.values),
                np.cumsum(df.pred_wrong.values))
            df["p_wciad_abund"] = utils.np_safe_divide(
                np.cumsum(df.pred_is_decoy.values * df.abundance),
                np.cumsum(df.pred_wrong.values * df.abundance),
            )

            # then we can compute fdr based on truth in addition to the one based on decoy assignment we already have
            # Note that fdr_truth is truth even for a specific peptide, if one has been specified.
            df["fdr_truth"] = utils.np_safe_divide(
                np.cumsum(df.pred_wrong.values),
                np.arange(1,
                          len(df.index) + 1))

            # factor abundance into fdr_truth
            # this overcounts since we really should divide by n_samples, but this divides out.
            df["fdr_t_abund"] = utils.np_safe_divide(
                np.cumsum(df.pred_wrong.values * df.abundance),
                np.cumsum(df.abundance))

            # add the flu for the true peptide if desired
            if with_flu_info is True:
                flus = self._sim_result.flus()[[
                    "pep_i", "flustr", "flu_count"
                ]]
                df = df.merge(flus, left_on="true_pep_iz", right_on="pep_i")

        # sort on scores descending
        return df.sort_values(by="scores",
                              ascending=False).reset_index(drop=True)
Exemple #6
0
def _do_nn_and_gmm(
    unit_radrow,
    dyerow,
    dt_mat,
    dt_inv_var_mat,
    dt_weights,
    flann,
    n_neighbors,
    dt_score_mode,
    dt_filter_threshold,
    dt_score_metric,
    dt_score_bias,
    penalty_coefs,
    rare_penalty,
    radius,
):
    """
    The workhorse of the Gaussian Mixture Model called from a parallel map below.

    Arguments:
        unit_radrow: One row of the unit_X matrix.
        dyerow: The true dyerow used for debugging
        dt_mat: The Dyetrack object used for neighbor lookup.
        dt_inv_var_mat: The inverse variance of each pattern (based on the VPD for each count)
        dt_weights: The weight of the pattern
        flann: The neighbor lookup
        The remainder are temporary parameters until I find the best solution

    Outline:
        unit_radmat_row is one row of uX. We ask the PYFLANN index to tell us the
        n nearest neighbors of all the dt_ann to this row and then for each
        of those neighbors we compute a variance-corrected distance.

    Notes:
        After some testing to compare to cdist "mahalanobis" I realized that actually
        the cdist function was not operating as expected. Specifically, it was taking the
        first element of the VI as THE ONLY covariance matrix, not one-per-dt! This means
        that all previous run results were wrong so I'm concerned how that was getting
        results that were as good as RF?
    """
    check.array_t(dt_mat, ndim=3)
    n_dts, n_channels, n_cycles = dt_mat.shape
    check.array_t(dt_inv_var_mat, shape=dt_mat.shape)
    check.array_t(unit_radrow, shape=(n_channels, n_cycles))
    check.array_t(dt_weights, shape=(n_dts, ))

    n_cols = n_channels * n_cycles
    unit_radmat_row_flat = unit_radrow.reshape((n_cols, ))

    # true_dt_i is found by use the "dyerow" which is really a cheat
    # since dyerow is oly something we know in simulated data.
    # There is no guarantee that the training set has every row
    # that is represented by the test set. Thus the "true_dt_i"
    # might not be found. We find it by using the _get_neighbor_iz
    # with a tiny radius because if the test dyetrack is found
    # among the targets it should be right on top of its target.
    true_dt_i = _get_neighbor_iz(
        flann,
        dyerow.reshape((n_cols, )),
        n_neighbors=n_neighbors,
        radius=0.01,
        default=0,
    )[0]

    nn_iz = _get_neighbor_iz(flann,
                             unit_radmat_row_flat,
                             n_neighbors=n_neighbors,
                             radius=radius,
                             default=0)
    nn_iz = nn_iz[nn_iz > 0]

    # FILTER any low-weight dyetracks
    sufficient_weight_mask = dt_weights[nn_iz] >= dt_filter_threshold
    nn_iz = nn_iz[sufficient_weight_mask]
    n_neigh_found = np.sum(nn_iz > 0)

    if n_neigh_found > 0:
        assert 1 <= n_neigh_found <= n_neighbors
        neigh_dt_mat = dt_mat[nn_iz].reshape((n_neigh_found, n_cols))
        neigh_dt_inv_var = dt_inv_var_mat[nn_iz].reshape(
            (n_neigh_found, n_cols))
        neigh_weights = dt_weights[nn_iz]
        vdist = np.zeros_like(neigh_dt_inv_var)

        def cd():
            return distance.cdist(
                unit_radmat_row_flat.reshape((1, n_cols)),
                neigh_dt_mat,
                metric=dt_score_metric,
            )[0]

        def penalty():
            p = 1.0
            if rare_penalty is not None:
                p *= 1.0 - np.exp(-rare_penalty * neigh_weights)

            if penalty_coefs is not None:
                # Experimental: reduce score by total est. dye count
                total_brightness = unit_radrow.sum()
                # From fitting on RF I see that p_correct is correlated to
                # total brightness. A linear function m=0.054, b=0.216
                # So I'm going to try reducing score by this factor
                correction = max(
                    0.0,
                    min(1.0, (total_brightness * penalty_coefs[0] +
                              penalty_coefs[1])),
                )
                assert 0 <= correction <= 1.0
                p *= correction

            return p

        if dt_score_mode == "gmm_normalized_wpdf":
            delta = unit_radmat_row_flat - neigh_dt_mat
            vdist = np.sum(delta * delta * neigh_dt_inv_var, axis=1)
            pdf = np.exp(-vdist)
            weighted_pdf = neigh_weights * pdf
            scores = utils.np_safe_divide(weighted_pdf, weighted_pdf.sum())

        elif dt_score_mode == "gmm_normalized_wpdf_dist_sigma":
            """
            https://en.wikipedia.org/wiki/Multivariate_normal_distribution
            Given:
                Sigma: covariance matrix
                mu: mean
                k: Dimensionality of the Gaussian
            The multivariate normal has the form:
                (
                    (2.0 * np.pi)**(-k / 2.0)
                ) * (
                    np.linalg.det(Sigma)**(-1.0 / 2.0)
                ) * np.exp(
                    -1.0 / 2.0 * ((x-mu).T @ np.linalg.inv(Sigma) @ (x-mu))
                )

            We can make some simplifications here:

            1.  We have n_rows (number of neighbors) and n_cols (number of feature dimensions)

            2.  We have pre-computed (x-mu) and call it "delta"
                This is a (n_rows, n_cols) matrix 

            3.  We don't actually have Sigma, rather we have 
                the "inverse variance" which we call: "neigh_dt_inv_var"
                and which we store in vector form!
                Therefore:
                   Sigma = 1.0 / neigh_dt_inv_var
                This is a (n_rows, n_cols) matrix

            4.  Our covariance matrix (Sigma) is diagonal and therefore
                its determinant is the product of the diagonal elements
                which, again, is stored as the rows.
                   np.linalg.det(Sigma) == np.prod(Sigma, axis=1)

            5.  Following from Sigma being diagonal, its inverse is:
                (1.0 / its elements). Therefore:
                   np.linalg.inv(Sigma) == 1.0 / Sigma

            6.  Furthermore, the term:
                   (x-mu).T @ np.linalg.inv(Sigma) @ (x-mu)
                is typically a full matrix expression.
                But Sigma is diagonal and stored as a vector, therefore:
                    delta.T @ np.sum((1.0 / Sigma) * delta, axis=1) ==
                    np.sum(delta * ((1.0 / Sigma) * delta), axis=1)

            7.  Because the whole equation converts to vector form
                all of the row operations on each neighbor can
                be done simultaneously.    

            8.  The  (2.0 * np.pi)**(-k / 2.0) is omitted as it will get 
                factored out when scores are computed.

            Therefore:
                n_rows = n_neigh_found = # Number of rows of neigh_dt_mat
                n_cols = # Number of columns of neigh_dt_mat
                delta = unit_radmat_row_flat - neigh_dt_mat  # np.array(size=(n_rows, n_cols,))
                Sigma = 1.0 / neigh_dt_inv_var  # np.array(size=(n_rows, n_cols,))

                pdf_at_x = (
                (2.0 * np.pi)**(-n_cols / 2.0)  # A constant (we can factor this out)
                ) * (
                np.prod(Sigma, axis=1)**(-1.0 / 2.0)  # np.array(size=(n_rows,))
                ) * np.exp(
                -1.0 / 2.0 * np.sum(delta * (neigh_dt_inv_var * delta), axis=1)
                )  # np.array(size=(n_rows,))
            """
            delta = unit_radmat_row_flat - neigh_dt_mat
            vdist = np.sum(delta * neigh_dt_inv_var * delta, axis=1)
            sigma = 1.0 / neigh_dt_inv_var
            determinant_of_sigma = np.prod(sigma, axis=1)
            pdf = determinant_of_sigma**(-1 / 2) * np.exp(-vdist / 2)
            weighted_pdf = neigh_weights * pdf
            scores = utils.np_safe_divide(weighted_pdf, weighted_pdf.sum())

        elif dt_score_mode == "gmm_normalized_wpdf_no_inv_var":
            delta = unit_radmat_row_flat - neigh_dt_mat
            vdist = np.sum(delta * delta, axis=1)
            pdf = np.exp(-vdist)
            weighted_pdf = neigh_weights * pdf
            scores = utils.np_safe_divide(weighted_pdf, weighted_pdf.sum())

        elif dt_score_mode == "cdist_normalized":
            d = cd()
            scores = 1.0 / (dt_score_bias + d)
            scores = utils.np_safe_divide(scores, scores.sum())

        elif dt_score_mode == "cdist_weighted_sqrt":
            d = cd()
            scores = np.sqrt(neigh_weights) / (dt_score_bias + d)

        elif dt_score_mode == "cdist_weighted_log":
            d = cd()
            scores = np.log(neigh_weights) / (dt_score_bias + d)

        elif dt_score_mode == "cdist_weighted_normalized":
            d = cd()
            scores = neigh_weights / (dt_score_bias + d)
            scores = utils.np_safe_divide(scores, scores.sum())

        elif dt_score_mode == "cdist_weighted_normalized_sqrt":
            d = cd()
            scores = np.sqrt(neigh_weights) / (dt_score_bias + d)
            scores = utils.np_safe_divide(scores, scores.sum())

        elif dt_score_mode == "cdist_weighted_normalized_log":
            d = cd()
            scores = np.log(neigh_weights) / (dt_score_bias + d)
            scores = utils.np_safe_divide(scores, scores.sum())

        else:
            raise NotImplementedError()

        # PICK highest score
        scores *= penalty()
        arg_sort = np.argsort(scores)[::-1]
        best_arg = arg_sort[0].astype(int)
        pred_dt_i = int(nn_iz[best_arg])
        pred_dt_score = scores[best_arg]
        vdist = vdist[best_arg]

    else:
        pred_dt_i = 0
        pred_dt_score = 0.0
        vdist = 0.0

    return (
        np.array([true_dt_i]),
        np.array([pred_dt_i]),
        np.array([pred_dt_score]),
        np.array([vdist]),
    )
Exemple #7
0
 def it_safe_divides_to_other_default():
     val = utils.np_safe_divide(np.array([1, 2]), np.array([0, 2]), default=10.0)
     assert np.all(val == [10.0, 1.0])
Exemple #8
0
 def it_safe_divides_to_zero():
     val = utils.np_safe_divide(np.array([1, 2]), np.array([0, 2]))
     assert np.all(val == [0.0, 1.0])
Exemple #9
0
 def snr(self, **kwargs):
     return utils.np_safe_divide(self.signal_radmat(**kwargs),
                                 self.noise_radmat(**kwargs))
Exemple #10
0
 def snr_for_field(self, field_i, **kwargs):
     return utils.np_safe_divide(
         self.signal_radmat_for_field(field_i, **kwargs),
         self.noise_radmat_for_field(field_i, **kwargs),
     )
Exemple #11
0
def _do_nn(
        i: int,  # Row index to analyze
        nn_params: TestNNParams,
        radmat,  # Classify the [i] row of this (NOT normalized!)
        dt_mat,  # Targets
        dt_inv_var_mat,  # Inv variance of each target
        dt_weights,  # Weight of each target
        flann,  # Neighbor lookup index
        channel_i_to_gain_inv,  # Normalization term for each channel (radmat->unit_radmat)
        score_normalization,  # Normalization term to scale score by
        # dt_pep_sources_df,  # (dye_i, pep_i, n_rows). How often did pep_i express dye_i
    dye_to_best_pep_df,
        output_pred_dt_scores,
        output_pred_scores,
        output_pred_pep_iz,
        output_pred_dt_iz,
        output_true_dt_iz,
        true_dyemat=None,  # For debugging
):
    """
    Arguments:
        i: Index of the row
        radrow: One row of the radmat matrix -- not this is NOT normalized!
        dyerow: The true dyerow used for debugging
        dt_mat: The Dyetrack object used for neighbor lookup.
        dt_inv_var_mat: The inverse variance of each pattern (based on the VPD for each count)
        dt_weights: The weight of the pattern
        flann: The neighbor lookup
        The remainder are temporary parameters until I find the best solution
    """
    radrow = radmat[i]
    unit_radrow = radrow * channel_i_to_gain_inv[:, None]

    check.array_t(dt_mat, ndim=3)
    n_dts, n_channels, n_cycles = dt_mat.shape
    check.array_t(dt_inv_var_mat, shape=dt_mat.shape)
    check.array_t(radrow, shape=(n_channels, n_cycles))
    check.array_t(dt_weights, shape=(n_dts, ))

    n_cols = n_channels * n_cycles
    unit_radrow_flat = unit_radrow.reshape((n_cols, ))
    if true_dyemat is not None:
        dyerow = true_dyemat[i].reshape((n_cols, ))
        # If true_dyemat is available then we can "cheat" and find the
        # true_dt_i for debugging purposes only.
        # Note, there is no guarantee that the training set has every row
        # that is represented by the test set. Thus the "true_dt_i"
        # might not be found. We find it by using the _get_neighbor_iz
        # with a tiny radius because if the test dyetrack is found
        # among the targets it should be right on top of its target.
        output_true_dt_iz[i] = _get_neighbor_iz(
            flann,
            radrow=dyerow.astype(RadType),
            n_neighbors=nn_params.n_neighbors,
            radius=0.01,
            default=0,
        )[0]

    # # FIND the neighbors in dyetrack target space encoded in the FLANN index
    nn_iz = _get_neighbor_iz(
        flann,
        unit_radrow_flat,
        n_neighbors=nn_params.n_neighbors,
        radius=nn_params.radius,
        default=-1,
    )

    # REMOVE -1 = unfound then convert to unsigned
    nn_iz = nn_iz[nn_iz >= 0].astype(IndexType)

    # FILTER any low-weight dyetracks
    sufficient_weight_mask = dt_weights[nn_iz] >= nn_params.dt_filter_threshold
    nn_iz = nn_iz[sufficient_weight_mask]
    n_neigh_found = np.sum(nn_iz >= 0)

    composite_score = ScoreType(0.0)
    pred_pep_i = IndexType(0)
    pred_dt_i = IndexType(0)
    pred_dt_score = ScoreType(0.0)

    if n_neigh_found > 0:
        assert 1 <= n_neigh_found <= nn_params.n_neighbors
        neigh_dt_mat = dt_mat[nn_iz].reshape((n_neigh_found, n_cols))
        neigh_dt_inv_var = dt_inv_var_mat[nn_iz].reshape(
            (n_neigh_found, n_cols))
        neigh_weights = dt_weights[nn_iz]

        def cd():
            return distance.cdist(
                unit_radrow_flat.reshape((1, n_cols)),
                neigh_dt_mat,
                metric=nn_params.dt_score_metric,
            )[0]

        def penalty():
            p = 1.0
            if nn_params.rare_penalty is not None:
                p *= 1.0 - np.exp(-nn_params.rare_penalty * neigh_weights)

            if nn_params.penalty_coefs is not None:
                # Experimental: reduce score by total est. dye count
                total_brightness = unit_radrow.sum()
                # From fitting on RF I see that p_correct is correlated to
                # total brightness. A linear function m=0.054, b=0.216
                # So I'm going to try reducing score by this factor
                correction = max(
                    0.0,
                    min(
                        1.0,
                        (total_brightness * nn_params.penalty_coefs[0] +
                         nn_params.penalty_coefs[1]),
                    ),
                )
                assert 0 <= correction <= 1.0
                p *= correction

            return p

        # if nn_params.dt_score_mode == "gmm_normalized_wpdf":
        #     delta = unit_radrow_flat - neigh_dt_mat
        #     vdist = np.sum(delta * delta * neigh_dt_inv_var, axis=1)
        #     pdf = np.exp(-vdist)
        #     weighted_pdf = neigh_weights * pdf
        #     scores = utils.np_safe_divide(weighted_pdf, weighted_pdf.sum())

        if nn_params.dt_score_mode == "gmm_normalized_wpdf_dist_sigma":
            """
            https://en.wikipedia.org/wiki/Multivariate_normal_distribution
            Given:
                Sigma: covariance matrix
                mu: mean
                k: Dimensionality of the Gaussian
            The multivariate normal has the form:
                (
                    (2.0 * np.pi)**(-k / 2.0)
                ) * (
                    np.linalg.det(Sigma)**(-1.0 / 2.0)
                ) * np.exp(
                    -1.0 / 2.0 * ((x-mu).T @ np.linalg.inv(Sigma) @ (x-mu))
                )

            We can make some simplifications here:

            1.  We have n_rows (number of neighbors) and n_cols (number of feature dimensions)

            2.  We have pre-computed (x-mu) and call it "delta"
                This is a (n_rows, n_cols) matrix

            3.  We don't actually have Sigma, rather we have
                the "inverse variance" which we call: "neigh_dt_inv_var"
                and which we store in vector form!
                Therefore:
                   Sigma = 1.0 / neigh_dt_inv_var
                This is a (n_rows, n_cols) matrix

            4.  Our covariance matrix (Sigma) is diagonal and therefore
                its determinant is the product of the diagonal elements
                which, again, is stored as the rows.
                   np.linalg.det(Sigma) == np.prod(Sigma, axis=1)

            5.  Following from Sigma being diagonal, its inverse is:
                (1.0 / its elements). Therefore:
                   np.linalg.inv(Sigma) == 1.0 / Sigma

            6.  Furthermore, the term:
                   (x-mu).T @ np.linalg.inv(Sigma) @ (x-mu)
                is typically a full matrix expression.
                But Sigma is diagonal and stored as a vector, therefore:
                    delta.T @ np.sum((1.0 / Sigma) * delta, axis=1) ==
                    np.sum(delta * ((1.0 / Sigma) * delta), axis=1)

            7.  Because the whole equation converts to vector form
                all of the row operations on each neighbor can
                be done simultaneously.

            8.  The  (2.0 * np.pi)**(-k / 2.0) is omitted as it will get
                factored out when scores are computed.

            Therefore:
                n_rows = n_neigh_found = # Number of rows of neigh_dt_mat
                n_cols = # Number of columns of neigh_dt_mat
                delta = unit_radrow_flat - neigh_dt_mat  # np.array(size=(n_rows, n_cols,))
                Sigma = 1.0 / neigh_dt_inv_var  # np.array(size=(n_rows, n_cols,))

                pdf_at_x = (
                (2.0 * np.pi)**(-n_cols / 2.0)  # A constant (we can factor this out)
                ) * (
                np.prod(Sigma, axis=1)**(-1.0 / 2.0)  # np.array(size=(n_rows,))
                ) * np.exp(
                -1.0 / 2.0 * np.sum(delta * (neigh_dt_inv_var * delta), axis=1)
                )  # np.array(size=(n_rows,))
            """
            delta = unit_radrow_flat - neigh_dt_mat
            vdist = np.sum(delta * neigh_dt_inv_var * delta, axis=1)
            sigma = 1.0 / neigh_dt_inv_var
            determinant_of_sigma = np.prod(sigma, axis=1)
            pdf = determinant_of_sigma**(-1 / 2) * np.exp(-vdist / 2)
            weighted_pdf = neigh_weights * pdf
            scores = utils.np_safe_divide(weighted_pdf, weighted_pdf.sum())

        # elif nn_params.dt_score_mode == "gmm_normalized_wpdf_no_inv_var":
        #     delta = unit_radrow_flat - neigh_dt_mat
        #     vdist = np.sum(delta * delta, axis=1)
        #     pdf = np.exp(-vdist)
        #     weighted_pdf = neigh_weights * pdf
        #     scores = utils.np_safe_divide(weighted_pdf, weighted_pdf.sum())
        #
        # elif nn_params.dt_score_mode == "cdist_normalized":
        #     d = cd()
        #     scores = 1.0 / (nn_params.dt_score_bias + d)
        #     scores = utils.np_safe_divide(scores, scores.sum())
        #
        # elif nn_params.dt_score_mode == "cdist_weighted_sqrt":
        #     d = cd()
        #     scores = np.sqrt(neigh_weights) / (nn_params.dt_score_bias + d)
        #
        # elif nn_params.dt_score_mode == "cdist_weighted_log":
        #     d = cd()
        #     scores = np.log(neigh_weights) / (nn_params.dt_score_bias + d)
        #
        # elif nn_params.dt_score_mode == "cdist_weighted_normalized":
        #     d = cd()
        #     scores = neigh_weights / (nn_params.dt_score_bias + d)
        #     scores = utils.np_safe_divide(scores, scores.sum())
        #
        # elif nn_params.dt_score_mode == "cdist_weighted_normalized_sqrt":
        #     d = cd()
        #     scores = np.sqrt(neigh_weights) / (nn_params.dt_score_bias + d)
        #     scores = utils.np_safe_divide(scores, scores.sum())
        #
        # elif nn_params.dt_score_mode == "cdist_weighted_normalized_log":
        #     d = cd()
        #     scores = np.log(neigh_weights) / (nn_params.dt_score_bias + d)
        #     scores = utils.np_safe_divide(scores, scores.sum())

        else:
            raise NotImplementedError()

        # PICK highest score
        scores *= penalty()
        scores = scores.astype(ScoreType)
        arg_sort = np.argsort(scores)[::-1]
        best_arg = arg_sort[0].astype(int)
        pred_dt_i = nn_iz[best_arg]
        pred_dt_score = scores[best_arg]

        assert type(pred_dt_i) is IndexType

        assert 0 <= pred_dt_score <= 1.0

        # At this point we have a dyetrack prediction and
        # we do a simple-minded call of the most likely
        # peptide for that dyetrack and the composite

        # But I have a problem -- before I was normalizing the dt_scores
        # before I multiplied by the pep prob.
        # But I think that's still okay.

        # USE the dt prediction to find a peptide prediction
        # df = dt_pep_sources_df
        # dt_pep_mat = df[df.dye_i == pred_dt_i].values
        # # dt_pep_mat has three columns:
        # #     0:dye_i (a constance == pred_dt_i because of previ lin filter)
        # #     1:pep_i (the peptide this dyetrack camer from)
        # #     2:n_rows (number of times that pep_i generated this dyetrack)
        #
        # sorted_pep_mat_args = np.argsort(dt_pep_mat[:, 2])[::-1]
        # top_pep_row_i = sorted_pep_mat_args[0]
        # pred_pep_i = dt_pep_mat[top_pep_row_i, 1]
        # pred_pep_score = dt_pep_mat[top_pep_row_i, 2] / dt_pep_mat[:, 2].sum()

        best_pep_row = dye_to_best_pep_df.loc[pred_dt_i]
        pred_pep_i = best_pep_row.pep_i
        composite_score = min(1.0, (pred_dt_score * best_pep_row.score) /
                              score_normalization)
        assert 0 <= composite_score <= 1.0

    output_pred_dt_scores[i] = ScoreType(pred_dt_score)
    output_pred_scores[i] = ScoreType(composite_score)
    output_pred_pep_iz[i] = pred_pep_i.astype(IndexType)
    output_pred_dt_iz[i] = pred_dt_i.astype(IndexType)
Exemple #12
0
    def false_calls(self, elem_i, n_false):
        """
        For a nice viz of the confusion matrix, see here:
        https://stackoverflow.com/a/50671617
        (Except that viz is transposed compared to our mats.)

        There's two kinds of off-diagonal failures wrt to any element "A":
            * FALSE-POSITIVES: Elements that are called A but are not A.
              I use a mnemonic: "im-POS-ters", ie the false "POS-itives"
            * FALSE-NEGATIVES: Elements that are not A but that steal calls
              from true A's. I think of these as "thieves" stealing from
              the truth.

            These are symmetric relationships:
            If B is an imposter of A then A is a thief of B.

            True negatives wrt to "A" are all of the elements outside the row
            and col of A.
        """

        n_dim = self.shape[0]
        assert self.shape[1] == n_dim

        if n_false >= n_dim:
            return None

        check.affirm(0 <= elem_i < n_dim, "elem_i out of range")

        # For now, only square matrices are supported
        assert self.shape[0] == self.shape[1]

        # Grab sums BEFORE removing the diagonal
        row_sum = self[elem_i, :].sum()
        col_sum = self[:, elem_i].sum()

        # FETCH the top falses (imposters and thieves) with the diag removed
        # to avoid self-collision. Make a copy first
        copy = np.copy(self)
        np.fill_diagonal(copy, 0)
        sorted_false_pos_pep_iz = np.argsort(copy[elem_i, :])[::-1]
        sorted_false_neg_pep_iz = np.argsort(copy[:, elem_i])[::-1]

        false_positive_tuples = [
            (
                f"FP{i}",
                sorted_false_pos_pep_iz[i],
                float(
                    utils.np_safe_divide(
                        copy[elem_i, sorted_false_pos_pep_iz[i]], row_sum, default=0.0
                    )
                ),
            )
            for i in range(n_false)
            if sorted_false_pos_pep_iz[i] > 0
        ]

        false_negative_tuples = [
            (
                f"FN{i}",
                sorted_false_neg_pep_iz[i],
                float(
                    utils.np_safe_divide(
                        copy[sorted_false_neg_pep_iz[i], elem_i], col_sum, default=0.0
                    )
                ),
            )
            for i in range(n_false)
            if sorted_false_neg_pep_iz[i] > 0
        ]

        return false_positive_tuples + false_negative_tuples
Exemple #13
0
    def hist(
        self, data=None, **kws,
    ):
        """
        Histogram. Converts nan in data to zeros.

        _bins: If a 3-tuple it is converted to a linspace. Default=(min, max, 50)
        _density: Passed as "density" to np.histogram()
        _normalizer: Value to divide through by safely. Default=1.0)
        _step: If True draws steps lines instead of filled bars
        _subsample: If non None it will subsample this number of items from the set
        """

        if isinstance(data, str):
            assert kws.get("source") is not None
            data = kws.pop("source")[data]

        else:
            data = np.nan_to_num(data, copy=True)

        ustack = self._u_stack()
        ustack.update(kws)
        _bins = ustack.get("_bins")
        _density = ustack.get("_density", False)
        _normalizer = ustack.get("_normalizer", 1.0)
        _step = ustack.get("_step", False)
        _subsample = ustack.get("_subsample")
        if _subsample is not None:
            data = subsample(data, _subsample)

        if _bins is None:
            if data.shape[0] > 0:
                min_ = np.min(data)
                max_ = np.max(data)
            else:
                min_ = 0
                max_ = 0
            _bins = np.linspace(min_, max_, 50)
        elif isinstance(_bins, (tuple, list)):
            _bins = np.linspace(_bins[0], _bins[1], _bins[2] if len(_bins) == 3 else 50)

        _hist, _edges = np.histogram(data, bins=_bins, density=_density)
        _hist = _hist.astype(float)
        _hist = utils.np_safe_divide(_hist, np.array(_normalizer), default=0.0)

        fig = self._begin(
            kws,
            dict(
                left=_edges[:-1],
                right=_edges[1:],
                top=_hist,
                bottom=np.zeros_like(_hist),
                _label="",
            ),
        )

        if _step:
            # Note, hovers do not work for steps as of Boken 1.4.0
            # https://github.com/bokeh/bokeh/issues/7419
            fig.step(x="left", y="top", mode="center", **self._p_stack())
        else:
            fig.quad(
                top="top",
                bottom="bottom",
                left="left",
                right="right",
                line_color=None,
                **self._p_stack(),
            )
        self._end()
Exemple #14
0
    def pr_curve_new(self, pep_iz_subset=None, n_steps=50):
        """
        See: https://docs.google.com/document/d/1MW92KNTaNtuL1bR_p0U1FwfjaiomHD3fRldiSVF74pY/edit#bookmark=id.4nqatzscuyw7

        Unlike sklearn's implementation, this one samples scores
        uniformly to prevents returning gigantic arrays.

        Returns a tuple of arrays; each row of the arrays is an increasing score threshold. The arrays are:
            * precision, recall, score_thresh, area_under_curve
        """

        # Obtain a reverse sorted calls: true, pred, score
        true = self.df["true_pep_iz"].values
        pred = self.df["pred_pep_iz"].values
        scores = self.df["scores"].values

        # At this point true, pred, scores are sorted WHOLE SET OF ALL PEPTIDES

        # If a subset is not request then assume ALL are wanted
        if pep_iz_subset is None:
            pep_iz_subset = np.unique(
                np.concatenate((self.df.true_pep_iz[1:], self.df.pred_pep_iz))
                # 1: => don't include the null peptide class from true
            )

        # MASK calls in the subset
        true_in_subset_mask = np.isin(true, pep_iz_subset)
        pred_in_subset_mask = np.isin(pred, pep_iz_subset)

        # In the old code true, pred, score were the WHOLE SET
        # and then pred_ and true_in_subset_mask were MASKS IN THIE WHOLE SET

        # At this point, true_ and pred_in_subset_mask are masks on the original set.
        # We now reduce to the set of interest so that we sort a smaller set
        true_or_pred_subset_mask = true_in_subset_mask | pred_in_subset_mask
        true = true[true_or_pred_subset_mask]
        pred = pred[true_or_pred_subset_mask]
        scores = scores[true_or_pred_subset_mask]
        true_in_subset_mask = true_in_subset_mask[true_or_pred_subset_mask]
        pred_in_subset_mask = pred_in_subset_mask[true_or_pred_subset_mask]

        # Now sort on a smaller set
        sorted_iz = np.argsort(scores)[::-1]
        true = true[sorted_iz]
        pred = pred[sorted_iz]
        scores = scores[sorted_iz]
        true_in_subset_mask = true_in_subset_mask[sorted_iz]
        pred_in_subset_mask = pred_in_subset_mask[sorted_iz]

        # How many true are in the subset? This will be
        # used as the denominator of recall.
        n_true_in_subset = true_in_subset_mask.sum()

        # WALK through scores linearlly from high to low, starting
        # at (1.0 - step_size) so that the first group has contents.
        step_size = 1.0 / n_steps

        # prsa sdtands for "Precision Recall Score Area_under_curve"
        prsa = np.zeros((n_steps, 4))
        precision_column = 0
        recall_column = 1
        score_thresh_column = 2
        auc_column = 3

        for prsa_i, score_thresh in enumerate(
                np.linspace(1 - step_size, 0, n_steps)):
            # i is the index where *ALL* scores before this point are greater
            # than or equal to the score_thresh. Note that because many calls
            # may have *tied* scores, we use np_arg_last_where to pick the
            # *last* position (ie lowest score) where the statement is true.
            i = utils.np_arg_last_where(scores >= score_thresh)
            if i is None:
                prsa[prsa_i] = (0.0, 0.0, score_thresh, 0.0)
            else:
                correct_at_i_mask = true[0:i + 1] == pred[0:i + 1]
                pred_at_i_mask = pred_in_subset_mask[0:i + 1]

                # At i, count:
                #  * How many of the subset of interest have been predicted?
                #    This will be used as the denominator of precision.
                #  * How many correct calls of the subset have been made?
                #    This is the numerator of precision and recall.
                #  Note that for the correct, the masking doesn't matter if
                #  we choose the true_mask or pred_mask because they are they same
                #  in the case of a correct call.
                n_pred_at_i = pred_at_i_mask.sum()
                n_correct_and_in_subset_at_i = (
                    correct_at_i_mask  # & pred_at_i_mask
                ).sum()

                prsa[prsa_i] = (
                    # Precision: Fraction of those that were called apples at i that were in fact apples
                    utils.np_safe_divide(n_correct_and_in_subset_at_i,
                                         n_pred_at_i),
                    # Recall: Fraction of all apples that were called apples at i
                    utils.np_safe_divide(n_correct_and_in_subset_at_i,
                                         n_true_in_subset),
                    # Score threshold is stepping down linearly
                    score_thresh,
                    0.0,
                )
                # The Area under the curve up to this point (requires two points)
                prsa[prsa_i, auc_column] = self._auc(
                    prsa[0:prsa_i + 1, recall_column],
                    prsa[0:prsa_i + 1, precision_column],
                )

        # CORRECT for the prior-recall.
        # During simulation some rows may be all-dark.
        # Those are accounted for here by scaling down the recall by
        # the fraction of non-dark rows / all rows.
        # This is done as MEAN of all recalls over the set of interest.

        # EXTRACT training recalls from the subset of peps.
        # This will leave NANs for all those that are not in the subset.
        if self._sim_result is not None:
            filtered_pep_recalls = np.full_like(self._sim_result.train_recalls,
                                                np.nan)
            filtered_pep_recalls[
                pep_iz_subset] = self._sim_result.train_recalls[pep_iz_subset]
        else:
            filtered_pep_recalls = np.full((prsa.shape[0], ), 1.0)

        # Use nanmean to ignore al those nans (the peps not in the subset)
        # And then use np.nan_to_num in case the subset was empty, we want get 0 not nan
        mean_recall = np.nan_to_num(np.nanmean(filtered_pep_recalls))
        assert 0.0 <= mean_recall <= 1.0

        # SCALE-DOWN all recall
        prsa[:, recall_column] *= mean_recall

        # SKIP all initial rows where the recall is zero, these clutter up the graph
        # The return may thus have fewer than n_steps rows.
        first_non_zero_i = utils.np_arg_first_where(
            prsa[:, recall_column] > 0.0)

        filtered_prsa = prsa[first_non_zero_i:]

        assert np.all(np.diff(filtered_prsa[:, 2]) <= 0.0)

        return (
            filtered_prsa[:, 0],  # Precision
            filtered_prsa[:, 1],  # Recall
            filtered_prsa[:, 2],  # Score thresholds
            filtered_prsa[:, 3],  # AUC
        )
Exemple #15
0
 def snr(self, field):
     return np.nan_to_num(
         utils.np_safe_divide(self.signal_radmat_for_field(field),
                              self.noise_radmat_for_field(field)))