Exemple #1
0
    def test_get_ind_mat_uniqueness(self):
        """
        Tests get_ind_mat_uniqueness function using indicator matrix from the book example
        """

        ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3))
        ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0]
        ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0]
        ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1]
        ind_mat = ind_mat.values

        labels_av_uniqueness = get_ind_mat_label_uniqueness(ind_mat)
        first_sample_unq = labels_av_uniqueness[0]
        second_sample_unq = labels_av_uniqueness[1]
        third_sample_unq = labels_av_uniqueness[2]

        self.assertTrue(
            abs(first_sample_unq[first_sample_unq > 0].mean() -
                0.8333) <= 1e-4)
        self.assertTrue(
            abs(second_sample_unq[second_sample_unq > 0].mean() -
                0.75) <= 1e-4)
        self.assertTrue(
            abs(third_sample_unq[third_sample_unq > 0].mean() - 1.0) <= 1e-4)
        # Test matrix av.uniqueness
        self.assertTrue(
            abs(labels_av_uniqueness[labels_av_uniqueness > 0].mean() -
                0.8571) <= 1e-4)
def _get_synthetic_samples(ind_mat, good_samples_thresh, bad_samples_thresh):
    """
    Get samples with uniqueness either > good_samples_thresh or uniqueness < bad_samples_thresh
    """
    # Get mix of samples where some of them are extremely non-overlapping, the other one are highly overlapping
    i = 0
    unique_samples = []
    for label in get_ind_mat_label_uniqueness(ind_mat):
        if np.mean(label[label > 0]) > good_samples_thresh or np.mean(label[label > 0]) < bad_samples_thresh:
            unique_samples.append(i)
        i += 1
    return unique_samples