def test_get_ind_mat_uniqueness(self): """ Tests get_ind_mat_uniqueness function using indicator matrix from the book example """ ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3)) ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0] ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0] ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1] ind_mat = ind_mat.values labels_av_uniqueness = get_ind_mat_label_uniqueness(ind_mat) first_sample_unq = labels_av_uniqueness[0] second_sample_unq = labels_av_uniqueness[1] third_sample_unq = labels_av_uniqueness[2] self.assertTrue( abs(first_sample_unq[first_sample_unq > 0].mean() - 0.8333) <= 1e-4) self.assertTrue( abs(second_sample_unq[second_sample_unq > 0].mean() - 0.75) <= 1e-4) self.assertTrue( abs(third_sample_unq[third_sample_unq > 0].mean() - 1.0) <= 1e-4) # Test matrix av.uniqueness self.assertTrue( abs(labels_av_uniqueness[labels_av_uniqueness > 0].mean() - 0.8571) <= 1e-4)
def _get_synthetic_samples(ind_mat, good_samples_thresh, bad_samples_thresh): """ Get samples with uniqueness either > good_samples_thresh or uniqueness < bad_samples_thresh """ # Get mix of samples where some of them are extremely non-overlapping, the other one are highly overlapping i = 0 unique_samples = [] for label in get_ind_mat_label_uniqueness(ind_mat): if np.mean(label[label > 0]) > good_samples_thresh or np.mean(label[label > 0]) < bad_samples_thresh: unique_samples.append(i) i += 1 return unique_samples