def test_tsne_plot_abclabs(self): sids = list(range(8)) fids = [str(i) for i in range(10)] labs = list(range(8)) np.random.seed(123) np.random.seed(123) x = np.random.ranf(80).reshape(8, -1) x_sorted = x[np.argsort(x[:, 5])] g = x_sorted[:, 5] slab_csamples = eda.SingleLabelClassifiedSamples( x_sorted, labs, sids=sids, fids=fids) return slab_csamples.tsne_plot(g, labels=list('abcdefgh'), figsize=(10, 10), s=50)
def test_swarm_a(self): # array([[0, 1], # [2, 3], # [4, 5], # [6, 7], # [8, 9]]) tslcs = eda.SingleLabelClassifiedSamples(np.arange(10).reshape(5, 2), [0, 0, 1, 2, 3], ['1', '2', '3', '4', '5'], ['a', 'z']) return tslcs.feature_swarm_plot('a', transform=lambda x: x + 200, selected_labels=[0, 2, 3], title='test', xlab='x', ylab='y')
def test_lab_sorted_sids(self): qsids = [0, 1, 5, 3, 2, 4] qlabs = [0, 0, 2, 1, 1, 1] rsids = [3, 4, 2, 5, 1, 0] slab_csamples = eda.SingleLabelClassifiedSamples( np.random.ranf(60).reshape(6, -1), qlabs, qsids) rs_qsids, rs_qlabs = slab_csamples.lab_sorted_sids(rsids) np.testing.assert_equal(rs_qsids, np.array([3, 4, 2, 5, 1, 0])) np.testing.assert_equal(rs_qlabs, np.array([1, 1, 1, 2, 0, 0])) rs_qsids, rs_qlabs = slab_csamples.lab_sorted_sids() np.testing.assert_equal(rs_qsids, np.array([0, 1, 3, 2, 4, 5])) np.testing.assert_equal(rs_qlabs, np.array([0, 0, 1, 1, 1, 2]))
def test_feature_importance_across_labs(self): # Generate simple dataset with gaussian noise x_centers = np.array([[0, 0, 1, 1, 5, 50, 10, 37], [0, 0, 1.5, 5, 5, 50, 10, 35], [0, 0, 10, 10, 5, 50, 10, 33]]) np.random.seed(1920) c1x = np.array(x_centers[0]) + np.random.normal(size=(500, 8)) c2x = np.array(x_centers[1]) + np.random.normal(size=(200, 8)) c3x = np.array(x_centers[2]) + np.random.normal(size=(300, 8)) x = np.vstack((c1x, c2x, c3x)) labs = [0] * 500 + [1] * 200 + [2] * 300 slcs = eda.SingleLabelClassifiedSamples(x, labs=labs) # binary logistic regression f_importance_list, bst = slcs.feature_importance_across_labs( [0, 1], silent=0) assert f_importance_list[0][0] == 3 # multi class softmax f_importance_list2, bst2 = slcs.feature_importance_across_labs( [0, 1, 2], random_state=123, silent=1) assert f_importance_list2[0][0] == 3 assert f_importance_list2 != f_importance_list # multiclass with provided parames xgb_params = { 'eta': 0.3, 'max_depth': 6, 'silent': 0, 'nthread': 1, 'alpha': 1, 'lambda': 0, 'seed': 0, 'objective': 'multi:softmax', 'eval_metric': 'merror', 'num_class': 3 } f_importance_list3, bst3 = slcs.feature_importance_across_labs( [0, 1, 2], random_state=123, xgb_params=xgb_params) assert f_importance_list3 == f_importance_list2 # shuffle features f_importance_list4, bst4 = slcs.feature_importance_across_labs( [0, 1], random_state=123, shuffle_features=True) assert f_importance_list2[0][0] == 3 # bootstrapping f_importance_list5, bst5 = slcs.feature_importance_across_labs( [0, 1], random_state=123, shuffle_features=True, num_bootstrap_round=10) f_importance_list6, bst6 = slcs.feature_importance_across_labs( [0, 1], random_state=123, shuffle_features=True, num_bootstrap_round=10) assert f_importance_list5 == f_importance_list6 assert f_importance_list5[0][0] == 3
def test_feature_importance_across_labs_bootstrap_resample(self): x_centers = np.array([[0, 0, 1, 1, 5, 50, 10, 37], [0, 0, 1.5, 5, 5, 50, 10, 35], [0, 0, 10, 10, 5, 50, 10, 33]]) np.random.seed(1920) c1x = np.array(x_centers[0]) + np.random.normal(size=(500, 8)) c2x = np.array(x_centers[1]) + np.random.normal(size=(1, 8)) c3x = np.array(x_centers[2]) + np.random.normal(size=(30, 8)) x = np.vstack((c1x, c2x, c3x)) labs = [0] * 500 + [1] * 1 + [2] * 30 slcs = eda.SingleLabelClassifiedSamples(x, labs=labs) # bootstrapping f_importance_list, bst = slcs.feature_importance_across_labs( [0, 1], random_state=123, shuffle_features=True, num_bootstrap_round=10)
def test_lab_x_empty(self): sids = list('abcdef') fids = list(range(10, 20)) labs = [0, 0, 0, 1, 2, 2] slcs = eda.SingleLabelClassifiedSamples( np.random.ranf(60).reshape(6, -1), labs=labs, sids=sids, fids=fids) # select sf empty_s = slcs.lab_x([]) assert empty_s._x.shape == (0, 10) assert empty_s._d.shape == (0, 0) assert empty_s._sids.shape == (0,) assert empty_s._labs.shape == (0,) assert empty_s._fids.shape == (10,) assert empty_s._labs.shape == (0,)
def test_feature_importance_distintuishing_labs(self): # Generate simple dataset with gaussian noise x_centers = np.array([[0, 0, 1, 1, 5, 50, 10, 37], [0, 0, 1.5, 5, 5, 50, 10, 35], [0, 0, 10, 10, 5, 50, 10, 33]]) np.random.seed(1920) c1x = np.array(x_centers[0]) + np.random.normal(size=(500, 8)) c2x = np.array(x_centers[1]) + np.random.normal(size=(200, 8)) c3x = np.array(x_centers[2]) + np.random.normal(size=(300, 8)) x = np.vstack((c1x, c2x, c3x)) labs = [0] * 500 + [1] * 200 + [2] * 300 slcs = eda.SingleLabelClassifiedSamples(x, labs=labs) # binary logistic regression f_importance_list, bst = slcs.feature_importance_distintuishing_labs( [0, 1], silent=0) assert f_importance_list[0][0] == 2
def test_getters(self): tslcs = eda.SingleLabelClassifiedSamples(np.arange(10).reshape(5, 2), [0, 0, 1, 2, 3], ['a', 'b', 'c', '1', '2'], ['a', 'z']) np.testing.assert_equal(tslcs.x, np.array( np.arange(10).reshape(5, 2), dtype='float64')) np.testing.assert_equal( tslcs.sids, np.array(['a', 'b', 'c', '1', '2'])) np.testing.assert_equal(tslcs.fids, np.array(['a', 'z'])) np.testing.assert_equal(tslcs.labs, np.array([0, 0, 1, 2, 3])) assert tslcs.x is not tslcs._x assert tslcs.sids is not tslcs._sids assert tslcs.fids is not tslcs._fids assert tslcs.labs is not tslcs._labs
def test_feature_importance_each_lab(self): # Generate simple dataset with gaussian noise x_centers = np.array([[0, 0, 1, 1, 5, 50, 10, 37], [0, 0, 1.5, 5, 5, 50, 10, 35], [0, 0, 10, 10, 5, 50, 10, 33]]) np.random.seed(1920) c1x = np.array(x_centers[0]) + np.random.normal(size=(500, 8)) c2x = np.array(x_centers[1]) + np.random.normal(size=(200, 8)) c3x = np.array(x_centers[2]) + np.random.normal(size=(300, 8)) x = np.vstack((c1x, c2x, c3x)) labs = [0] * 500 + [1] * 200 + [2] * 300 slcs = eda.SingleLabelClassifiedSamples(x, labs=labs) # binary logistic regression ulab_fi_lut = slcs.feature_importance_each_lab() assert ulab_fi_lut[0][-1][0] == 3 print(ulab_fi_lut) assert ulab_fi_lut[1][-1][0] == 2
def test_merge_labels(self): sids = list('abcdef') fids = list(range(10, 20)) labs = [0, 0, 1, 1, 2, 3] slcs = eda.SingleLabelClassifiedSamples( np.random.ranf(60).reshape(6, -1), labs=labs, sids=sids, fids=fids) slcs.merge_labels([1, 2, 3], 5) new_labs = [0, 0, 5, 5, 5, 5] assert slcs.labs == new_labs assert slcs.sids == sids assert slcs.fids == fids assert slcs.labs_to_sids([5]) == (('c', 'd', 'e', 'f'),) assert slcs.sids_to_labs(sids).tolist() == new_labs assert slcs._uniq_labs.tolist() == [0, 5] assert slcs._uniq_lab_cnts.tolist() == [2, 4]
def test_filter_min_class_n(self): sids = [0, 1, 2, 3, 4, 5] labs = [0, 0, 0, 1, 2, 2] slab_csamples = eda.SingleLabelClassifiedSamples( np.random.ranf(60).reshape(6, -1), labs, sids, None) min_cl_n = 2 mcnf_slab_csamples = slab_csamples.filter_min_class_n(min_cl_n) np.testing.assert_equal(mcnf_slab_csamples.sids, np.array([0, 1, 2, 4, 5])) np.testing.assert_equal(mcnf_slab_csamples.labs, np.array([0, 0, 0, 2, 2])) np.testing.assert_equal(mcnf_slab_csamples._x.shape, (5, 10)) np.testing.assert_equal(mcnf_slab_csamples.fids, slab_csamples.fids) np.testing.assert_equal(mcnf_slab_csamples._x, slab_csamples._x[np.array([0, 1, 2, 4, 5])]) s_inds = np.array([0, 1, 2, 4, 5]) np.testing.assert_equal(mcnf_slab_csamples._d, slab_csamples._d[s_inds][:, s_inds])
def test_relabel(self): sids = list('abcdef') fids = list(range(10, 20)) labs = [0, 0, 0, 1, 2, 2] slcs = eda.SingleLabelClassifiedSamples( np.random.ranf(60).reshape(6, -1), labs=labs, sids=sids, fids=fids) new_labs = ['a', 'b', 'c', 'd', 'e', 'f'] slcs_rl = slcs.relabel(new_labs) assert slcs_rl.labs == new_labs assert slcs_rl._x is not slcs._x assert slcs_rl._d is not slcs._d assert slcs_rl._sids is not slcs._sids assert slcs_rl._fids is not slcs._fids np.testing.assert_equal(slcs_rl._x, slcs._x) np.testing.assert_equal(slcs_rl._d, slcs._d) np.testing.assert_equal(slcs_rl._sids, slcs._sids) np.testing.assert_equal(slcs_rl._fids, slcs._fids)
def test_merge_labels_wrong_args(self): sids = list('abcdef') fids = list(range(10, 20)) labs = [0, 0, 1, 1, 2, 3] slcs = eda.SingleLabelClassifiedSamples( np.random.ranf(60).reshape(6, -1), labs=labs, sids=sids, fids=fids) # wrong new lab type with pytest.raises(ValueError) as excinfo: slcs.merge_labels([1, 2, 3], [5]) # wrong m lab type with pytest.raises(ValueError) as excinfo: slcs.merge_labels([[], [1]], 1) # duplicated m labs with pytest.raises(ValueError) as excinfo: slcs.merge_labels([1, 1, 2], 1) # m lab not in original lab with pytest.raises(ValueError) as excinfo: slcs.merge_labels([0, 1, 5], 1)
def test_tsne_feature_gradient_plot_wrong_args(self): sids = list(range(8)) fids = [str(i) for i in range(10)] labs = list(range(8)) np.random.seed(123) x = np.random.ranf(80).reshape(8, -1) x_sorted = x[np.argsort(x[:, 5])] slab_csamples = eda.SingleLabelClassifiedSamples( x, labs, sids=sids, fids=fids) with pytest.raises(ValueError): slab_csamples.tsne_feature_gradient_plot([0, 1]) with pytest.raises(ValueError): slab_csamples.tsne_feature_gradient_plot(11) with pytest.raises(ValueError): slab_csamples.tsne_feature_gradient_plot(11) with pytest.raises(ValueError): slab_csamples.tsne_feature_gradient_plot(-1) with pytest.raises(ValueError): slab_csamples.tsne_feature_gradient_plot(5) with pytest.raises(ValueError): slab_csamples.tsne_feature_gradient_plot('123')
def test_lab_x(self): sids = list('abcdef') fids = list(range(10, 20)) labs = [0, 0, 0, 1, 2, 2] slcs = eda.SingleLabelClassifiedSamples( np.random.ranf(60).reshape(6, -1), labs=labs, sids=sids, fids=fids) # select sf ss_slcs = slcs.lab_x([0, 2]) assert ss_slcs._x.shape == (5, 10) assert ss_slcs.sids == ['a', 'b', 'c', 'e', 'f'] assert ss_slcs.labs == [0, 0, 0, 2, 2] assert ss_slcs.fids == list(range(10, 20)) ss_s_inds = [0, 1, 2, 4, 5] np.testing.assert_equal(ss_slcs.d, slcs._d[np.ix_(ss_s_inds, ss_s_inds)]) # select sf ss_slcs = slcs.lab_x(0) assert ss_slcs._x.shape == (3, 10) assert ss_slcs.sids == ['a', 'b', 'c'] assert ss_slcs.labs == [0, 0, 0] assert ss_slcs.fids == list(range(10, 20)) ss_s_inds = [0, 1, 2] np.testing.assert_equal(ss_slcs.d, slcs._d[np.ix_(ss_s_inds, ss_s_inds)]) # select with None slcs_n = slcs.lab_x(None) np.testing.assert_equal(slcs_n._x, slcs._x) np.testing.assert_equal(slcs_n._d, slcs._d) np.testing.assert_equal(slcs_n._sids, slcs._sids) np.testing.assert_equal(slcs_n._fids, slcs._fids) np.testing.assert_equal(slcs_n._labs, slcs._labs) # select non-existent labs with pytest.raises(ValueError) as excinfo: slcs.lab_x([-1]) with pytest.raises(ValueError) as excinfo: slcs.lab_x([0, 3]) with pytest.raises(ValueError) as excinfo: slcs.lab_x([0, -3])
def test_id_x(self): sids = list('abcdef') fids = list(range(10, 20)) labs = [0, 0, 0, 1, 2, 2] slcs = eda.SingleLabelClassifiedSamples( np.random.ranf(60).reshape(6, -1), labs=labs, sids=sids, fids=fids) # select sf ss_slcs = slcs.id_x(['a', 'f'], list(range(10, 15))) assert ss_slcs._x.shape == (2, 5) assert ss_slcs.sids == ['a', 'f'] assert ss_slcs.labs == [0, 2] assert ss_slcs.fids == list(range(10, 15)) np.testing.assert_equal( ss_slcs.d, slcs._d[np.ix_((0, 5), (0, 5))]) # select with Default ss_slcs = slcs.id_x() assert ss_slcs._x.shape == (6, 10) assert ss_slcs.sids == list('abcdef') assert ss_slcs.labs == labs assert ss_slcs.fids == list(range(10, 20)) np.testing.assert_equal(ss_slcs.d, slcs._d) # select with None ss_slcs = slcs.id_x(None, None) assert ss_slcs._x.shape == (6, 10) assert ss_slcs.sids == list('abcdef') assert ss_slcs.labs == labs assert ss_slcs.fids == list(range(10, 20)) np.testing.assert_equal(ss_slcs.d, slcs._d) # select non-existent inds # id lookup raises ValueError with pytest.raises(ValueError) as excinfo: slcs.id_x([6]) with pytest.raises(ValueError) as excinfo: slcs.id_x(None, ['a'])
def test_labs_to_cmap(): sids = [0, 1, 2, 3, 4, 5, 6, 7] labs = list(map(str, [3, 0, 1, 0, 0, 1, 2, 2])) slab_csamples = eda.SingleLabelClassifiedSamples( np.random.ranf(80).reshape(8, -1), labs, sids) (lab_cmap, lab_norm, lab_ind_arr, lab_col_lut, uniq_lab_lut) = eda.plot.labs_to_cmap(slab_csamples.labs, return_lut=True) n_uniq_labs = len(set(labs)) assert lab_cmap.N == n_uniq_labs assert lab_cmap.colors == sns.hls_palette(n_uniq_labs) np.testing.assert_equal(lab_ind_arr, np.array([3, 0, 1, 0, 0, 1, 2, 2])) assert labs == [uniq_lab_lut[x] for x in lab_ind_arr] assert len(uniq_lab_lut) == n_uniq_labs assert len(lab_col_lut) == n_uniq_labs assert [lab_col_lut[uniq_lab_lut[i]] for i in range(n_uniq_labs)] == sns.hls_palette(n_uniq_labs) lab_cmap2, lab_norm2 = eda.plot.labs_to_cmap(slab_csamples.labs, return_lut=False) assert lab_cmap2.N == n_uniq_labs assert lab_cmap2.colors == lab_cmap.colors np.testing.assert_equal(lab_norm2.boundaries, lab_norm.boundaries)
def test_swarm_minimal_z(self): tslcs = eda.SingleLabelClassifiedSamples(np.arange(10).reshape(5, 2), [0, 0, 1, 2, 3], ['1', '2', '3', '4', '5'], ['a', 'z']) return tslcs.feature_swarm_plot('z')
def test_dmat_heatmap(self): x = [[0, 0], [1, 1], [2, 2], [10, 10], [12, 12], [11, 11], [100, 100]] tslcs = eda.SingleLabelClassifiedSamples(x, [0, 0, 0, 1, 1, 1, 2], metric='euclidean') return tslcs.dmat_heatmap(selected_labels=[0, 1], transform=lambda x: x + 100)
def test_init_wrong_lab_len(self): with pytest.raises(Exception) as excinfo: eda.SingleLabelClassifiedSamples( self.sfm3x3_arr, [0, 1], None, None)