def test_maui_merges_latent_factors(): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame( [ [1,1,1,0,0,0,1,0,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,1,0,0,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,1,1,1,0], ], index=[f'sample {i}' for i in range(11)], columns=[f'LF{i}' for i in range(9)], dtype=float ) # expect 0,1,2 to be merged, and 3,7 to be merged z_merged = maui_model.merge_similar_latent_factors(distance_metric='euclidean') assert z_merged.shape[1] == 6 assert '0_1_2' in z_merged.columns assert '3_7' in z_merged.columns
def test_select_clinical_factors(): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame( [ [1,1,1,0,0,0,1,0,1], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,1,0,0,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,1,1,1,1], ], index=[f'sample {i}' for i in range(11)], columns=[f'LF{i}' for i in range(9)] ) # here the first 3 factors separate the groups and the last 6 do not durations = [1,2,3,4,5,6, 1000,2000,3000, 4000, 5000] # here the first 3 have short durations, the last 3 longer ones observed = [True]*11 # all events observed survival = pd.DataFrame(dict(duration=durations, observed=observed), index=[f'sample {i}' for i in range(11)]) z_clin = maui_model.select_clinical_factors(survival, cox_penalizer=1) assert 'LF0' in z_clin.columns assert 'LF5' not in z_clin.columns
def test_maui_computes_harrells_c(): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame( [ [1,1,1,0,0,0,1,0,1], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,1,0,0,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,1,1,1,1], ], index=[f'sample {i}' for i in range(11)], columns=[f'LF{i}' for i in range(9)] ) # here the first 3 factors separate the groups and the last 6 do not durations = [1,2,3,4,5,6, 1000,2000,3000, 4000, 5000] # here the first 3 have short durations, the last 3 longer ones observed = [True]*11 # all events observed survival = pd.DataFrame(dict(duration=durations, observed=observed), index=[f'sample {i}' for i in range(11)]) cs = maui_model.c_index(survival, clinical_only=True, duration_column='duration', observed_column='observed', cox_penalties=[.1,1,10,100,1000,10000], cv_folds=2, sel_clin_alpha=.05, sel_clin_penalty=1) print(cs) assert np.allclose(cs, [.8,.8])
def test_maui_merge_latent_factors_complains_if_unknown_merge_by(): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame( [ [1, 1, 1, 0, 0, 0, 1, 0, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 1, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 1, 1, 1, 0], ], index=[f"sample {i}" for i in range(11)], columns=[f"LF{i}" for i in range(9)], dtype=float, ) # expect 0,1,2 to be merged, and 3,7 to be merged with pytest.raises(Exception): z_merged = maui_model.merge_similar_latent_factors( distance_in="xxx", distance_metric="euclidean" )
def test_maui_clusters_picks_optimal_k_by_ami(): ami_mock = mock.Mock() ami_mock.side_effect = [ 2, 3, 1, ] # the optimal AMI will be given at the second trial with mock.patch("sklearn.metrics.adjusted_mutual_info_score", ami_mock): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame( np.random.randn(10, 2), index=[f"sample {i}" for i in range(10)], columns=["LF1", "LF2"], ) maui_model.x_ = pd.DataFrame( np.random.randn(20, 10), index=[f"feature {i}" for i in range(20)], columns=[f"sample {i}" for i in range(10)], ) the_y = pd.Series(np.arange(10), index=maui_model.z_.index) maui_model.cluster( ami_y=the_y, optimal_k_range=[1, 2, 3] ) # the second trial is k=2 assert maui_model.optimal_k_ == 2
def test_maui_merges_latent_factors_by_w(): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame( [ [1, 1, 1, 0, 0, 0, 1, 0, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 1, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 1, 1, 1, 0], ], index=[f"sample {i}" for i in range(11)], columns=[f"LF{i}" for i in range(9)], dtype=float, ) maui_model.x_ = pd.DataFrame( [[1], [1], [1], [1], [1], [1], [0], [0], [0], [0], [0]], index=[f"sample {i}" for i in range(11)], columns=["Feature 1"], dtype=float, ) # with these z and x, expect 0,1,2 and 4,5 and 3,6,7 z_merged = maui_model.merge_similar_latent_factors( distance_in="w", distance_metric="euclidean" ) assert z_merged.shape[1] == 4 assert "0_1_2" in z_merged.columns assert "3_6_7" in z_merged.columns assert "4_5" in z_merged.columns
def test_maui_drops_unexplanatody_factors_by_r2(): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame( [ [1, 1, 1, 0, 0, 0, 1, 0, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 1, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 1, 1, 1, 0], ], index=[f"sample {i}" for i in range(11)], columns=[f"LF{i}" for i in range(9)], dtype=float, ) # here the first 8 latent factors have R2 above threshold, the last does not maui_model.x_ = pd.DataFrame( [[1], [1], [1], [1], [1], [1], [0], [0], [0], [0], [0]], index=[f"sample {i}" for i in range(11)], columns=["Feature 1"], dtype=float, ) z_filt = maui_model.drop_unexplanatory_factors() assert z_filt.shape[1] == 8
def test_maui_clusters_with_single_k(): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame(np.random.randn(10,2), index=[f'sample {i}' for i in range(10)], columns=['LF1', 'LF2']) maui_model.x_ = pd.DataFrame(np.random.randn(20,10), index=[f'feature {i}' for i in range(20)], columns=[f'sample {i}' for i in range(10)]) yhat = maui_model.cluster(5) assert yhat.shape == (10,)
def test_maui_clusters_picks_optimal_k_by_silhouette(): silhouette_mock = mock.Mock() silhouette_mock.side_effect = [2,3,1] # the optimal silhouette will be given at the second trial with mock.patch('sklearn.metrics.silhouette_score', silhouette_mock): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame(np.random.randn(10,2), index=[f'sample {i}' for i in range(10)], columns=['LF1', 'LF2']) maui_model.x_ = pd.DataFrame(np.random.randn(20,10), index=[f'feature {i}' for i in range(20)], columns=[f'sample {i}' for i in range(10)]) maui_model.cluster(optimal_k_method='silhouette', optimal_k_range=[1,2,3]) # the second trial is k=2 assert maui_model.optimal_k_ == 2
def test_maui_clusters_only_samples_in_y_index_when_optimizing(): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame(np.random.randn(10,2), index=[f'sample {i}' for i in range(10)], columns=['LF1', 'LF2']) maui_model.x_ = pd.DataFrame(np.random.randn(20,10), index=[f'feature {i}' for i in range(20)], columns=[f'sample {i}' for i in range(10)]) y = pd.Series(['a','a','a','b','b','b'], index=[f'sample {i}' for i in range(6)]) yhat = maui_model.cluster(ami_y=y, optimal_k_range=[1,2,3]) assert set(yhat.index) == set(y.index)
def test_maui_clusters_picks_optimal_k_with_custom_scoring(): scorer = mock.Mock() scorer.side_effect = [2,3,1] # the optimal AMI will be given at the second trial scorer.__name__ = 'mock_scorer' maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame(np.random.randn(10,2), index=[f'sample {i}' for i in range(10)], columns=['LF1', 'LF2']) maui_model.x_ = pd.DataFrame(np.random.randn(20,10), index=[f'feature {i}' for i in range(20)], columns=[f'sample {i}' for i in range(10)]) maui_model.cluster(optimal_k_method=scorer, optimal_k_range=[1,2,3]) # the second trial is k=2 assert maui_model.optimal_k_ == 2
def test_maui_merges_latent_factors_by_w(): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame( [ [1,1,1,0,0,0,1,0,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,0,1,1,1,0], [1,1,1,1,1,0,0,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,0,1,1,0], [0,0,0,1,0,1,1,1,0], ], index=[f'sample {i}' for i in range(11)], columns=[f'LF{i}' for i in range(9)], dtype=float ) maui_model.x_ = pd.DataFrame( [ [1], [1], [1], [1], [1], [1], [0], [0], [0], [0], [0], ], index=[f'sample {i}' for i in range(11)], columns=['Feature 1'], dtype=float) # with these z and x, expect 0,1,2 and 4,5 and 3,6,7 z_merged = maui_model.merge_similar_latent_factors(distance_in='w', distance_metric='euclidean') assert z_merged.shape[1] == 4 assert '0_1_2' in z_merged.columns assert '3_6_7' in z_merged.columns assert '4_5' in z_merged.columns
def test_maui_clusters_only_samples_in_y_index_when_optimizing(): np.random.seed(0) maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame( np.random.randn(10, 2), index=[f"sample {i}" for i in range(10)], columns=["LF1", "LF2"], ) maui_model.x_ = pd.DataFrame( np.random.randn(20, 10), index=[f"feature {i}" for i in range(20)], columns=[f"sample {i}" for i in range(10)], ) y = pd.Series(["a", "a", "a", "b", "b", "b"], index=[f"sample {i}" for i in range(6)]) yhat = maui_model.cluster(ami_y=y, optimal_k_range=[1, 2, 3]) assert set(yhat.index) == set(y.index)
def test_maui_computes_roc_and_auc(): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame([ [0, 1, 1, 1, 0, 1, 1, 0, 0], [1, 0, 0, 0, 0, 0, 1, 1, 0], [1, 0, 1, 0, 0, 0, 1, 1, 0], [1, 0, 0, 1, 0, 0, 1, 1, 0], [1, 0, 0, 0, 1, 1, 1, 1, 0], [1, 1, 1, 0, 0, 0, 1, 1, 1], ], index=[f'sample {i}' for i in range(6)], columns=[f'LF{i}' for i in range(9)]) y = pd.Series(['a', 'b', 'a', 'c', 'b', 'c'], index=maui_model.z_.index) rocs = maui_model.compute_roc(y, cv_folds=2) assert rocs == maui_model.roc_curves_ assert 'a' in rocs assert 'b' in rocs assert 'c' in rocs assert "mean" in rocs aucs = maui_model.compute_auc(y, cv_folds=2) assert aucs == maui_model.aucs_
def test_maui_computes_roc_and_auc(): maui_model = Maui(n_hidden=[10], n_latent=2, epochs=1) maui_model.z_ = pd.DataFrame( [ [0, 1, 1, 1, 0, 1, 1, 0, 0], [1, 0, 0, 0, 0, 0, 1, 1, 0], [1, 0, 1, 0, 0, 0, 1, 1, 0], [1, 0, 0, 1, 0, 0, 1, 1, 0], [1, 0, 0, 0, 1, 1, 1, 1, 0], [1, 1, 1, 0, 0, 0, 1, 1, 1], ], index=[f"sample {i}" for i in range(6)], columns=[f"LF{i}" for i in range(9)], ) y = pd.Series(["a", "b", "a", "c", "b", "c"], index=maui_model.z_.index) rocs = maui_model.compute_roc(y, cv_folds=2) assert rocs == maui_model.roc_curves_ assert "a" in rocs assert "b" in rocs assert "c" in rocs assert "mean" in rocs aucs = maui_model.compute_auc(y, cv_folds=2) assert aucs == maui_model.aucs_