def test_group_pca_matches_sklearn_iris_results(n_components, exclude_groups): X = iris.data pca = PCA(n_components=n_components) X_2 = np.hstack([X, X, X]) groups = [ np.arange(0, X.shape[1]), np.arange(X.shape[1], 2 * X.shape[1]), np.arange(2 * X.shape[1], 3 * X.shape[1]), ] gpca = GroupPCA(n_components=n_components, groups=groups, exclude_groups=exclude_groups) # _ = gpca.fit_transform(X_2) X_r = pca.fit_transform(X) X_r2 = gpca.fit_transform(X_2) hstack_list = [X_r, X_r, X_r] if exclude_groups is not None: if isinstance(exclude_groups, int): hstack_list[exclude_groups] = X else: for idx in exclude_groups: hstack_list[idx] = X assert_allclose(X_r2, np.hstack(hstack_list))
def test_theta_equals_zero_equals_unsupervised(binarize): n_components = 3 diabetes = load_diabetes() X = diabetes.data y = diabetes.target if binarize: y = (y > np.median(y)).astype(int) groups = [np.arange(0, X.shape[1]), np.arange(X.shape[1], 2 * X.shape[1])] X = np.hstack([X, X]) gpca = GroupPCA(n_components=n_components, groups=groups) sgpca = SupervisedGroupPCA(n_components=n_components, groups=groups, theta=0.0) X_r = gpca.fit_transform(X, y) X_rs = sgpca.fit_transform(X, y) assert_allclose(X_r, X_rs)
def test_group_pca_inverse_matches_input(): X = iris.data X_2 = np.hstack([X, X]) groups = [np.arange(0, X.shape[1]), np.arange(X.shape[1], 2 * X.shape[1])] gpca = GroupPCA(groups=groups, group_names=["one", "two"]).fit(X_2) Y = gpca.transform(X_2) Y_inv = gpca.inverse_transform(Y) assert_allclose(Y_inv, X_2) assert gpca.get_feature_names() == ( [f"one_pc{idx}" for idx in range(Y.shape[1] // 2)] + [f"two_pc{idx}" for idx in range(Y.shape[1] // 2)]) with pytest.raises(TypeError): GroupPCA(groups=groups, exclude_groups="error").fit(X_2)
feature_names = afqdata.feature_names group_names = afqdata.group_names subjects = afqdata.subjects # Here we reduce computation time by taking the first 10 principal components of each feature group and performing SGL logistic regression on those components. # If you want to train an SGL model without group PCA, set ``do_group_pca = False``. This will increase the number of features by an order of magnitude and slow down execution time. do_group_pca = True if do_group_pca: n_components = 10 # The next three lines retrieve the group structure of the group-wise PCA # and store it in ``groups_pca``. We do not use the imputer or GroupPCA transformer # for anything else imputer = SimpleImputer(strategy="median") gpca = GroupPCA(n_components=n_components, groups=groups) groups_pca = gpca.fit(imputer.fit_transform(X)).groups_out_ transformer = GroupPCA transformer_kwargs = {"groups": groups, "n_components": n_components} else: transformer = False transformer_kwargs = None pipe = make_afq_classifier_pipeline( imputer_kwargs={"strategy": "median"}, # Use median imputation use_cv_estimator=True, # Automatically determine the best hyperparameters feature_transformer=transformer, # See note above about group PCA feature_transformer_kwargs=transformer_kwargs, scaler="standard", # Standard scale the features before regression groups=groups_pca if do_group_pca else