Ejemplo n.º 1
0
    def test_gowl_vs_glasso_duality_gap_3(self):
        """
        Duality Gap goes negative in this case. Should that happen?
        """
        np.random.seed(680)
        p = 10
        blocks = [
            Block(dim=p,
                  idx=0,
                  block_min_size=2,
                  block_max_size=6,
                  block_value=0.9),
            Block(dim=p,
                  idx=1,
                  block_min_size=2,
                  block_max_size=6,
                  block_value=-0.9),
            Block(dim=p,
                  idx=3,
                  block_min_size=2,
                  block_max_size=6,
                  block_value=-0.5),
        ]
        theta_star, blocks, theta_blocks = generate_theta_star_gowl(p=p,
                                                                    alpha=0.5,
                                                                    noise=0.1,
                                                                    blocks=blocks)
        lam1 = 0.001  # controls sparsity
        lam2 = 0.01  # encourages equality of coefficients
        rho = oscar_weights(lam1, lam2, (p ** 2 - p) / 2)

        theta_star = theta_star[0]
        sigma = np.linalg.inv(theta_star)
        n = 100
        X = np.random.multivariate_normal(np.zeros(p), sigma, n)
        X = standardize(X)
        S = np.cov(X.T)

        theta_0 = np.linalg.inv(S)
        model = GOWLModel(X, S, theta_0, lam1, lam2, 'backtracking', max_iters=100000)
        model.fit()
        theta_gowl = model.theta_hat

        gl = GraphicalLasso(max_iter=200)
        gl.fit(S)
        theta_glasso = gl.get_precision()

        print('Non zero entries in precision matrix {}'.format(np.count_nonzero(theta_gowl)))
        plot_multiple_theta_matrices_2d([theta_blocks, theta_star, theta_glasso, theta_gowl],
                                        [f"Blocks: {len(blocks)}", 'True Theta', 'GLASSO', 'GOWL'])

        _fit_evaluations(theta_star, theta_glasso, 3, 'GLASSO')
        _fit_evaluations(theta_star, theta_gowl, 3, 'GOWL')

        y_hat_gowl = spectral_clustering(theta=theta_gowl, K=4)
        y_hat_glasso = spectral_clustering(theta=theta_glasso, K=4)
        y_true = spectral_clustering(theta=theta_blocks, K=4).flatten()
        _cluster_evaluations(y_true, y_hat_gowl, 'GOWL')
        _cluster_evaluations(y_true, y_hat_glasso, 'GLASSO')
Ejemplo n.º 2
0
    def test_ccgowl_vs_grab_1(self):
        np.random.seed(680)
        p = 10
        n_blocks = 1
        theta_star, blocks, theta_blocks = generate_theta_star_gowl(p=p,
                                                                    alpha=0.5,
                                                                    noise=0.1,
                                                                    n_blocks=n_blocks,
                                                                    block_min_size=2,
                                                                    block_max_size=6)
        theta_star = theta_star[0]
        sigma = np.linalg.inv(theta_star)
        n = 100
        X = np.random.multivariate_normal(np.zeros(p), sigma, n)
        X = standardize(X)
        S = np.cov(X.T)

        lam1 = 0.05263158
        lam2 = 0.05263158

        theta_owl = np.zeros((p, p))
        model = CCGOWLModel(X, lam1, lam2)
        model.fit()
        theta_ccgowl = model.theta_hat

        lmbda = .2
        K = 10
        o_size = .3  # The size of overlap, as an input parameter
        max_iter = 20
        tol = 1e-4
        dual_max_iter = 600
        dual_tol = 1e-4
        theta_grab, blocks = grab.BCD(S,
                                      lmbda=lmbda,
                                      K=K,
                                      o_size=o_size,
                                      max_iter=max_iter,
                                      tol=tol,
                                      dual_max_iter=dual_max_iter,
                                      dual_tol=dual_tol)

        theta_grab = np.asarray(theta_grab)

        print('Non zero entries in precision matrix {}'.format(np.count_nonzero(theta_owl)))
        plot_multiple_theta_matrices_2d([S, theta_blocks, theta_star, theta_grab, theta_ccgowl],
                                        ['Sample Covariance', f"Blocks: {len(blocks)}", 'True Theta', 'GRAB', 'CCGOWL'])

        _fit_evaluations(theta_star, theta_grab, 1, 'GRAB')
        _fit_evaluations(theta_star, theta_owl, 1, 'GOWL')

        y_hat_gowl = spectral_clustering(theta=theta_owl, K=2)
        y_hat_grab = spectral_clustering(theta=theta_grab, K=2)
        y_true = spectral_clustering(theta=theta_blocks, K=2).flatten()
        _cluster_evaluations(y_true, y_hat_gowl, 'CCGOWL')
        _cluster_evaluations(y_true, y_hat_grab, 'GRAB')
Ejemplo n.º 3
0
    def test_gowl_vs_grab_1(self):
        np.random.seed(680)
        p = 10
        n_blocks = 1
        theta_star, blocks, theta_blocks = generate_theta_star_gowl(p=p,
                                                                    alpha=0.5,
                                                                    noise=0.1,
                                                                    n_blocks=n_blocks,
                                                                    block_min_size=2,
                                                                    block_max_size=6)
        theta_star = theta_star[0]
        sigma = np.linalg.inv(theta_star)
        n = 100
        X = np.random.multivariate_normal(np.zeros(p), sigma, n)
        X = standardize(X)
        S = np.cov(X.T)

        lam1 = 0.001  # controls sparsity
        lam2 = 0.01  # encourages equality of coefficients

        lmbda = .2
        K = 10
        o_size = .3  # The size of overlap, as an input parameter
        max_iter = 20
        tol = 1e-4
        dual_max_iter = 600
        dual_tol = 1e-4
        theta_grab, blocks = grab.BCD(S,
                                      lmbda=lmbda,
                                      K=K,
                                      o_size=o_size,
                                      max_iter=max_iter,
                                      tol=tol,
                                      dual_max_iter=dual_max_iter,
                                      dual_tol=dual_tol)

        theta_grab = np.asarray(theta_grab)

        model = GOWLModel(X, S, lam1, lam2, 'backtracking', max_iters=100000)
        model.fit()
        theta_gowl = model.theta_hat

        print('Non zero entries in precision matrix {}'.format(np.count_nonzero(theta_gowl)))
        plot_multiple_theta_matrices_2d([theta_blocks, theta_star, theta_grab, theta_gowl],
                                        [f"1 Block of Size 2", 'True Theta', 'GRAB', 'GOWL'])

        _fit_evaluations(theta_star, theta_grab, 1, 'GRAB')
        _fit_evaluations(theta_star, theta_gowl, 1, 'GOWL')

        y_hat_gowl = spectral_clustering(theta=theta_gowl, K=2)
        y_hat_grab = spectral_clustering(theta=theta_grab, K=2)
        y_true = spectral_clustering(theta=theta_blocks, K=2).flatten()
        _cluster_evaluations(y_true, y_hat_gowl, 'GOWL')
        _cluster_evaluations(y_true, y_hat_grab, 'GRAB')
Ejemplo n.º 4
0
def read_stock_data():
    proj_root_path = pathlib.Path.cwd().parent.parent
    data_path = 'data/raw/s_&_p/data.csv'
    sector_path = 'data/raw/s_&_p/info.csv'
    data_full_path = proj_root_path / data_path
    sectors_full_path = proj_root_path / sector_path
    data = pd.read_csv(data_full_path, index_col=0)
    info = pd.read_csv(sectors_full_path)
    data = standardize(data)
    stock_names = info['V1']
    sectors = info['V2']
    data.columns = [f'{comp}/{gic}' for comp, gic in zip(stock_names, sectors)]

    return data, data.columns
Ejemplo n.º 5
0
def load_gene_subset():
    proj_root_path = pathlib.Path.cwd().parent.parent
    data_path = 'data/processed/golub_et_al/AML_ALL_reduced_50.csv'
    reactome_path = 'data/processed/golub_et_al/reactome.csv'
    data_full_path = proj_root_path / data_path
    reactome_full_path = proj_root_path / reactome_path
    df = pd.read_csv(data_full_path)
    react_df = pd.read_csv(reactome_full_path)
    labels = {
        k: f"{k}/{label}"
        for k, label in zip(react_df['gene'].to_list(), react_df['pathway'].to_list())
    }
    X = df.drop([0, 1], axis=0)
    X = X.drop(['Unnamed: 0'], axis=1)
    X.index = pd.to_numeric(X.index)
    X.sort_index(inplace=True)
    X = X.astype(float)
    X = standardize(X)

    return X, labels