def test_ridge_reducer_fit(spark):

    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    blockdf = spark.read.parquet(f'{data_root}/blockedGT.snappy.parquet')
    testGroup = '0'
    testBlock = 'chr_1_block_0'
    ids = indexdf.filter(f'sample_block = {testGroup}').select(
        'sample_ids').head().sample_ids
    headers = [
        r.header for r in blockdf.filter(
            f'header_block = "{testBlock}" AND sample_block = {testGroup}').
        orderBy('sort_key').select('header').collect()
    ]

    X_out = X0[headers].drop(ids, axis='rows')
    Y_out = labeldf.drop(ids, axis='rows')

    XtX_out = X_out.to_numpy().T @ X_out.to_numpy()
    XtY_out = X_out.to_numpy().T @ Y_out.to_numpy()
    B = np.column_stack([
        (np.linalg.inv(XtX_out + np.identity(XtX_out.shape[1]) * a) @ XtY_out)
        for a in alphas
    ])

    stack = RidgeReducer(alphas)
    modeldf = stack.fit(blockdf, labeldf, __get_sample_blocks(indexdf))

    columns = ['coefficients']
    rows = modeldf.filter(f'header_block = "{testBlock}" AND sample_block = {testGroup}') \
        .select(*columns).collect()
    outdf = pd.DataFrame(rows, columns=columns)

    B_stack = np.row_stack(outdf['coefficients'].to_numpy())

    assert np.allclose(B_stack, B)
def test_reducer_transform_validates_inputs(spark):
    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    blockdf = spark.read.parquet(
        f'{data_root}/blockedGT.snappy.parquet').limit(5)

    group2ids = __get_sample_blocks(indexdf)
    reducer = RidgeReducer(alphas)
    model0df = reducer.fit(blockdf, labeldf, group2ids)

    with pytest.raises(ValueError):
        reducer.transform(blockdf, label_with_missing, group2ids, model0df,
                          covdf)
    with pytest.raises(ValueError):
        reducer.transform(blockdf, labeldf, group2ids, model0df,
                          covdf_with_missing)
    with pytest.warns(UserWarning):
        reducer.transform(blockdf, labeldf + 0.5, group2ids, model0df, covdf)
    with pytest.warns(UserWarning):
        reducer.transform(blockdf, labeldf * 1.5, group2ids, model0df, covdf)
    with pytest.warns(UserWarning):
        reducer.transform(blockdf, labeldf, group2ids, model0df, covdf + 0.5)
    with pytest.warns(UserWarning):
        reducer.transform(blockdf, labeldf, group2ids, model0df, covdf * 1.5)
    # Should issue no warnings
    reducer.transform(blockdf, labeldf, group2ids, model0df, covdf)
Exemple #3
0
def test_reducer_generate_alphas(spark):

    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    blockdf = spark.read.parquet(f'{data_root}/blockedGT.snappy.parquet').limit(5)
    group2ids = __get_sample_blocks(indexdf)

    stack_without_alphas = RidgeReducer()
    stack_with_alphas = RidgeReducer(np.array(sorted(list(generate_alphas(blockdf).values()))))

    model0_without_alphas = stack_without_alphas.fit(blockdf, labeldf, group2ids)
    model0df = stack_with_alphas.fit(blockdf, labeldf, group2ids)
    __assert_dataframes_equal(model0_without_alphas, model0df)

    level1_without_alphas = stack_without_alphas.transform(blockdf, labeldf, group2ids, model0df)
    level1df = stack_with_alphas.transform(blockdf, labeldf, group2ids, model0df)
    __assert_dataframes_equal(level1_without_alphas, level1df)
Exemple #4
0
def test_reducer_missing_alphas(spark):
    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    blockdf = spark.read.parquet(f'{data_root}/blockedGT.snappy.parquet').limit(5)
    group2ids = __get_sample_blocks(indexdf)

    stack_fit = RidgeReducer()
    stack_transform = RidgeReducer()

    model0df = stack_fit.fit(blockdf, labeldf, group2ids)
    level1df = stack_transform.transform(blockdf, labeldf, group2ids, model0df)
    with pytest.raises(Exception):
        level1df.collect()
Exemple #5
0
def test_reducer_fit_transform(spark):

    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    blockdf = spark.read.parquet(f'{data_root}/blockedGT.snappy.parquet').limit(5)
    group2ids = __get_sample_blocks(indexdf)

    stack0 = RidgeReducer(alphas)
    model0df = stack0.fit(blockdf, labeldf, group2ids)
    level1df = stack0.transform(blockdf, labeldf, group2ids, model0df)
    fit_transform_df = stack0.fit_transform(blockdf, labeldf, group2ids)

    __assert_dataframes_equal(fit_transform_df, level1df)
def test_ridge_reducer_transform_with_cov(spark):

    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    blockdf = spark.read.parquet(f'{data_root}/blockedGT.snappy.parquet')
    testGroup = '0'
    testBlock = 'chr_1_block_0'
    ids = indexdf.filter(f'sample_block = {testGroup}').select(
        'sample_ids').head().sample_ids
    sample_blocks = __get_sample_blocks(indexdf)
    headers = [
        r.header for r in blockdf.filter(
            f'header_block = "{testBlock}" AND sample_block= {testGroup}').
        orderBy('sort_key').select('header').collect()
    ]

    C_in = covdf.loc[ids, :].values
    X_in = X0[headers].loc[ids, :].values
    X_in_cov = np.column_stack([C_in, X_in])
    C_out = covdf.drop(ids, axis='rows').values
    X_out = X0[headers].drop(ids, axis='rows').values
    X_out_cov = np.column_stack((C_out, X_out))
    Y_out = labeldf.drop(ids, axis='rows').values

    XtX_out_cov = X_out_cov.T @ X_out_cov
    XtY_out_cov = X_out_cov.T @ Y_out
    diags_cov = [
        np.concatenate(
            [np.ones(n_cov),
             np.ones(XtX_out_cov.shape[1] - n_cov) * a]) for a in alphas
    ]
    B_cov = np.column_stack([
        (np.linalg.inv(XtX_out_cov + np.diag(d)) @ XtY_out_cov)
        for d in diags_cov
    ])
    X1_in_cov = X_in_cov @ B_cov

    stack = RidgeReducer(alphas)
    modeldf_cov = stack.fit(blockdf, labeldf, sample_blocks, covdf)
    level1df_cov = stack.transform(blockdf, labeldf, sample_blocks,
                                   modeldf_cov, covdf)

    columns = ['alpha', 'label', 'values']
    rows_cov = level1df_cov.filter(f'header LIKE "%{testBlock}%" AND sample_block= {testGroup}') \
        .select(*columns) \
        .collect()
    outdf_cov = pd.DataFrame(rows_cov, columns=columns)
    X1_in_stack_cov = np.column_stack(outdf_cov['values'])

    assert np.allclose(X1_in_stack_cov, X1_in_cov)
def test_one_level_regression(spark):

    indexdf = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet')
    blockdf = spark.read.parquet(f'{data_root}/blockedGT.snappy.parquet')
    testLabel = 'sim100'

    group2ids = __get_sample_blocks(indexdf)
    bestAlpha, bestr2, y_hat = __calculate_y_hat(X1, group2ids, testLabel)

    stack0 = RidgeReducer(alphas)
    model0df = stack0.fit(blockdf, labeldf, group2ids)
    level1df = stack0.transform(blockdf, labeldf, group2ids, model0df)

    regressor = RidgeRegression(alphas)
    model1df, cvdf = regressor.fit(level1df, labeldf, group2ids)
    yhatdf = regressor.transform(level1df, labeldf, group2ids, model1df, cvdf)

    r = cvdf.filter(f'label = "{testLabel}"').select('alpha', 'r2_mean').head()
    bestAlpha_lvl, bestr2_lvl = (r.alpha, r.r2_mean)
    y_hat_lvl = np.array(yhatdf[testLabel])

    assert (bestAlpha_lvl == bestAlpha and np.isclose(bestr2_lvl, bestr2)
            and np.allclose(y_hat_lvl, np.array(y_hat)))