Example #1
0
def compare_linreg_to_regenie(spark, output_prefix, missing=[], single_chr=True):

    (genotype_df, phenotype_df, covariate_df, offset_df) = fx.get_input_dfs(spark,
                                                                            binary=False,
                                                                            missing=missing,
                                                                            single_chr=single_chr)
    glowgr_df = lr.linear_regression(genotype_df,
                                     phenotype_df,
                                     covariate_df,
                                     offset_df,
                                     values_column='values').toPandas()
    fx.compare_to_regenie(output_prefix, glowgr_df)
Example #2
0
def test_values_expr(spark, rg):
    from pyspark.sql.functions import array, lit
    num_samples = 5
    genotype_df = spark.range(1).withColumn('genotypes', lit(42))
    phenotype_df = pd.DataFrame(rg.random((num_samples, 5)))
    covariate_df = pd.DataFrame(rg.random((num_samples, 2)))
    array_vals = [lit(i) for i in range(num_samples)]
    results = lr.linear_regression(genotype_df,
                                   phenotype_df,
                                   covariate_df,
                                   values_column=array(*array_vals))
    pandas_genotype_df = pd.DataFrame({'values': range(num_samples)})
    baseline = statsmodels_baseline(pandas_genotype_df, phenotype_df, covariate_df)
    assert regression_results_equal(baseline, results.drop('id').toPandas())
    assert not 'genotypes' in [field.name for field in results.schema]
Example #3
0
def run_linear_regression_spark(spark,
                                genotype_df,
                                phenotype_df,
                                covariate_df=pd.DataFrame({}),
                                extra_cols=pd.DataFrame({}),
                                offset_df=pd.DataFrame({}),
                                values_column='values',
                                **kwargs):
    pdf = pd.DataFrame({values_column: genotype_df.to_numpy().T.tolist()})
    if not extra_cols.empty:
        pdf = pd.concat([pdf, extra_cols], axis=1)
    pdf['idx'] = pdf.index
    results = lr.linear_regression(spark.createDataFrame(pdf),
                                   phenotype_df,
                                   covariate_df,
                                   offset_df,
                                   values_column=values_column,
                                   **kwargs).toPandas().sort_values(['phenotype',
                                                                     'idx']).drop('idx', axis=1)
    return results