def compare_linreg_to_regenie(spark, output_prefix, missing=[], single_chr=True): (genotype_df, phenotype_df, covariate_df, offset_df) = fx.get_input_dfs(spark, binary=False, missing=missing, single_chr=single_chr) glowgr_df = lr.linear_regression(genotype_df, phenotype_df, covariate_df, offset_df, values_column='values').toPandas() fx.compare_to_regenie(output_prefix, glowgr_df)
def test_values_expr(spark, rg): from pyspark.sql.functions import array, lit num_samples = 5 genotype_df = spark.range(1).withColumn('genotypes', lit(42)) phenotype_df = pd.DataFrame(rg.random((num_samples, 5))) covariate_df = pd.DataFrame(rg.random((num_samples, 2))) array_vals = [lit(i) for i in range(num_samples)] results = lr.linear_regression(genotype_df, phenotype_df, covariate_df, values_column=array(*array_vals)) pandas_genotype_df = pd.DataFrame({'values': range(num_samples)}) baseline = statsmodels_baseline(pandas_genotype_df, phenotype_df, covariate_df) assert regression_results_equal(baseline, results.drop('id').toPandas()) assert not 'genotypes' in [field.name for field in results.schema]
def run_linear_regression_spark(spark, genotype_df, phenotype_df, covariate_df=pd.DataFrame({}), extra_cols=pd.DataFrame({}), offset_df=pd.DataFrame({}), values_column='values', **kwargs): pdf = pd.DataFrame({values_column: genotype_df.to_numpy().T.tolist()}) if not extra_cols.empty: pdf = pd.concat([pdf, extra_cols], axis=1) pdf['idx'] = pdf.index results = lr.linear_regression(spark.createDataFrame(pdf), phenotype_df, covariate_df, offset_df, values_column=values_column, **kwargs).toPandas().sort_values(['phenotype', 'idx']).drop('idx', axis=1) return results