def add_glow(doctest_namespace, spark): glow.register(spark, new_session=False) doctest_namespace['Row'] = Row doctest_namespace['spark'] = spark doctest_namespace['lit'] = functions.lit doctest_namespace['col'] = functions.col doctest_namespace['glow'] = glow
def add_spark(doctest_namespace, spark): glow.register(spark) doctest_namespace['Row'] = Row doctest_namespace['spark'] = spark doctest_namespace['lit'] = functions.lit doctest_namespace['col'] = functions.col doctest_namespace['glow'] = glow
def test_register(spark): glow.register(spark) df = spark.read.format("vcf") \ .load("test-data/1kg_sample.vcf") stats = df.selectExpr("expand_struct(dp_summary_stats(genotypes))") \ .select("min", "max") \ .head() assert stats.asDict() == Row(min=1.0, max=23).asDict()
def test_register(spark): glow.register(spark) row_one = Row(Row(str_col='foo', int_col=1, bool_col=True)) row_two = Row(Row(str_col='bar', int_col=2, bool_col=False)) df = spark.createDataFrame([row_one, row_two], schema=['base_col']) added_col_row = df.selectExpr("add_struct_fields(base_col, 'float_col', 3.14, 'rev_str_col', reverse(base_col.str_col)) as added_col") \ .filter("added_col.str_col = 'foo'") \ .head() assert added_col_row.added_col.rev_str_col == 'oof'
# MAGIC #!/usr/bin/env bash # MAGIC rm -r /opt/liftover # MAGIC mkdir /opt/liftover # MAGIC curl https://raw.githubusercontent.com/broadinstitute/gatk/master/scripts/funcotator/data_sources/gnomAD/b37ToHg38.over.chain --output /opt/liftover/b37ToHg38.over.chain # MAGIC ``` # MAGIC In this demo, we perform coordinate and variant liftover from b37 to hg38. # MAGIC # MAGIC To perform variant liftover, you must download a reference file to each node of the cluster. Here, we use the FUSE mount to access the reference genome at # MAGIC ```/dbfs/databricks-datasets/genomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa``` # COMMAND ---------- # DBTITLE 1,Import glow and define path variables import glow spark = glow.register(spark) chain_file = '/opt/liftover/b37ToHg38.over.chain' reference_file = '/dbfs/databricks-datasets/genomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa' vcf_file = 'dbfs:/databricks-datasets/genomics/1kg-vcfs/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz' # COMMAND ---------- # DBTITLE 1,First, read in a VCF from a flat file or Delta Lake table. input_df = (spark.read.format("vcf").load(vcf_file).limit(1).cache()) # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC Now apply the `lift_over_coordinates` UDF, with the parameters as follows: # MAGIC - chromosome (`string`)
root = sys.argv[1] freeze = sys.argv[2][1:-1] + "/" pheno = sys.argv[3] covar = sys.argv[4] split = sys.argv[5] offsets = sys.argv[6] jobname = sys.argv[7] splitctg = sys.argv[8] repart = sys.argv[9] spark = SparkSession\ .builder\ .appName(jobname)\ .getOrCreate() glow.register(spark, False) spark.udf.registerJavaFunction("chartodoublearray", "org.gorpipe.spark.udfs.CharToDoubleArray", ArrayType(DoubleType())) rootfreeze = root + freeze label_df = pd.read_csv(root + pheno, sep='\t', index_col=0) covariate_df = None if len(covar) > 0: covariates = pd.read_csv(root + covar, sep='\t', index_col=0) covariate_df = covariates.fillna(covariates.mean()) covariate_df = (covariate_df - covariate_df.mean()) / covariate_df.std() covariate_df
def register_glow(spark): glow.register(spark, new_session=False)
spark = ( SparkSession.builder .appName('desmi_inject_gnomad') .config("spark.jars.packages", ",".join([ "io.projectglow:glow-spark3_2.12:1.0.0", ])) .config("spark.local.dir", os.environ.get("TMP")) .config("spark.master", f"local[{N_CPU},{MAX_FAILURES}]") .config("spark.sql.shuffle.partitions", "2001") .config("spark.sql.execution.arrow.enabled", "true") .config("spark.driver.maxResultSize", "48G") .config("spark.task.maxFailures", MAX_FAILURES) .getOrCreate() ) glow.register(spark) spark # - INPUT_VCF = snakemake.input["vcf"] INPUT_VCF OUTPUT_PQ = snakemake.output["vep"] OUTPUT_PQ # + FASTA=snakemake.input["fasta"] GTF=snakemake.input["gtf"] HUMAN_GENOME_VERSION=snakemake.params["human_genome_version"]
def spark_session(): spark = (SparkSession.builder.config( "spark.jars.packages", "io.projectglow:glow_2.11:0.5.0").config( "spark.sql.execution.arrow.pyspark.enabled", "true").getOrCreate()) glow.register(spark) return spark
def test_new_session(spark): sess = glow.register(spark, new_session=False) assert sess._jsparkSession.equals(spark._jsparkSession) sess = glow.register(spark, new_session=True) assert not sess._jsparkSession.equals(spark._jsparkSession)