def test_transform(spark): df = spark.read.format("vcf")\ .load("test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf") converted = glow.transform("pipe", df, input_formatter="vcf", output_formatter="vcf", cmd='["cat"]', in_vcf_header="infer") assert converted.count() == 1075
def test_arg_map(spark): df = spark.read.format("vcf") \ .load("test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf") args = { "inputFormatter": "vcf", "outputFormatter": "vcf", "cmd": '["cat"]', "in_vcfHeader": "infer" } converted = glow.transform("pipe", df, args) assert converted.count() == 1075
def _transform( self, input_df: f.DataFrame, contig: pyspark.sql.column.Column, start: pyspark.sql.column.Column, end: pyspark.sql.column.Column, ref: pyspark.sql.column.Column, alt: pyspark.sql.column.Column, id: pyspark.sql.column.Column, ): """ Runs Ensembl VEP on a Spark DataFrame with VEP. The DataFrame needs to provide the following fields: - "contigName" - "start" - "end - "referenceAllele" - "alternateAlleles" Args: df: Spark DataFrame with contigNamem start, end, ref and alt contig: contig name column start: variant position column ref: reference allele column alt: array of alternate alleles column id: array of id's Returns: Spark DataFrame with single column `text` that contains json-formatted VEP output as string """ import glow input_df = input_df.select([ contig, start, end, id, ref, alt, ]) vep_transformed_df = glow.transform( "pipe", input_df, cmd=json.dumps(self.call_args), inputFormatter='vcf', inVcfHeader='infer', outputFormatter='text', ) return vep_transformed_df
# DBTITLE 1,Filter rows for which liftover succeeded and see which rows changed. changed_with_lifted_df = input_with_lifted_df.filter( "lifted is not null").filter("start != lifted.start") display(changed_with_lifted_df) # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC Now apply the `lift_over_variants` transformer, with the following options. # MAGIC - `chain_file`: `string` # MAGIC - `reference_file`: `string` # MAGIC - `min_match_ratio`: `double` (optional, defaults to `.95`) # COMMAND ---------- output_df = glow.transform('lift_over_variants', input_df, chain_file=chain_file, reference_file=reference_file) # COMMAND ---------- # DBTITLE 1,View the rows for which liftover succeeded lifted_df = output_df.filter('liftOverStatus.success = true').drop( 'liftOverStatus') display( lifted_df.select('contigName', 'start', 'end', 'referenceAllele', 'alternateAlleles', 'INFO_AC', 'INFO_SwappedAlleles', 'INFO_ReverseComplementedAlleles'))
# Databricks notebook source import pyspark.sql.functions as fx from pyspark.sql.types import * import glow spark = glow.register(spark) import json # COMMAND ---------- # DBTITLE 1,Use the text input and output formatters df = spark.createDataFrame([["foo"], ["bar"], ["baz"]], ["text"]) display( glow.transform('pipe', df, cmd=['rev'], input_formatter='text', output_formatter='text')) # COMMAND ---------- # DBTITLE 1,Read 1kg chr22 df = spark.read.format("vcf").option( "flattenInfoFields", False).load("/databricks-datasets/genomics/1kg-vcfs/*.vcf.gz") df = sqlContext.createDataFrame(sc.parallelize(df.take(1000)), df.schema).cache() # COMMAND ---------- # DBTITLE 1,Use grep to drop INFO lines from VCF header transformed_df = glow.transform('pipe',
lmm_udf = fx.pandas_udf(lmm, returnType=DoubleType()) # COMMAND ---------- # DBTITLE 1,Prepare the input DataFrame """ Read in 1000genomes phase 3 chr 22 and split multiallelic sites to biallelic. Add the phenotypes by cross joining with the genomic DataFrame. The input to the lmm is the genotype represented as the number of alt alleles (0, 1, or 2). In this example, we remove all sites where some samples are missing (as represented by -1). """ df = glow.transform( \ "split_multiallelics", \ spark.read.format("vcf").load("/databricks-datasets/genomics/1kg-vcfs/*chr22*.vcf.gz") \ ) \ .crossJoin(spark.read.format("parquet").load("/databricks-datasets/genomics/1000G/phenotypes.normalized/")) \ .withColumn('genotype_states', fx.expr("genotype_states(genotypes)")) \ .where(~fx.array_contains(fx.col('genotype_states'), -1)) # COMMAND ---------- # DBTITLE 1,Run the UDF and display results by_pvalue = df.limit(1000).select("contigName", "start", "names", lmm_udf(df['genotype_states'], df['values']).alias("pValue"))\ .na.drop(subset=["pValue"])\ .orderBy("pValue", ascending=True) display(by_pvalue)
sample_ids = glow.wgr.get_sample_ids(base_variant_df) # COMMAND ---------- # MAGIC %md # MAGIC To prepare the data for analysis, we perform the following transformations: # MAGIC - Split multiallelic variants with the ``split_multiallelics`` transformer. # MAGIC - Calculate the number of alternate alleles for biallelic variants with `genotype_states`. # MAGIC - Replace any missing values with the mean of the non-missing values using `mean_substitute`. # MAGIC - Filter out all homozygous SNPs. # COMMAND ---------- variant_df = (glow.transform( 'split_multiallelics', base_variant_df).withColumn( 'values', glow.mean_substitute(glow.genotype_states('genotypes'))).filter( fx.size(fx.array_distinct('values')) > 1)) # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC Create the beginning block genotype matrix and sample block ID mapping with `glow.wgr.block_variants_and_samples`. # MAGIC # MAGIC Write the block matrix to Delta and the sample blocks a JSON file so that we can reuse them for multiple phenotype batches. # COMMAND ---------- block_df, sample_blocks = glow.wgr.block_variants_and_samples( variant_df, sample_ids, variants_per_block, sample_block_count)
# Databricks notebook source # DBTITLE 1,Define path variables import glow spark = glow.register(spark) vcf_path = '/databricks-datasets/genomics/variant-splitting/01_IN_altered_multiallelic.vcf' # COMMAND ---------- # DBTITLE 1,Load a VCF into a DataFrame original_variants_df = (spark.read.format("vcf").option( "includeSampleIds", False).option("flattenInfoFields", True).load(vcf_path)) # COMMAND ---------- # DBTITLE 1,Display display(original_variants_df) # COMMAND ---------- # DBTITLE 1,Split multi-allelic variants spark.conf.set( "spark.sql.codegen.wholeStage", False ) # turn off Spark SQL whole-stage code generation for faster performance. split_variants_df = glow.transform("split_multiallelics", original_variants_df) display(split_variants_df)
# COMMAND ---------- vcf.printSchema() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC #### split multiallelic events # COMMAND ---------- spark.conf.set( "spark.sql.codegen.wholeStage", False ) # turn off Spark SQL whole-stage code generation for faster performance. split_vcf = glow.transform("split_multiallelics", vcf) # COMMAND ---------- split_vcf.show() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC #### normalize variants # MAGIC # MAGIC This is an important quality control / sanity check when ingesting VCFs # MAGIC # MAGIC And is always necessary after multiallelics variants are split to biallelics
# --plugin dbscSNV # --custom ${GNOMAD_VCF},Gnomad_2.1.1,vcf,overlap # --plugin AncestralAllele print(vep_cmd) # - df = ( spark .read .option("flattenInfoFields", False) .format('vcf') .load(INPUT_VCF) ) df = glow.transform("split_multiallelics", df) df.printSchema() df = df.withColumn("names", f.array([f.concat( f.col('contigName'), f.lit(":"), f.col('start') + 1, f.lit(":"), f.col('referenceAllele'), f.lit(">"), f.col('alternateAlleles')[0] )])) df.limit(10).toPandas()
def test_no_transform(spark): df = spark.read.format("vcf") \ .load("test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf") with pytest.raises(IllegalArgumentException): glow.transform("dne", df)
original_variants_df = (spark.read .format("vcf") .option("includeSampleIds", False) .load(vcf_path)) # COMMAND ---------- # DBTITLE 1,Display display(original_variants_df) # COMMAND ---------- # DBTITLE 1,Normalize variants using normalize_variants transformer with column replacement normalized_variants_df = glow.transform( "normalize_variants", original_variants_df, reference_genome_path=ref_genome_path ) display(normalized_variants_df) # COMMAND ---------- # DBTITLE 1,Normalize variants using normalize_variants transformer without column replacement normalized_variants_df = glow.transform( "normalize_variants", original_variants_df, reference_genome_path=ref_genome_path, replace_columns="False" )
# MAGIC write genotype data into Delta Lake, a high performance big data store with ACID semantics. # MAGIC Delta Lake organizes, indexes and compresses data, allowing for performant and reliable computation on genomics data as it grows over time. # COMMAND ---------- vcf_view_unsplit = (spark.read.format("vcf") .option("flattenInfoFields", "false") .load(vcf_path)) # COMMAND ---------- # MAGIC %md Split multiallelics variants to biallelics # COMMAND ---------- vcf_view = glow.transform("split_multiallelics", vcf_view_unsplit) # COMMAND ---------- display(vcf_view.withColumn("genotypes", fx.col("genotypes")[0])) # COMMAND ---------- # MAGIC %md # MAGIC ##### Note: we compute variant-wise summary stats and Hardy-Weinberg equilibrium P values using `call_summary_stats` & `hardy_weinberg`, which are built into Glow # COMMAND ---------- (vcf_view .select( fx.expr("*"),