Exemple #1
0
def test_exclude_sample_ids(spark, tmp_path):
    input_vcf = 'test-data/NA12878_21_10002403.vcf'
    hail_df = functions.from_matrix_table(hl.import_vcf(input_vcf),
                                          include_sample_ids=False)
    hail_with_sample_id_df = functions.from_matrix_table(
        hl.import_vcf(input_vcf))
    with pytest.raises(AssertionError):
        _compare_struct_types(hail_df.schema, hail_with_sample_id_df.schema)
    _assert_lossless_adapter(spark,
                             tmp_path,
                             hail_df,
                             input_vcf,
                             'vcf',
                             'vcf',
                             reader_options={'includeSampleIds': 'false'})
Exemple #2
0
def test_plink(spark):
    input_base = 'test-data/plink/five-samples-five-variants/bed-bim-fam/test'
    # Do not recode contigs (eg. 23 -> X)
    hail_df = functions.from_matrix_table(
        hl.import_plink(bed=input_base + '.bed',
                        bim=input_base + '.bim',
                        fam=input_base + '.fam',
                        reference_genome=None,
                        contig_recoding={}))

    # Hail does not set the genotype if it is missing; the Glow PLINK reader sets the calls to (-1, -1)
    # Hail sets the genotype phased=False when reading from PLINK if the genotype is present;
    # the Glow PLINK reader does not as it is always false
    glow_df = spark.read.format('plink') \
        .option('mergeFidIid', 'false') \
        .load(input_base + '.bed')
    _compare_struct_types(hail_df.schema,
                          glow_df.schema,
                          ignore_fields=['phased'])
    matching_glow_df = glow_df.withColumn(
        'genotypes',
        fx.expr(
            "transform(genotypes, gt -> named_struct('sampleId', gt.sampleId, 'calls', ifnull(gt.calls, array(-1,-1)), 'phased', if(gt.calls = array(-1, -1), null, false)))"
        ))
    matching_hail_df = hail_df.select(*glow_df.schema.names)
    assert matching_hail_df.subtract(matching_glow_df).count() == 0
    assert matching_glow_df.subtract(matching_hail_df).count() == 0
Exemple #3
0
def test_annotated_sites_only_vcf(spark, tmp_path):
    # The Hail DataFrame will not have the split CSQ/ANN fields, as it does not have
    # the VCF header metadata; we include the header when writing the round-trip VCF.
    input_vcf = 'test-data/vcf/vep.vcf'
    hail_df = functions.from_matrix_table(hl.import_vcf(input_vcf))
    _assert_lossless_adapter(spark,
                             tmp_path,
                             hail_df,
                             input_vcf,
                             'vcf',
                             'vcf',
                             writer_options={'vcfHeader': input_vcf})
Exemple #4
0
def test_unphased_bgen(spark, tmp_path):
    spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '-1')
    input_bgen = 'test-data/bgen/example.8bits.bgen'
    hl.index_bgen(input_bgen, reference_genome=None)
    hail_df = functions.from_matrix_table(
        hl.import_bgen(input_bgen, entry_fields=['GP']))
    _assert_lossless_adapter(spark,
                             tmp_path,
                             hail_df,
                             input_bgen,
                             'bgen',
                             'bigbgen',
                             writer_options={'bitsPerProbability': '8'})
Exemple #5
0
def test_gvcfs(spark, tmp_path):
    # GVCF MatrixTables are not keyed by locus and alleles, just by locus
    input_vcf = 'test-data/tabix-test-vcf/combined.chr20_18210071_18210093.g.vcf.gz'
    partitions = [
        hl.Interval(hl.Locus("chr20", 1, reference_genome='GRCh38'),
                    hl.Locus("chr20", 20000000, reference_genome='GRCh38'),
                    includes_end=True)
    ]
    hail_df = functions.from_matrix_table(
        hl.import_gvcfs([input_vcf],
                        partitions,
                        force_bgz=True,
                        reference_genome='GRCh38')[0])
    _assert_lossless_adapter(spark, tmp_path, hail_df, input_vcf, 'vcf',
                             'bigvcf')
vcf_mt = hl.import_vcf(vcf_path)
vcf_mt.show()

# COMMAND ----------

vcf_mt.count()

# COMMAND ----------

# MAGIC %md
# MAGIC ##### convert to spark dataframe with glow schema

# COMMAND ----------

df = functions.from_matrix_table(vcf_mt, include_sample_ids=True)

# COMMAND ----------

df.printSchema()

# COMMAND ----------

df.write.format("delta").save(out_path)

# COMMAND ----------

# MAGIC %md
# MAGIC ##### read back in, view and count dataframe

# COMMAND ----------
Exemple #7
0
def test_gvcf(spark, tmp_path):
    input_vcf = 'test-data/NA12878_21_10002403.g.vcf'
    hail_df = functions.from_matrix_table(hl.import_vcf(input_vcf))
    _assert_lossless_adapter(spark, tmp_path, hail_df, input_vcf, 'vcf', 'vcf')
Exemple #8
0
def test_vcf(spark, tmp_path):
    input_vcf = 'test-data/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf'
    hail_df = functions.from_matrix_table(hl.import_vcf(input_vcf))
    _assert_lossless_adapter(spark, tmp_path, hail_df, input_vcf, 'vcf', 'vcf')
Exemple #9
0
def test_missing_alleles():
    input_vcf = 'test-data/1kg_sample.vcf'
    mt = hl.import_vcf(input_vcf).key_rows_by('locus').drop('alleles')
    with pytest.raises(ValueError):
        functions.from_matrix_table(mt)