Example #1
0
 def test_summarize_variants_ti_tv(self):
     mt = hl.import_vcf(resource('sample.vcf'))
     # check that summarize can run with the print control flow
     hl.summarize_variants(mt, handler=lambda s: ())
     r = hl.summarize_variants(mt, show=False)
     assert r['allele_types'] == {'Deletion': 27, 'Insertion': 18, 'SNP': 301}
     assert r['contigs'] == {'20': 346}
     assert r['n_variants'] == 346
     assert r['r_ti_tv'] == 2.5
     assert r['allele_counts'] == {2: 346}
Example #2
0
 def test_summarize_variants(self):
     mt = hl.utils.range_matrix_table(3, 3)
     variants = hl.literal({0: hl.Struct(locus=hl.Locus('1', 1), alleles=['A', 'T', 'C']),
                            1: hl.Struct(locus=hl.Locus('2', 1), alleles=['A', 'AT', '@']),
                            2: hl.Struct(locus=hl.Locus('2', 1), alleles=['AC', 'GT'])})
     mt = mt.annotate_rows(**variants[mt.row_idx]).key_rows_by('locus', 'alleles')
     r = hl.summarize_variants(mt, show=False)
     self.assertEqual(r.n_variants, 3)
     self.assertEqual(r.contigs, {'1': 1, '2': 2})
     self.assertEqual(r.allele_types, {'SNP': 2, 'MNP': 1, 'Unknown': 1, 'Insertion': 1})
     self.assertEqual(r.allele_counts, {2: 1, 3: 2})
Example #3
0
 def test_summarize_variants(self):
     mt = hl.utils.range_matrix_table(3, 3)
     variants = hl.literal({
         0:
         hl.Struct(locus=hl.Locus('1', 1), alleles=['A', 'T', 'C']),
         1:
         hl.Struct(locus=hl.Locus('2', 1), alleles=['A', 'AT', '@']),
         2:
         hl.Struct(locus=hl.Locus('2', 1), alleles=['AC', 'GT'])
     })
     mt = mt.annotate_rows(**variants[mt.row_idx]).key_rows_by(
         'locus', 'alleles')
     r = hl.summarize_variants(mt, show=False)
     self.assertEqual(r.n_variants, 3)
     self.assertEqual(r.contigs, {'1': 1, '2': 2})
     self.assertEqual(r.allele_types, {
         'SNP': 2,
         'MNP': 1,
         'Unknown': 1,
         'Insertion': 1
     })
     self.assertEqual(r.allele_counts, {2: 1, 3: 2})
def summarize_variants(t: Union[hl.MatrixTable, hl.Table], ) -> hl.Struct:
    """
    Get summary of variants in a MatrixTable or Table.

    Print the number of variants to stdout and check that each chromosome has variant calls.

    :param t: Input MatrixTable or Table to be checked.
    :return: Struct of variant summary
    """
    if isinstance(t, hl.MatrixTable):
        logger.info("Dataset has %d samples.", t.count_cols())

    var_summary = hl.summarize_variants(t, show=False)
    logger.info(
        "Dataset has %d variants distributed across the following contigs: %s",
        var_summary.n_variants,
        var_summary.contigs,
    )

    for contig in var_summary.contigs:
        if var_summary.contigs[contig] == 0:
            logger.warning("%s has no variants called", var_summary.contigs)

    return var_summary
Example #5
0
                                  (6 * stats_het_hom_var.stdev)))
mt = mt.filter_cols(
    mt.sample_qc.r_het_hom_var > (stats_het_hom_var.mean -
                                  (6 * stats_het_hom_var.stdev)))
#Number of heterozygous calls
mt = mt.filter_cols(mt.sample_qc.n_het < (stats_het.mean +
                                          (6 * stats_het.stdev)))
mt = mt.filter_cols(mt.sample_qc.n_het > (stats_het.mean -
                                          (6 * stats_het.stdev)))

######## 3.4 Remove non-autosomes(X, Y and MT DNA)
mt = mt.filter_rows(mt.locus.in_autosome())

######## 4. BASELINE CHARACTERISTICS QC-FILTERED DATA
# Summary on number of SNPs, indels and variants per chromosomes
hl.summarize_variants(mt)

#Partition data into cases (mt_case) and controls (mt_ctrl)
mt_case = mt.filter_cols(mt.Affection == 'Case')
mt_ctrl = mt.filter_cols(mt.Affection == 'Control')

#Calculate subject statistics
print('Age of cases =', mt_case.aggregate_cols(hl.agg.stats(mt_case.Age)))
print('Age of controls =', mt_ctrl.aggregate_cols(hl.agg.stats(mt_ctrl.Age)))

print('#Individuals of Cases:',
      mt_case.aggregate_cols(hl.agg.counter(mt_case.Race)))
print('#Individuals of Controls:',
      mt_ctrl.aggregate_cols(hl.agg.counter(mt_ctrl.Race)))

print('Gender  Cases:',
Example #6
0
def populate_clinvar():

    clinvar_release_date = _parse_clinvar_release_date('clinvar.vcf.gz')
    mt = import_vcf('clinvar.vcf.gz',
                    "38",
                    drop_samples=True,
                    min_partitions=2000,
                    skip_invalid_loci=True)
    mt = mt.annotate_globals(version=clinvar_release_date)

    print("\n=== Running VEP ===")
    mt = hl.vep(mt, 'vep85-loftee-ruddle-b38.json', name="vep")

    print("\n=== Processing ===")
    mt = mt.annotate_rows(
        sortedTranscriptConsequences=
        get_expr_for_vep_sorted_transcript_consequences_array(vep_root=mt.vep))

    mt = mt.annotate_rows(
        main_transcript=
        get_expr_for_worst_transcript_consequence_annotations_struct(
            vep_sorted_transcript_consequences_root=mt.
            sortedTranscriptConsequences))

    mt = mt.annotate_rows(gene_ids=get_expr_for_vep_gene_ids_set(
        vep_transcript_consequences_root=mt.sortedTranscriptConsequences), )

    review_status_str = hl.delimit(
        hl.sorted(hl.array(hl.set(mt.info.CLNREVSTAT)),
                  key=lambda s: s.replace("^_", "z")))

    mt = mt.select_rows(
        allele_id=mt.info.ALLELEID,
        alt=get_expr_for_alt_allele(mt),
        chrom=get_expr_for_contig(mt.locus),
        clinical_significance=hl.delimit(
            hl.sorted(hl.array(hl.set(mt.info.CLNSIG)),
                      key=lambda s: s.replace("^_", "z"))),
        domains=get_expr_for_vep_protein_domains_set(
            vep_transcript_consequences_root=mt.vep.transcript_consequences),
        gene_ids=mt.gene_ids,
        gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map(
            vep_sorted_transcript_consequences_root=mt.
            sortedTranscriptConsequences,
            gene_ids=mt.gene_ids),
        gold_stars=CLINVAR_GOLD_STARS_LOOKUP[review_status_str],
        **{
            f"main_transcript_{field}": mt.main_transcript[field]
            for field in mt.main_transcript.dtype.fields
        },
        pos=get_expr_for_start_pos(mt),
        ref=get_expr_for_ref_allele(mt),
        review_status=review_status_str,
        transcript_consequence_terms=get_expr_for_vep_consequence_terms_set(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
        transcript_ids=get_expr_for_vep_transcript_ids_set(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
        transcript_id_to_consequence_json=
        get_expr_for_vep_transcript_id_to_consequence_map(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
        variant_id=get_expr_for_variant_id(mt),
        xpos=get_expr_for_xpos(mt.locus),
    )

    print("\n=== Summary ===")
    hl.summarize_variants(mt)

    # Drop key columns for export
    rows = mt.rows()
    rows = rows.order_by(rows.variant_id).drop("locus", "alleles")
    rows.write('clinvar.ht', overwrite=True)
    '''
Example #7
0
    maf
    hwe
    relatedness
'''

# Getting total counts for samples/variants
printCount(mt_auto)

# Getting variant counts per site
siteVarCount(mt_auto)

# Getting sample counts per site
siteSampleCount(mt_auto)

# Getting the indel counts
hl.summarize_variants(mt_auto)

# SNP call rate 1st pass filtering
mt_qc = mt_auto.filter_rows((mt_auto.var_cr_flag.contains(True) == True),
                            keep=False)

# Printing out counts post filter
printFilterCounts('SNP call rate', mt_qc, mt_auto.count_rows(), 'variants')

# Getting the indel counts
hl.summarize_variants(mt_qc)

# Getting counts per site for variants and samples post filter
siteVarCount(mt_qc)
siteSampleCount(mt_qc)