def _convert_to_py(self, annotation): if annotation: return Interval._from_java(annotation) else: return annotation
# * In tier 1 regions. # # Definition of probably not somatic: # DP > 10 AND # ( # GT != het OR # ( # binomTest(ad, dp, 0.5, "two.sided") >= alpha # ) # ) # # With alpha = 0.1 we expect a little over 10% of het sites will be falsely lost. # For alpha = 0.1, the DP > 10 restriction corresponds to effective bounds on VAF # of [0.2, 0.8]. for chrom in range(1, 23): print 'Chromosome %d...' % chrom (vds.filter_intervals(Interval.parse( '%d' % chrom)).annotate_variants_table( tier1_bed, root='va.tier1bed').filter_variants_expr( 'va.tier1bed == true && v.isAutosomal()', keep=True).filter_samples_expr( '"^Z" ~ s', keep=False).split_multi().filter_genotypes( ''' g.dp > 10 && ( (!g.isHet()) || binomTest(g.ad[1], g.dp, 0.5, "two.sided") >= 0.1 )''', keep=True).min_rep().export_plink( 'tmp/08b_genotypes_%d' % chrom))
#!../../software/pyhail.sh import hail from hail.representation import Interval from hail.expr import TString, TBoolean, TFloat, TInt hc = hail.HailContext(log = 'log/99_dreamlab2.log', tmp_dir = 'tmp/hail') vds = hc.read('../MGRB.phase2.tier12.match.vqsr.minrep.vds') # Chr22 only, rough variant quality filters vds = (vds .filter_intervals(Interval.parse('22'), keep=True) .filter_variants_expr('va.filters.isEmpty()', keep=True) .split_multi() .variant_qc() .filter_variants_expr(''' v.altAllele.isSNP && va.qc.callRate >= 0.99 && va.qc.dpMean >= 20 && va.qc.dpMean <= 60 && va.qc.dpStDev < 8 && va.filters.isEmpty() && va.qc.AF >= 0.05 && va.qc.AF <= 0.95''') ) # Drop samples with poor metrics on these filtered variants. vds = (vds .sample_qc() .filter_samples_expr('sa.qc.callRate >= 0.985') )
#!/usr/bin/env python import argparse as ap import hail from hail.representation import Interval p = ap.ArgumentParser() p.add_argument("--exomes-vds", help="Exomes dataset to be loaded, already split", required=True) p.add_argument("--genomes-vds", help="Genomes dataset to be loaded, already split", required=True) p.add_argument( "--exomes-vds-out", help="Exomes file to be written", required=True) p.add_argument("--genomes-vds-out", help="Genomes file to be written", required=True) p.add_argument("--interval", help="Interval to subset", required=True) args = p.parse_args() hc = hail.HailContext(log="/hail.log") (hc.read(args.exomes_vds) .filter_intervals(Interval.parse(args.interval)) .write(args.exomes_vds_out, overwrite=True)) (hc.read(args.genomes_vds) .filter_intervals(Interval.parse(args.interval)) .write(args.genomes_vds_out, overwrite=True))
#!./bin/pyhail.sh import pyspark import hail from hail.representation import Interval from hail import KeyTable import os.path hc = hail.HailContext(log='log/07_plink_export_45andup_subpops.log', tmp_dir='tmp/hail') interval_list = [Interval.parse('%d' % chrom) for chrom in range(1, 22)] tier1_bed = KeyTable.import_bed( '../../locus-annotations/source_data/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel.bed' ) (hc.read( '../MGRB.phase2.SNPtier12.match.vqsr.minrep.locusannot.WGStier12.unrelated.vds' ).filter_intervals(interval_list, keep=True).annotate_variants_table( tier1_bed, root='va.tier1bed').filter_variants_expr( 'va.tier1bed == true', keep=True).annotate_variants_expr('va = {}'). filter_samples_expr('"^B" ~ s').split_multi().min_rep().write( 'tmp/MGRB.phase2.SNPtier12.match.vqsr.minrep.locusannot.WGStier12.unrelated.45andUp.GiaB_HCR.noannot.split.minrep.vds' )) sample_lists = [ '../45andup_followup_qcpass_anycancer_mf.sample_list', '../45andup_followup_qcpass_breastcancer_f.sample_list', '../45andup_followup_qcpass_breastcancer_mf.sample_list', '../45andup_followup_qcpass_colorectalcancer_mf.sample_list', '../45andup_followup_qcpass_melanomacancer_mf.sample_list', '../45andup_followup_qcpass_nocancer_f.sample_list',