#!./bin/pyhail.sh import hail from hail.expr import TString, TBoolean, TFloat, TInt hc = hail.HailContext(log = 'log/load.log', tmp_dir = 'tmp/hail') vds = hc.import_vcf('../../GATK3_fastq2gvcf-hs37d5x-1.0_split-combine_genotype_vqsr/*.vcf.bgz') sample_table = (hc .import_table('../../metadata/MGRB_phase2_metadata.csv', delimiter=',', types={ 'sampleID': TString(), 'cohort': TString(), 'YOB':TInt(), 'SBPMean':TInt(), 'HtMtrs':TFloat(), 'WtKgs':TFloat(), 'AbdoCircCms':TInt(), 'GlcmmolL':TFloat(), 'AMD':TBoolean(), 'treatedForHighBP':TBoolean(), 'treatedForHighChol':TBoolean(), 'isFemale': TBoolean() }) .key_by('sampleID') ) vds = vds.annotate_samples_table(sample_table, root='sa.pheno') # Note the use of min_rep only here: vt norm is not performed (as it was for phase 1). # In phase 1 we observed that vt norm almost never changed GATK's representation (rate # < 1e-5), so this extra step was considered not worthwhile. As it is, in the presence
"gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr4.cov.liftover.GRCh38.txt.gz", "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr5.cov.liftover.GRCh38.txt.gz", "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr6.cov.liftover.GRCh38.txt.gz", "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr7.cov.liftover.GRCh38.txt.gz", "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr8.cov.liftover.GRCh38.txt.gz", "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr9.cov.liftover.GRCh38.txt.gz", "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chrX.cov.liftover.GRCh38.txt.gz", ], "output_path": "gs://%(output_bucket)s/GRCh38/gnomad/genomes.coverage.vds" % args.__dict__, }, } field_types = { '#chrom': TString(), 'pos': TInt(), 'mean': TDouble(), 'median': TDouble(), '1': TDouble(), '5': TDouble(), '10': TDouble(), '15': TDouble(), '20': TDouble(), '25': TDouble(), '30': TDouble(), '50': TDouble(), '100': TDouble(), } for label, data_paths in COVERAGE_TSV_PATHS.items():
type=int) args = p.parse_args() hc = hail.HailContext(log="/tmp/hail.log") gene_results_url = "gs://epi-browser/2018-11-07_epi25-exome-browser-gene-results-table-reduced.csv" kt = hc.import_table(gene_results_url, delimiter=",", missing="NA", quote='"', types={ 'gene_name': TString(), 'description': TString(), 'gene_id': TString(), 'xcase_lof': TInt(), 'xctrl_lof': TInt(), 'pval_lof': TDouble(), 'xcase_mpc': TInt(), 'xctrl_mpc': TInt(), 'pval_mpc': TDouble(), 'xcase_infrIndel': TInt(), 'xctrl_infrIndel': TInt(), 'pval_infrIndel': TDouble(), 'pval_meta': TDouble(), 'analysis_group': TString(), }) es = ElasticsearchClient(args.host, args.port) es.export_kt_to_elasticsearch(
--- gnomAD_genomes_NFE_AN: String, --- gnomAD_genomes_NFE_AF: String, --- gnomAD_genomes_OTH_AC: String, --- gnomAD_genomes_OTH_AN: String, --- gnomAD_genomes_OTH_AF: String, --- clinvar_rs: String, --- clinvar_clnsig: String, --- clinvar_trait: String, --- clinvar_golden_stars: String, Interpro_domain: String, GTEx_V6p_gene: String, GTEx_V6p_tissue: String """ DBNSFP_FIELDS["2.9.3"]["field_types"] = { 'pos(1-coor)': TInt(), 'TWINSUK_AC': TInt(), 'TWINSUK_AF': TFloat(), 'ESP6500_AA_AF': TFloat(), 'ESP6500_EA_AF': TFloat(), } DBNSFP_FIELDS["3.5"]["field_types"] = { 'pos(1-based)': TInt(), 'TWINSUK_AC': TInt(), 'TWINSUK_AF': TFloat(), 'ALSPAC_AC': TInt(), 'ALSPAC_AF': TFloat(), 'ESP6500_AA_AC': TInt(), 'ESP6500_AA_AF': TFloat(), 'ESP6500_EA_AC': TInt(),
#! python import hail from hail.expr import TString, TBoolean, TFloat, TInt hc = hail.HailContext(log='log/1.load.log', tmp_dir='tmp/hail') vds = hc.import_vcf('../cohort.vcf') sample_table = (hc.import_table('../metadata/cohort_metadata.csv', delimiter=',', types={ 'sampleID': TString(), 'cohort': TString(), 'YOB': TInt(), 'isFemale': TBoolean() }).key_by('sampleID')) vds = vds.annotate_samples_table(sample_table, root='sa.pheno') # Note the use of min_rep only here: vt norm is not performed (as it was for phase 1). # In phase 1 we observed that vt norm almost never changed GATK's representation (rate # < 1e-5), so this extra step was considered not worthwhile. As it is, in the presence # of complex and multi-allelic variants, simple variant normalization is not especially # useful. vds.min_rep().repartition(500).write('../cohort.minrep.vds')
--- last_evaluated: String, all_submitters: String, --- submitters_ordered: String, all_traits: String, all_pmids: String, inheritance_modes: String, age_of_onset: String, prevalence: String, disease_mechanism: String, origin: String, xrefs: String, --- dates_ordered: String, """ TYPE_MAP = { 'Int': TInt(), 'String': TString(), 'Float': TFloat(), } for genome_version in ["37", "38"]: input_paths = CLINVAR_FILES[genome_version]["input_paths"] output_path = CLINVAR_FILES[genome_version]["output_path"] print("Reading in %s" % ", ".join(input_paths)) fields_to_keep = _parse_field_names_and_types(CLINVAR_SCHEMA, to_keep=True) fields_to_drop = _parse_field_names_and_types(CLINVAR_SCHEMA, to_keep=False) field_types = {
'../variants.distance.good.GiaB_HG001_highConf.txt', 'va.locus.goodGiaBHighConf') ] vds = vds.annotate_variants_expr('va.locus = {}') for inpath, outpath, root in regions: cmd = './bin/bedtools closest -d -g ../genome.txt -a ../variants.sorted.bed.gz -b {inpath} > {outpath}'.format( inpath=inpath, outpath=outpath) print(cmd) subprocess.call(cmd, shell=True) kt = hc.import_table(outpath, no_header=True, types={ 'f3': TVariant(), 'f7': TInt() }).annotate('v = f3, dist = f7').key_by('v') vds = vds.annotate_variants_table( kt, expr='{}_dist = table.dist'.format(root)).annotate_variants_expr( '{} = {}_dist == 0'.format(root, root)) # os.remove(outpath) vds = (vds.annotate_variants_expr('va.locus.badPAR = v.inXPar() || v.inYPar()' ).annotate_variants_expr('''va.locus.tier = if (!("^([0-9]+|X|Y)$" ~ v.contig)) 3 else (if (va.locus.badCoverage_dist <= 5 || va.locus.badComplexity_dist <= 5 || va.locus.badMappability_dist <= 5 || va.locus.badEncodeExcluded_dist <= 5 || va.locus.badPAR) 3 else (if (va.locus.goodGiaBHighConf_dist > 0) 2 else (if (v.contig ~ "^(X|Y)$") 2 else 1)))''')) vds.write('../ccs.cosmic_cgc.minrep.locusannot.vds')
subprocess.call('gzip -dc ../../locus-annotations/hs37d5x_data/genome.bed.gz | cut -f 1,3 > ../genome.txt', shell=True) regions = [('../../locus-annotations/regions/bad.depth.bed.gz', '../variants.distance.bad.depth.txt', 'va.locus.badCoverage'), ('../../locus-annotations/regions/bad.mdust.bed.gz', '../variants.distance.bad.mdust.txt', 'va.locus.badComplexity'), ('../../locus-annotations/regions/bad.rmsk.bed.gz', '../variants.distance.bad.rmsk.txt', 'va.locus.badRepeat'), ('../../locus-annotations/regions/bad.wgEncodeCrgMapabilityAlign100mer.bed.gz', '../variants.distance.bad.mappability100.txt', 'va.locus.badMappability'), ('../../locus-annotations/regions/bad.wgEncodeExcludable.bed.gz', '../variants.distance.bad.encodeExcludable.txt', 'va.locus.badEncodeExcluded'), ('../../locus-annotations/regions/good.HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel.bed.gz', '../variants.distance.good.GiaB_HG001_highConf.txt', 'va.locus.goodGiaBHighConf')] vds = vds.annotate_variants_expr('va.locus = {}') for inpath, outpath, root in regions: cmd = './bin/bedtools closest -d -g ../genome.txt -a ../variants.sorted.bed.gz -b {inpath} > {outpath}'.format(inpath=inpath, outpath=outpath) print(cmd) subprocess.call(cmd, shell=True) kt = hc.import_table(outpath, no_header=True, types={'f3': TVariant(), 'f7': TInt()}).annotate('v = f3, dist = f7').key_by('v') vds = vds.annotate_variants_table(kt, expr='{}_dist = table.dist'.format(root)).annotate_variants_expr('{} = {}_dist == 0'.format(root, root)) os.remove(outpath) vds = (vds .annotate_variants_expr('va.locus.badPAR = v.inXPar() || v.inYPar()') .annotate_variants_expr('''va.locus.tier = if (!("^([0-9]+|X|Y)$" ~ v.contig)) 3 else (if (va.locus.badCoverage_dist <= 5 || va.locus.badComplexity_dist <= 5 || va.locus.badMappability_dist <= 5 || va.locus.badEncodeExcluded_dist <= 5 || va.locus.badPAR) 3 else (if (va.locus.goodGiaBHighConf_dist > 0) 2 else (if (v.contig ~ "^(X|Y)$") 2 else 1)))''') ) vds.write('../MGRB.phase2.SNPtier12.match.vqsr.minrep.locusannot.vds')