Example #1
0
#!./bin/pyhail.sh
import hail
from hail.expr import TString, TBoolean, TFloat, TInt

hc = hail.HailContext(log = 'log/load.log', tmp_dir = 'tmp/hail')

vds = hc.import_vcf('../../GATK3_fastq2gvcf-hs37d5x-1.0_split-combine_genotype_vqsr/*.vcf.bgz')

sample_table = (hc
    .import_table('../../metadata/MGRB_phase2_metadata.csv', delimiter=',', types={
        'sampleID': TString(),
        'cohort': TString(),
        'YOB':TInt(),
        'SBPMean':TInt(),
        'HtMtrs':TFloat(),
        'WtKgs':TFloat(),
        'AbdoCircCms':TInt(),
        'GlcmmolL':TFloat(),
        'AMD':TBoolean(),
        'treatedForHighBP':TBoolean(),
        'treatedForHighChol':TBoolean(),
        'isFemale': TBoolean()
    })
    .key_by('sampleID')
)

vds = vds.annotate_samples_table(sample_table, root='sa.pheno')

# Note the use of min_rep only here: vt norm is not performed (as it was for phase 1).
# In phase 1 we observed that vt norm almost never changed GATK's representation (rate
# < 1e-5), so this extra step was considered not worthwhile.  As it is, in the presence
            "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr4.cov.liftover.GRCh38.txt.gz",
            "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr5.cov.liftover.GRCh38.txt.gz",
            "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr6.cov.liftover.GRCh38.txt.gz",
            "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr7.cov.liftover.GRCh38.txt.gz",
            "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr8.cov.liftover.GRCh38.txt.gz",
            "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chr9.cov.liftover.GRCh38.txt.gz",
            "gs://seqr-reference-data/GRCh38/gnomad/coverage/gnomad.chrX.cov.liftover.GRCh38.txt.gz",
        ],
        "output_path": "gs://%(output_bucket)s/GRCh38/gnomad/genomes.coverage.vds" % args.__dict__,
    },
}


field_types = {
    '#chrom': TString(),
    'pos': TInt(),
    'mean': TDouble(),
    'median': TDouble(),
    '1': TDouble(),
    '5': TDouble(),
    '10': TDouble(),
    '15': TDouble(),
    '20': TDouble(),
    '25': TDouble(),
    '30': TDouble(),
    '50': TDouble(),
    '100': TDouble(),
}


for label, data_paths in COVERAGE_TSV_PATHS.items():
               type=int)
args = p.parse_args()

hc = hail.HailContext(log="/tmp/hail.log")

gene_results_url = "gs://epi-browser/2018-11-07_epi25-exome-browser-gene-results-table-reduced.csv"

kt = hc.import_table(gene_results_url,
                     delimiter=",",
                     missing="NA",
                     quote='"',
                     types={
                         'gene_name': TString(),
                         'description': TString(),
                         'gene_id': TString(),
                         'xcase_lof': TInt(),
                         'xctrl_lof': TInt(),
                         'pval_lof': TDouble(),
                         'xcase_mpc': TInt(),
                         'xctrl_mpc': TInt(),
                         'pval_mpc': TDouble(),
                         'xcase_infrIndel': TInt(),
                         'xctrl_infrIndel': TInt(),
                         'pval_infrIndel': TDouble(),
                         'pval_meta': TDouble(),
                         'analysis_group': TString(),
                     })

es = ElasticsearchClient(args.host, args.port)

es.export_kt_to_elasticsearch(
         --- gnomAD_genomes_NFE_AN: String,
         --- gnomAD_genomes_NFE_AF: String,
         --- gnomAD_genomes_OTH_AC: String,
         --- gnomAD_genomes_OTH_AN: String,
         --- gnomAD_genomes_OTH_AF: String,
         --- clinvar_rs: String,
         --- clinvar_clnsig: String,
         --- clinvar_trait: String,
         --- clinvar_golden_stars: String,
         Interpro_domain: String,
         GTEx_V6p_gene: String,
         GTEx_V6p_tissue: String
    """

DBNSFP_FIELDS["2.9.3"]["field_types"] = {
    'pos(1-coor)': TInt(),
    'TWINSUK_AC': TInt(),
    'TWINSUK_AF': TFloat(),
    'ESP6500_AA_AF': TFloat(),
    'ESP6500_EA_AF': TFloat(),
}

DBNSFP_FIELDS["3.5"]["field_types"] = {
    'pos(1-based)': TInt(),
    'TWINSUK_AC': TInt(),
    'TWINSUK_AF': TFloat(),
    'ALSPAC_AC': TInt(),
    'ALSPAC_AF': TFloat(),
    'ESP6500_AA_AC': TInt(),
    'ESP6500_AA_AF': TFloat(),
    'ESP6500_EA_AC': TInt(),
Example #5
0
#! python
import hail
from hail.expr import TString, TBoolean, TFloat, TInt

hc = hail.HailContext(log='log/1.load.log', tmp_dir='tmp/hail')

vds = hc.import_vcf('../cohort.vcf')

sample_table = (hc.import_table('../metadata/cohort_metadata.csv',
                                delimiter=',',
                                types={
                                    'sampleID': TString(),
                                    'cohort': TString(),
                                    'YOB': TInt(),
                                    'isFemale': TBoolean()
                                }).key_by('sampleID'))

vds = vds.annotate_samples_table(sample_table, root='sa.pheno')

# Note the use of min_rep only here: vt norm is not performed (as it was for phase 1).
# In phase 1 we observed that vt norm almost never changed GATK's representation (rate
# < 1e-5), so this extra step was considered not worthwhile.  As it is, in the presence
# of complex and multi-allelic variants, simple variant normalization is not especially
# useful.
vds.min_rep().repartition(500).write('../cohort.minrep.vds')
Example #6
0
    --- last_evaluated: String,
    all_submitters: String,
    --- submitters_ordered: String,
    all_traits: String,
    all_pmids: String,
    inheritance_modes: String,
    age_of_onset: String,
    prevalence: String,
    disease_mechanism: String,
    origin: String,
    xrefs: String,
    --- dates_ordered: String,
"""

TYPE_MAP = {
    'Int': TInt(),
    'String': TString(),
    'Float': TFloat(),
}

for genome_version in ["37", "38"]:
    input_paths = CLINVAR_FILES[genome_version]["input_paths"]
    output_path = CLINVAR_FILES[genome_version]["output_path"]

    print("Reading in %s" % ", ".join(input_paths))

    fields_to_keep = _parse_field_names_and_types(CLINVAR_SCHEMA, to_keep=True)
    fields_to_drop = _parse_field_names_and_types(CLINVAR_SCHEMA,
                                                  to_keep=False)

    field_types = {
Example #7
0
     '../variants.distance.good.GiaB_HG001_highConf.txt',
     'va.locus.goodGiaBHighConf')
]

vds = vds.annotate_variants_expr('va.locus = {}')

for inpath, outpath, root in regions:
    cmd = './bin/bedtools closest -d -g ../genome.txt -a ../variants.sorted.bed.gz -b {inpath} > {outpath}'.format(
        inpath=inpath, outpath=outpath)
    print(cmd)
    subprocess.call(cmd, shell=True)
    kt = hc.import_table(outpath,
                         no_header=True,
                         types={
                             'f3': TVariant(),
                             'f7': TInt()
                         }).annotate('v = f3, dist = f7').key_by('v')
    vds = vds.annotate_variants_table(
        kt, expr='{}_dist = table.dist'.format(root)).annotate_variants_expr(
            '{} = {}_dist == 0'.format(root, root))
#    os.remove(outpath)

vds = (vds.annotate_variants_expr('va.locus.badPAR = v.inXPar() || v.inYPar()'
                                  ).annotate_variants_expr('''va.locus.tier = 
        if (!("^([0-9]+|X|Y)$" ~ v.contig)) 3
        else (if (va.locus.badCoverage_dist <= 5 || va.locus.badComplexity_dist <= 5 || va.locus.badMappability_dist <= 5 || va.locus.badEncodeExcluded_dist <= 5 || va.locus.badPAR) 3
        else (if (va.locus.goodGiaBHighConf_dist > 0) 2
        else (if (v.contig ~ "^(X|Y)$") 2
        else 1)))'''))

vds.write('../ccs.cosmic_cgc.minrep.locusannot.vds')
Example #8
0
subprocess.call('gzip -dc ../../locus-annotations/hs37d5x_data/genome.bed.gz | cut -f 1,3 > ../genome.txt', shell=True)

regions = [('../../locus-annotations/regions/bad.depth.bed.gz', '../variants.distance.bad.depth.txt', 'va.locus.badCoverage'),
           ('../../locus-annotations/regions/bad.mdust.bed.gz', '../variants.distance.bad.mdust.txt', 'va.locus.badComplexity'),
           ('../../locus-annotations/regions/bad.rmsk.bed.gz', '../variants.distance.bad.rmsk.txt', 'va.locus.badRepeat'),
           ('../../locus-annotations/regions/bad.wgEncodeCrgMapabilityAlign100mer.bed.gz', '../variants.distance.bad.mappability100.txt', 'va.locus.badMappability'),
           ('../../locus-annotations/regions/bad.wgEncodeExcludable.bed.gz', '../variants.distance.bad.encodeExcludable.txt', 'va.locus.badEncodeExcluded'),
           ('../../locus-annotations/regions/good.HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel.bed.gz', '../variants.distance.good.GiaB_HG001_highConf.txt', 'va.locus.goodGiaBHighConf')]

vds = vds.annotate_variants_expr('va.locus = {}')

for inpath, outpath, root in regions:
    cmd = './bin/bedtools closest -d -g ../genome.txt -a ../variants.sorted.bed.gz -b {inpath} > {outpath}'.format(inpath=inpath, outpath=outpath)
    print(cmd)
    subprocess.call(cmd, shell=True)
    kt = hc.import_table(outpath, no_header=True, types={'f3': TVariant(), 'f7': TInt()}).annotate('v = f3, dist = f7').key_by('v')
    vds = vds.annotate_variants_table(kt, expr='{}_dist = table.dist'.format(root)).annotate_variants_expr('{} = {}_dist == 0'.format(root, root))
    os.remove(outpath)
	
vds = (vds
    .annotate_variants_expr('va.locus.badPAR = v.inXPar() || v.inYPar()')
    .annotate_variants_expr('''va.locus.tier = 
        if (!("^([0-9]+|X|Y)$" ~ v.contig)) 3
        else (if (va.locus.badCoverage_dist <= 5 || va.locus.badComplexity_dist <= 5 || va.locus.badMappability_dist <= 5 || va.locus.badEncodeExcluded_dist <= 5 || va.locus.badPAR) 3
        else (if (va.locus.goodGiaBHighConf_dist > 0) 2
        else (if (v.contig ~ "^(X|Y)$") 2
        else 1)))''')
)

vds.write('../MGRB.phase2.SNPtier12.match.vqsr.minrep.locusannot.vds')