Esempi in Python per parse_locus, esempi in Python per hail.parse_locus

Esempio n. 1

0

Mostra file

File: ld.py Progetto: enriquea/gnomad_hail

def get_r_human_readable(pop: str, var1: str, var2: str, ref_genome: str = "GRCh37"):
    bm = ld_matrix(pop).bm()
    ht = ld_index(pop).ht()
    chrom, pos, ref, alt = var1.split("-")
    var1 = (hl.parse_locus(f"{chrom}:{pos}", ref_genome), [ref, alt])
    chrom, pos, ref, alt = var2.split("-")
    var2 = (hl.parse_locus(f"{chrom}:{pos}", ref_genome), [ref, alt])
    return get_r_for_pair_of_variants(bm, ht, var1, var2)

Esempio n. 2

0

Mostra file

def run_gwas(mt,
             phen: str,
             sim_name: str,
             subset_idx: int,
             param_suffix: str,
             wd: str,
             is_logreg=True):
    assert {'GT', 'dosage'}.intersection(
        mt.entry
    ) != {}, "mt does not have an entry field named 'dosage' or 'GT' corresponding to genotype data"

    mt = mt.filter_cols(mt.subset_idx == subset_idx)
    mt = mt.filter_cols(hl.is_defined(mt[phen]))
    print(
        f'\n\ngwas sample count (subset {subset_idx}): {mt.count_cols()}\n\n')

    if 'dosage' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.dosage) / 2)
    elif 'GT' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)

    gwas_path = f'{wd}/gwas.{"logreg" if is_logreg else "linreg"}.{sim_name}.subset_{subset_idx}.{param_suffix}.tsv.gz'

    if not hl.hadoop_is_file(gwas_path):
        gt_field = mt.dosage if 'dosage' in mt.entry else mt.GT.n_alt_alleles()

        if is_logreg:
            gwas_ht = hl.logistic_regression_rows(test='wald',
                                                  y=mt[phen],
                                                  x=gt_field,
                                                  covariates=[1],
                                                  pass_through=['EAF'])
        else:
            gwas_ht = hl.linear_regression_rows(y=mt[phen],
                                                x=gt_field,
                                                covariates=[1],
                                                pass_through=['EAF'])
        gwas_ht.select('EAF', 'beta', 'standard_error',
                       'p_value').export(gwas_path)

    else:
        print(f'GWAS already run! ({gwas_path})')
        gwas_ht = hl.import_table(gwas_path, impute=True, force=True)
        gwas_ht = gwas_ht.annotate(locus=hl.parse_locus(gwas_ht.locus),
                                   alleles=gwas_ht.alleles.replace(
                                       '\[\"', '').replace('\"\]',
                                                           '').split('\",\"'))
        gwas_ht = gwas_ht.key_by('locus', 'alleles')

    return gwas_ht

Esempio n. 3

0

Mostra file

def load_variant_data(directory: str,
                      pheno_key_dict,
                      ukb_vep_path: str,
                      extension: str = 'single.txt',
                      n_cases: int = -1,
                      n_controls: int = -1,
                      heritability: float = -1.0,
                      saige_version: str = 'NA',
                      inv_normalized: str = 'NA',
                      overwrite: bool = False,
                      legacy_annotations: bool = False,
                      num_partitions: int = 1000):
    output_ht_path = f'{directory}/variant_results.ht'
    ht = hl.import_table(f'{directory}/*.{extension}',
                         delimiter=' ',
                         impute=True)
    print(f'Loading: {directory}/*.{extension} ...')
    marker_id_col = 'markerID' if extension == 'single.txt' else 'SNPID'
    locus_alleles = ht[marker_id_col].split('_')
    if n_cases == -1: n_cases = hl.null(hl.tint)
    if n_controls == -1: n_controls = hl.null(hl.tint)
    if heritability == -1.0: heritability = hl.null(hl.tfloat)
    if saige_version == 'NA': saige_version = hl.null(hl.tstr)
    if inv_normalized == 'NA': inv_normalized = hl.null(hl.tstr)

    ht = ht.key_by(locus=hl.parse_locus(locus_alleles[0]),
                   alleles=locus_alleles[1].split('/'),
                   **pheno_key_dict).distinct().naive_coalesce(num_partitions)
    if marker_id_col == 'SNPID':
        ht = ht.drop('CHR', 'POS', 'rsid', 'Allele1', 'Allele2')
    ht = ht.transmute(Pvalue=ht['p.value']).annotate_globals(
        n_cases=n_cases,
        n_controls=n_controls,
        heritability=heritability,
        saige_version=saige_version,
        inv_normalized=inv_normalized)
    ht = ht.drop('varT', 'varTstar', 'N', 'Tstat')
    ht = ht.annotate(**get_vep_formatted_data(
        ukb_vep_path, legacy_annotations=legacy_annotations)[hl.struct(
            locus=ht.locus, alleles=ht.alleles
        )])  # TODO: fix this for variants that overlap multiple genes
    ht = ht.checkpoint(output_ht_path,
                       overwrite=overwrite,
                       _read_if_exists=not overwrite).drop(
                           'n_cases', 'n_controls', 'heritability')

Esempio n. 4

0

Mostra file

File: test_api.py Progetto: shulik7/hail

    def test_constructors(self):
        rg = hl.ReferenceGenome("foo", ["1"], {"1": 100})

        schema = hl.tstruct(a=hl.tfloat64, b=hl.tfloat64, c=hl.tint32, d=hl.tint32)
        rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': 5}]
        kt = hl.Table.parallelize(rows, schema)
        kt = kt.annotate(d=hl.int64(kt.d))

        kt = kt.annotate(l1=hl.parse_locus("1:51"),
                         l2=hl.locus("1", 51, reference_genome=rg),
                         i1=hl.parse_locus_interval("1:51-56", reference_genome=rg),
                         i2=hl.interval(hl.locus("1", 51, reference_genome=rg),
                                        hl.locus("1", 56, reference_genome=rg)))

        expected_schema = {'a': hl.tfloat64, 'b': hl.tfloat64, 'c': hl.tint32, 'd': hl.tint64,
                           'l1': hl.tlocus(), 'l2': hl.tlocus(rg),
                           'i1': hl.tinterval(hl.tlocus(rg)), 'i2': hl.tinterval(hl.tlocus(rg))}

        self.assertTrue(all([expected_schema[f] == t for f, t in kt.row.dtype.items()]))

Esempio n. 5

0

Mostra file

def import_var(seqr: str) -> hl.Table:
    '''
    Reads in tsv of variants downloaded from seqr into a hail Table.

    :param str seqr: Path to  variants tsv 
    :return: Table of variants
    :rtype: hl.Table
    '''
    ht = hl.import_table(seqr, impute=True)

    # add 'chr' in front of chromosome; hail will not recognize a locus as a valid b38 locus unless the chromosome is prefixed with 'chr'
    ht = ht.transmute(chrom=hl.format('chr%s', ht.chrom))

    # create locus and alleles (need these two fields to succesfully join with gnomAD data)
    ht = ht.transmute(locus=hl.parse_locus(hl.format('%s:%s', ht.chrom,
                                                     ht.pos)),
                      alleles=[ht.ref, ht.alt])
    ht = ht.key_by('locus', 'alleles')
    ht.describe()
    return ht

Esempio n. 6

0

Mostra file

def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    # get frequency of loadings values
    loadings = hl.read_table(LOADINGS)
    number_of_pcs = hl.len(loadings.loadings).take(1)[0]
    print(loadings.count())
    for i in range(0, (number_of_pcs)):
        pc = i + 1
        freq = Counter(hl.abs(loadings.loadings[i]).collect())
        filename = 'loadings_pc' + str(pc) + '.txt'
        with open(filename, 'w') as f:
            for key, value in freq.items():
                str_value = repr(key) + ' ' + repr(value)
                f.write(str_value + '\n')
        f.close()
        subprocess.run(['gsutil', 'cp', filename, output], check=False)

    # pull out variants that looked like they're capped in the loadings plot
    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    # Get NFE samples only
    mt = mt.filter_cols((
        mt.hgdp_1kg_metadata.population_inference.pop == 'nfe')
                        | (mt.s.contains('TOB')))
    intervals = [
        hl.parse_locus(x, reference_genome='GRCh38') for x in [
            'chr1:176163025',
            'chr5:272714',
            'chr5:36104012',
            'chr1:183565810',
            'chr3:58111799',
        ]
    ]
    mt_hits = mt.filter_rows(hl.literal(intervals).contains(mt.locus))
    mt_path = f'{output}/capped_loadings_intervals.mt'
    mt_hits.write(mt_path)

Esempio n. 7

0

Mostra file

File: get_coding_variants.py Progetto: nikbaya/smiles

def annotate_with_coding(ht, fname):
    ss0 = hl.import_table(f'{wd_data}/{fname}',impute=True,force=True,types={'chr':hl.tstr})
    if 'variant' in list(ss0.row): 
        variant = ss0.variant.split(':')
        ss = ss0.filter(hl.is_valid_locus(variant[0], 
                                          hl.int(variant[1]),
                                          'GRCh37'))
        locus = ss.variant.split(':')
        ss = ss.annotate(locus = hl.parse_locus(locus[0]+':'+locus[1],reference_genome='GRCh37'))
        if 'ytx' in ss.row: # a proxy for checking if the sumstats are from UKB
            variant = ss.variant.split(':')
            ss.annotate(A1 = variant[2],
                        A2 = variant[3])
    elif 'chr' in list(ss0.row) and 'pos' in list(ss0.row):
        ss = ss0.annotate(locus = hl.locus(contig=ss0.chr,pos=ss0.pos,reference_genome='GRCh37'))
            
    ss = ss.annotate(coding=hl.is_defined(ht[ss.locus]))
    fields_to_drop = []
    fields = ['locus','AC','ytx','tstat','effect_allele','other_allele']
    for field in fields:
        if field in ss.row:
            fields_to_drop.append(field)
    ss = ss.drop(*fields_to_drop)
    ss.export(f"{wd_data}/{fname.split('.tsv')[0]}.coding.tsv{fname.split('.tsv')[1]}")

Esempio n. 8

0

Mostra file

File: conftest.py Progetto: tpoterba/hail

def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl
    doctest_namespace['np'] = np

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'output/example.mt', overwrite=True)

    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    small_mt = hl.balding_nichols_model(3, 4, 4)
    doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt',
                                                        overwrite=True)

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'output/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    ld_score_one_pheno_sumstats = hl.import_table(
        'data/ld_score_regression.one_pheno.sumstats.tsv',
        types={
            'locus': hl.tlocus('GRCh37'),
            'alleles': hl.tarray(hl.tstr),
            'chi_squared': hl.tfloat64,
            'n': hl.tint32,
            'ld_score': hl.tfloat64,
            'phenotype': hl.tstr,
            'chi_squared_50_irnt': hl.tfloat64,
            'n_50_irnt': hl.tint32,
            'chi_squared_20160': hl.tfloat64,
            'n_20160': hl.tint32
        },
        key=['locus', 'alleles'])
    doctest_namespace[
        'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats

    mt = hl.import_matrix_table(
        'data/ld_score_regression.all_phenos.sumstats.tsv',
        row_fields={
            'locus': hl.tstr,
            'alleles': hl.tstr,
            'ld_score': hl.tfloat64
        },
        entry_type=hl.tstr)
    mt = mt.key_cols_by(phenotype=mt.col_id)
    mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus),
                        alleles=mt.alleles.split(','))
    mt = mt.drop('row_id', 'col_id')
    mt = mt.annotate_entries(x=mt.x.split(","))
    mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]),
                              n=hl.int32(mt.x[1]))
    mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score))
    doctest_namespace['ld_score_all_phenos_sumstats'] = mt

    print("finished setting up doctest...")

Esempio n. 9

0

Mostra file

     '/ludc/Home/daniel_c/dva/files/ukb_index/chr{}.idx2'.format(ch)
 }
 # Creating MatrixTable
 mt = hl.import_bgen(bgen_file,
                     entry_fields=['GT'],
                     sample_file=ukb_sf,
                     index_file_map=file_map,
                     _row_fields=['rsid'])
 # Extracting SNPs of interest
 mt_f = hl.filter_intervals(mt, ploci)
 mt_f = hl.variant_qc(mt_f)
 chromdat['chrompos'] = chromdat['chrom'] + ':' + chromdat[
     'hg19_pos'].astype(str)
 chromdat_hl = hl.Table.from_pandas(chromdat)
 chromdat_hl = chromdat_hl.annotate(
     locus=hl.parse_locus(chromdat_hl.chrompos, reference_genome='GRCh37'))
 chromdat_hl = chromdat_hl.key_by('locus')
 mt_f = mt_f.annotate_rows(**chromdat_hl[mt_f.locus])
 flip = hl.case().when(mt_f.ea == mt_f.alleles[0],
                       True).when(mt_f.ea == mt_f.alleles[1],
                                  False).or_missing()
 mt_f = mt_f.annotate_rows(flip=flip)
 mt_f = mt_f.annotate_rows(
     prior=2 *
     hl.if_else(mt_f.flip, mt_f.variant_qc.AF[0], mt_f.variant_qc.AF[1]))
 mt_f = mt_f.select_entries(G=hl.coalesce(
     hl.if_else(mt_f.flip, 2 - mt_f.GT.n_alt_alleles(),
                mt_f.GT.n_alt_alleles()), mt_f.prior))
 ## Exporting result
 output = '/ludc/Home/daniel_c/dva/files/ukbgeno/chrom{}.vcf.bgz'.format(ch)
 hl.export_vcf(mt_f, output)

Esempio n. 10

0

Mostra file

        # TODO: write out matrix table with sim results? Should always be able to get same exact mt if global seed is set

    elif sim_name[:3] != 'bn_' and hl.hadoop_is_file(
            betas_path) and hl.hadoop_is_file(phens_path):
        mt = get_mt(
            remove_withdrawn=False
        )  # no need to remove withdrawn samples because phenotypes have only been calculated for non-withdrawn samples

        betas = hl.import_table(betas_path, impute=True, force=True)
        phens = hl.import_table(phens_path,
                                key=['s'],
                                types={'s': hl.tstr},
                                impute=True,
                                force=True)

        betas = betas.annotate(locus=hl.parse_locus(betas.locus),
                               alleles=betas.alleles.replace(
                                   '\[\"', '').replace('\"\]',
                                                       '').split('\",\"'))
        betas = betas.key_by('locus', 'alleles')

        #        sim_mt = mt.annotate_rows(beta=betas[mt.locus, mt.alleles].beta)
        sim_mt = sim_mt.annotate_cols(y_binarized=phens[sim_mt.s].y_binarized)

    else:
        mt = get_mt(remove_withdrawn=True)
        sim_mt = get_sim_mt(mt=mt, h2=h2, pi=pi, K=K)

        sim_mt.rows().select('beta').export(betas_path)
        sim_mt.cols().select('y', 'y_binarized').export(phens_path)

Esempio n. 11

0

Mostra file

for variant in args.variant:
    print("------------")
    try:
        chrom, pos, ref, alt = variant.split("-")
        chrom_without_prefix = chrom.replace("chr", "")
        chrom = "chr" + chrom_without_prefix

        pos = int(pos)
    except:
        p.error(f"Unable to parse variant: {variant}")
        break

    print(f"locus: {chrom}:{pos-200}-{pos+200}")

    locus = hl.parse_locus(f"{chrom}:{pos}", reference_genome="GRCh38")

    print(f"checking v3: {chrom}-{pos}")
    ht_v3 = hl.read_table(v3_table)
    matches = ht_v3.filter(ht_v3.locus == locus, keep=True).collect()
    print()
    for match in matches:
        print("----")
        print(f"   {match}")
        print_bam_paths(match)

    print(f"checking v3.1: {chrom}-{pos}")
    ht_v3_1 = hl.read_table(v3_1_table)
    matches = ht_v3_1.filter(ht_v3_1.locus == locus, keep=True).collect()
    print()
    for match in matches:

Esempio n. 12

0

Mostra file

# Import modules and init Hail
import hail as hl
from hail_init import DEFAULT_REF

# Read variant QC passing matrix table
variantqc_pass = hl.read_matrix_table("variantqc_pass.mt")

# Exact or approximate coordinates
intervals = ["chr10:52765380-52772784", "chr1:100M-200M"]

filtered_mt = hl.filter_intervals(variantqc_pass, [
    hl.parse_locus_interval(x, reference_genome=DEFAULT_REF) for x in intervals
])

# Nucleotide window around locus
locus = hl.parse_locus("chrX:23833353", DEFAULT_REF)
window = locus.window(100000, 100000)  # 100,000 nucleotides before and after

filtered_mt = variantqc_pass.filter_rows(window.contains(variantqc_pass.locus))

# Filter by allelic frequency
filtered_mt = filtered_mt.filter_rows(filtered_mt.variant_qc.AF[1] < 0.01)

Esempio n. 13

0

Mostra file

File: test_variant_id.py Progetto: leklab/pcgc_hail

 def test_xpos_1(self):
     locus = hl.parse_locus("1:55505463", "GRCh37")
     self.assertEqual(hl.eval(get_expr_for_xpos(locus)), 1055505463)

Esempio n. 14

0

Mostra file

File: test_variant_id.py Progetto: leklab/pcgc_hail

 def test_xpos_grch38(self):
     locus = hl.parse_locus("chr2:166847734", "GRCh38")
     self.assertEqual(hl.eval(get_expr_for_xpos(locus)), 2166847734)

Esempio n. 15

0

Mostra file

File: test_variant_id.py Progetto: leklab/pcgc_hail

 def test_xpos_2(self):
     locus = hl.parse_locus("X:18525192", "GRCh37")
     self.assertEqual(hl.eval(get_expr_for_xpos(locus)), 23018525192)

Esempio n. 16

0

Mostra file

def main(args):
    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import adj genotype MT and remove
    mt = hl.read_matrix_table(
        get_qc_mt_path(dataset=args.exome_cohort,
                       part='sample_qc_adj_genotypes',
                       split=True))

    # keep samples passing QC filtering
    mt = (mt.filter_cols(mt.pass_filters).select_cols().select_rows())

    # import variant info fields (vcf info)
    variant_info_ht = (get_vep_annotation_ht().drop('vep'))

    # Add useful annotation for variant hard filter
    ht = (
        mt.annotate_rows(
            inbreeding_coeff=variant_info_ht[mt.row_key].info.InbreedingCoeff,
            vqsr_filter=variant_info_ht[mt.row_key].filters,
            VQSLOD=variant_info_ht[mt.row_key].info.VQSLOD,
            gt_counts=hl.agg.count_where(hl.is_defined(
                mt.GT))  # expected MT filtered to high-quality GT
        ).rows())

    # 1. Apply variant hard filters
    # hard filter expression
    variant_hard_filter_expr = {
        'fail_inbreeding_coeff':
        ht.inbreeding_coeff < INBREEDING_COEFFICIENT_CUTOFF,
        'AC0': ht.gt_counts == 0
    }

    ht = (ht.annotate(**variant_hard_filter_expr))

    # 2. Apply VQSR filter
    ht = (ht.annotate(fail_vqsr=hl.len(ht.vqsr_filter) != 0))

    # 3. Apply RF filter

    # import/parse rf final HT
    ht_rf = hl.read_table(get_variant_qc_ht_path(part='rf_result'))

    ht_rf = (ht_rf.select(rf_probability_tp=ht_rf.rf_probability['TP'],
                          variant_type=ht_rf.variant_type))

    ht = (ht.annotate(**ht_rf[ht.key]))

    ht = (ht.annotate(fail_rf=hl.case().when(
        (ht.rf_probability_tp < RF_PROBABILITY_SNV_CUTOFF)
        & (ht.variant_type == 'snv'), True).when(
            (ht.rf_probability_tp < RF_PROBABILITY_INDEL_CUTOFF)
            & (ht.variant_type == 'indel'), True).default(False)))

    # 5. Apply coverage/capture interval filters

    ## gnomad genome coverage
    gnomad_coverage_ht = get_gnomad_genomes_coverage_ht().key_by()
    gnomad_coverage_ht = (gnomad_coverage_ht.annotate(locus=hl.parse_locus(
        gnomad_coverage_ht.locus, reference_genome='GRCh38')).key_by('locus'))
    ht = (ht.annotate(gnomad_cov_10X=gnomad_coverage_ht[ht.locus].over_10))
    ht = (ht.annotate(is_coveraged_gnomad_genomes=ht.gnomad_cov_10X >= 0.9))

    ## defined in capture intervals

    # filter to capture intervals (intersect)
    ht_defined_intervals = filter_capture_intervals(ht)
    ht = (ht.annotate(is_defined_capture_intervals=hl.is_defined(
        ht_defined_intervals[ht.key])))

    # 6. Summary final variant QC

    # final variant qc filter joint expression
    final_variant_qc_ann_expr = {
        'pass_variant_qc_filters':
        hl.cond(
            ~ht.fail_inbreeding_coeff & ~ht.AC0 & ~ht.fail_vqsr & ~ht.fail_rf
            & ht.is_coveraged_gnomad_genomes & ht.is_defined_capture_intervals,
            True, False)
    }
    ht = (ht.annotate(**final_variant_qc_ann_expr))

    # Counts the number of variants (snv and indels) affected by every filter and add as global field
    filter_flags = [
        'fail_inbreeding_coeff', 'AC0', 'fail_vqsr', 'fail_rf',
        'is_coveraged_gnomad_genomes', 'is_defined_capture_intervals',
        'pass_variant_qc_filters'
    ]

    summary_filter_expr = {
        v: hl.struct(
            **{
                f: hl.agg.filter(ht.variant_type == v, hl.agg.counter(ht[f]))
                for f in filter_flags
            })
        for v in ['snv', 'indel']
    }

    ht = ht.annotate_globals(
        summary_filter=ht.aggregate(summary_filter_expr, _localize=False))

    # write HT variant QC final table
    output_path = get_variant_qc_ht_path(dataset=args.exome_cohort,
                                         part='final_qc')
    ht = ht.checkpoint(output_path, overwrite=args.overwrite)

    # print filter summary
    logger.info(f'Variant QC filter summary: {ht.summary_filter.collect()}')

    # export HT to file
    if args.write_to_file:
        ht.export(f'{output_path}.tsv.bgz')

    # Stop Hail
    hl.stop()

    print("Finished!")