Python split_multi_hts Beispiele, hail.split_multi_hts Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_methods.py Projekt: shulik7/hail

    def test_split_multi_hts(self):
        ds1 = hl.import_vcf(resource('split_test.vcf'))
        ds1 = hl.split_multi_hts(ds1)
        ds2 = hl.import_vcf(resource('split_test_b.vcf'))
        df = ds1.rows()
        self.assertTrue(df.all((df.locus.position == 1180) | df.was_split))
        ds1 = ds1.drop('was_split', 'a_index')
        self.assertTrue(ds1._same(ds2))

        ds = self.get_dataset()
        ds = ds.annotate_entries(X=ds.GT)
        with self.assertRaises(utils.FatalError):
            hl.split_multi_hts(ds)

Beispiel #2

0

Datei anzeigen

def generate_allele_data(ht: hl.Table) -> hl.Table:
    """
    Returns bi-allelic sites HT with the following annotations:
     - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles)

    :param Table ht: Full unsplit HT
    :return: Table with allele data annotations
    :rtype: Table
    """
    ht = ht.select()
    allele_data = hl.struct(nonsplit_alleles=ht.alleles,
                            has_star=hl.any(lambda a: a == "*", ht.alleles))
    ht = ht.annotate(allele_data=allele_data.annotate(
        **add_variant_type(ht.alleles)))

    ht = hl.split_multi_hts(ht)
    ht = ht.filter(hl.len(ht.alleles) > 1)
    allele_type = (hl.case().when(
        hl.is_snp(ht.alleles[0], ht.alleles[1]),
        "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]),
                    "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]),
                                "del").default("complex"))
    ht = ht.annotate(allele_data=ht.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=ht.allele_data.variant_type == "mixed"))
    return ht

Beispiel #3

0

Datei anzeigen

def main(args):

    hl.init(log='/liftover.log')

    if args.gnomad:
        gnomad = True
        path = None

        if args.exomes:
            data_type = 'exomes'
        if args.genomes:
            data_type = 'genomes'

        logger.info('Working on gnomAD {} release ht'.format(data_type))
        logger.info('Reading in release ht')
        t = public_release(data_type).ht()
        logger.info('Variants in release ht: {}'.format(t.count()))

    else:
        data_type = None
        gnomad = False

        if args.ht:
            path = args.ht
            t = hl.read_table(args.ht)
        if args.mt:
            path = args.mt
            t = hl.read_matrix_table(args.mt)

    logger.info('Checking if input data has been split')
    if 'was_split' not in t.row:
        t = hl.split_multi(t) if isinstance(
            t, hl.Table) else hl.split_multi_hts(t)

    logger.info('Preparing reference genomes for liftover')
    source, target = get_liftover_genome(t)

    if args.test:
        logger.info('Filtering to chr21 for testing')
        if source.name == 'GRCh38':
            contig = 'chr21'
        else:
            contig = '21'
        t = hl.filter_intervals(
            t, [hl.parse_locus_interval(contig, reference_genome=source.name)])

    logger.info(f'Lifting data to {target.name}')
    t = lift_data(t, gnomad, data_type, path, target, args.overwrite)

    logger.info('Checking SNPs for reference mismatches')
    t = annotate_snp_mismatch(t, data_type, target)

    mismatch = check_mismatch(t) if isinstance(
        t, hl.Table) else check_mismatch(t.rows())
    logger.info('{} total SNPs'.format(mismatch['total_variants']))
    logger.info('{} SNPs on minus strand'.format(mismatch['negative_strand']))
    logger.info('{} reference mismatches in SNPs'.format(
        mismatch['total_mismatch']))
    logger.info('{} mismatches on minus strand'.format(
        mismatch['negative_strand_mismatch']))

Beispiel #4

0

Datei anzeigen

Datei: write_1kg_ht.py Projekt: ssadedin/hail-elasticsearch-pipelines

def vcf_to_mt(path, genome_version):
    '''
    Converts 1kg vcf to mt. The 1kg dataset has multi-allelic variants and duplicates.
    This function independently filters the mutli-allelics to split, then unions with
    the bi-allelics.

    :param path: vcf path
    :param genome_version: genome version
    :return:
    '''
    # Import but do not split multis here.
    mt = import_vcf(path,
                    genome_version=genome_version,
                    min_partitions=1000,
                    split_multi_alleles=False)

    multiallelic_mt = mt.filter_rows(hl.len(mt.alleles) > 2)
    multiallelic_mt = hl.split_multi_hts(multiallelic_mt)

    # We annotate some rows manually to conform to the multiallelic_mt (after split).
    # Calling split_multi_hts on biallelic to annotate the rows causes problems.
    biallelic_mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    biallelic_mt = biallelic_mt.annotate_rows(a_index=1, was_split=False)

    all_mt = biallelic_mt.union_rows(multiallelic_mt)
    all_mt = all_mt.key_rows_by(all_mt.locus, all_mt.alleles)

    # 37 is known to have some unneeded symbolic alleles, so we filter out.
    all_mt = all_mt.filter_rows(hl.allele_type(
        all_mt.alleles[0], all_mt.alleles[1]) == 'Symbolic',
                                keep=False)

    return all_mt

Beispiel #5

0

Datei anzeigen

    def split_filter_and_flatten_ht(
            truth_mt: hl.MatrixTable,
            high_confidence_intervals_ht: hl.Table) -> hl.Table:
        """
        Split a truth sample MT, filter it to the given high confidence intervals, and then "flatten" it as a HT by annotating GT in a row field.

        :param truth_mt: Truth sample MT
        :param high_confidence_intervals_ht: High confidence intervals
        :return: Truth sample table with GT as a row annotation
        """
        assert truth_mt.count_cols() == 1

        if not "was_split" in truth_mt.row:
            truth_mt = hl.split_multi_hts(truth_mt)

        truth_mt = truth_mt.filter_rows(
            hl.is_defined(high_confidence_intervals_ht[truth_mt.locus]))
        rename_entries = {"GT": "_GT"}
        if "adj" in truth_mt.entry:
            rename_entries.update({"adj": "_adj"})

        truth_mt = truth_mt.rename(rename_entries)
        return truth_mt.annotate_rows(
            **
            {x: hl.agg.take(truth_mt[f"_{x}"], 1)[0]
             for x in rename_entries}).rows()

Beispiel #6

0

Datei anzeigen

Datei: test_family_methods.py Projekt: jigold/hail

    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
                 .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                            alleles=[truth.REF, truth.ALT])
                 .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
               .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))
        bad.describe()

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')

Beispiel #7

0

Datei anzeigen

Datei: 1a.family_stats_hail_de_novo.py Projekt: wtsi-hgi/exomeQC-hail-gnomad

def generate_allele_data(mt: hl.MatrixTable) -> hl.Table:
    """
    Writes bi-allelic sites MT with the following annotations:
     - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles)

    :param MatrixTable mt: Full unsplit MT
    :return: Table with allele data annotations
    :rtype: Table
    """
    ht = mt.rows().select()
    allele_data = hl.struct(nonsplit_alleles=ht.alleles,
                            has_star=hl.any(lambda a: a == '*', ht.alleles))
    ht = ht.annotate(allele_data=allele_data.annotate(
        **add_variant_type(ht.alleles)))

    ht = hl.split_multi_hts(ht)
    allele_type = (hl.case().when(
        hl.is_snp(ht.alleles[0], ht.alleles[1]),
        'snv').when(hl.is_insertion(ht.alleles[0], ht.alleles[1]),
                    'ins').when(hl.is_deletion(ht.alleles[0], ht.alleles[1]),
                                'del').default('complex'))
    ht = ht.annotate(allele_data=ht.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=ht.allele_data.variant_type == 'mixed'))
    return ht

Beispiel #8

0

Datei anzeigen

Datei: test_methods.py Projekt: shulik7/hail

    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
            .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                       alleles=[truth.REF, truth.ALT])
            .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
            .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi2 - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')

Beispiel #9

0

Datei anzeigen

Datei: 1a.family_stats_hail_de_novo_lustre_part2.py Projekt: wtsi-hgi/exomeQC-hail-gnomad

def main(args):
    ################################

    truthset_table = hl.read_table(args.truthset_table)
    #################################
    group = "raw"
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    #mt = mt.annotate_rows(family_stats=famstats_ht[mt.row_key].family_stats)
    #mt=mt.checkpoint(f'{args.output_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)
    mt=hl.read_matrix_table(f'{args.output_dir}/MegaWESSanger_cohorts_sampleQC_filtered.mt')
    mt = hl.split_multi_hts(
        mt, keep_star=False, left_aligned=False, permit_shuffle=True)
    mt=mt.checkpoint(f'{args.output_dir}/MegaWESSanger_cohorts_sampleQC_filtered_split_multi.mt', overwrite=True)
    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    mt = mt.checkpoint(
        f'{lustre_dir}/Sanger_cohorts_family_stats_gnomad_AF.mt', overwrite=True)
    de_novo_table = hl.de_novo(
        mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(
        f'{args.output_dir}/Sanger_cohort_denovo_table.ht', overwrite=True)

Beispiel #10

0

Datei anzeigen

Datei: read_file.py Projekt: atgu/GWASpy

def read_vcf(dirname: str, basename: str) -> hl.MatrixTable:
    hl._set_flags(no_whole_stage_codegen='1')
    vcf_file = '{}{}.vcf.gz'.format(dirname, basename)
    hl.import_vcf(vcf_file, force_bgz=True,
                  block_size=16).write('{}GWASpy.preimpQC.mt'.format(dirname),
                                       overwrite=True)
    in_mt = hl.read_matrix_table('{}GWASpy.preimpQC.mt'.format(dirname))

    # Unlike array data, a VCF might have multi-allelic sites
    # split multi-allelic sites into bi-allelic
    print("Checking for multi-allelic sites")
    pre_filt_multi_n = in_mt.count_rows()
    bi = in_mt.filter_rows(hl.len(in_mt.alleles) == 2)
    bi = bi.annotate_rows(
        a_index=hl.missing(hl.tint)
    )  # when we update Hail version, use hl.missing instead of hl.null
    bi = bi.annotate_rows(was_split=False)

    multi = in_mt.filter_rows(hl.len(in_mt.alleles) > 2)
    split = hl.split_multi_hts(multi)

    in_mt = split.union_rows(bi)
    pos_filt_multi_n = in_mt.count_rows()
    print("Number of multi-allelic SNPs in VCF file: {}".format(
        pos_filt_multi_n - pre_filt_multi_n))

    return in_mt

Beispiel #11

0

Datei anzeigen

def compute_samocha_denovos(mt, pedigree):
    gnomad_ht = hl.read_table("gs://gnomad-public/release/2.1.1/liftover_grch38/ht/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.ht")
    gnomad_ht = hl.split_multi_hts(gnomad_ht)

    de_novo_priors_ht = gnomad_ht.select(AF=gnomad_ht.freq[gnomad_ht.freq_index_dict["gnomad"]].AF)

    de_novos_ht = hl.de_novo(mt, pedigree, de_novo_priors_ht[mt.row_key].AF)

    de_novos_ht = de_novos_ht.transmute(proband=de_novos_ht.proband.s, father=de_novos_ht.father.s, mother=de_novos_ht.mother.s)

    de_novos_ht = de_novos_ht.annotate(proband_AB=de_novos_ht.proband_entry.AD[1]/(de_novos_ht.proband_entry.AD[0]+de_novos_ht.proband_entry.AD[1]))
    de_novos_ht = de_novos_ht.annotate(proband_DP=de_novos_ht.proband_entry.DP)
    de_novos_ht = de_novos_ht.annotate(proband_GQ=de_novos_ht.proband_entry.GQ)
    de_novos_ht = de_novos_ht.annotate(proband_GT=de_novos_ht.proband_entry.GT)

    de_novos_ht = de_novos_ht.annotate(father_AB=de_novos_ht.father_entry.AD[1]/(de_novos_ht.father_entry.AD[0]+de_novos_ht.father_entry.AD[1]))
    de_novos_ht = de_novos_ht.annotate(father_DP=de_novos_ht.father_entry.DP)
    de_novos_ht = de_novos_ht.annotate(father_GQ=de_novos_ht.father_entry.GQ)
    de_novos_ht = de_novos_ht.annotate(father_GT=de_novos_ht.father_entry.GT)

    de_novos_ht = de_novos_ht.annotate(mother_AB=de_novos_ht.mother_entry.AD[1]/(de_novos_ht.mother_entry.AD[0]+de_novos_ht.mother_entry.AD[1]))
    de_novos_ht = de_novos_ht.annotate(mother_DP=de_novos_ht.mother_entry.DP)
    de_novos_ht = de_novos_ht.annotate(mother_GQ=de_novos_ht.mother_entry.GQ)
    de_novos_ht = de_novos_ht.annotate(mother_GT=de_novos_ht.mother_entry.GT)

    de_novos_ht = de_novos_ht.drop(de_novos_ht.proband_entry, de_novos_ht.father_entry, de_novos_ht.mother_entry)

    return de_novos_ht

Beispiel #12

0

Datei anzeigen

Datei: seqr_loading.py Projekt: ssadedin/hail-elasticsearch-pipelines

 def annotate_old_and_split_multi_hts(self, mt):
     """
     Saves the old allele and locus because while split_multi does this, split_multi_hts drops this. Will see if
     we can add this to split_multi_hts and then this will be deprecated.
     :return: mt that has pre-annotations
     """
     # Named `locus_old` instead of `old_locus` because split_multi_hts drops `old_locus`.
     return hl.split_multi_hts(mt.annotate_rows(locus_old=mt.locus, alleles_old=mt.alleles))

Beispiel #13

0

Datei anzeigen

Datei: load_vqsr.py Projekt: broadinstitute/gnomad_qc

def import_vqsr(
    vqsr_path: str,
    vqsr_type: str = "alleleSpecificTrans",
    num_partitions: int = 5000,
    overwrite: bool = False,
    import_header_path: Optional[str] = None,
) -> None:
    """
    Imports vqsr site vcf into a HT
    :param vqsr_path: Path to input vqsr site vcf. This can be specified as Hadoop glob patterns
    :param vqsr_type: One of `classic`, `alleleSpecific` (allele specific) or `alleleSpecificTrans`
        (allele specific with transmitted singletons)
    :param num_partitions: Number of partitions to use for the VQSR HT
    :param overwrite: Whether to overwrite imported VQSR HT
    :param import_header_path: Optional path to a header file to use for import
    :return: None
    """

    logger.info(f"Importing VQSR annotations for {vqsr_type} VQSR...")
    mt = hl.import_vcf(
        vqsr_path,
        force_bgz=True,
        reference_genome="GRCh38",
        header_file=import_header_path,
    ).repartition(num_partitions)

    ht = mt.rows()

    ht = ht.annotate(info=ht.info.annotate(
        AS_VQSLOD=ht.info.AS_VQSLOD.map(lambda x: hl.float(x)),
        AS_QUALapprox=ht.info.AS_QUALapprox.split("\|")[1:].map(
            lambda x: hl.int(x)),
        AS_VarDP=ht.info.AS_VarDP.split("\|")[1:].map(lambda x: hl.int(x)),
        AS_SB_TABLE=ht.info.AS_SB_TABLE.split("\|").map(
            lambda x: x.split(",").map(lambda y: hl.int(y))),
    ))

    ht = ht.checkpoint(
        get_vqsr_filters(f"vqsr_{vqsr_type}", split=False,
                         finalized=False).path,
        overwrite=overwrite,
    )

    unsplit_count = ht.count()
    ht = hl.split_multi_hts(ht)

    ht = ht.annotate(
        info=ht.info.annotate(**split_info_annotation(ht.info, ht.a_index)), )

    ht = ht.checkpoint(
        get_vqsr_filters(f"vqsr_{vqsr_type}", split=True,
                         finalized=False).path,
        overwrite=overwrite,
    )
    split_count = ht.count()
    logger.info(
        f"Found {unsplit_count} unsplit and {split_count} split variants with VQSR annotations"
    )

Beispiel #14

0

Datei anzeigen

 def test_fix3307_read_mt_wrong(self):
     mt = hl.import_vcf(resource('sample2.vcf'))
     mt = hl.split_multi_hts(mt)
     mt.write('/tmp/foo.mt', overwrite=True)
     mt2 = hl.read_matrix_table('/tmp/foo.mt')
     t = hl.read_table('/tmp/foo.mt/rows')
     self.assertTrue(mt.rows()._same(t))
     self.assertTrue(mt2.rows()._same(t))
     self.assertTrue(mt._same(mt2))

Beispiel #15

0

Datei anzeigen

Datei: test_impex.py Projekt: lfrancioli/hail

    def test_import_vcf(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        vcf_table = vcf.rows()
        self.assertTrue(vcf_table.all(vcf_table.locus.contig == "chr22"))
        self.assertTrue(vcf.locus.dtype, hl.tlocus('GRCh37'))

Beispiel #16

0

Datei anzeigen

Datei: test_methods.py Projekt: shulik7/hail

    def test_import_vcf(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        vcf_table = vcf.rows()
        self.assertTrue(vcf_table.all(vcf_table.locus.contig == "chr22"))
        self.assertTrue(vcf.locus.dtype, hl.tlocus('GRCh37'))

Beispiel #17

0

Datei anzeigen

def main(args):
    output_prefix = args.vcf.split(".vcf")[0]

    if args.import_vcf:
        mt = hl.import_vcf(args.vcf, force_bgz=True)
        mt.write(f'{output_prefix}.unsplit.mt', overwrite=args.overwrite)
        mt = hl.read_matrix_table(f'{output_prefix}.unsplit.mt')
        mt = hl.split_multi_hts(mt)
        mt.write(f'{output_prefix}.mt', overwrite=args.overwrite)

Beispiel #18

0

Datei anzeigen

Datei: test_matrix_table.py Projekt: tpoterba/hail

 def test_fix3307_read_mt_wrong(self):
     mt = hl.import_vcf(resource('sample2.vcf'))
     mt = hl.split_multi_hts(mt)
     mt.write('/tmp/foo.mt', overwrite=True)
     mt2 = hl.read_matrix_table('/tmp/foo.mt')
     t = hl.read_table('/tmp/foo.mt/rows')
     self.assertTrue(mt.rows()._same(t))
     self.assertTrue(mt2.rows()._same(t))
     self.assertTrue(mt._same(mt2))

Beispiel #19

0

Datei anzeigen

def main(args):
    hl.init()

    data_type = 'genomes' if args.genomes else 'exomes'

    if args.write_hardcalls:
        mt = get_gnomad_data(data_type, split=False, raw=True, meta_root=None)
        ht = hl.read_table(qc_ht_path(data_type, 'hard_filters'))
        mt = annotate_adj(
            mt.select_cols(sex=ht[hl.literal(data_type), mt.s].sex))
        mt = mt.select_entries(GT=hl.case(missing_false=True).when(
            hl.call(mt.PGT[0], mt.PGT[1]) == mt.GT, mt.PGT).default(mt.GT),
                               PID=mt.PID,
                               adj=mt.adj)
        mt = adjust_sex_ploidy(mt, mt.sex)
        mt = mt.select_cols().naive_coalesce(10000)
        mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=False),
                 args.overwrite)

    if args.split_hardcalls:
        mt = get_gnomad_data(data_type, split=False, meta_root=None)
        mt = hl.split_multi_hts(mt)
        mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=True),
                 args.overwrite)

    if args.write_nonrefs:  # CPU-hours: 600 (E)
        mt = get_gnomad_data(data_type, split=False, raw=True,
                             meta_root=None).select_cols()
        mt = mt.annotate_entries(is_missing=hl.is_missing(mt.GT))
        mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref())
        mt = annotate_adj(mt)
        if args.exomes:
            mt = mt.naive_coalesce(10000)
        mt.write(
            get_gnomad_data_path(data_type, split=False, non_refs_only=True),
            args.overwrite)

    if args.split_nonrefs:  # CPU-hours: 300 (E)
        mt = get_gnomad_data(data_type, split=False, non_refs_only=True)
        mt = hl.split_multi_hts(mt)
        mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref())
        mt.write(
            get_gnomad_data_path(data_type, split=True, non_refs_only=True),
            args.overwrite)

Beispiel #20

0

Datei anzeigen

Datei: export_plink.py Projekt: populationgenomics/ancestry

def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = hl.split_multi_hts(tob_wgs)
    tob_wgs_path = output_path('tob_wgs_plink')
    hl.export_plink(tob_wgs, tob_wgs_path, ind_id=tob_wgs.s)

Beispiel #21

0

Datei anzeigen

Datei: hail_utils.py Projekt: ssadedin/hail-elasticsearch-pipelines

def import_vcf(vcf_path: str,
               genome_version: str,
               min_partitions: int = None,
               force_bgz: bool = True,
               drop_samples: bool = False,
               skip_invalid_loci: bool = False,
               split_multi_alleles: bool = True):
    """Import vcf and return MatrixTable.

    :param str vcf_path: MT to annotate with VEP
    :param str genome_version: "37" or "38"
    :param int min_partitions: min partitions
    :param bool force_bgz: read .gz as a bgzipped file
    :param bool drop_samples: if True, discard genotype info
    :param bool skip_invalid_loci: if True, skip loci that are not consistent with the reference_genome.
    """

    if genome_version not in ("37", "38"):
        raise ValueError(f"Invalid genome_version: {genome_version}")

    logger.info(f"\n==> import vcf: {vcf_path}")

    # add (or remove) "chr" prefix from vcf chroms so they match the reference
    ref = hl.get_reference(f"GRCh{genome_version}")
    contig_recoding = {
        **{
            ref_contig.replace("chr", ""): ref_contig
            for ref_contig in ref.contigs if "chr" in ref_contig
        },
        **{
            f"chr{ref_contig}": ref_contig
            for ref_contig in ref.contigs if "chr" not in ref_contig
        }
    }

    mt = hl.import_vcf(vcf_path,
                       reference_genome=f"GRCh{genome_version}",
                       contig_recoding=contig_recoding,
                       min_partitions=min_partitions,
                       force_bgz=force_bgz,
                       drop_samples=drop_samples,
                       skip_invalid_loci=skip_invalid_loci)

    mt = mt.annotate_globals(sourceFilePath=vcf_path,
                             genomeVersion=genome_version)

    mt = mt.annotate_rows(original_alt_alleles=hl.or_missing(
        hl.len(mt.alleles) > 2, get_expr_for_variant_ids(mt.locus,
                                                         mt.alleles)))

    if split_multi_alleles:
        mt = hl.split_multi_hts(mt)
        mt = mt.key_rows_by(**hl.min_rep(mt.locus, mt.alleles))

    return mt

Beispiel #22

0

Datei anzeigen

Datei: hail_utils.py Projekt: nicklecompteBCH/seqr-bch-installation

def import_vcf(
        vcf_path: Union[str,List[str]],
        genome_version: str,
        sample_name: str,
        # Exomes VCFs can be split to ~1mb chunks for annotation (good for joins)
        # but they are pretty tiny and I think too big is bad.
        min_partitions: int = 50,
        force_bgz: bool = False,
        drop_samples: bool = False,
        skip_invalid_loci: bool = False,
        split_multi_alleles: bool = True):
    """Import vcf and return MatrixTable.

    :param str vcf_path: MT to annotate with VEP
    :param str genome_version: "37" or "38"
    :param int min_partitions: min partitions
    :param bool force_bgz: read .gz as a bgzipped file
    :param bool drop_samples: if True, discard genotype info
    :param bool skip_invalid_loci: if True, skip loci that are not consistent with the reference_genome.
    """
    if genome_version not in ("37", "38"):
        raise ValueError(f"Invalid genome_version: {genome_version}")
#    logger.info(f"\n==> import vcf: {vcf_path}")
    # add (or remove) "chr" prefix from vcf chroms so they match the reference
    ref = hl.get_reference(f"GRCh{genome_version}")
    contig_recoding = {
        **{ref_contig.replace("chr", ""): ref_contig for ref_contig in ref.contigs if "chr" in ref_contig},
        **{f"chr{ref_contig}": ref_contig for ref_contig in ref.contigs if "chr" not in ref_contig}}
    mt = hl.import_vcf(
        vcf_path,
        reference_genome=f"GRCh{genome_version}",
        contig_recoding=contig_recoding,
        min_partitions=min_partitions,
        force_bgz=force_bgz,
        drop_samples=drop_samples,
        skip_invalid_loci=skip_invalid_loci,
        array_elements_required=False)
    valid_chros = {'18', '14', '17', 'Y', '2', '8', 'X', '22', '16', '21', '3', '6', '10', '5', '13', '7', '1', '11', '19', 'MT', '4', '12', '9', '20', '15'}
    mt = mt.filter_rows(hl.literal(valid_chros).contains(mt.locus.contig))
    mt.write("/vep/tmpck1.mt",overwrite=True)
    mt = hl.read_matrix_table("/vep/tmpck1.mt")
    mt = mt.annotate_globals(sourceFilePath=vcf_path, genomeVersion=genome_version)
    mt = mt.annotate_rows(
        originalAltAlleles=hl.or_missing(hl.len(mt.alleles) > 2, get_expr_for_variant_ids(mt.locus, mt.alleles))
    )
    mt = mt.annotate_rows(
        xpos=get_expr_for_xpos(mt.locus)
    )
    mt = mt.annotate_rows(
        ref=mt.alleles[0]
    )
    if split_multi_alleles:
        mt = hl.split_multi_hts(mt)
        mt = mt.key_rows_by(**hl.min_rep(mt.locus, mt.alleles))
    return mt

Beispiel #23

0

Datei anzeigen

Datei: methods_benchmarks.py Projekt: zietzm/hail

def genetics_pipeline():
    mt = get_mt()
    mt = hl.split_multi_hts(mt)
    mt = hl.variant_qc(mt)
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols(mt.sample_qc.call_rate > 0.95)
    mt = mt.filter_rows(mt.variant_qc.AC[1] > 5)
    mt = mt.filter_entries(hl.case().when(
        hl.is_indel(mt.alleles[0], mt.alleles[1]),
        mt.GQ > 20).default(mt.GQ > 10))
    mt.write('/tmp/genetics_pipeline.mt', overwrite=True)

Beispiel #24

0

Datei anzeigen

def main(args):
    ################################

    truthset_table = hl.read_table(args.truthset_table)
    #################################
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)
    mt = hl.split_multi_hts(mt,
                            keep_star=False,
                            left_aligned=False,
                            permit_shuffle=True)
    mt = mt.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_autosomes_split.mt',
        overwrite=True)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True)
    trio_dataset.write(f'{args.output_dir}/MegaWES_trio_table.mt',
                       overwrite=True)

    (ht1, famstats_ht) = generate_family_stats(mt, fam)
    print("Writing mt and family stats_ht")
    ht1.write(f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True)
    #famstats_ht.write(
    #    f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True)
    mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats)
    mt = mt.checkpoint(f'{args.output_dir}/MegaWES_family_stats.mt',
                       overwrite=True)
    #(mt1, famstats_ht) = generate_family_stats(mt, fam)
    #print("Writing mt and family stats_ht")
    #mt1.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)
    # famstats_ht.write(
    #    f'{tmp_dir}/Sanger_cohorts_family_stats.ht', overwrite=True)
    #mt = mt.annotate_rows(family_stats=ht[mt.row_key].family_stats)
    #mt.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)

    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    mt = mt.checkpoint(
        f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt',
        overwrite=True)
    #mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True)

    de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(
        f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht',
        overwrite=True)

Beispiel #25

0

Datei anzeigen

Datei: seqr_sample_qc.py Projekt: macarthur-lab/methods

  def run_hail_sample_qc(mt: hl.MatrixTable, data_type: str) -> hl.MatrixTable:
    """
    Runs Hail's built in sample qc function on the MatrixTable. Splits the MatrixTable in order to calculate inbreeding
    coefficient and annotates the result back onto original MatrixTable. Applies flags by population and platform groups.
    :param MatrixTable mt: QC MatrixTable
    :param str data_type: WGS or WES for write path
    :return: MatrixTable annotated with hails sample qc metrics as well as pop and platform outliers
    :rtype: MatrixTable
    """
    mt = mt.select_entries(mt.GT)
    mt = filter_to_autosomes(mt)
    mt = hl.split_multi_hts(mt)
    mt = hl.sample_qc(mt)
    mt = mt.annotate_cols(
        sample_qc=mt.sample_qc.annotate(
            f_inbreeding=hl.agg.inbreeding(mt.GT, mt.info.AF[0])
        )
    )
    mt = mt.annotate_cols(idx=mt.qc_pop + "_" + hl.str(mt.qc_platform))

    sample_qc = [
        "n_snp",
        "r_ti_tv",
        "r_insertion_deletion",
        "n_insertion",
        "n_deletion",
        "r_het_hom_var",
    ]
    if data_type == "WGS":
        sample_qc = sample_qc + ["call_rate"]

    strat_ht = mt.cols()
    qc_metrics = {metric: strat_ht.sample_qc[metric] for metric in sample_qc}
    strata = {"qc_pop": strat_ht.qc_pop, "qc_platform": strat_ht.qc_platform}

    metric_ht = compute_stratified_metrics_filter(strat_ht, qc_metrics, strata)
    checkpoint_pass = metric_ht.aggregate(
        hl.agg.count_where(hl.len(metric_ht.qc_metrics_filters) == 0)
    )
    logger.info(
        "%i samples found passing pop/platform-specific filtering", checkpoint_pass
    )
    checkpoint_fail = metric_ht.aggregate(
        hl.agg.count_where(hl.len(metric_ht.qc_metrics_filters) != 0)
    )
    logger.info(
        "%i samples found failing pop/platform-specific filtering", checkpoint_fail
    )
    metric_ht = metric_ht.annotate(sample_qc=mt.cols()[metric_ht.key].sample_qc)
    return metric_ht

Beispiel #26

0

Datei anzeigen

Datei: generate_qc_annotations.py Projekt: edenkal13/gnomad_qc

def run_vep(vep_version: str = "101") -> hl.Table:
    """
    Returns a table with a VEP annotation for each variant in the raw MatrixTable.

    :param vep_version: Version of VEPed context Table to use in `vep_or_lookup_vep`
    :return: VEPed Table
    """
    ht = get_gnomad_v3_mt(key_by_locus_and_alleles=True, remove_hard_filtered_samples=False).rows()
    ht = ht.filter(hl.len(ht.alleles) > 1)
    ht = hl.split_multi_hts(ht)
    ht = vep_or_lookup_vep(ht, vep_version=vep_version)
    ht = ht.annotate_globals(version=f'v{vep_version}')

    return ht

Beispiel #27

0

Datei anzeigen

Datei: test_methods.py Projekt: shulik7/hail

    def test_import_plink(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        hl.export_plink(vcf, '/tmp/sample_plink')

        bfile = '/tmp/sample_plink'
        plink = hl.import_plink(
            bfile + '.bed', bfile + '.bim', bfile + '.fam',
            a2_reference=True,
            contig_recoding={'chr22': '22'},
            reference_genome='GRCh37').rows()
        self.assertTrue(plink.all(plink.locus.contig == "22"))
        self.assertEqual(vcf.count_rows(), plink.count())
        self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))

Beispiel #28

0

Datei anzeigen

Datei: seqr_loading_optimized.py Projekt: nicklecompteBCH/hail-elasticsearch-pipelines

    def run(self):
        mt = self.import_vcf()
        mt = hl.split_multi_hts(mt)
        if self.validate:
            self.validate_mt(mt, self.genome_version, self.sample_type)
        mt = HailMatrixTableTask.run_vep(mt, self.genome_version,
                                         self.vep_runner)
        # We're now adding ref data.
        ref_data = hl.read_table(self.reference_ht_path)
        clinvar = hl.read_table(self.clinvar_ht_path)
        hgmd = hl.read_table(self.hgmd_ht_path)

        mt = SeqrVariantSchema(
            mt, ref_data=ref_data, clinvar_data=clinvar,
            hgmd_data=hgmd).annotate_all(overwrite=True).select_annotated_mt()

        mt.write(self.output().path, stage_locally=True)

Beispiel #29

0

Datei anzeigen

Datei: test_impex.py Projekt: lfrancioli/hail

    def test_import_plink_contig_recoding_w_reference(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        hl.export_plink(vcf, '/tmp/sample_plink')

        bfile = '/tmp/sample_plink'
        plink = hl.import_plink(
            bfile + '.bed', bfile + '.bim', bfile + '.fam',
            a2_reference=True,
            contig_recoding={'chr22': '22'},
            reference_genome='GRCh37').rows()
        self.assertTrue(plink.all(plink.locus.contig == "22"))
        self.assertEqual(vcf.count_rows(), plink.count())
        self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))

Beispiel #30

0

Datei anzeigen

def generate_de_novos(mt: hl.MatrixTable, fam_file: str, freq_data: hl.Table) -> hl.Table:
    mt = mt.select_cols()
    fam_ht = read_fam(fam_file).key_by()
    fam_ht = fam_ht.select(
        s=[fam_ht.s, fam_ht.pat_id, fam_ht.mat_id]).explode('s').key_by('s')
    mt = mt.filter_cols(hl.is_defined(fam_ht[mt.s]))
    mt = mt.select_rows()
    mt = hl.split_multi_hts(mt)
    mt = mt.annotate_rows(family_stats=freq_data[mt.row_key].family_stats)
    ped = hl.Pedigree.read(fam_file, delimiter='\\t')

    de_novo_table = hl.de_novo(
        mt, ped, mt.family_stats[0].unrelated_qc_callstats.AF[1])
    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')

    return de_novo_table

Beispiel #31

0

Datei anzeigen

Datei: test_impex.py Projekt: lfrancioli/hail

    def test_export_plink(self):
        vcf_file = resource('sample.vcf')
        mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10))

        # permute columns so not in alphabetical order!
        import random
        indices = list(range(mt.count_cols()))
        random.shuffle(indices)
        mt = mt.choose_cols(indices)

        split_vcf_file = uri_path(new_temp_file())
        hl_output = uri_path(new_temp_file())
        plink_output = uri_path(new_temp_file())
        merge_output = uri_path(new_temp_file())

        hl.export_vcf(mt, split_vcf_file)
        hl.export_plink(mt, hl_output)

        run_command(["plink", "--vcf", split_vcf_file,
                     "--make-bed", "--out", plink_output,
                     "--const-fid", "--keep-allele-order"])

        data = []
        with open(uri_path(plink_output + ".bim")) as file:
            for line in file:
                row = line.strip().split()
                row[1] = ":".join([row[0], row[3], row[5], row[4]])
                data.append("\t".join(row) + "\n")

        with open(plink_output + ".bim", 'w') as f:
            f.writelines(data)

        run_command(["plink", "--bfile", plink_output,
                     "--bmerge", hl_output, "--merge-mode",
                     "6", "--out", merge_output])

        same = True
        with open(merge_output + ".diff") as f:
            for line in f:
                row = line.strip().split()
                if row != ["SNP", "FID", "IID", "NEW", "OLD"]:
                    same = False
                    break

        self.assertTrue(same)

Beispiel #32

0

Datei anzeigen

Datei: generate_split_alleles.py Projekt: leklab/pcgc_hail

def generate_split_alleles(mt: hl.MatrixTable) -> hl.Table:

    allele_data = hl.struct(nonsplit_alleles=mt.alleles,
                            has_star=hl.any(lambda a: a == '*', mt.alleles))

    mt = mt.annotate_rows(allele_data=allele_data.annotate(
        **add_variant_type(mt.alleles)))
    mt = hl.split_multi_hts(mt, left_aligned=True)

    allele_type = (hl.case().when(
        hl.is_snp(mt.alleles[0], mt.alleles[1]),
        'snv').when(hl.is_insertion(mt.alleles[0], mt.alleles[1]),
                    'ins').when(hl.is_deletion(mt.alleles[0], mt.alleles[1]),
                                'del').default('complex'))
    mt = mt.annotate_rows(allele_data=mt.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=mt.allele_data.variant_type == 'mixed'))
    return mt

Beispiel #33

0

Datei anzeigen

Datei: test_methods.py Projekt: shulik7/hail

    def test_skat(self):
        ds2 = hl.import_vcf(resource('sample2.vcf'))

        covariatesSkat = (hl.import_table(resource("skat.cov"), impute=True)
            .key_by("Sample"))

        phenotypesSkat = (hl.import_table(resource("skat.pheno"),
                                          types={"Pheno": hl.tfloat64},
                                          missing="0")
            .key_by("Sample"))

        intervalsSkat = (hl.import_locus_intervals(resource("skat.interval_list")))

        weightsSkat = (hl.import_table(resource("skat.weights"),
                                       types={"locus": hl.tlocus(),
                                              "weight": hl.tfloat64})
            .key_by("locus"))

        ds = hl.split_multi_hts(ds2)
        ds = ds.annotate_rows(gene=intervalsSkat[ds.locus],
                              weight=weightsSkat[ds.locus].weight)
        ds = ds.annotate_cols(pheno=phenotypesSkat[ds.s].Pheno,
                              cov=covariatesSkat[ds.s])
        ds = ds.annotate_cols(pheno=hl.cond(ds.pheno == 1.0,
                                            False,
                                            hl.cond(ds.pheno == 2.0,
                                                    True,
                                                    hl.null(hl.tbool))))

        hl.skat(ds,
                key_expr=ds.gene,
                weight_expr=ds.weight,
                y=ds.pheno,
                x=ds.GT.n_alt_alleles(),
                covariates=[ds.cov.Cov1, ds.cov.Cov2],
                logistic=False).count()

        hl.skat(ds,
                key_expr=ds.gene,
                weight_expr=ds.weight,
                y=ds.pheno,
                x=hl.pl_dosage(ds.PL),
                covariates=[ds.cov.Cov1, ds.cov.Cov2],
                logistic=True).count()

Beispiel #34

0

Datei anzeigen

Datei: variant_info.py Projekt: yassineS/neurogap_downsampling

def load_files(file_prefix, overwrite, gencove, mt):
    """
    loads VCFs, run sample QC and variant QC, writes matrix table
    :param file_prefix:
    :param overwrite:
    :return:
    """
    if gencove:
        ngap_downsample = hl.read_matrix_table(file_prefix + '_grch38.mt')
    else:
        ngap_downsample = hl.import_vcf(file_prefix + '.vcf.gz',
                                        force_bgz=True,
                                        reference_genome='GRCh38',
                                        min_partitions=200)
        ngap_downsample = hl.split_multi_hts(ngap_downsample)
    ngap_downsample = ngap_downsample.filter_cols(
        (ngap_downsample.s != 'NGE0018') & (ngap_downsample.s != 'NGE0130'))
    ngap_sample_qc = hl.sample_qc(ngap_downsample)
    ngap_sample_variant_qc = hl.variant_qc(ngap_sample_qc)
    ngap_sample_variant_qc.write(file_prefix + '.mt', overwrite=overwrite)

Beispiel #35

0

Datei anzeigen

    def test_export_plink(self):
        vcf_file = resource('sample.vcf')
        mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10))

        split_vcf_file = uri_path(new_temp_file())
        hl_output = uri_path(new_temp_file())
        plink_output = uri_path(new_temp_file())
        merge_output = uri_path(new_temp_file())

        hl.export_vcf(mt, split_vcf_file)
        hl.export_plink(mt, hl_output)

        run_command([
            "plink", "--vcf", split_vcf_file, "--make-bed", "--out",
            plink_output, "--const-fid", "--keep-allele-order"
        ])

        data = []
        with open(uri_path(plink_output + ".bim")) as file:
            for line in file:
                row = line.strip().split()
                row[1] = ":".join([row[0], row[3], row[5], row[4]])
                data.append("\t".join(row) + "\n")

        with open(plink_output + ".bim", 'w') as f:
            f.writelines(data)

        run_command([
            "plink", "--bfile", plink_output, "--bmerge", hl_output,
            "--merge-mode", "6", "--out", merge_output
        ])

        same = True
        with open(merge_output + ".diff") as f:
            for line in f:
                row = line.strip().split()
                if row != ["SNP", "FID", "IID", "NEW", "OLD"]:
                    same = False
                    break

        self.assertTrue(same)

Beispiel #36

0

Datei anzeigen

Datei: doctest_write_data.py Projekt: bcajes/hail

                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
                      cov2=hl.rand_norm(0, 1),
                      cohort="SIGMA")
ds = ds.annotate_globals(global_field_1=5,
                         global_field_2=10,
                         pli={'SCN1A': 0.999, 'SONIC': 0.014},
                         populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
ds = ds.annotate_rows(gene=['TTN'])
ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
ds.write('data/example.vds', overwrite=True)

lmmreg_ds = hl.variant_qc(hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz')))
lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True)
lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']])
lmmreg_ds = lmmreg_ds.annotate_rows(use_in_kinship = lmmreg_ds.variant_qc.AF[1] > 0.05)
lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True)

burden_ds = hl.import_vcf('data/example_burden.vcf')
burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True)
burden_ds = burden_ds.annotate_cols(burden = burden_kt[burden_ds.s])
burden_ds = burden_ds.annotate_rows(weight = hl.float64(burden_ds.locus.position))
burden_ds = hl.variant_qc(burden_ds)
genekt = hl.import_locus_intervals('data/gene.interval_list')
burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
burden_ds.write('data/example_burden.vds', overwrite=True)

Beispiel #37

0

Datei anzeigen

Datei: helpers.py Projekt: danking/hail

def get_dataset():
    global _dataset
    if _dataset is None:
        _dataset = hail.split_multi_hts(hail.import_vcf(resource('sample.vcf'))).cache()
    return _dataset

Beispiel #38

0

Datei anzeigen

Datei: methods_benchmarks.py Projekt: bcajes/hail

def split_multi_hts():
    mt = hl.read_matrix_table(resource('profile.mt'))
    hl.split_multi_hts(mt)._force_count_rows()