Beispiel #1
0
    def test_split_multi_hts(self):
        ds1 = hl.import_vcf(resource('split_test.vcf'))
        ds1 = hl.split_multi_hts(ds1)
        ds2 = hl.import_vcf(resource('split_test_b.vcf'))
        df = ds1.rows()
        self.assertTrue(df.all((df.locus.position == 1180) | df.was_split))
        ds1 = ds1.drop('was_split', 'a_index')
        self.assertTrue(ds1._same(ds2))

        ds = self.get_dataset()
        ds = ds.annotate_entries(X=ds.GT)
        with self.assertRaises(utils.FatalError):
            hl.split_multi_hts(ds)
Beispiel #2
0
def generate_allele_data(ht: hl.Table) -> hl.Table:
    """
    Returns bi-allelic sites HT with the following annotations:
     - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles)

    :param Table ht: Full unsplit HT
    :return: Table with allele data annotations
    :rtype: Table
    """
    ht = ht.select()
    allele_data = hl.struct(nonsplit_alleles=ht.alleles,
                            has_star=hl.any(lambda a: a == "*", ht.alleles))
    ht = ht.annotate(allele_data=allele_data.annotate(
        **add_variant_type(ht.alleles)))

    ht = hl.split_multi_hts(ht)
    ht = ht.filter(hl.len(ht.alleles) > 1)
    allele_type = (hl.case().when(
        hl.is_snp(ht.alleles[0], ht.alleles[1]),
        "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]),
                    "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]),
                                "del").default("complex"))
    ht = ht.annotate(allele_data=ht.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=ht.allele_data.variant_type == "mixed"))
    return ht
Beispiel #3
0
def main(args):

    hl.init(log='/liftover.log')

    if args.gnomad:
        gnomad = True
        path = None

        if args.exomes:
            data_type = 'exomes'
        if args.genomes:
            data_type = 'genomes'

        logger.info('Working on gnomAD {} release ht'.format(data_type))
        logger.info('Reading in release ht')
        t = public_release(data_type).ht()
        logger.info('Variants in release ht: {}'.format(t.count()))

    else:
        data_type = None
        gnomad = False

        if args.ht:
            path = args.ht
            t = hl.read_table(args.ht)
        if args.mt:
            path = args.mt
            t = hl.read_matrix_table(args.mt)

    logger.info('Checking if input data has been split')
    if 'was_split' not in t.row:
        t = hl.split_multi(t) if isinstance(
            t, hl.Table) else hl.split_multi_hts(t)

    logger.info('Preparing reference genomes for liftover')
    source, target = get_liftover_genome(t)

    if args.test:
        logger.info('Filtering to chr21 for testing')
        if source.name == 'GRCh38':
            contig = 'chr21'
        else:
            contig = '21'
        t = hl.filter_intervals(
            t, [hl.parse_locus_interval(contig, reference_genome=source.name)])

    logger.info(f'Lifting data to {target.name}')
    t = lift_data(t, gnomad, data_type, path, target, args.overwrite)

    logger.info('Checking SNPs for reference mismatches')
    t = annotate_snp_mismatch(t, data_type, target)

    mismatch = check_mismatch(t) if isinstance(
        t, hl.Table) else check_mismatch(t.rows())
    logger.info('{} total SNPs'.format(mismatch['total_variants']))
    logger.info('{} SNPs on minus strand'.format(mismatch['negative_strand']))
    logger.info('{} reference mismatches in SNPs'.format(
        mismatch['total_mismatch']))
    logger.info('{} mismatches on minus strand'.format(
        mismatch['negative_strand_mismatch']))
def vcf_to_mt(path, genome_version):
    '''
    Converts 1kg vcf to mt. The 1kg dataset has multi-allelic variants and duplicates.
    This function independently filters the mutli-allelics to split, then unions with
    the bi-allelics.

    :param path: vcf path
    :param genome_version: genome version
    :return:
    '''
    # Import but do not split multis here.
    mt = import_vcf(path,
                    genome_version=genome_version,
                    min_partitions=1000,
                    split_multi_alleles=False)

    multiallelic_mt = mt.filter_rows(hl.len(mt.alleles) > 2)
    multiallelic_mt = hl.split_multi_hts(multiallelic_mt)

    # We annotate some rows manually to conform to the multiallelic_mt (after split).
    # Calling split_multi_hts on biallelic to annotate the rows causes problems.
    biallelic_mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    biallelic_mt = biallelic_mt.annotate_rows(a_index=1, was_split=False)

    all_mt = biallelic_mt.union_rows(multiallelic_mt)
    all_mt = all_mt.key_rows_by(all_mt.locus, all_mt.alleles)

    # 37 is known to have some unneeded symbolic alleles, so we filter out.
    all_mt = all_mt.filter_rows(hl.allele_type(
        all_mt.alleles[0], all_mt.alleles[1]) == 'Symbolic',
                                keep=False)

    return all_mt
Beispiel #5
0
    def split_filter_and_flatten_ht(
            truth_mt: hl.MatrixTable,
            high_confidence_intervals_ht: hl.Table) -> hl.Table:
        """
        Split a truth sample MT, filter it to the given high confidence intervals, and then "flatten" it as a HT by annotating GT in a row field.

        :param truth_mt: Truth sample MT
        :param high_confidence_intervals_ht: High confidence intervals
        :return: Truth sample table with GT as a row annotation
        """
        assert truth_mt.count_cols() == 1

        if not "was_split" in truth_mt.row:
            truth_mt = hl.split_multi_hts(truth_mt)

        truth_mt = truth_mt.filter_rows(
            hl.is_defined(high_confidence_intervals_ht[truth_mt.locus]))
        rename_entries = {"GT": "_GT"}
        if "adj" in truth_mt.entry:
            rename_entries.update({"adj": "_adj"})

        truth_mt = truth_mt.rename(rename_entries)
        return truth_mt.annotate_rows(
            **
            {x: hl.agg.take(truth_mt[f"_{x}"], 1)[0]
             for x in rename_entries}).rows()
Beispiel #6
0
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
                 .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                            alleles=[truth.REF, truth.ALT])
                 .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
               .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))
        bad.describe()

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
def generate_allele_data(mt: hl.MatrixTable) -> hl.Table:
    """
    Writes bi-allelic sites MT with the following annotations:
     - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles)

    :param MatrixTable mt: Full unsplit MT
    :return: Table with allele data annotations
    :rtype: Table
    """
    ht = mt.rows().select()
    allele_data = hl.struct(nonsplit_alleles=ht.alleles,
                            has_star=hl.any(lambda a: a == '*', ht.alleles))
    ht = ht.annotate(allele_data=allele_data.annotate(
        **add_variant_type(ht.alleles)))

    ht = hl.split_multi_hts(ht)
    allele_type = (hl.case().when(
        hl.is_snp(ht.alleles[0], ht.alleles[1]),
        'snv').when(hl.is_insertion(ht.alleles[0], ht.alleles[1]),
                    'ins').when(hl.is_deletion(ht.alleles[0], ht.alleles[1]),
                                'del').default('complex'))
    ht = ht.annotate(allele_data=ht.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=ht.allele_data.variant_type == 'mixed'))
    return ht
Beispiel #8
0
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
            .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                       alleles=[truth.REF, truth.ALT])
            .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
            .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi2 - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
def main(args):
    ################################

    truthset_table = hl.read_table(args.truthset_table)
    #################################
    group = "raw"
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    #mt = mt.annotate_rows(family_stats=famstats_ht[mt.row_key].family_stats)
    #mt=mt.checkpoint(f'{args.output_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)
    mt=hl.read_matrix_table(f'{args.output_dir}/MegaWESSanger_cohorts_sampleQC_filtered.mt')
    mt = hl.split_multi_hts(
        mt, keep_star=False, left_aligned=False, permit_shuffle=True)
    mt=mt.checkpoint(f'{args.output_dir}/MegaWESSanger_cohorts_sampleQC_filtered_split_multi.mt', overwrite=True)
    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    mt = mt.checkpoint(
        f'{lustre_dir}/Sanger_cohorts_family_stats_gnomad_AF.mt', overwrite=True)
    de_novo_table = hl.de_novo(
        mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(
        f'{args.output_dir}/Sanger_cohort_denovo_table.ht', overwrite=True)
Beispiel #10
0
def read_vcf(dirname: str, basename: str) -> hl.MatrixTable:
    hl._set_flags(no_whole_stage_codegen='1')
    vcf_file = '{}{}.vcf.gz'.format(dirname, basename)
    hl.import_vcf(vcf_file, force_bgz=True,
                  block_size=16).write('{}GWASpy.preimpQC.mt'.format(dirname),
                                       overwrite=True)
    in_mt = hl.read_matrix_table('{}GWASpy.preimpQC.mt'.format(dirname))

    # Unlike array data, a VCF might have multi-allelic sites
    # split multi-allelic sites into bi-allelic
    print("Checking for multi-allelic sites")
    pre_filt_multi_n = in_mt.count_rows()
    bi = in_mt.filter_rows(hl.len(in_mt.alleles) == 2)
    bi = bi.annotate_rows(
        a_index=hl.missing(hl.tint)
    )  # when we update Hail version, use hl.missing instead of hl.null
    bi = bi.annotate_rows(was_split=False)

    multi = in_mt.filter_rows(hl.len(in_mt.alleles) > 2)
    split = hl.split_multi_hts(multi)

    in_mt = split.union_rows(bi)
    pos_filt_multi_n = in_mt.count_rows()
    print("Number of multi-allelic SNPs in VCF file: {}".format(
        pos_filt_multi_n - pre_filt_multi_n))

    return in_mt
Beispiel #11
0
def compute_samocha_denovos(mt, pedigree):
    gnomad_ht = hl.read_table("gs://gnomad-public/release/2.1.1/liftover_grch38/ht/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.ht")
    gnomad_ht = hl.split_multi_hts(gnomad_ht)

    de_novo_priors_ht = gnomad_ht.select(AF=gnomad_ht.freq[gnomad_ht.freq_index_dict["gnomad"]].AF)

    de_novos_ht = hl.de_novo(mt, pedigree, de_novo_priors_ht[mt.row_key].AF)

    de_novos_ht = de_novos_ht.transmute(proband=de_novos_ht.proband.s, father=de_novos_ht.father.s, mother=de_novos_ht.mother.s)

    de_novos_ht = de_novos_ht.annotate(proband_AB=de_novos_ht.proband_entry.AD[1]/(de_novos_ht.proband_entry.AD[0]+de_novos_ht.proband_entry.AD[1]))
    de_novos_ht = de_novos_ht.annotate(proband_DP=de_novos_ht.proband_entry.DP)
    de_novos_ht = de_novos_ht.annotate(proband_GQ=de_novos_ht.proband_entry.GQ)
    de_novos_ht = de_novos_ht.annotate(proband_GT=de_novos_ht.proband_entry.GT)

    de_novos_ht = de_novos_ht.annotate(father_AB=de_novos_ht.father_entry.AD[1]/(de_novos_ht.father_entry.AD[0]+de_novos_ht.father_entry.AD[1]))
    de_novos_ht = de_novos_ht.annotate(father_DP=de_novos_ht.father_entry.DP)
    de_novos_ht = de_novos_ht.annotate(father_GQ=de_novos_ht.father_entry.GQ)
    de_novos_ht = de_novos_ht.annotate(father_GT=de_novos_ht.father_entry.GT)

    de_novos_ht = de_novos_ht.annotate(mother_AB=de_novos_ht.mother_entry.AD[1]/(de_novos_ht.mother_entry.AD[0]+de_novos_ht.mother_entry.AD[1]))
    de_novos_ht = de_novos_ht.annotate(mother_DP=de_novos_ht.mother_entry.DP)
    de_novos_ht = de_novos_ht.annotate(mother_GQ=de_novos_ht.mother_entry.GQ)
    de_novos_ht = de_novos_ht.annotate(mother_GT=de_novos_ht.mother_entry.GT)

    de_novos_ht = de_novos_ht.drop(de_novos_ht.proband_entry, de_novos_ht.father_entry, de_novos_ht.mother_entry)

    return de_novos_ht
 def annotate_old_and_split_multi_hts(self, mt):
     """
     Saves the old allele and locus because while split_multi does this, split_multi_hts drops this. Will see if
     we can add this to split_multi_hts and then this will be deprecated.
     :return: mt that has pre-annotations
     """
     # Named `locus_old` instead of `old_locus` because split_multi_hts drops `old_locus`.
     return hl.split_multi_hts(mt.annotate_rows(locus_old=mt.locus, alleles_old=mt.alleles))
Beispiel #13
0
def import_vqsr(
    vqsr_path: str,
    vqsr_type: str = "alleleSpecificTrans",
    num_partitions: int = 5000,
    overwrite: bool = False,
    import_header_path: Optional[str] = None,
) -> None:
    """
    Imports vqsr site vcf into a HT
    :param vqsr_path: Path to input vqsr site vcf. This can be specified as Hadoop glob patterns
    :param vqsr_type: One of `classic`, `alleleSpecific` (allele specific) or `alleleSpecificTrans`
        (allele specific with transmitted singletons)
    :param num_partitions: Number of partitions to use for the VQSR HT
    :param overwrite: Whether to overwrite imported VQSR HT
    :param import_header_path: Optional path to a header file to use for import
    :return: None
    """

    logger.info(f"Importing VQSR annotations for {vqsr_type} VQSR...")
    mt = hl.import_vcf(
        vqsr_path,
        force_bgz=True,
        reference_genome="GRCh38",
        header_file=import_header_path,
    ).repartition(num_partitions)

    ht = mt.rows()

    ht = ht.annotate(info=ht.info.annotate(
        AS_VQSLOD=ht.info.AS_VQSLOD.map(lambda x: hl.float(x)),
        AS_QUALapprox=ht.info.AS_QUALapprox.split("\|")[1:].map(
            lambda x: hl.int(x)),
        AS_VarDP=ht.info.AS_VarDP.split("\|")[1:].map(lambda x: hl.int(x)),
        AS_SB_TABLE=ht.info.AS_SB_TABLE.split("\|").map(
            lambda x: x.split(",").map(lambda y: hl.int(y))),
    ))

    ht = ht.checkpoint(
        get_vqsr_filters(f"vqsr_{vqsr_type}", split=False,
                         finalized=False).path,
        overwrite=overwrite,
    )

    unsplit_count = ht.count()
    ht = hl.split_multi_hts(ht)

    ht = ht.annotate(
        info=ht.info.annotate(**split_info_annotation(ht.info, ht.a_index)), )

    ht = ht.checkpoint(
        get_vqsr_filters(f"vqsr_{vqsr_type}", split=True,
                         finalized=False).path,
        overwrite=overwrite,
    )
    split_count = ht.count()
    logger.info(
        f"Found {unsplit_count} unsplit and {split_count} split variants with VQSR annotations"
    )
Beispiel #14
0
 def test_fix3307_read_mt_wrong(self):
     mt = hl.import_vcf(resource('sample2.vcf'))
     mt = hl.split_multi_hts(mt)
     mt.write('/tmp/foo.mt', overwrite=True)
     mt2 = hl.read_matrix_table('/tmp/foo.mt')
     t = hl.read_table('/tmp/foo.mt/rows')
     self.assertTrue(mt.rows()._same(t))
     self.assertTrue(mt2.rows()._same(t))
     self.assertTrue(mt._same(mt2))
Beispiel #15
0
    def test_import_vcf(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        vcf_table = vcf.rows()
        self.assertTrue(vcf_table.all(vcf_table.locus.contig == "chr22"))
        self.assertTrue(vcf.locus.dtype, hl.tlocus('GRCh37'))
Beispiel #16
0
    def test_import_vcf(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        vcf_table = vcf.rows()
        self.assertTrue(vcf_table.all(vcf_table.locus.contig == "chr22"))
        self.assertTrue(vcf.locus.dtype, hl.tlocus('GRCh37'))
Beispiel #17
0
def main(args):
    output_prefix = args.vcf.split(".vcf")[0]

    if args.import_vcf:
        mt = hl.import_vcf(args.vcf, force_bgz=True)
        mt.write(f'{output_prefix}.unsplit.mt', overwrite=args.overwrite)
        mt = hl.read_matrix_table(f'{output_prefix}.unsplit.mt')
        mt = hl.split_multi_hts(mt)
        mt.write(f'{output_prefix}.mt', overwrite=args.overwrite)
Beispiel #18
0
 def test_fix3307_read_mt_wrong(self):
     mt = hl.import_vcf(resource('sample2.vcf'))
     mt = hl.split_multi_hts(mt)
     mt.write('/tmp/foo.mt', overwrite=True)
     mt2 = hl.read_matrix_table('/tmp/foo.mt')
     t = hl.read_table('/tmp/foo.mt/rows')
     self.assertTrue(mt.rows()._same(t))
     self.assertTrue(mt2.rows()._same(t))
     self.assertTrue(mt._same(mt2))
Beispiel #19
0
def main(args):
    hl.init()

    data_type = 'genomes' if args.genomes else 'exomes'

    if args.write_hardcalls:
        mt = get_gnomad_data(data_type, split=False, raw=True, meta_root=None)
        ht = hl.read_table(qc_ht_path(data_type, 'hard_filters'))
        mt = annotate_adj(
            mt.select_cols(sex=ht[hl.literal(data_type), mt.s].sex))
        mt = mt.select_entries(GT=hl.case(missing_false=True).when(
            hl.call(mt.PGT[0], mt.PGT[1]) == mt.GT, mt.PGT).default(mt.GT),
                               PID=mt.PID,
                               adj=mt.adj)
        mt = adjust_sex_ploidy(mt, mt.sex)
        mt = mt.select_cols().naive_coalesce(10000)
        mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=False),
                 args.overwrite)

    if args.split_hardcalls:
        mt = get_gnomad_data(data_type, split=False, meta_root=None)
        mt = hl.split_multi_hts(mt)
        mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=True),
                 args.overwrite)

    if args.write_nonrefs:  # CPU-hours: 600 (E)
        mt = get_gnomad_data(data_type, split=False, raw=True,
                             meta_root=None).select_cols()
        mt = mt.annotate_entries(is_missing=hl.is_missing(mt.GT))
        mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref())
        mt = annotate_adj(mt)
        if args.exomes:
            mt = mt.naive_coalesce(10000)
        mt.write(
            get_gnomad_data_path(data_type, split=False, non_refs_only=True),
            args.overwrite)

    if args.split_nonrefs:  # CPU-hours: 300 (E)
        mt = get_gnomad_data(data_type, split=False, non_refs_only=True)
        mt = hl.split_multi_hts(mt)
        mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref())
        mt.write(
            get_gnomad_data_path(data_type, split=True, non_refs_only=True),
            args.overwrite)
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = hl.split_multi_hts(tob_wgs)
    tob_wgs_path = output_path('tob_wgs_plink')
    hl.export_plink(tob_wgs, tob_wgs_path, ind_id=tob_wgs.s)
def import_vcf(vcf_path: str,
               genome_version: str,
               min_partitions: int = None,
               force_bgz: bool = True,
               drop_samples: bool = False,
               skip_invalid_loci: bool = False,
               split_multi_alleles: bool = True):
    """Import vcf and return MatrixTable.

    :param str vcf_path: MT to annotate with VEP
    :param str genome_version: "37" or "38"
    :param int min_partitions: min partitions
    :param bool force_bgz: read .gz as a bgzipped file
    :param bool drop_samples: if True, discard genotype info
    :param bool skip_invalid_loci: if True, skip loci that are not consistent with the reference_genome.
    """

    if genome_version not in ("37", "38"):
        raise ValueError(f"Invalid genome_version: {genome_version}")

    logger.info(f"\n==> import vcf: {vcf_path}")

    # add (or remove) "chr" prefix from vcf chroms so they match the reference
    ref = hl.get_reference(f"GRCh{genome_version}")
    contig_recoding = {
        **{
            ref_contig.replace("chr", ""): ref_contig
            for ref_contig in ref.contigs if "chr" in ref_contig
        },
        **{
            f"chr{ref_contig}": ref_contig
            for ref_contig in ref.contigs if "chr" not in ref_contig
        }
    }

    mt = hl.import_vcf(vcf_path,
                       reference_genome=f"GRCh{genome_version}",
                       contig_recoding=contig_recoding,
                       min_partitions=min_partitions,
                       force_bgz=force_bgz,
                       drop_samples=drop_samples,
                       skip_invalid_loci=skip_invalid_loci)

    mt = mt.annotate_globals(sourceFilePath=vcf_path,
                             genomeVersion=genome_version)

    mt = mt.annotate_rows(original_alt_alleles=hl.or_missing(
        hl.len(mt.alleles) > 2, get_expr_for_variant_ids(mt.locus,
                                                         mt.alleles)))

    if split_multi_alleles:
        mt = hl.split_multi_hts(mt)
        mt = mt.key_rows_by(**hl.min_rep(mt.locus, mt.alleles))

    return mt
def import_vcf(
        vcf_path: Union[str,List[str]],
        genome_version: str,
        sample_name: str,
        # Exomes VCFs can be split to ~1mb chunks for annotation (good for joins)
        # but they are pretty tiny and I think too big is bad.
        min_partitions: int = 50,
        force_bgz: bool = False,
        drop_samples: bool = False,
        skip_invalid_loci: bool = False,
        split_multi_alleles: bool = True):
    """Import vcf and return MatrixTable.

    :param str vcf_path: MT to annotate with VEP
    :param str genome_version: "37" or "38"
    :param int min_partitions: min partitions
    :param bool force_bgz: read .gz as a bgzipped file
    :param bool drop_samples: if True, discard genotype info
    :param bool skip_invalid_loci: if True, skip loci that are not consistent with the reference_genome.
    """
    if genome_version not in ("37", "38"):
        raise ValueError(f"Invalid genome_version: {genome_version}")
#    logger.info(f"\n==> import vcf: {vcf_path}")
    # add (or remove) "chr" prefix from vcf chroms so they match the reference
    ref = hl.get_reference(f"GRCh{genome_version}")
    contig_recoding = {
        **{ref_contig.replace("chr", ""): ref_contig for ref_contig in ref.contigs if "chr" in ref_contig},
        **{f"chr{ref_contig}": ref_contig for ref_contig in ref.contigs if "chr" not in ref_contig}}
    mt = hl.import_vcf(
        vcf_path,
        reference_genome=f"GRCh{genome_version}",
        contig_recoding=contig_recoding,
        min_partitions=min_partitions,
        force_bgz=force_bgz,
        drop_samples=drop_samples,
        skip_invalid_loci=skip_invalid_loci,
        array_elements_required=False)
    valid_chros = {'18', '14', '17', 'Y', '2', '8', 'X', '22', '16', '21', '3', '6', '10', '5', '13', '7', '1', '11', '19', 'MT', '4', '12', '9', '20', '15'}
    mt = mt.filter_rows(hl.literal(valid_chros).contains(mt.locus.contig))
    mt.write("/vep/tmpck1.mt",overwrite=True)
    mt = hl.read_matrix_table("/vep/tmpck1.mt")
    mt = mt.annotate_globals(sourceFilePath=vcf_path, genomeVersion=genome_version)
    mt = mt.annotate_rows(
        originalAltAlleles=hl.or_missing(hl.len(mt.alleles) > 2, get_expr_for_variant_ids(mt.locus, mt.alleles))
    )
    mt = mt.annotate_rows(
        xpos=get_expr_for_xpos(mt.locus)
    )
    mt = mt.annotate_rows(
        ref=mt.alleles[0]
    )
    if split_multi_alleles:
        mt = hl.split_multi_hts(mt)
        mt = mt.key_rows_by(**hl.min_rep(mt.locus, mt.alleles))
    return mt
Beispiel #23
0
def genetics_pipeline():
    mt = get_mt()
    mt = hl.split_multi_hts(mt)
    mt = hl.variant_qc(mt)
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols(mt.sample_qc.call_rate > 0.95)
    mt = mt.filter_rows(mt.variant_qc.AC[1] > 5)
    mt = mt.filter_entries(hl.case().when(
        hl.is_indel(mt.alleles[0], mt.alleles[1]),
        mt.GQ > 20).default(mt.GQ > 10))
    mt.write('/tmp/genetics_pipeline.mt', overwrite=True)
Beispiel #24
0
def main(args):
    ################################

    truthset_table = hl.read_table(args.truthset_table)
    #################################
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)
    mt = hl.split_multi_hts(mt,
                            keep_star=False,
                            left_aligned=False,
                            permit_shuffle=True)
    mt = mt.checkpoint(
        f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_autosomes_split.mt',
        overwrite=True)
    fam = args.trio_fam
    pedigree = hl.Pedigree.read(fam)
    trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True)
    trio_dataset.write(f'{args.output_dir}/MegaWES_trio_table.mt',
                       overwrite=True)

    (ht1, famstats_ht) = generate_family_stats(mt, fam)
    print("Writing mt and family stats_ht")
    ht1.write(f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True)
    #famstats_ht.write(
    #    f'{args.output_dir}/MegaWES_family_stats.ht', overwrite=True)
    mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats)
    mt = mt.checkpoint(f'{args.output_dir}/MegaWES_family_stats.mt',
                       overwrite=True)
    #(mt1, famstats_ht) = generate_family_stats(mt, fam)
    #print("Writing mt and family stats_ht")
    #mt1.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)
    # famstats_ht.write(
    #    f'{tmp_dir}/Sanger_cohorts_family_stats.ht', overwrite=True)
    #mt = mt.annotate_rows(family_stats=ht[mt.row_key].family_stats)
    #mt.write(f'{tmp_dir}/Sanger_cohorts_family_stats.mt', overwrite=True)

    priors = hl.read_table(args.priors)
    mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)
    mt = mt.checkpoint(
        f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt',
        overwrite=True)
    #mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True)

    de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf)

    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
    de_novo_table.write(
        f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht',
        overwrite=True)
Beispiel #25
0
  def run_hail_sample_qc(mt: hl.MatrixTable, data_type: str) -> hl.MatrixTable:
    """
    Runs Hail's built in sample qc function on the MatrixTable. Splits the MatrixTable in order to calculate inbreeding
    coefficient and annotates the result back onto original MatrixTable. Applies flags by population and platform groups.
    :param MatrixTable mt: QC MatrixTable
    :param str data_type: WGS or WES for write path
    :return: MatrixTable annotated with hails sample qc metrics as well as pop and platform outliers
    :rtype: MatrixTable
    """
    mt = mt.select_entries(mt.GT)
    mt = filter_to_autosomes(mt)
    mt = hl.split_multi_hts(mt)
    mt = hl.sample_qc(mt)
    mt = mt.annotate_cols(
        sample_qc=mt.sample_qc.annotate(
            f_inbreeding=hl.agg.inbreeding(mt.GT, mt.info.AF[0])
        )
    )
    mt = mt.annotate_cols(idx=mt.qc_pop + "_" + hl.str(mt.qc_platform))

    sample_qc = [
        "n_snp",
        "r_ti_tv",
        "r_insertion_deletion",
        "n_insertion",
        "n_deletion",
        "r_het_hom_var",
    ]
    if data_type == "WGS":
        sample_qc = sample_qc + ["call_rate"]

    strat_ht = mt.cols()
    qc_metrics = {metric: strat_ht.sample_qc[metric] for metric in sample_qc}
    strata = {"qc_pop": strat_ht.qc_pop, "qc_platform": strat_ht.qc_platform}

    metric_ht = compute_stratified_metrics_filter(strat_ht, qc_metrics, strata)
    checkpoint_pass = metric_ht.aggregate(
        hl.agg.count_where(hl.len(metric_ht.qc_metrics_filters) == 0)
    )
    logger.info(
        "%i samples found passing pop/platform-specific filtering", checkpoint_pass
    )
    checkpoint_fail = metric_ht.aggregate(
        hl.agg.count_where(hl.len(metric_ht.qc_metrics_filters) != 0)
    )
    logger.info(
        "%i samples found failing pop/platform-specific filtering", checkpoint_fail
    )
    metric_ht = metric_ht.annotate(sample_qc=mt.cols()[metric_ht.key].sample_qc)
    return metric_ht
def run_vep(vep_version: str = "101") -> hl.Table:
    """
    Returns a table with a VEP annotation for each variant in the raw MatrixTable.

    :param vep_version: Version of VEPed context Table to use in `vep_or_lookup_vep`
    :return: VEPed Table
    """
    ht = get_gnomad_v3_mt(key_by_locus_and_alleles=True, remove_hard_filtered_samples=False).rows()
    ht = ht.filter(hl.len(ht.alleles) > 1)
    ht = hl.split_multi_hts(ht)
    ht = vep_or_lookup_vep(ht, vep_version=vep_version)
    ht = ht.annotate_globals(version=f'v{vep_version}')

    return ht
Beispiel #27
0
    def test_import_plink(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        hl.export_plink(vcf, '/tmp/sample_plink')

        bfile = '/tmp/sample_plink'
        plink = hl.import_plink(
            bfile + '.bed', bfile + '.bim', bfile + '.fam',
            a2_reference=True,
            contig_recoding={'chr22': '22'},
            reference_genome='GRCh37').rows()
        self.assertTrue(plink.all(plink.locus.contig == "22"))
        self.assertEqual(vcf.count_rows(), plink.count())
        self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
    def run(self):
        mt = self.import_vcf()
        mt = hl.split_multi_hts(mt)
        if self.validate:
            self.validate_mt(mt, self.genome_version, self.sample_type)
        mt = HailMatrixTableTask.run_vep(mt, self.genome_version,
                                         self.vep_runner)
        # We're now adding ref data.
        ref_data = hl.read_table(self.reference_ht_path)
        clinvar = hl.read_table(self.clinvar_ht_path)
        hgmd = hl.read_table(self.hgmd_ht_path)

        mt = SeqrVariantSchema(
            mt, ref_data=ref_data, clinvar_data=clinvar,
            hgmd_data=hgmd).annotate_all(overwrite=True).select_annotated_mt()

        mt.write(self.output().path, stage_locally=True)
Beispiel #29
0
    def test_import_plink_contig_recoding_w_reference(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        hl.export_plink(vcf, '/tmp/sample_plink')

        bfile = '/tmp/sample_plink'
        plink = hl.import_plink(
            bfile + '.bed', bfile + '.bim', bfile + '.fam',
            a2_reference=True,
            contig_recoding={'chr22': '22'},
            reference_genome='GRCh37').rows()
        self.assertTrue(plink.all(plink.locus.contig == "22"))
        self.assertEqual(vcf.count_rows(), plink.count())
        self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
Beispiel #30
0
def generate_de_novos(mt: hl.MatrixTable, fam_file: str, freq_data: hl.Table) -> hl.Table:
    mt = mt.select_cols()
    fam_ht = read_fam(fam_file).key_by()
    fam_ht = fam_ht.select(
        s=[fam_ht.s, fam_ht.pat_id, fam_ht.mat_id]).explode('s').key_by('s')
    mt = mt.filter_cols(hl.is_defined(fam_ht[mt.s]))
    mt = mt.select_rows()
    mt = hl.split_multi_hts(mt)
    mt = mt.annotate_rows(family_stats=freq_data[mt.row_key].family_stats)
    ped = hl.Pedigree.read(fam_file, delimiter='\\t')

    de_novo_table = hl.de_novo(
        mt, ped, mt.family_stats[0].unrelated_qc_callstats.AF[1])
    de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')

    return de_novo_table
Beispiel #31
0
    def test_export_plink(self):
        vcf_file = resource('sample.vcf')
        mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10))

        # permute columns so not in alphabetical order!
        import random
        indices = list(range(mt.count_cols()))
        random.shuffle(indices)
        mt = mt.choose_cols(indices)

        split_vcf_file = uri_path(new_temp_file())
        hl_output = uri_path(new_temp_file())
        plink_output = uri_path(new_temp_file())
        merge_output = uri_path(new_temp_file())

        hl.export_vcf(mt, split_vcf_file)
        hl.export_plink(mt, hl_output)

        run_command(["plink", "--vcf", split_vcf_file,
                     "--make-bed", "--out", plink_output,
                     "--const-fid", "--keep-allele-order"])

        data = []
        with open(uri_path(plink_output + ".bim")) as file:
            for line in file:
                row = line.strip().split()
                row[1] = ":".join([row[0], row[3], row[5], row[4]])
                data.append("\t".join(row) + "\n")

        with open(plink_output + ".bim", 'w') as f:
            f.writelines(data)

        run_command(["plink", "--bfile", plink_output,
                     "--bmerge", hl_output, "--merge-mode",
                     "6", "--out", merge_output])

        same = True
        with open(merge_output + ".diff") as f:
            for line in f:
                row = line.strip().split()
                if row != ["SNP", "FID", "IID", "NEW", "OLD"]:
                    same = False
                    break

        self.assertTrue(same)
def generate_split_alleles(mt: hl.MatrixTable) -> hl.Table:

    allele_data = hl.struct(nonsplit_alleles=mt.alleles,
                            has_star=hl.any(lambda a: a == '*', mt.alleles))

    mt = mt.annotate_rows(allele_data=allele_data.annotate(
        **add_variant_type(mt.alleles)))
    mt = hl.split_multi_hts(mt, left_aligned=True)

    allele_type = (hl.case().when(
        hl.is_snp(mt.alleles[0], mt.alleles[1]),
        'snv').when(hl.is_insertion(mt.alleles[0], mt.alleles[1]),
                    'ins').when(hl.is_deletion(mt.alleles[0], mt.alleles[1]),
                                'del').default('complex'))
    mt = mt.annotate_rows(allele_data=mt.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=mt.allele_data.variant_type == 'mixed'))
    return mt
Beispiel #33
0
    def test_skat(self):
        ds2 = hl.import_vcf(resource('sample2.vcf'))

        covariatesSkat = (hl.import_table(resource("skat.cov"), impute=True)
            .key_by("Sample"))

        phenotypesSkat = (hl.import_table(resource("skat.pheno"),
                                          types={"Pheno": hl.tfloat64},
                                          missing="0")
            .key_by("Sample"))

        intervalsSkat = (hl.import_locus_intervals(resource("skat.interval_list")))

        weightsSkat = (hl.import_table(resource("skat.weights"),
                                       types={"locus": hl.tlocus(),
                                              "weight": hl.tfloat64})
            .key_by("locus"))

        ds = hl.split_multi_hts(ds2)
        ds = ds.annotate_rows(gene=intervalsSkat[ds.locus],
                              weight=weightsSkat[ds.locus].weight)
        ds = ds.annotate_cols(pheno=phenotypesSkat[ds.s].Pheno,
                              cov=covariatesSkat[ds.s])
        ds = ds.annotate_cols(pheno=hl.cond(ds.pheno == 1.0,
                                            False,
                                            hl.cond(ds.pheno == 2.0,
                                                    True,
                                                    hl.null(hl.tbool))))

        hl.skat(ds,
                key_expr=ds.gene,
                weight_expr=ds.weight,
                y=ds.pheno,
                x=ds.GT.n_alt_alleles(),
                covariates=[ds.cov.Cov1, ds.cov.Cov2],
                logistic=False).count()

        hl.skat(ds,
                key_expr=ds.gene,
                weight_expr=ds.weight,
                y=ds.pheno,
                x=hl.pl_dosage(ds.PL),
                covariates=[ds.cov.Cov1, ds.cov.Cov2],
                logistic=True).count()
def load_files(file_prefix, overwrite, gencove, mt):
    """
    loads VCFs, run sample QC and variant QC, writes matrix table
    :param file_prefix:
    :param overwrite:
    :return:
    """
    if gencove:
        ngap_downsample = hl.read_matrix_table(file_prefix + '_grch38.mt')
    else:
        ngap_downsample = hl.import_vcf(file_prefix + '.vcf.gz',
                                        force_bgz=True,
                                        reference_genome='GRCh38',
                                        min_partitions=200)
        ngap_downsample = hl.split_multi_hts(ngap_downsample)
    ngap_downsample = ngap_downsample.filter_cols(
        (ngap_downsample.s != 'NGE0018') & (ngap_downsample.s != 'NGE0130'))
    ngap_sample_qc = hl.sample_qc(ngap_downsample)
    ngap_sample_variant_qc = hl.variant_qc(ngap_sample_qc)
    ngap_sample_variant_qc.write(file_prefix + '.mt', overwrite=overwrite)
Beispiel #35
0
    def test_export_plink(self):
        vcf_file = resource('sample.vcf')
        mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10))

        split_vcf_file = uri_path(new_temp_file())
        hl_output = uri_path(new_temp_file())
        plink_output = uri_path(new_temp_file())
        merge_output = uri_path(new_temp_file())

        hl.export_vcf(mt, split_vcf_file)
        hl.export_plink(mt, hl_output)

        run_command([
            "plink", "--vcf", split_vcf_file, "--make-bed", "--out",
            plink_output, "--const-fid", "--keep-allele-order"
        ])

        data = []
        with open(uri_path(plink_output + ".bim")) as file:
            for line in file:
                row = line.strip().split()
                row[1] = ":".join([row[0], row[3], row[5], row[4]])
                data.append("\t".join(row) + "\n")

        with open(plink_output + ".bim", 'w') as f:
            f.writelines(data)

        run_command([
            "plink", "--bfile", plink_output, "--bmerge", hl_output,
            "--merge-mode", "6", "--out", merge_output
        ])

        same = True
        with open(merge_output + ".diff") as f:
            for line in f:
                row = line.strip().split()
                if row != ["SNP", "FID", "IID", "NEW", "OLD"]:
                    same = False
                    break

        self.assertTrue(same)
Beispiel #36
0
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
                      cov2=hl.rand_norm(0, 1),
                      cohort="SIGMA")
ds = ds.annotate_globals(global_field_1=5,
                         global_field_2=10,
                         pli={'SCN1A': 0.999, 'SONIC': 0.014},
                         populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
ds = ds.annotate_rows(gene=['TTN'])
ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
ds.write('data/example.vds', overwrite=True)

lmmreg_ds = hl.variant_qc(hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz')))
lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True)
lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']])
lmmreg_ds = lmmreg_ds.annotate_rows(use_in_kinship = lmmreg_ds.variant_qc.AF[1] > 0.05)
lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True)

burden_ds = hl.import_vcf('data/example_burden.vcf')
burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True)
burden_ds = burden_ds.annotate_cols(burden = burden_kt[burden_ds.s])
burden_ds = burden_ds.annotate_rows(weight = hl.float64(burden_ds.locus.position))
burden_ds = hl.variant_qc(burden_ds)
genekt = hl.import_locus_intervals('data/gene.interval_list')
burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
burden_ds.write('data/example_burden.vds', overwrite=True)
Beispiel #37
0
def get_dataset():
    global _dataset
    if _dataset is None:
        _dataset = hail.split_multi_hts(hail.import_vcf(resource('sample.vcf'))).cache()
    return _dataset
Beispiel #38
0
def split_multi_hts():
    mt = hl.read_matrix_table(resource('profile.mt'))
    hl.split_multi_hts(mt)._force_count_rows()