def annotate_with_genotype_num_alt(mt: hl.MatrixTable) -> hl.MatrixTable:
    if 'AD' in set(mt.entry):
        # GATK-consistent VCF
        mt = mt.annotate_rows(genotypes=(hl.agg.collect(
            hl.struct(num_alt=hl.cond(mt.alleles[1] == '<CNV>', 0,
                                      mt.GT.n_alt_alleles()),
                      ab=hl.cond(
                          mt.alleles[1] == '<CNV>', 0.0,
                          hl.float(hl.array(mt.AD)[1]) /
                          hl.float(hl.fold(lambda i, j: i + j, 0, mt.AD))),
                      gq=mt.GQ,
                      sample_id=mt.s,
                      dp=mt.DP))))
    elif 'AO' in set(mt.entry):
        mt = mt.annotate_rows(
            genotypes=hl.agg.collect(
                hl.struct(num_alt=hl.cond(mt.alleles[1] == '<CNV>', 0,
                                          mt.GT.n_alt_alleles()),
                          ab=hl.cond(mt.alleles[1] == '<CNV>' or mt.DP == 0,
                                     0.0,
                                     hl.float(mt.AO[0]) / hl.float(mt.DP)),
                          dp=mt.DP,
                          gq=mt.GQ,
                          sample_id=mt.s))
        )  #hl.cond(mt.GT=="0/0",0,hl.cond(mt.GT=="1/0",1,hl.cond(mt.GT=="0/1",1,hl.cond((mt.GT=="1/1",2,hl.cond(mt.GT=="1/2",2,hl.cond(mt.GT=="2/1",2,hl.cond(mt.GT=="2/2",2,-1))))))))
    else:
        raise ValueError("unrecognized vcf")
    return mt
def prepare_exac_constraint(exac_constraint_path):
    ds = hl.import_table(exac_constraint_path, force=True)

    ds = ds.select(
        transcript_id=ds.transcript.split("\\.")[0],
        # Expected
        exp_syn=hl.float(ds.exp_syn),
        exp_mis=hl.float(ds.exp_mis),
        exp_lof=hl.float(ds.exp_lof),
        # Actual
        obs_syn=hl.int(ds.n_syn),
        obs_mis=hl.int(ds.n_mis),
        obs_lof=hl.int(ds.n_lof),
        # mu
        mu_syn=hl.float(ds.mu_syn),
        mu_mis=hl.float(ds.mu_mis),
        mu_lof=hl.float(ds.mu_lof),
        # Z
        syn_z=hl.float(ds.syn_z),
        mis_z=hl.float(ds.mis_z),
        lof_z=hl.float(ds.lof_z),
        # Other
        pLI=hl.float(ds.pLI),
    )

    ds = ds.key_by("transcript_id")

    return ds
def format_exac_constraint(ds):
    # Select relevant fields
    ds = ds.select(
        transcript_id=ds.transcript.split("\\.")[0],
        # Expected
        exp_syn=hl.float(ds.exp_syn),
        exp_mis=hl.float(ds.exp_mis),
        exp_lof=hl.float(ds.exp_lof),
        # Actual
        obs_syn=hl.int(ds.n_syn),
        obs_mis=hl.int(ds.n_mis),
        obs_lof=hl.int(ds.n_lof),
        # mu
        mu_syn=hl.float(ds.mu_syn),
        mu_mis=hl.float(ds.mu_mis),
        mu_lof=hl.float(ds.mu_lof),
        # Z
        syn_z=hl.float(ds.syn_z),
        mis_z=hl.float(ds.mis_z),
        lof_z=hl.float(ds.lof_z),
        # Other
        pLI=hl.float(ds.pLI),
    )

    ds = ds.key_by("transcript_id")

    return ds
Esempio n. 4
0
def test_dndarray_sum():
    n_variants = 10
    n_samples = 10
    block_size = 3
    n_blocks = 16
    mt1 = hl.balding_nichols_model(n_populations=2,
                                   n_variants=n_variants,
                                   n_samples=n_samples)
    mt1 = mt1.select_entries(dosage=hl.float(mt1.GT.n_alt_alleles()))
    mt2 = hl.balding_nichols_model(n_populations=2,
                                   n_variants=n_variants,
                                   n_samples=n_samples)
    mt2 = mt2.select_entries(dosage=hl.float(mt2.GT.n_alt_alleles()))

    da1 = hl.experimental.dnd.array(mt1, 'dosage', block_size=block_size)
    da2 = hl.experimental.dnd.array(mt2, 'dosage', block_size=block_size)
    da_sum = (da1 + da2).checkpoint(new_temp_file())
    assert da_sum._force_count_blocks() == n_blocks
    da_result = da_sum.collect()

    a1 = np.array(mt1.dosage.collect()).reshape(n_variants, n_samples)
    a2 = np.array(mt2.dosage.collect()).reshape(n_variants, n_samples)
    a_result = a1 + a2

    assert np.array_equal(da_result, a_result)
Esempio n. 5
0
def prepare_exac_constraint(path):
    ds = hl.import_table(path, force=True)
    ds = ds.repartition(32, shuffle=True)

    # Select relevant fields
    ds = ds.select(
        # Remove version number from transcript ID
        transcript_id=ds.transcript.split("\\.")[0],
        # Expected
        exp_syn=hl.float(ds.exp_syn),
        exp_mis=hl.float(ds.exp_mis),
        exp_lof=hl.float(ds.exp_lof),
        # Actual
        obs_syn=hl.int(ds.n_syn),
        obs_mis=hl.int(ds.n_mis),
        obs_lof=hl.int(ds.n_lof),
        # mu
        mu_syn=hl.float(ds.mu_syn),
        mu_mis=hl.float(ds.mu_mis),
        mu_lof=hl.float(ds.mu_lof),
        # Z
        syn_z=hl.float(ds.syn_z),
        mis_z=hl.float(ds.mis_z),
        lof_z=hl.float(ds.lof_z),
        # Other
        pli=hl.float(ds.pLI),
    )

    ds = ds.key_by("transcript_id")

    return ds
 def _parse_odds_ratio(field_name):
     return hl.rbind(
         ds[field_name].split(" ", n=2),
         lambda parts: hl.rbind(
             parts[0],
             parts[1][1:-1].split("-", 2),
             lambda value, bounds: hl.struct(
                 **{
                     field_name: hl.float(value),
                     field_name + " lower bound": hl.float(bounds[0]),
                     field_name + " upper bound": hl.float(bounds[1]),
                 }),
         ),
     )
Esempio n. 7
0
def add_global_af(ht: hl.Table, temp: str) -> hl.Table:
    '''
    Adds gnomAD global AF annotation to Table

    :param Table ht: Input Table
    :param str temp: Path to temp bucket (to store intermediary files)
    :return: Table with gnomAD global AF annotation
    :rtype: Table
    '''
    # checkpoint table after completing both gnomAD exomes and gnomAD genomes join
    temp_path = f'{temp}/join.ht'
    ht = ht.checkpoint(temp_path)

    # set gnomAD ACs and ANs to 0 if they are missing after the join
    ht = ht.transmute(
        gnomad_exomes_AC=hl.if_else(hl.is_defined(ht.gnomad_exomes_AC),
                                    ht.gnomad_exomes_AC, 0),
        gnomad_genomes_AC=hl.if_else(hl.is_defined(ht.gnomad_genomes_AC),
                                     ht.gnomad_genomes_AC, 0),
        gnomad_exomes_AN=hl.if_else(hl.is_defined(ht.gnomad_exomes_AN),
                                    ht.gnomad_exomes_AN, 0),
        gnomad_genomes_AN=hl.if_else(hl.is_defined(ht.gnomad_genomes_AN),
                                     ht.gnomad_genomes_AN, 0),
    )

    ht = ht.annotate(gnomad_global_AF=(
        hl.if_else(((ht.gnomad_exomes_AN == 0)
                    & (ht.gnomad_genomes_AN == 0)), 0.0,
                   hl.float((ht.gnomad_exomes_AC + ht.gnomad_genomes_AC) /
                            (ht.gnomad_exomes_AN + ht.gnomad_genomes_AN)))))
    ht.describe()
    return ht
Esempio n. 8
0
def annotate_meta(mt, meta, key_by):
    '''
    Annotates a matrix table with the following meta data: collaborator participant id, site id, and
    autocall call rate. Assumes that meta and mt don't have the same key, keys meta by chip_well_barcode
    :param mt: hail matrix table
    :param meta_table: tsv file containing sample IDs, reported sex information, and IS specific to
    :param key_by:
    :return:
    '''
    # Setting the key of metaDat as chip_well_barcode
    meta = meta.key_by(key_by)

    # Adding the reported sex column from the sample data table to the matrix table
    mt = mt.annotate_cols(reported_sex=meta[mt.s].reported_gender)

    # Annotating the matrix table with the projID from meta data
    mt = mt.annotate_cols(collab_PID=meta[mt.s].collaborator_participant_id)

    # Creating a new column in mt for siteID which is the 3 letters from the collab participant id
    mt = mt.annotate_cols(siteID=mt.col.collab_PID[0:3])

    # Switching lowercase letters to capital in collabPID and siteID
    mt = mt.annotate_cols(collab_PID=mt.collab_PID.upper())
    mt = mt.annotate_cols(siteID=mt.siteID.upper())

    # Annotating mt with autocall call rate
    mt = mt.annotate_cols(autocall_call_rate=meta[mt.s].autocall_call_rate)

    # Filtering out individuals with an autocall call rate below .95
    mt = mt.filter_cols(hl.float(mt.autocall_call_rate) < .95, keep=False)

    # Filtering NA12878 individual from the matrix table
    samples_to_remove = {'NA12878'}
    set_to_remove = hl.literal(samples_to_remove)
    return mt.filter_cols(~set_to_remove.contains(mt['collab_PID']))
Esempio n. 9
0
def test_memory_issue_from_9009():
    mt = hl.utils.range_matrix_table(1000, 1, n_partitions=1)
    mt = mt.annotate_entries(x=hl.float(mt.row_idx * mt.col_idx))
    mt = mt.annotate_rows(big=hl.zeros(100_000_000))
    try:
        hl.linalg.BlockMatrix.write_from_entry_expr(mt.x, new_temp_file(), overwrite=True)
    except Exception:
        assert False
Esempio n. 10
0
 def test_lambda_gc_nans(self):
     N = 5000000
     ht = hl.utils.range_table(N).annotate(x=hl.scan.count() / N,
                                           is_even=hl.scan.count() % 2 == 0)
     lgc_nan = hl.lambda_gc(hl.case().when(ht.is_even,
                                           hl.float('nan')).default(ht.x))
     self.assertAlmostEqual(lgc_nan, 1,
                            places=1)  # approximate, 1 place is safe
Esempio n. 11
0
def test_king_homo_estimator():
    hl.set_global_seed(1)
    mt = hl.balding_nichols_model(2, 5, 5)
    mt = mt.select_entries(genotype_score=hl.float(mt.GT.n_alt_alleles()))
    da = hl.experimental.dnd.array(mt, 'genotype_score', block_size=3)

    def sqr(x):
        return x * x

    score_difference = da.T.inner_product(
        da, lambda l, r: sqr(l - r), lambda l, r: l + r, hl.float(0),
        hl.agg.sum).checkpoint(new_temp_file())
    assert np.array_equal(
        score_difference.collect(),
        np.array([[0., 6., 4., 2., 4.], [6., 0., 6., 4., 6.],
                  [4., 6., 0., 6., 0.], [2., 4., 6., 0., 6.],
                  [4., 6., 0., 6., 0.]]))
Esempio n. 12
0
def import_vqsr(
    vqsr_path: str,
    vqsr_type: str = "alleleSpecificTrans",
    num_partitions: int = 5000,
    overwrite: bool = False,
    import_header_path: Optional[str] = None,
) -> None:
    """
    Imports vqsr site vcf into a HT
    :param vqsr_path: Path to input vqsr site vcf. This can be specified as Hadoop glob patterns
    :param vqsr_type: One of `classic`, `alleleSpecific` (allele specific) or `alleleSpecificTrans`
        (allele specific with transmitted singletons)
    :param num_partitions: Number of partitions to use for the VQSR HT
    :param overwrite: Whether to overwrite imported VQSR HT
    :param import_header_path: Optional path to a header file to use for import
    :return: None
    """

    logger.info(f"Importing VQSR annotations for {vqsr_type} VQSR...")
    mt = hl.import_vcf(
        vqsr_path,
        force_bgz=True,
        reference_genome="GRCh38",
        header_file=import_header_path,
    ).repartition(num_partitions)

    ht = mt.rows()

    ht = ht.annotate(info=ht.info.annotate(
        AS_VQSLOD=ht.info.AS_VQSLOD.map(lambda x: hl.float(x)),
        AS_QUALapprox=ht.info.AS_QUALapprox.split("\|")[1:].map(
            lambda x: hl.int(x)),
        AS_VarDP=ht.info.AS_VarDP.split("\|")[1:].map(lambda x: hl.int(x)),
        AS_SB_TABLE=ht.info.AS_SB_TABLE.split("\|").map(
            lambda x: x.split(",").map(lambda y: hl.int(y))),
    ))

    ht = ht.checkpoint(
        get_vqsr_filters(f"vqsr_{vqsr_type}", split=False,
                         finalized=False).path,
        overwrite=overwrite,
    )

    unsplit_count = ht.count()
    ht = hl.split_multi_hts(ht)

    ht = ht.annotate(
        info=ht.info.annotate(**split_info_annotation(ht.info, ht.a_index)), )

    ht = ht.checkpoint(
        get_vqsr_filters(f"vqsr_{vqsr_type}", split=True,
                         finalized=False).path,
        overwrite=overwrite,
    )
    split_count = ht.count()
    logger.info(
        f"Found {unsplit_count} unsplit and {split_count} split variants with VQSR annotations"
    )
Esempio n. 13
0
def test_medium_collect():
    n_variants = 100
    n_samples = 100
    block_size = 32
    mt = hl.balding_nichols_model(n_populations=2,
                                  n_variants=n_variants,
                                  n_samples=n_samples)
    mt = mt.select_entries(dosage=hl.float(mt.GT.n_alt_alleles()))

    da = hl.experimental.dnd.array(mt, 'dosage', block_size=block_size)
    a = np.array(mt.dosage.collect()).reshape(n_variants, n_samples)

    assert np.array_equal(da.collect(), a)
Esempio n. 14
0
def variant_qc_aggregator(mt) -> hl.MatrixTable:
    """:func:`.variant_qc` as an aggregator."""
    bound_exprs = {}
    gq_dp_exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select(
            'mean', 'stdev', 'min', 'max')
    if has_field_of_type('GQ', hl.tint32):
        gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select(
            'mean', 'stdev', 'min', 'max')
    if not has_field_of_type('GT', hl.tcall):
        raise ValueError(
            "'variant_qc': expect an entry field 'GT' of type 'call'")
    bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))
    n_cols = hl.agg.count()
    bound_exprs['n_filtered'] = hl.int64(n_cols) - hl.agg.count()
    bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles)
    return hl.rbind(
        hl.struct(**bound_exprs), lambda e1: hl.rbind(
            hl.case().when(
                hl.len(mt.alleles) == 2,
                hl.hardy_weinberg_test(
                    e1.call_stats.homozygote_count[0], e1.call_stats.AC[
                        1] - 2 * e1.call_stats.homozygote_count[1], e1.
                    call_stats.homozygote_count[1])).or_missing(), lambda hwe:
            hl.struct(
                **{
                    **gq_dp_exprs,
                    **e1.call_stats, 'call_rate':
                    hl.float(e1.n_called) /
                    (e1.n_called + e1.n_not_called + e1.n_filtered),
                    'n_called':
                    e1.n_called,
                    'n_not_called':
                    e1.n_not_called,
                    'n_filtered':
                    e1.n_filtered,
                    'n_het':
                    e1.n_called - hl.sum(e1.call_stats.homozygote_count),
                    'n_non_ref':
                    e1.n_called - e1.call_stats.homozygote_count[0],
                    'het_freq_hwe':
                    hwe.het_freq_hwe,
                    'p_value_hwe':
                    hwe.p_value
                })))
Esempio n. 15
0
def cast_str(ht: hl.Table, field_names: list, output_type: str) -> hl.Table:
    """

    :param ht:
    :param field_names:
    :param output_type:
    :return:
    """
    if output_type == 'float':
        ht = (ht.transmute(
            **{
                k: hl.cond(~ht[k].matches('\p{Digit}'), hl.float(0),
                           hl.float(ht[k]))
                for k in field_names
            }))
    if output_type == 'int':
        ht = (ht.transmute(
            **{
                k: hl.cond(~ht[k].matches('\p{Digit}'), hl.int(0), hl.int(
                    ht[k]))
                for k in field_names
            }))

    return ht
Esempio n. 16
0
def join_mitochondria_vcfs_into_mt(input_tsv: str,
                                   output_bucket: str,
                                   chunk_size: int = 100) -> hl.MatrixTable:
    """Reformats and joins individual mitochondrial vcfs into one MatrixTable

    :param dict confirmed_vcfs: dictionary of samples for which the vcf existence was confirmed (sample as key, path to vcf as value)
    :param str output_bucket: path to bucket to which results should be written
    :param int chunk_size: number of MatrixTables to join per chunk

    :return: joined MatrixTable of samples given in confirmed_vcfs dictionary
    :rtype: hl.MatrixTable
    """

    mt_list = []

    with open(input_tsv, "r") as f:

        for line in f:
            line = line.rstrip()
            items = line.split("\t")
            sample, vcf_path = items[0:2]

            mt = hl.import_vcf(vcf_path, reference_genome="GRCh38")
            # because the vcfs are split, there is only one AF value, although misinterpreted as an array because Number=A in vcf header
            # second value of MMQ is the value for the alternate allele
            mt = mt.select_entries("DP", HL=mt.AF[0])
            mt = mt.annotate_entries(MQ=hl.float(mt.info["MMQ"][1]),
                                     TLOD=mt.info["TLOD"][0],
                                     FT=hl.if_else(
                                         hl.len(mt.filters) == 0, {"PASS"},
                                         mt.filters))
            # use GRCh37 as reference as this is more compatibile with mitochondria resources that may be added as annotations in downstream scripts
            mt = mt.key_rows_by(
                locus=hl.locus("MT",
                               mt.locus.position,
                               reference_genome="GRCh37"),
                alleles=mt.alleles,
            )
            mt = mt.key_cols_by(s=sample)
            mt = mt.select_rows()
            mt_list.append(mt)

    temp_out_dir = output_bucket + "/temp"
    combined_mt = multi_way_union_mts(mt_list, temp_out_dir, chunk_size)

    return combined_mt
Esempio n. 17
0
def test_matmul_via_inner_product():
    n_variants = 10
    n_samples = 10
    block_size = 3
    n_blocks = 16
    mt = hl.utils.range_matrix_table(n_variants, n_samples)
    mt = mt.select_entries(x=mt.row_idx * mt.col_idx)

    da = hl.experimental.dnd.array(mt, 'x', block_size=block_size)
    prod = (da @ da.T).checkpoint(new_temp_file())
    assert prod._force_count_blocks() == n_blocks
    prod_result = prod.collect()

    ip_result = da.inner_product(da.T, lambda l, r: l * r, lambda l, r: l + r,
                                 hl.float(0.0),
                                 lambda prod: hl.agg.sum(prod)).collect()

    assert np.array_equal(prod_result, ip_result)
Esempio n. 18
0
def test_medium_matmul():
    n_variants = 100
    n_samples = 100
    block_size = 32
    n_blocks = 16
    mt = hl.balding_nichols_model(n_populations=2,
                                  n_variants=n_variants,
                                  n_samples=n_samples)
    mt = mt.select_entries(dosage=hl.float(mt.GT.n_alt_alleles()))

    da = hl.experimental.dnd.array(mt, 'dosage', block_size=block_size)
    da = (da @ da.T).checkpoint(new_temp_file())
    assert da._force_count_blocks() == n_blocks
    da_result = da.collect().reshape(n_variants, n_variants)

    a = np.array(mt.dosage.collect()).reshape(n_variants, n_samples)
    a_result = a @ a.T

    assert np.array_equal(da_result, a_result)
 def _genotype_fields(self):
     # Convert the mt genotype entries into num_alt, gq, ab, dp, and sample_id.
     is_called = hl.is_defined(self.mt.GT)
     return {
         'num_alt':
         hl.cond(is_called, self.mt.GT.n_alt_alleles(), -1),
         'gq':
         hl.cond(is_called, self.mt.GQ, hl.null(hl.tint)),
         'ab':
         hl.bind(
             lambda total: hl.cond(
                 (is_called) & (total != 0) & (hl.len(self.mt.AD) > 1),
                 hl.float(self.mt.AD[1] / total), hl.null(hl.tfloat)),
             hl.sum(self.mt.AD)),
         'dp':
         hl.cond(is_called, hl.int(hl.min(self.mt.DP, 32000)),
                 hl.null(hl.tfloat)),
         'sample_id':
         self.mt.s
     }
Esempio n. 20
0
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
    ...                      fam='data/ldsc.fam')

    >>> # Create locus-keyed Table with numeric variant annotations
    >>> ht = hl.import_table('data/ldsc.annot',
    ...                      types={'BP': hl.tint,
    ...                             'binary': hl.tfloat,
    ...                             'continuous': hl.tfloat})
    >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
    >>> ht = ht.key_by('locus')

    >>> # Annotate MatrixTable with external annotations
    >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary,
    ...                       continuous_annotation=ht[mt.locus].continuous)

    >>> # Calculate LD scores using centimorgan coordinates
    >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(),
    ...                                      locus_expr=mt.locus,
    ...                                      radius=1.0,
    ...                                      coord_expr=mt.cm_position,
    ...                                      annotation_exprs=[mt.binary_annotation,
    ...                                                        mt.continuous_annotation])

    >>> # Show results
    >>> ht_scores.show(3)

    .. code-block:: text

        +---------------+-------------------+-----------------------+-------------+
        | locus         | binary_annotation | continuous_annotation |  univariate |
        +---------------+-------------------+-----------------------+-------------+
        | locus<GRCh37> |           float64 |               float64 |     float64 |
        +---------------+-------------------+-----------------------+-------------+
        | 20:82079      |       1.15183e+00 |           7.30145e+01 | 1.60117e+00 |
        | 20:103517     |       2.04604e+00 |           2.75392e+02 | 4.69239e+00 |
        | 20:108286     |       2.06585e+00 |           2.86453e+02 | 5.00124e+00 |
        +---------------+-------------------+-----------------------+-------------+


    Warning
    -------
        :func:`.ld_score` will fail if ``entry_expr`` results in any missing
        values. The special float value ``nan`` is not considered a
        missing value.

    **Further reading**

    For more in-depth discussion of LD scores, see:

    - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__
    - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__

    Notes
    -----

    `entry_expr`, `locus_expr`, `coord_expr` (if specified), and
    `annotation_exprs` (if specified) must come from the same
    MatrixTable.


    Parameters
    ----------
    entry_expr : :class:`.NumericExpression`
        Expression for entries of genotype matrix
        (e.g. ``mt.GT.n_alt_alleles()``).
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression.
    radius : :obj:`int` or :obj:`float`
        Radius of window for row values (in units of `coord_expr` if set,
        otherwise in units of basepairs).
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value used to window
        variants. By default, the row value is given by the locus
        position.
    annotation_exprs : :class:`.NumericExpression` or
                       :obj:`list` of :class:`.NumericExpression`, optional
        Annotation expression(s) to partition LD scores. Univariate
        annotation will always be included and does not need to be
        specified.
    block_size : :obj:`int`, optional
        Block size. Default given by :meth:`.BlockMatrix.default_block_size`.

    Returns
    -------
    :class:`.Table`
        Table keyed by `locus_expr` with LD scores for each variant and
        `annotation_expr`. The function will always return LD scores for
        the univariate (all SNPs) annotation."""

    mt = entry_expr._indices.source
    mt_locus_expr = locus_expr._indices.source

    if coord_expr is None:
        mt_coord_expr = mt_locus_expr
    else:
        mt_coord_expr = coord_expr._indices.source

    if not annotation_exprs:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr])
    else:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr]
                        + [mt == x._indices.source
                           for x in wrap_to_list(annotation_exprs)])

    if not check_mts:
        raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr
                            (if specified), and annotation_exprs (if
                            specified) must come from same MatrixTable.""")

    n = mt.count_cols()
    r2 = hl.row_correlation(entry_expr, block_size) ** 2
    r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))

    starts, stops = hl.linalg.utils.locus_windows(locus_expr,
                                                  radius,
                                                  coord_expr)
    r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops)

    r2_adj_sparse_tmp = new_temp_file()
    r2_adj_sparse.write(r2_adj_sparse_tmp)
    r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp)

    if not annotation_exprs:
        cols = ['univariate']
        col_idxs = {0: 'univariate'}
        l2 = r2_adj_sparse.sum(axis=1)
    else:
        ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows()
        ht = ht.annotate(univariate=hl.literal(1.0))
        names = [name for name in ht.row if name not in ht.key]

        ht_union = hl.Table.union(
            *[(ht.annotate(name=hl.str(x),
                           value=hl.float(ht[x]))
               .select('name', 'value')) for x in names])
        mt_annotations = ht_union.to_matrix_table(
            row_key=list(ht_union.key),
            col_key=['name'])

        cols = mt_annotations.key_cols_by()['name'].collect()
        col_idxs = {i: cols[i] for i in range(len(cols))}

        a_tmp = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp)

        a = BlockMatrix.read(a_tmp)
        l2 = r2_adj_sparse @ a

    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_temp_file()
    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index()
    ht_scores = ht_scores.key_by('idx')
    ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i]
                                  for i in range(len(cols))})

    ht = mt.select_rows(__locus=locus_expr).rows()
    ht = ht.add_index()
    ht = ht.annotate(**ht_scores[ht.idx])
    ht = ht.key_by('__locus')
    ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key])
    ht = ht.rename({'__locus': 'locus'})

    return ht
Esempio n. 21
0
def main(args):
    ########################################################################
    ### initialize
    print('Getting started: ' + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    # 1. Read in summary stats data
    # 2. Annotate matrix table with effect sizes for each phenotype
    # 3. Compute PRS for each
    start = time.time()

    pheno_gwas = hl.import_table(f'gs://apcdr/pheno_code_ukb_code.txt')
    pheno_ss = dict([(x.pheno_code, x.ukb_code) for x in pheno_gwas.collect()])
    #pheno_ss = dict([(x.ss_code, x.pheno_code) for x in pheno_gwas.collect()])

    # mt = hl.read_matrix_table('gs://apcdr/prs_sumstats_clumps/ukb_holdout/ukb31063.gwas_holdout_sumstats_pheno37_subset.mt')
    mt = hl.read_matrix_table('gs://apcdr/dosage_bgen/apcdr.mt')
    ss_keys = dict(
        zip(['CHR', 'POS', 'REF', 'ALT', 'P', 'BETA'],
            args.chr_pos_ref_alt_p_beta.split(',')))

    for pheno in list(pheno_ss.keys()):
        #for pheno in ['WHR']:
        print('Pheno: ' + pheno + ', Time: ' +
              datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

        suffix_replace = args.ss_suffix.split('.')
        suffix_replace[-2] = 'clumped'
        suffix_replace = '.'.join(suffix_replace)
        if hl.hadoop_exists(args.ss_clump_prefix + pheno + suffix_replace):
            ss_path = args.ss_clump_prefix + pheno + args.ss_suffix
            clump_path = args.ss_clump_prefix + pheno + suffix_replace
        elif hl.hadoop_exists(args.ss_clump_prefix + pheno_ss[pheno] +
                              suffix_replace):
            ss_path = args.ss_clump_prefix + pheno_ss[pheno] + args.ss_suffix
            clump_path = args.ss_clump_prefix + pheno_ss[pheno] + suffix_replace
        else:
            continue

        ss = hl.import_table(ss_path,
                             impute=True,
                             delimiter='\s+',
                             min_partitions=1000)
        ss = ss.annotate(locus=hl.locus(hl.str(ss[ss_keys['CHR']]),
                                        ss[ss_keys['POS']]),
                         alleles=[ss[ss_keys['REF']], ss[ss_keys['ALT']]])
        ss = ss.key_by(ss.locus, ss.alleles)

        ## Read in summary statistics and true phenotypes
        mt_annot = mt.annotate_rows(ss=ss[mt.locus,
                                          mt.alleles])  # come back to this
        # ht_samples = hl.import_table('gs://apcdr/ukb_holdout/ukb31063.gwas_samples.gwas_vs_holdout.txt',
        #                              types={'s': hl.tstr}, key='s')
        # ht_samples = hl.import_table('gs://apcdr/ukb_holdout/ukb31063.gwas_samples.holdout_and_target.txt',
        #                              types={'s': hl.tstr}, key='s')
        # # mt_annot = mt_annot.filter_cols(hl.or_else(ht_samples[mt_annot.s].in_gwas != 'TRUE', True))
        # mt_annot = mt_annot.filter_cols(hl.is_defined(ht_samples[mt_annot.s]))
        # # print(mt.count()) # 13364303, 136265)

        print('Starting ' + pheno + ': ' +
              datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

        p_max = {
            's1': 5e-8,
            's2': 1e-6,
            's3': 1e-4,
            's4': 1e-3,
            's5': 1e-2,
            's6': .05,
            's7': .1,
            's8': .2,
            's9': .5,
            's10': 1
        }

        pheno_clump = specific_clumps(clump_path)

        mt_annot = mt_annot.filter_rows(pheno_clump.get(mt_annot.locus, False))
        # print(mt.count())

        annot_expr = {
            k: hl.agg.sum(
                hl.float(mt_annot.ss[ss_keys['BETA']]) * mt_annot.dosage *
                hl.int(mt_annot.ss[ss_keys['P']] < v))
            for k, v in p_max.items()
        }

        mt_annot = mt_annot.annotate_cols(**annot_expr)

        ht_out = mt_annot.cols()
        #ht_out.describe()
        #covs = hl.read_table('gs://apcdr/ukb_holdout/uk_round2_allSamples_phenos_phesant.ht').select('age', 'sex')  # added
        # need to add in PCs
        #ht_out = ht_out.annotate(**covs[ht_out.key])
        ht_comb = ht_out.select(*p_max.keys(),
                                age=ht_out.phenotypes.age,
                                sex=ht_out.phenotypes.sex,
                                pheno=ht_out.phenotypes[pheno])

        output_location = args.ss_clump_prefix + pheno + '_apcdr_PRS'
        #ht_comb.describe()
        #ht_comb.write(output_location + '.ht', overwrite=args.overwrite)
        #ht_comb = hl.read_table(output_location + '.ht')
        ht_comb.export(output_location + '.txt.bgz')

    end = time.time()
    print("Success! Job was completed in %s" %
          time.strftime("%H:%M:%S", time.gmtime(end - start)))
def create_binned_data_initial(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table:
    # Count variants for ranking
    count_expr = {x: hl.agg.filter(hl.is_defined(ht[x]), hl.agg.counter(hl.cond(hl.is_snp(
        ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank')}
    rank_variant_counts = ht.aggregate(hl.Struct(**count_expr))
    logger.info(
        f"Found the following variant counts:\n {pformat(rank_variant_counts)}")
    ht_truth_data = hl.read_table(
        f"{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht")
    ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts)
    ht = ht.annotate(
        **ht_truth_data[ht.key],
        # **fam_ht[ht.key],
        # **gnomad_ht[ht.key],
        # **denovo_ht[ht.key],
        # clinvar=hl.is_defined(clinvar_ht[ht.key]),
        indel_length=hl.abs(ht.alleles[0].length()-ht.alleles[1].length()),
        rank_bins=hl.array(
            [hl.Struct(
                rank_id=rank_name,
                bin=hl.int(hl.ceil(hl.float(ht[rank_name] + 1) / hl.floor(ht.globals.rank_variant_counts[rank_name][hl.cond(
                    hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins)))
            )
                for rank_name in rank_variant_counts]
        ),
        # lcr=hl.is_defined(lcr_intervals[ht.locus])
    )

    ht = ht.explode(ht.rank_bins)
    ht = ht.transmute(
        rank_id=ht.rank_bins.rank_id,
        bin=ht.rank_bins.bin
    )
    ht = ht.filter(hl.is_defined(ht.bin))

    ht = ht.checkpoint(
        f'{tmp_dir}/gnomad_score_binning_tmp.ht', overwrite=True)

    # Create binned data
    return (
        ht
        .group_by(
            rank_id=ht.rank_id,
            contig=ht.locus.contig,
            snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
            bi_allelic=hl.is_defined(ht.biallelic_rank),
            singleton=ht.transmitted_singleton,
            trans_singletons=hl.is_defined(ht.singleton_rank),
            de_novo_high_quality=ht.de_novo_high_quality_rank,
            de_novo_medium_quality=hl.is_defined(
                ht.de_novo_medium_quality_rank),
            de_novo_synonymous=hl.is_defined(ht.de_novo_synonymous_rank),
            # release_adj=ht.ac > 0,
            bin=ht.bin
        )._set_buffer_size(20000)
        .aggregate(
            min_score=hl.agg.min(ht.score),
            max_score=hl.agg.max(ht.score),
            n=hl.agg.count(),
            n_ins=hl.agg.count_where(
                hl.is_insertion(ht.alleles[0], ht.alleles[1])),
            n_del=hl.agg.count_where(
                hl.is_deletion(ht.alleles[0], ht.alleles[1])),
            n_ti=hl.agg.count_where(hl.is_transition(
                ht.alleles[0], ht.alleles[1])),
            n_tv=hl.agg.count_where(hl.is_transversion(
                ht.alleles[0], ht.alleles[1])),
            n_1bp_indel=hl.agg.count_where(ht.indel_length == 1),
            n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0),
            # n_clinvar=hl.agg.count_where(ht.clinvar),
            n_singleton=hl.agg.count_where(ht.transmitted_singleton),
            n_high_quality_de_novos=hl.agg.count_where(
                ht.de_novo_data.p_de_novo[0] > 0.99),
            n_validated_DDD_denovos=hl.agg.count_where(
                ht.inheritance.contains("De novo")),
            n_medium_quality_de_novos=hl.agg.count_where(
                ht.de_novo_data.p_de_novo[0] > 0.5),
            n_high_confidence_de_novos=hl.agg.count_where(
                ht.de_novo_data.confidence[0] == 'HIGH'),
            n_de_novo=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.sum(
                ht.family_stats.mendel[0].errors)),
            n_high_quality_de_novos_synonymous=hl.agg.count_where(
                (ht.de_novo_data.p_de_novo[0] > 0.99) & (ht.consequence == "synonymous_variant")),
            # n_de_novo_no_lcr=hl.agg.filter(~ht.lcr & (
            #    ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)),
            n_de_novo_sites=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.count_where(
                ht.family_stats.mendel[0].errors > 0)),
            # n_de_novo_sites_no_lcr=hl.agg.filter(~ht.lcr & (
            #    ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)),
            n_trans_singletons=hl.agg.filter((ht.ac_raw < 3) & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)),
            n_trans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)),
            n_untrans_singletons=hl.agg.filter((ht.ac_raw < 3) & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)),
            n_untrans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)),
            n_train_trans_singletons=hl.agg.count_where(
                (ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1) & (ht.family_stats.tdt[0].t == 1)),
            n_omni=hl.agg.count_where(ht.omni),
            n_mills=hl.agg.count_where(ht.mills),
            n_hapmap=hl.agg.count_where(ht.hapmap),
            n_kgp_high_conf_snvs=hl.agg.count_where(
                ht.kgp_phase1_hc),
            fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters),
            # n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site),
            # n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)
        )
    )
Esempio n. 23
0
File: qc.py Progetto: troels/hail
def variant_qc(mt, name='variant_qc') -> MatrixTable:
    """Compute common variant statistics (quality control metrics).

    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    >>> dataset_result = hl.variant_qc(dataset)

    Notes
    -----
    This method computes variant statistics from the genotype data, returning
    a new struct field `name` with the following metrics based on the fields
    present in the entry schema.

    If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the
    field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type
    :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats`
    and `gq_stats` are structs with with four fields:

    - `mean` (``float64``) -- Mean value.
    - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom).
    - `min` (``int32``) -- Minimum value.
    - `max` (``int32``) -- Maximum value.

    If the dataset does not contain an entry field `GT` of type
    :py:data:`.tcall`, then an error is raised. The following fields are always
    computed from `GT`:

    - `AF` (``array<float64>``) -- Calculated allele frequency, one element
      per allele, including the reference. Sums to one. Equivalent to
      `AC` / `AN`.
    - `AC` (``array<int32>``) -- Calculated allele count, one element per
      allele, including the reference. Sums to `AN`.
    - `AN` (``int32``) -- Total number of called alleles.
    - `homozygote_count` (``array<int32>``) -- Number of homozygotes per
      allele. One element per allele, including the reference.
    - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered.
      Equivalent to `n_called` / :meth:`.count_cols`.
    - `n_called` (``int64``) -- Number of samples with a defined `GT`.
    - `n_not_called` (``int64``) -- Number of samples with a missing `GT`.
    - `n_filtered` (``int64``) -- Number of filtered entries.
    - `n_het` (``int64``) -- Number of heterozygous samples.
    - `n_non_ref` (``int64``) -- Number of samples with at least one called
      non-reference allele.
    - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous
      samples under Hardy-Weinberg equilibrium. See
      :func:`.functions.hardy_weinberg_test` for details.
    - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium.
      See :func:`.functions.hardy_weinberg_test` for details.

    Warning
    -------
    `het_freq_hwe` and `p_value_hwe` are calculated as in
    :func:`.functions.hardy_weinberg_test`, with non-diploid calls
    (``ploidy != 2``) ignored in the counts. As this test is only
    statistically rigorous in the biallelic setting, :func:`.variant_qc`
    sets both fields to missing for multiallelic variants. Consider using
    :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
    """
    require_row_key_variant(mt, 'variant_qc')

    bound_exprs = {}
    gq_dp_exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select(
            'mean', 'stdev', 'min', 'max')

    if has_field_of_type('GQ', hl.tint32):
        gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select(
            'mean', 'stdev', 'min', 'max')

    if not has_field_of_type('GT', hl.tcall):
        raise ValueError(
            f"'variant_qc': expect an entry field 'GT' of type 'call'")

    bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))
    bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count()
    bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles)

    result = hl.rbind(
        hl.struct(**bound_exprs), lambda e1: hl.rbind(
            hl.case().when(
                hl.len(mt.alleles) == 2,
                hl.hardy_weinberg_test(
                    e1.call_stats.homozygote_count[0], e1.call_stats.AC[
                        1] - 2 * e1.call_stats.homozygote_count[1], e1.
                    call_stats.homozygote_count[1])).or_missing(), lambda hwe:
            hl.struct(
                **{
                    **gq_dp_exprs,
                    **e1.call_stats, 'call_rate':
                    hl.float(e1.n_called) /
                    (e1.n_called + e1.n_not_called + e1.n_filtered),
                    'n_called':
                    e1.n_called,
                    'n_not_called':
                    e1.n_not_called,
                    'n_filtered':
                    e1.n_filtered,
                    'n_het':
                    e1.n_called - hl.sum(e1.call_stats.homozygote_count),
                    'n_non_ref':
                    e1.n_called - e1.call_stats.homozygote_count[0],
                    'het_freq_hwe':
                    hwe.het_freq_hwe,
                    'p_value_hwe':
                    hwe.p_value
                })))

    return mt.annotate_rows(**{name: result})
Esempio n. 24
0
def main(args):
    ########################################################################
    ### initialize
    phenos = [
        'height', 'bmi', 'sbp', 'dbp', 'wbc', 'monocyte', 'neutrophil',
        'eosinophil', 'basophil', 'lymphocyte', 'rbc', 'mch', 'mcv', 'mchc',
        'hb', 'ht', 'plt'
    ]
    phenos.sort()
    phenotype = 'ALL17'

    if args.clump_basename is None:
        clumps = args.dirname + args.basename + '_ALL17.clumped'
        prs_loci_table_location = args.dirname + 'keytables/ukb-' + phenotype + '_' + args.basename + '-pt-sumstats-locus-allele-keyed.kt'
        contig_row_dict_location = args.dirname + 'contig_row_dict-' + phenotype + '_' + args.basename
    else:
        clumps = args.dirname + args.clump_basename + '_ALL17.clumped'
        prs_loci_table_location = args.dirname + 'keytables/ukb-' + phenotype + '_' + args.basename_out + '-pt-sumstats-locus-allele-keyed.kt'
        contig_row_dict_location = args.dirname + 'contig_row_dict-' + phenotype + '_' + args.basename_out

        # clumps = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '.clumped'
        # ss_filename = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '.tsv.gz'
        # out_base = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '_gwas_PRS'

    clump_table_location = clumps.replace('.clumped', '.kt')

    contigs = {'0{}'.format(x): str(x) for x in range(1, 10)}

    bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'

    start = time.time()
    # large block size because we read very little data (due to filtering & ignoring genotypes)
    hl.init(branching_factor=10, min_block_size=2000)
    # set min_block_size only in import_bgen

    ################################################################################
    ### set up the sumstats table (chr, bp for union SNPs)
    # if (args.generate_prs_loci_table):
    #     t = hl.import_table(sumstats_text_file,
    #                         delimiter='\s+',
    #                         impute=True)
    #     t = t.select(locus = hl.locus(hl.str(t.CHR), t.BP))
    #     t = t.key_by('locus')
    #     t.write(prs_loci_table_location, overwrite=True)
    #
    # ss = hl.read_table(prs_loci_table_location)

    if args.read_clumps:
        clump_file = hl.import_table(clumps, delimiter='\s+', impute=True)
        clump_file = clump_file.select(
            locus=hl.locus(hl.str(clump_file.CHR), clump_file.BP))
        clump_file = clump_file.key_by('locus')
        clump_file.write(clump_table_location, overwrite=True)

    clump_file = hl.read_table(clump_table_location)

    # ################################################################################
    # ### determine the indices of the prs variants in bgen
    # if (args.generate_contig_row_dict):
    #     mt = hl.methods.import_bgen(bgen_files,
    #                                 [],
    #                                 contig_recoding=contigs,
    #                                 _row_fields=['file_row_idx'])
    #     prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows()
    #     print('about to collect')
    #     # remove all unnecessary data, dropping keys and other irrelevant fields
    #     prs_rows = prs_rows.key_by()
    #     prs_rows = prs_rows.select(contig=prs_rows.locus.contig,
    #                                file_row_idx=prs_rows.file_row_idx)
    #     contig_row_list = prs_rows.collect()
    #     print('finished collecting')
    #     contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list]
    #     print('reformed')
    #     from collections import defaultdict
    #     contig_row_dict = defaultdict(list)
    #     for k, v in contig_reformed:
    #         contig_row_dict[k].append(v)
    #     print('dictionary created')
    #
    #     with hl.hadoop_open(contig_row_dict_location, 'wb') as f:
    #         pickle.dump(contig_row_dict, f)
    # else:
    #     with hl.hadoop_open(contig_row_dict_location, 'rb') as f:
    #         contig_row_dict = pickle.load(f)

    ################################################################################
    ### Get true phenotypes from UKBB
    if args.pheno_table:
        # phenotypes = hl.import_table('gs://phenotype_31063/ukb31063.phesant_phenotypes.both_sexes.tsv.bgz',
        #                                key='userId', quote='"', impute=True, types={'userId': hl.tstr}, missing='')
        phenotypes = hl.import_table(
            'gs://armartin/disparities/ukbb/UKB_phenos_ALL17.txt.bgz',
            key='eid',
            impute=True,
            types={'eid': hl.tstr})

        covariates = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
            key='s',
            impute=True,
            types={'s': hl.tstr})

        samples = covariates.annotate(**phenotypes[covariates.s])

        # Write pheno/covar/sample info table
        for pheno in phenos:
            #sampleids = hl.import_table('gs://ukb31063-mega-gwas/hail-0.1/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')
            gwas_holdout = hl.import_table(
                'gs://armartin/mama/ukb31063.gwas_samples.gwas_vs_holdout.txt',
                delimiter='\s+').key_by('s')

            samples = samples.annotate(**{
                pheno + '_holdout':
                gwas_holdout[samples.s].in_gwas == 'FALSE'
            })

        samples.write(args.dirname + args.basename + '_holdout_gwas_phenos.ht',
                      True)

    if args.ss_tables:
        # Write ss info
        for pheno in phenos:
            print(pheno)
            # change sumstats to bgz
            #ss = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.txt.gz',
            ss = hl.import_table(args.dirname + pheno + '_' + args.basename +
                                 '.*.bgz',
                                 delimiter='\s+',
                                 impute=True,
                                 types={
                                     'MAMA_BETA': hl.tfloat,
                                     'MAMA_PVAL': hl.tfloat,
                                     'BP': hl.tint
                                 })
            #, 'N': hl.tint})
            ss = ss.key_by(
                locus=hl.locus(hl.str(ss.CHR), hl.int(ss.BP))).repartition(200)

            ss.write(args.dirname + pheno + '_' + args.basename + '.ht', True)

    ################################################################################
    ### Run the PRS using phenotype-specific clump variants
    if args.write_bgen:
        mt_all = hl.import_bgen(
            bgen_files, ['dosage'],
            sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
            variants=clump_file.locus)
        # contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()}
        # mt_all = hl.methods.import_bgen(bgen_files,
        #                             ['dosage'],
        #                             sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
        #                             contig_recoding=contigs,
        #                             _variants_per_file=contig_row_dict2,
        #                             _row_fields=[])

        #samples.write(args.dirname + args.basename + '_holdout_gwas_phenos.ht', True)
        samples = hl.read_table(args.dirname + args.basename +
                                '_holdout_gwas_phenos.ht')
        mt_all = mt_all.annotate_cols(**samples[
            mt_all.s])  # ok that phenos keyed on userId not s?

        #
        if args.clump_basename is None:
            mt_all.repartition(5000, shuffle=False).write(
                args.dirname + args.basename + '_ALL17.mt', True)
        else:
            mt_all.repartition(5000, shuffle=False).write(
                args.dirname + args.basename_out + '_ALL17.mt', True)

    mt_all = hl.read_matrix_table(args.dirname + args.basename + '_ALL17.mt')

    for pheno in phenos:  #[6:len(phenos)]:
        print(pheno)
        ss = hl.read_table(args.dirname + pheno + '_' + args.basename + '.ht')
        """
        To add:
        - Filter only to samples in holdout GWAS
        - Filter to rows in phenotype-specific clump file
        - Build PRS for 10 p-value thresholds
        - Also fix nt1/nt2 to A1 and A2 (check) from sumstats.
        """
        # filter to only samples held out from GWAS
        mt = mt_all.filter_cols(mt_all[pheno + '_holdout'])

        mt = mt.annotate_rows(ss=ss[mt.locus])
        mt = annotate_beta(mt, mt.ss)

        p_max = {
            's1': 5e-8,
            's2': 1e-6,
            's3': 1e-4,
            's4': 1e-3,
            's5': 1e-2,
            's6': .05,
            's7': .1,
            's8': .2,
            's9': .5,
            's10': 1
        }

        if args.clump_basename is None:
            pheno_clump = specific_clumps(args.dirname + pheno + '_' +
                                          args.basename + '.clumped')
        else:
            pheno_clump = specific_clumps(args.dirname + pheno + '_' +
                                          args.clump_basename + '.clumped')

        mt = mt.filter_rows(pheno_clump.get(mt.locus, False))
        print(mt.count())

        # divide by sd's of frequencies to get standardized betas back to allelic scale for MAMA betas (only, not METAL)
        # sqrt(2pq)
        if args.betas_are_standardized:
            annot_expr = {
                k: hl.agg.sum(mt.beta / hl.sqrt(2 * hl.float(mt.ss.FRQ) * 1 -
                                                hl.float(mt.ss.FRQ)) *
                              mt.dosage * hl.int(mt.ss.MAMA_PVAL < v))
                for k, v in p_max.items()
            }
        else:
            annot_expr = {
                k:
                hl.agg.sum(mt.beta * mt.dosage * hl.int(mt.ss.MAMA_PVAL < v))
                for k, v in p_max.items()
            }

        mt = mt.annotate_cols(**annot_expr)

        if args.clump_basename is None:
            mt.cols().write(args.dirname + 'UKB_' + pheno + '_' +
                            args.basename + '_PRS.ht',
                            stage_locally=True,
                            overwrite=True)
            ht = hl.read_table(args.dirname + 'UKB_' + pheno + '_' +
                               args.basename + '_PRS.ht')
        else:
            mt.cols().write(args.dirname + 'UKB_' + pheno + '_' +
                            args.basename_out + '_PRS.ht',
                            stage_locally=True,
                            overwrite=True)
            ht = hl.read_table(args.dirname + 'UKB_' + pheno + '_' +
                               args.basename_out + '_PRS.ht')
        ht_out = ht.drop(*[x for x in list(ht.row) if 'holdout' in x],
                         *[x for x in phenos if pheno not in x])

        if args.clump_basename is None:
            output_location = args.dirname + 'UKB_' + pheno + '_' + args.basename + '_PRS.txt.bgz'
        else:
            output_location = args.dirname + 'UKB_' + pheno + '_' + args.basename_out + '_PRS.txt.bgz'
        ht_out.export(output_location)
    end = time.time()
    print("Success! Job was completed in %s" %
          time.strftime("%H:%M:%S", time.gmtime(end - start)))
def create_binned_data(ht: hl.Table, data: str, data_type: str,
                       n_bins: int) -> hl.Table:
    """
    Creates binned data from a rank Table grouped by rank_id (rank, biallelic, etc.), contig, snv, bi_allelic and singleton
    containing the information needed for evaluation plots.

    :param Table ht: Input rank table
    :param str data: Which data/run hash is being created
    :param str data_type: one of 'exomes' or 'genomes'
    :param int n_bins: Number of bins.
    :return: Binned Table
    :rtype: Table
    """

    # Count variants for ranking
    count_expr = {
        x: hl.agg.filter(
            hl.is_defined(ht[x]),
            hl.agg.counter(
                hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv',
                        'indel')))
        for x in ht.row if x.endswith('rank')
    }
    rank_variant_counts = ht.aggregate(hl.Struct(**count_expr))
    logger.info(
        f"Found the following variant counts:\n {pformat(rank_variant_counts)}"
    )
    ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts)

    # Load external evaluation data
    clinvar_ht = hl.read_table(clinvar_ht_path)
    denovo_ht = get_validated_denovos_ht()
    if data_type == 'exomes':
        denovo_ht = denovo_ht.filter(denovo_ht.gnomad_exomes.high_quality)
    else:
        denovo_ht = denovo_ht.filter(denovo_ht.gnomad_genomes.high_quality)
    denovo_ht = denovo_ht.select(
        validated_denovo=denovo_ht.validated,
        high_confidence_denovo=denovo_ht.Confidence == 'HIGH')
    ht_truth_data = hl.read_table(annotations_ht_path(data_type, 'truth_data'))
    fam_ht = hl.read_table(annotations_ht_path(data_type, 'family_stats'))
    fam_ht = fam_ht.select(family_stats=fam_ht.family_stats[0])
    gnomad_ht = get_gnomad_data(data_type).rows()
    gnomad_ht = gnomad_ht.select(
        vqsr_negative_train_site=gnomad_ht.info.NEGATIVE_TRAIN_SITE,
        vqsr_positive_train_site=gnomad_ht.info.POSITIVE_TRAIN_SITE,
        fail_hard_filters=(gnomad_ht.info.QD < 2) | (gnomad_ht.info.FS > 60) |
        (gnomad_ht.info.MQ < 30))
    lcr_intervals = hl.import_locus_intervals(lcr_intervals_path)

    ht = ht.annotate(
        **ht_truth_data[ht.key],
        **fam_ht[ht.key],
        **gnomad_ht[ht.key],
        **denovo_ht[ht.key],
        clinvar=hl.is_defined(clinvar_ht[ht.key]),
        indel_length=hl.abs(ht.alleles[0].length() - ht.alleles[1].length()),
        rank_bins=hl.array([
            hl.Struct(
                rank_id=rank_name,
                bin=hl.int(
                    hl.ceil(
                        hl.float(ht[rank_name] + 1) / hl.floor(
                            ht.globals.rank_variant_counts[rank_name][hl.cond(
                                hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv',
                                'indel')] / n_bins))))
            for rank_name in rank_variant_counts
        ]),
        lcr=hl.is_defined(lcr_intervals[ht.locus]))

    ht = ht.explode(ht.rank_bins)
    ht = ht.transmute(rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin)
    ht = ht.filter(hl.is_defined(ht.bin))

    ht = ht.checkpoint(
        f'gs://gnomad-tmp/gnomad_score_binning_{data_type}_tmp_{data}.ht',
        overwrite=True)

    # Create binned data
    return (ht.group_by(
        rank_id=ht.rank_id,
        contig=ht.locus.contig,
        snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
        bi_allelic=hl.is_defined(ht.biallelic_rank),
        singleton=ht.singleton,
        release_adj=ht.ac > 0,
        bin=ht.bin)._set_buffer_size(20000).aggregate(
            min_score=hl.agg.min(ht.score),
            max_score=hl.agg.max(ht.score),
            n=hl.agg.count(),
            n_ins=hl.agg.count_where(
                hl.is_insertion(ht.alleles[0], ht.alleles[1])),
            n_del=hl.agg.count_where(
                hl.is_deletion(ht.alleles[0], ht.alleles[1])),
            n_ti=hl.agg.count_where(
                hl.is_transition(ht.alleles[0], ht.alleles[1])),
            n_tv=hl.agg.count_where(
                hl.is_transversion(ht.alleles[0], ht.alleles[1])),
            n_1bp_indel=hl.agg.count_where(ht.indel_length == 1),
            n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0),
            n_clinvar=hl.agg.count_where(ht.clinvar),
            n_singleton=hl.agg.count_where(ht.singleton),
            n_validated_de_novos=hl.agg.count_where(ht.validated_denovo),
            n_high_confidence_de_novos=hl.agg.count_where(
                ht.high_confidence_denovo),
            n_de_novo=hl.agg.filter(
                ht.family_stats.unrelated_qc_callstats.AC[1] == 0,
                hl.agg.sum(ht.family_stats.mendel.errors)),
            n_de_novo_no_lcr=hl.agg.filter(
                ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0),
                hl.agg.sum(ht.family_stats.mendel.errors)),
            n_de_novo_sites=hl.agg.filter(
                ht.family_stats.unrelated_qc_callstats.AC[1] == 0,
                hl.agg.count_where(ht.family_stats.mendel.errors > 0)),
            n_de_novo_sites_no_lcr=hl.agg.filter(
                ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0),
                hl.agg.count_where(ht.family_stats.mendel.errors > 0)),
            n_trans_singletons=hl.agg.filter(
                (ht.info_ac < 3) &
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1),
                hl.agg.sum(ht.family_stats.tdt.t)),
            n_untrans_singletons=hl.agg.filter(
                (ht.info_ac < 3) &
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1),
                hl.agg.sum(ht.family_stats.tdt.u)),
            n_train_trans_singletons=hl.agg.count_where(
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1)
                & (ht.family_stats.tdt.t == 1)),
            n_omni=hl.agg.count_where(ht.truth_data.omni),
            n_mills=hl.agg.count_where(ht.truth_data.mills),
            n_hapmap=hl.agg.count_where(ht.truth_data.hapmap),
            n_kgp_high_conf_snvs=hl.agg.count_where(
                ht.truth_data.kgp_high_conf_snvs),
            fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters),
            n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site),
            n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)))
Esempio n. 26
0
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
    ...                      fam='data/ldsc.fam')

    >>> # Create locus-keyed Table with numeric variant annotations
    >>> ht = hl.import_table('data/ldsc.annot',
    ...                      types={'BP': hl.tint,
    ...                             'binary': hl.tfloat,
    ...                             'continuous': hl.tfloat})
    >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
    >>> ht = ht.key_by('locus')

    >>> # Annotate MatrixTable with external annotations
    >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary,
    ...                       continuous_annotation=ht[mt.locus].continuous)

    >>> # Calculate LD scores using centimorgan coordinates
    >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(),
    ...                                      locus_expr=mt.locus,
    ...                                      radius=1.0,
    ...                                      coord_expr=mt.cm_position,
    ...                                      annotation_exprs=[mt.binary_annotation,
    ...                                                        mt.continuous_annotation])

    >>> # Show results
    >>> ht_scores.show(3)

    .. code-block:: text

        +---------------+-------------------+-----------------------+-------------+
        | locus         | binary_annotation | continuous_annotation |  univariate |
        +---------------+-------------------+-----------------------+-------------+
        | locus<GRCh37> |           float64 |               float64 |     float64 |
        +---------------+-------------------+-----------------------+-------------+
        | 20:82079      |       1.15183e+00 |           7.30145e+01 | 1.60117e+00 |
        | 20:103517     |       2.04604e+00 |           2.75392e+02 | 4.69239e+00 |
        | 20:108286     |       2.06585e+00 |           2.86453e+02 | 5.00124e+00 |
        +---------------+-------------------+-----------------------+-------------+


    Warning
    -------
        :func:`.ld_score` will fail if ``entry_expr`` results in any missing
        values. The special float value ``nan`` is not considered a
        missing value.

    **Further reading**

    For more in-depth discussion of LD scores, see:

    - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__
    - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__

    Notes
    -----

    `entry_expr`, `locus_expr`, `coord_expr` (if specified), and
    `annotation_exprs` (if specified) must come from the same
    MatrixTable.


    Parameters
    ----------
    entry_expr : :class:`.NumericExpression`
        Expression for entries of genotype matrix
        (e.g. ``mt.GT.n_alt_alleles()``).
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression.
    radius : :obj:`int` or :obj:`float`
        Radius of window for row values (in units of `coord_expr` if set,
        otherwise in units of basepairs).
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value used to window
        variants. By default, the row value is given by the locus
        position.
    annotation_exprs : :class:`.NumericExpression` or
                       :obj:`list` of :class:`.NumericExpression`, optional
        Annotation expression(s) to partition LD scores. Univariate
        annotation will always be included and does not need to be
        specified.
    block_size : :obj:`int`, optional
        Block size. Default given by :meth:`.BlockMatrix.default_block_size`.

    Returns
    -------
    :class:`.Table`
        Table keyed by `locus_expr` with LD scores for each variant and
        `annotation_expr`. The function will always return LD scores for
        the univariate (all SNPs) annotation."""

    mt = entry_expr._indices.source
    mt_locus_expr = locus_expr._indices.source

    if coord_expr is None:
        mt_coord_expr = mt_locus_expr
    else:
        mt_coord_expr = coord_expr._indices.source

    if not annotation_exprs:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr])
    else:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr] +
                        [mt == x._indices.source
                         for x in wrap_to_list(annotation_exprs)])

    if not check_mts:
        raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr
                            (if specified), and annotation_exprs (if
                            specified) must come from same MatrixTable.""")

    n = mt.count_cols()
    r2 = hl.row_correlation(entry_expr, block_size) ** 2
    r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0))

    starts, stops = hl.linalg.utils.locus_windows(locus_expr,
                                                  radius,
                                                  coord_expr)
    r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops)

    r2_adj_sparse_tmp = new_temp_file()
    r2_adj_sparse.write(r2_adj_sparse_tmp)
    r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp)

    if not annotation_exprs:
        cols = ['univariate']
        col_idxs = {0: 'univariate'}
        l2 = r2_adj_sparse.sum(axis=1)
    else:
        ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows()
        ht = ht.annotate(univariate=hl.literal(1.0))
        names = [name for name in ht.row if name not in ht.key]

        ht_union = hl.Table.union(
            *[(ht.annotate(name=hl.str(x),
                           value=hl.float(ht[x]))
                 .select('name', 'value')) for x in names])
        mt_annotations = ht_union.to_matrix_table(
            row_key=list(ht_union.key),
            col_key=['name'])

        cols = mt_annotations.key_cols_by()['name'].collect()
        col_idxs = {i: cols[i] for i in range(len(cols))}

        a_tmp = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp)

        a = BlockMatrix.read(a_tmp)
        l2 = r2_adj_sparse @ a

    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_temp_file()
    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index()
    ht_scores = ht_scores.key_by('idx')
    ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i]
                                  for i in range(len(cols))})

    ht = mt.select_rows(__locus=locus_expr).rows()
    ht = ht.add_index()
    ht = ht.annotate(**ht_scores[ht.idx])
    ht = ht.key_by('__locus')
    ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key])
    ht = ht.rename({'__locus': 'locus'})

    return ht
Esempio n. 27
0
File: qc.py Progetto: jigold/hail
def variant_qc(mt, name='variant_qc') -> MatrixTable:
    """Compute common variant statistics (quality control metrics).

    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    >>> dataset_result = hl.variant_qc(dataset)

    Notes
    -----
    This method computes variant statistics from the genotype data, returning
    a new struct field `name` with the following metrics based on the fields
    present in the entry schema.

    If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the
    field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type
    :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats`
    and `gq_stats` are structs with with four fields:

    - `mean` (``float64``) -- Mean value.
    - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom).
    - `min` (``int32``) -- Minimum value.
    - `max` (``int32``) -- Maximum value.

    If the dataset does not contain an entry field `GT` of type
    :py:data:`.tcall`, then an error is raised. The following fields are always
    computed from `GT`:

    - `AF` (``array<float64>``) -- Calculated allele frequency, one element
      per allele, including the reference. Sums to one. Equivalent to
      `AC` / `AN`.
    - `AC` (``array<int32>``) -- Calculated allele count, one element per
      allele, including the reference. Sums to `AN`.
    - `AN` (``int32``) -- Total number of called alleles.
    - `homozygote_count` (``array<int32>``) -- Number of homozygotes per
      allele. One element per allele, including the reference.
    - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered.
       Equivalent to `n_called` / :meth:`.count_cols`.
    - `n_called` (``int64``) -- Number of samples with a defined `GT`.
    - `n_not_called` (``int64``) -- Number of samples with a missing `GT`.
    - `n_filtered` (``int64``) -- Number of filtered entries.
    - `n_het` (``int64``) -- Number of heterozygous samples.
    - `n_non_ref` (``int64``) -- Number of samples with at least one called
      non-reference allele.
    - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous
      samples under Hardy-Weinberg equilibrium. See
      :func:`.functions.hardy_weinberg_test` for details.
    - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium.
      See :func:`.functions.hardy_weinberg_test` for details.

    Warning
    -------
    `het_freq_hwe` and `p_value_hwe` are calculated as in
    :func:`.functions.hardy_weinberg_test`, with non-diploid calls
    (``ploidy != 2``) ignored in the counts. As this test is only
    statistically rigorous in the biallelic setting, :func:`.variant_qc`
    sets both fields to missing for multiallelic variants. Consider using
    :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
    """
    require_row_key_variant(mt, 'variant_qc')

    bound_exprs = {}
    gq_dp_exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max')

    if has_field_of_type('GQ', hl.tint32):
        gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max')

    if not has_field_of_type('GT',  hl.tcall):
        raise ValueError(f"'variant_qc': expect an entry field 'GT' of type 'call'")

    bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))
    bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count()
    bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles)

    result = hl.rbind(hl.struct(**bound_exprs),
                      lambda e1: hl.rbind(
                          hl.case().when(hl.len(mt.alleles) == 2,
                                         hl.hardy_weinberg_test(e1.call_stats.homozygote_count[0],
                                                                e1.call_stats.AC[1] - 2 *
                                                                e1.call_stats.homozygote_count[1],
                                                                e1.call_stats.homozygote_count[1])
                                         ).or_missing(),
                          lambda hwe: hl.struct(**{
                              **gq_dp_exprs,
                              **e1.call_stats,
                              'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered),
                              'n_called': e1.n_called,
                              'n_not_called': e1.n_not_called,
                              'n_filtered': e1.n_filtered,
                              'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count),
                              'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0],
                              'het_freq_hwe': hwe.het_freq_hwe,
                              'p_value_hwe': hwe.p_value})))

    return mt.annotate_rows(**{name: result})
Esempio n. 28
0
def prepare_base_level_pext(base_level_pext_path):
    tmp_dir = os.path.expanduser("~")

    #
    # Step 1: rename fields, extract chrom/pos from locus, convert missing values to 0, export to TSV
    #
    ds = hl.read_table(base_level_pext_path)

    ds = ds.select(
        gene_id=ds.ensg,
        chrom=ds.locus.contig,
        pos=ds.locus.position,
        # Replace NaNs and missing values with 0s
        mean=hl.if_else(
            hl.is_missing(ds.mean_proportion) | hl.is_nan(ds.mean_proportion),
            hl.float(0), ds.mean_proportion),
        **{
            renamed: hl.if_else(
                hl.is_missing(ds[original]) | hl.is_nan(ds[original]),
                hl.float(0), ds[original])
            for original, renamed in TISSUE_NAME_MAP.items()
        })

    ds = ds.order_by(ds.gene_id, hl.asc(ds.pos)).drop("locus")
    ds.export("file://" + os.path.join(tmp_dir, "bases.tsv"))

    #
    # Step 2: Collect base-level data into regions
    #
    with open(os.path.join(tmp_dir, "regions.tsv"), "w") as output_file:
        writer = csv.writer(output_file, delimiter="\t")
        writer.writerow(["gene_id", "chrom", "start", "stop", "mean"] +
                        TISSUE_FIELDS)

        def output_region(region):
            writer.writerow([
                region.gene, region.chrom, region.start, region.stop,
                region.tissues["mean"]
            ] + [region.tissues[t] for t in TISSUE_FIELDS])

        rows = read_bases_tsv(os.path.join(tmp_dir, "bases.tsv"))
        first_row = next(rows)
        current_region = Region(gene=first_row.gene,
                                chrom=first_row.chrom,
                                start=first_row.pos,
                                stop=None,
                                tissues=first_row.tissues)
        last_pos = first_row.pos

        for row in tqdm(rows):
            if (row.gene != current_region.gene
                    or row.chrom != current_region.chrom or row.pos >
                (last_pos + 1)
                    or any(row.tissues[t] != current_region.tissues[t]
                           for t in row.tissues)):
                output_region(current_region._replace(stop=last_pos))
                current_region = Region(gene=row.gene,
                                        chrom=row.chrom,
                                        start=row.pos,
                                        stop=None,
                                        tissues=row.tissues)

            last_pos = row.pos

        output_region(current_region._replace(stop=last_pos))

    # Copy regions file to HDFS
    subprocess.run(
        [
            "hdfs", "dfs", "-cp",
            "file://" + os.path.join(tmp_dir, "regions.tsv"),
            os.path.join("/tmp/regions.tsv")
        ],
        check=True,
    )

    #
    # Step 3: Convert regions to a Hail table.
    #
    types = {t: hl.tfloat for t in TISSUE_FIELDS}
    types["gene_id"] = hl.tstr
    types["chrom"] = hl.tstr
    types["start"] = hl.tint
    types["stop"] = hl.tint
    types["mean"] = hl.tfloat

    ds = hl.import_table("/tmp/regions.tsv",
                         min_partitions=100,
                         missing="",
                         types=types)

    ds = ds.select("gene_id",
                   "chrom",
                   "start",
                   "stop",
                   "mean",
                   tissues=hl.struct(**{t: ds[t]
                                        for t in TISSUE_FIELDS}))

    ds = ds.group_by("gene_id").aggregate(
        regions=hl.agg.collect(ds.row_value.drop("gene_id")))

    return ds
def main(args):
    input_tsv = args.input_tsv
    output_ht = args.output_ht
    chunk_size = args.chunk_size
    overwrite = args.overwrite

    mt_list = []
    logger.info(
        "Reading in individual coverage files as matrix tables and adding to a list of matrix tables..."
    )
    with open(input_tsv, "r") as f:
        #next(f)
        for line in f:
            line = line.rstrip()
            items = line.split("\t")
            sample, base_level_coverage_metrics = items[0:2]
            #print(sample)
            #print(base_level_coverage_metrics)

            mt = hl.import_matrix_table(
                base_level_coverage_metrics,
                delimiter="\t",
                row_fields={
                    "chrom": hl.tstr,
                    "pos": hl.tint,
                    "target": hl.tstr
                },
                row_key=["chrom", "pos"],
            ).drop("target")
            mt = mt.rename({"x": "coverage"})
            mt = mt.key_cols_by(s=sample)
            mt_list.append(mt)

    logger.info("Joining individual coverage mts...")
    out_dir = dirname(output_ht)
    temp_out_dir = out_dir + "/temp"

    cov_mt = multi_way_union_mts(mt_list, temp_out_dir, chunk_size)
    n_samples = cov_mt.count_cols()

    logger.info("Adding coverage annotations...")
    cov_mt = cov_mt.annotate_rows(
        locus=hl.locus(cov_mt.chrom, cov_mt.pos, reference_genome="GRCh38"),
        mean=hl.float(hl.agg.mean(cov_mt.coverage)),
        median=hl.median(hl.agg.collect(cov_mt.coverage)),
        over_100=hl.float(
            (hl.agg.count_where(cov_mt.coverage > 100) / n_samples)),
        over_1000=hl.float(
            (hl.agg.count_where(cov_mt.coverage > 1000) / n_samples)),
    )
    cov_mt.show()

    cov_mt = cov_mt.key_rows_by("locus").drop("chrom", "pos")

    output_mt = re.sub("\.ht$", ".mt", output_ht)
    output_tsv = re.sub("\.ht$", ".tsv", output_ht)
    output_samples = re.sub("\.ht$", "_sample_level.txt", output_ht)

    logger.info("Writing sample level coverage...")
    sample_mt = cov_mt.key_rows_by(pos=cov_mt.locus.position)
    sample_mt.coverage.export(output_samples)

    logger.info("Writing coverage mt and ht...")
    cov_mt.write(output_mt, overwrite=overwrite)
    cov_ht = cov_mt.rows()
    cov_ht = cov_ht.checkpoint(output_ht, overwrite=overwrite)
    cov_ht.export(output_tsv)
Esempio n. 30
0
def prepare_mitochondrial_variants(path, mnvs_path=None):
    ds = hl.read_table(path)

    haplogroups = hl.eval(ds.globals.hap_order)

    ds = ds.annotate(hl_hist=ds.hl_hist.annotate(
        bin_edges=ds.hl_hist.bin_edges.map(
            lambda n: hl.float(hl.format("%.2f", n)))))

    filter_names = hl.dict({
        "artifact_prone_site": "Artifact-prone site",
        "indel_stack": "Indel stack",
        "npg": "No passing genotype"
    })

    ds = ds.select(
        # ID
        variant_id=variant_id(ds.locus, ds.alleles),
        reference_genome=ds.locus.dtype.reference_genome.name,
        chrom=normalized_contig(ds.locus.contig),
        pos=ds.locus.position,
        ref=ds.alleles[0],
        alt=ds.alleles[1],
        rsid=ds.rsid,
        # Quality
        filters=ds.filters.map(lambda f: filter_names.get(f, f)),
        qual=ds.qual,
        genotype_quality_metrics=[
            hl.struct(name="Depth", alt=ds.dp_hist_alt, all=ds.dp_hist_all)
        ],
        genotype_quality_filters=[
            hl.struct(
                name="Base Quality",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.base_qual_hist),
            ),
            hl.struct(
                name="Contamination",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.contamination_hist),
            ),
            hl.struct(
                name="Heteroplasmy below 10%",
                filtered=hl.struct(
                    bin_edges=ds.hl_hist.bin_edges,
                    bin_freq=ds.heteroplasmy_below_10_percent_hist),
            ),
            hl.struct(name="Position",
                      filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                         bin_freq=ds.position_hist)),
            hl.struct(
                name="Strand Bias",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.strand_bias_hist),
            ),
            hl.struct(
                name="Weak Evidence",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.weak_evidence_hist),
            ),
        ],
        site_quality_metrics=[
            hl.struct(name="Mean Depth", value=nullify_nan(ds.dp_mean)),
            hl.struct(name="Mean MQ", value=nullify_nan(ds.mq_mean)),
            hl.struct(name="Mean TLOD", value=nullify_nan(ds.tlod_mean)),
        ],
        # Frequency
        an=ds.AN,
        ac_hom=ds.AC_hom,
        ac_het=ds.AC_het,
        excluded_ac=ds.excluded_AC,
        # Heteroplasmy
        common_low_heteroplasmy=ds.common_low_heteroplasmy,
        heteroplasmy_distribution=ds.hl_hist,
        max_heteroplasmy=ds.max_hl,
        # Populations
        populations=hl.sorted(
            hl.range(hl.len(
                ds.globals.pop_order)).map(lambda pop_index: hl.struct(
                    id=ds.globals.pop_order[pop_index],
                    an=ds.pop_AN[pop_index],
                    ac_het=ds.pop_AC_het[pop_index],
                    ac_hom=ds.pop_AC_hom[pop_index],
                    heteroplasmy_distribution=hl.struct(
                        bin_edges=ds.hl_hist.bin_edges,
                        bin_freq=ds.pop_hl_hist[pop_index],
                        n_smaller=0,
                        n_larger=0,
                    ),
                )),
            key=lambda pop: pop.id,
        ),
        # Haplogroups
        hapmax_af_hom=ds.hapmax_AF_hom,
        hapmax_af_het=ds.hapmax_AF_het,
        faf_hapmax_hom=ds.faf_hapmax_hom,
        haplogroup_defining=ds.hap_defining_variant,
        haplogroups=[
            hl.struct(
                id=haplogroup,
                an=ds.hap_AN[i],
                ac_het=ds.hap_AC_het[i],
                ac_hom=ds.hap_AC_hom[i],
                faf_hom=ds.hap_faf_hom[i],
                heteroplasmy_distribution=ds.hap_hl_hist[i],
            ) for i, haplogroup in enumerate(haplogroups)
        ],
        # Other
        age_distribution=hl.struct(het=ds.age_hist_het, hom=ds.age_hist_hom),
        flags=hl.set([
            hl.or_missing(ds.common_low_heteroplasmy,
                          "common_low_heteroplasmy")
        ]).filter(hl.is_defined),
        mitotip_score=ds.mitotip_score,
        mitotip_trna_prediction=ds.mitotip_trna_prediction,
        pon_ml_probability_of_pathogenicity=ds.
        pon_ml_probability_of_pathogenicity,
        pon_mt_trna_prediction=ds.pon_mt_trna_prediction,
        variant_collapsed=ds.variant_collapsed,
        vep=ds.vep,
    )

    if mnvs_path:
        mnvs = hl.import_table(mnvs_path,
                               types={
                                   "pos": hl.tint,
                                   "ref": hl.tstr,
                                   "alt": hl.tstr,
                                   "AC_hom_MNV": hl.tint
                               })
        mnvs = mnvs.key_by(
            locus=hl.locus("chrM",
                           mnvs.pos,
                           reference_genome=ds.locus.dtype.reference_genome),
            alleles=[mnvs.ref, mnvs.alt],
        )
        ds = ds.annotate(ac_hom_mnv=hl.or_else(mnvs[ds.key].AC_hom_MNV, 0))
        ds = ds.annotate(
            flags=hl.if_else(ds.ac_hom_mnv > 0, ds.flags.add("mnv"), ds.flags))

    return ds
import hail as hl
mt = hl.balding_nichols_model(3, 100, 100)
gts_as_rows = mt.annotate_rows(
    mean=hl.agg.mean(hl.float(mt.GT.n_alt_alleles())),
    genotypes=hl.agg.collect(hl.float(mt.GT.n_alt_alleles()))).rows()
groups = gts_as_rows.group_by(
    ld_block=gts_as_rows.locus.position // 10).aggregate(
        genotypes=hl.agg.collect(gts_as_rows.genotypes),
        ys=hl.agg.collect(gts_as_rows.mean))

df = groups.to_spark()

from pyspark.sql.functions import udf


def get_intercept(X, y):
    from sklearn import linear_model
    clf = linear_model.Lasso(alpha=0.1)
    clf.fit(X, y)
    return float(clf.intercept_)


get_intercept_udf = udf(get_intercept)
df.select(get_intercept_udf("genotypes", "ys").alias("intercept")).show()
Esempio n. 32
0
import hail as hl

root = 'gs://hail-datasets-raw-data/LDSC/baselineLD_v2.2'

mt = hl.import_matrix_table(f'{root}/ld_scores.GRCh37.tsv.bgz',
    row_fields={'CHR': hl.tstr, 'SNP': hl.tstr, 'BP': hl.tint}, entry_type=hl.tstr)

mt = mt.annotate_entries(x=hl.float(mt['x']))
mt = mt.annotate_rows(
    locus=hl.locus(mt['CHR'], mt['BP'], 'GRCh37'))
mt = mt.key_rows_by('locus')
mt = mt.select_rows('SNP')

M = hl.import_table(
    f'{root}/M.GRCh37.tsv.bgz', key='annotation')
M_5_50 = hl.import_table(
    f'{root}/M_5_50.GRCh37.tsv.bgz', key='annotation')

mt = mt.rename({'col_id': 'annotation'})
mt = mt.annotate_cols(
    M_5_50=hl.int(hl.float(M_5_50[mt.annotation].M_5_50)),
    M=hl.int(hl.float(M[mt.annotation].M)))

n_rows, n_cols = mt.count()
n_partitions = mt.n_partitions()

mt = mt.annotate_globals(
    metadata=hl.struct(
        name='LDSC_baselineLD_v2.2_ld_scores',
        reference_genome='GRCh37',