コード例 #1
0
    def test_window_by_locus(self):
        mt = hl.utils.range_matrix_table(100, 2, n_partitions=10)
        mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1))
        mt = mt.key_rows_by('locus')
        mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx)
        mt = hl.window_by_locus(mt, 5).cache()

        self.assertEqual(mt.count_rows(), 100)

        rows = mt.rows()
        self.assertTrue(
            rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5)))
        self.assertTrue(
            rows.all(
                hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx,
                       hl.zip_with_index(rows.prev_rows))))

        entries = mt.entries()
        self.assertTrue(
            entries.all(
                hl.all(lambda x: x.e_col_idx == entries.col_idx,
                       entries.prev_entries)))
        self.assertTrue(
            entries.all(
                hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx,
                       hl.zip_with_index(entries.prev_entries))))
コード例 #2
0
def add_variant_type(alt_alleles: hl.expr.ArrayExpression) -> hl.expr.StructExpression:
    """
    Get Struct of variant_type and n_alt_alleles from ArrayExpression of Strings (all alleles)
    """
    ref = alt_alleles[0]
    alts = alt_alleles[1:]
    non_star_alleles = hl.filter(lambda a: a != '*', alts)
    return hl.struct(variant_type=hl.cond(
        hl.all(lambda a: hl.is_snp(ref, a), non_star_alleles),
        hl.cond(hl.len(non_star_alleles) > 1, "multi-snv", "snv"),
        hl.cond(
            hl.all(lambda a: hl.is_indel(ref, a), non_star_alleles),
            hl.cond(hl.len(non_star_alleles) > 1, "multi-indel", "indel"),
            "mixed")
    ), n_alt_alleles=hl.len(non_star_alleles))
コード例 #3
0
ファイル: test_misc.py プロジェクト: danking/hail
    def test_window_by_locus(self):
        mt = hl.utils.range_matrix_table(100, 2, n_partitions=10)
        mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1))
        mt = mt.key_rows_by('locus')
        mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx)
        mt = hl.window_by_locus(mt, 5).cache()

        self.assertEqual(mt.count_rows(), 100)

        rows = mt.rows()
        self.assertTrue(rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5)))
        self.assertTrue(rows.all(hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx,
                                        hl.zip_with_index(rows.prev_rows))))

        entries = mt.entries()
        self.assertTrue(entries.all(hl.all(lambda x: x.e_col_idx == entries.col_idx, entries.prev_entries)))
        self.assertTrue(entries.all(hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx,
                                           hl.zip_with_index(entries.prev_entries))))
コード例 #4
0
ファイル: phenotype_loading.py プロジェクト: wlu04/ukb_common
def load_activity_monitor_data(first_exposure_and_activity_monitor_data_path):
    ht = hl.import_table(first_exposure_and_activity_monitor_data_path, delimiter=',', quote='"', missing='', impute=True, key='eid')  #, min_partitions=500)
    quality_fields = ['90015-0.0', '90016-0.0', '90017-0.0']
    qual_ht = ht.select(hq=hl.is_missing(ht['90002-0.0']) & hl.all(lambda x: x == 1, [ht[x] for x in quality_fields]))
    mt = filter_and_annotate_ukb_data(ht, lambda x, v: x.startswith('90') and x.endswith('-0.0') and
                                                       v.dtype in {hl.tint32, hl.tfloat64})
    mt = mt.filter_cols(mt.ValueType == 'Continuous')
    mt = mt.annotate_rows(**qual_ht[mt.row_key])
    mt = mt.annotate_entries(value=hl.or_missing(hl.is_defined(mt.hq), mt.value))
    mt = mt.key_cols_by(trait_type='continuous', phenocode=mt.phenocode, pheno_sex='both_sexes', coding=NULL_STR_KEY, modifier=NULL_STR_KEY)
    return mt
コード例 #5
0
ファイル: export_pheno.py プロジェクト: Nealelab/ukb_common
def main(args):
    hl.init(master=f'local[{args.n_threads}]',
            log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(),
                                                     'export_pheno'),
                                        suffix='.log'),
            default_reference='GRCh38')

    sys.path.append('/')
    load_module = importlib.import_module(args.load_module)
    add_args = []
    if args.additional_args is not None:
        add_args = args.additional_args.split(',')
    mt = getattr(load_module, args.load_mt_function)(*add_args)

    mt = mt.filter_cols(
        hl.all(lambda x: x, [
            mt[k] == getattr(args, k, False)
            for k in PHENO_KEY_FIELDS if k != 'pheno_sex'
        ]))
    pheno_sex_mt = mt.filter_cols(mt.pheno_sex == args.pheno_sex)
    if pheno_sex_mt.count_cols() == 1:
        mt = pheno_sex_mt
    else:
        mt = mt.filter_cols(mt.pheno_sex == 'both_sexes')
    mt = mt.select_entries(value=mt[args.pheno_sex])
    if args.binary_trait:
        mt = mt.select_entries(value=hl.int(mt.value))
    if args.proportion_single_sex > 0:
        prop_female = mt.n_cases_females / (mt.n_cases_males +
                                            mt.n_cases_females)
        prop_female = prop_female.collect()[0]
        print(f'Female proportion: {prop_female}')
        if prop_female <= args.proportion_single_sex:
            print(
                f'{prop_female} less than {args.proportion_single_sex}. Filtering to males...'
            )
            mt = mt.filter_rows(mt.sex == 1)
        elif prop_female >= 1 - args.proportion_single_sex:
            print(
                f'{prop_female} greater than {1 - args.proportion_single_sex}. Filtering to females...'
            )
            mt = mt.filter_rows(mt.sex == 0)
    ht = mt.key_cols_by().select_cols().entries()
    ht.export(args.output_file)
コード例 #6
0
def test_sampleqc_old_new_equivalence():
    vds = hl.vds.read_vds(
        os.path.join(resource('vds'), '1kg_chr22_5_samples.vds'))
    sqc = hl.vds.sample_qc(vds)

    dense = hl.vds.to_dense_mt(vds)
    dense = dense.transmute_entries(GT=hl.vds.lgt_to_gt(dense.LGT, dense.LA))
    res = hl.sample_qc(dense)

    res = res.annotate_cols(sample_qc_new=sqc[res.s])

    fields_to_test = [
        'n_het', 'n_hom_var', 'n_non_ref', 'n_singleton', 'n_snp',
        'n_insertion', 'n_deletion', 'n_transition', 'n_transversion',
        'n_star', 'r_ti_tv', 'r_het_hom_var', 'r_insertion_deletion'
    ]

    assert res.aggregate_cols(
        hl.all(*(hl.agg.all(res.sample_qc[field] == res.sample_qc_new[field])
                 for field in fields_to_test)))
コード例 #7
0
ファイル: nd.py プロジェクト: saponas/hail
def concatenate(nds, axis=0):
    """Join a sequence of arrays along an existing axis.

    Examples
    --------

    >>> x = hl.nd.array([[1., 2.], [3., 4.]])
    >>> y = hl.nd.array([[5.], [6.]])
    >>> hl.eval(hl.nd.concatenate([x, y], axis=1))
    array([[1., 2., 5.],
           [3., 4., 6.]])
    >>> x = hl.nd.array([1., 2.])
    >>> y = hl.nd.array([3., 4.])
    >>> hl.eval(hl.nd.concatenate((x, y), axis=0))
    array([1., 2., 3., 4.])

    Parameters
    ----------
    :param nds: a1, a2, …sequence of array_like
        The arrays must have the same shape, except in the dimension corresponding to axis (the first, by default).
        Note: unlike Numpy, the numerical element type of each array_like must match.
    :param axis: int, optional
        The axis along which the arrays will be joined. Default is 0.
        Note: unlike Numpy, if provided, axis cannot be None.

    Returns
    -------
    - res: ndarray
        The concatenated array
    """
    head_nd = nds[0]
    head_ndim = head_nd.ndim
    hl.case().when(hl.all(lambda a: a.ndim == head_ndim, nds),
                   True).or_error("Mismatched ndim")

    makearr = aarray(nds)
    concat_ir = NDArrayConcat(makearr._ir, axis)

    return construct_expr(concat_ir,
                          tndarray(head_nd._type.element_type, head_ndim))
コード例 #8
0
def annotate_variants(mt):
    '''
    Takes matrix table and annotates variants with gene, LOF and missense annotations by parsing VEP annotations.

    :param mt: matrix table to annotate
    :return: returns matrix table with new row annotations gene, LOF, and missense.
    '''
    try:
        test = hl.is_defined(mt.row.was_split)
    except Exception as e:
        print('Split multi-allelics before running!')
        print(e)
        return

    # If there is no canonical and protein-coding transcript consequence for that variant,
    # give the gene corresponding to the most severe consequence.
    # If there is a canonical and protein-coding transcript consequence for that variant,
    # give the gene symbol associated with that transcript consequence.
    canon_pc = mt.row.vep.transcript_consequences.filter(
        lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'))
    most_severe = mt.row.vep.transcript_consequences.filter(
        lambda x: x.consequence_terms.contains(mt.row.vep.
                                               most_severe_consequence))

    mt = mt.annotate_rows(gene=hl.if_else(
        hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'),
               mt.row.vep.transcript_consequences),
        canon_pc.map(lambda x: x.gene_symbol),
        most_severe.map(lambda x: x.gene_symbol)))

    # The above returns gene symbols for all canonical and protein coding transcripts, not just the one related to the
    # most severe consequence. So we will keep the above, but annotate also the gene corresponding to the most severe
    # consequence as well (useful for synonymous, missense, and LOF annotations)

    canon_pc = mt.row.vep.transcript_consequences.filter(
        lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding')
        & x.consequence_terms.contains(mt.vep.most_severe_consequence))
    most_severe = mt.vep.transcript_consequences.filter(
        lambda x: x.consequence_terms.contains(mt.row.vep.
                                               most_severe_consequence))

    mt = mt.annotate_rows(gene_most_severe_conseq=hl.if_else(
        hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'),
               mt.vep.transcript_consequences),
        canon_pc.map(lambda x: x.gene_symbol),
        most_severe.map(lambda x: x.gene_symbol)))

    # either if there is a canonical and protein coding transcript consequence for that variant,
    # and the lof annotation is not missing and equal to HC, and the lof flag is missing or is blank,
    # or if there isn't a canonical and protein coding transcript consequence for that variant and the
    # transcript consequence with consequence terms containing the most severe consequence term has lof not missing,
    # is equal to HC, and lof flags missing or blank,
    # true, else false

    canon_pc = mt.row.vep.transcript_consequences\
                         .filter(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'))
    most_severe = mt.row.vep.transcript_consequences\
                            .filter(lambda x: x.consequence_terms.contains(
                                              mt.row.vep.most_severe_consequence))

    canon_bool = (
        hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'),
               mt.row.vep.transcript_consequences)
        & hl.any(lambda x: hl.is_defined(x.lof), canon_pc) &
        (canon_pc.map(lambda x: x.lof) == ["HC"]) &
        (hl.all(lambda x: hl.is_missing(x.lof_flags) |
                (x.lof_flags == ""), canon_pc)))

    non_canon_bool = (~(hl.any(
        lambda x: (x.canonical == 1) &
        (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences))
                      & hl.any(lambda x: hl.is_defined(x.lof), most_severe) &
                      (most_severe.map(lambda x: x.lof) == ["HC"]) & (hl.all(
                          lambda x: hl.is_missing(x.lof_flags) |
                          (x.lof_flags == ""), most_severe)))

    mt = mt.annotate_rows(LOF=hl.if_else(canon_bool
                                         | non_canon_bool, True, False))

    # Either if there is a canonical and protein coding transcript consequence for that variant
    # whose consequence terms contain "missense variant"
    # or if there is not a canonical and protein coding transcript consequence for that variant,
    # but the most severe consequence is "missense variant"
    # or if if there is a canonical and protein coding transcript consequence for that variant
    # whose consequence terms contain "inframe deletion"
    # or if there is not a canonical and protein coding transcript consequence for that variant,
    # but the variant's most severe consequence is "inframe deletion"
    # true else false

    canon_pc = mt.row.vep.transcript_consequences\
                         .filter(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'))

    canon_missense_bool = canon_pc.map(lambda x: x.consequence_terms).contains(
        ["missense_variant"])
    noncanon_missense_bool = (~(hl.any(
        lambda x: (x.canonical == 1) &
        (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) &
                              (mt.row.vep.most_severe_consequence
                               == "missense_variant"))

    canon_inframe_bool = canon_pc.map(lambda x: x.consequence_terms).contains(
        ["inframe_deletion"])
    noncanon_inframe_bool = (~(hl.any(
        lambda x: (x.canonical == 1) &
        (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) &
                             (mt.row.vep.most_severe_consequence
                              == "inframe_deletion"))

    canon_inframe_ins_bool = canon_pc.map(
        lambda x: x.consequence_terms).contains(["inframe_insertion"])
    noncanon_inframe_ins_bool = (~(hl.any(
        lambda x: (x.canonical == 1) &
        (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) &
                                 (mt.row.vep.most_severe_consequence
                                  == "inframe_insertion"))

    mt = mt.annotate_rows(
        missense=hl.if_else((canon_missense_bool | noncanon_missense_bool
                             | canon_inframe_bool | noncanon_inframe_bool
                             | canon_inframe_ins_bool
                             | noncanon_inframe_ins_bool), True, False))

    # If the most severe consequence is "synonymous_variant", true else false
    mt = mt.annotate_rows(synonymous=hl.if_else(
        mt.row.vep.most_severe_consequence == "synonymous_variant", True,
        False))

    # When there is a transcript consequence for that variant that is canonical,
    # protein coding, and lof = "HC", its lof flags
    # When there is not a transcript consequence for that variant that is canonical and protein coding,
    # but there is a transcript consequence whose consequence terms contains the most severe consequence
    # and its lof == HC, its lof flags
    # else blank

    canon_bool = hl.any(
        lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'),
        mt.row.vep.transcript_consequences)
    canon_hc_bool = hl.any(
        lambda x:
        (x.canonical == 1) & (x.biotype == 'protein_coding') & (x.lof == 'HC'),
        mt.row.vep.transcript_consequences)
    canon_pc_hc = mt.row.vep.transcript_consequences.filter(lambda x: (
        x.canonical == 1) & (x.biotype == 'protein_coding') & (x.lof == "HC"))
    most_severe_bool = hl.any(
        lambda x:
        (x.consequence_terms.contains(mt.row.vep.most_severe_consequence)) &
        (x.lof == 'HC'), mt.row.vep.transcript_consequences)
    most_severe_hc = mt.row.vep.transcript_consequences.filter(lambda x: (
        x.consequence_terms.contains(mt.row.vep.most_severe_consequence)) &
                                                               (x.lof == "HC"))

    mt = mt.annotate_rows(LOF_flag=hl.case().when(
        canon_hc_bool, canon_pc_hc.map(lambda x: x.lof_flags)).when(
            ~canon_bool & most_severe_bool,
            most_severe_hc.map(lambda x: x.lof_flags)).default([""]))

    return mt
コード例 #9
0
ファイル: methods.py プロジェクト: chrisvittal/hail
def impute_sex_chromosome_ploidy(vds: VariantDataset, calling_intervals,
                                 normalization_contig: str) -> hl.Table:
    """Impute sex chromosome ploidy from depth of reference data within calling intervals.

    Returns a :class:`.Table` with sample ID keys, with the following fields:

     -  ``autosomal_mean_dp`` (*float64*): Mean depth on calling intervals on normalization contig.
     -  ``x_mean_dp`` (*float64*): Mean depth on calling intervals on X chromosome.
     -  ``x_ploidy`` (*float64*): Estimated ploidy on X chromosome. Equal to ``2 * x_mean_dp / autosomal_mean_dp``.
     -  ``y_mean_dp`` (*float64*): Mean depth on calling intervals on  chromosome.
     -  ``y_ploidy`` (*float64*): Estimated ploidy on Y chromosome. Equal to ``2 * y_mean_db / autosomal_mean_dp``.

    Parameters
    ----------
    vds : vds: :class:`.VariantDataset`
        Dataset.
    calling_intervals : :class:`.Table` or :class:`.ArrayExpression`
        Calling intervals with consistent read coverage (for exomes, trim the capture intervals).
    normalization_contig : str
        Autosomal contig for depth comparison.

    Returns
    -------
    :class:`.Table`
    """

    if not isinstance(calling_intervals, Table):
        calling_intervals = hl.Table.parallelize(
            hl.map(lambda i: hl.struct(interval=i), calling_intervals),
            schema=hl.tstruct(interval=calling_intervals.dtype.element_type),
            key='interval')
    else:
        key_dtype = calling_intervals.key.dtype
        if len(key_dtype) != 1 or not isinstance(
                calling_intervals.key[0].dtype,
                hl.tinterval) or calling_intervals.key[
                    0].dtype.point_type != vds.reference_data.locus.dtype:
            raise ValueError(
                f"'impute_sex_chromosome_ploidy': expect calling_intervals to be list of intervals or"
                f" table with single key of type interval<locus>, found table with key: {key_dtype}"
            )

    rg = vds.reference_data.locus.dtype.reference_genome

    par_boundaries = []
    for par_interval in rg.par:
        par_boundaries.append(par_interval.start)
        par_boundaries.append(par_interval.end)

    # segment on PAR interval boundaries
    calling_intervals = hl.segment_intervals(calling_intervals, par_boundaries)

    # remove intervals overlapping PAR
    calling_intervals = calling_intervals.filter(
        hl.all(lambda x: ~x.overlaps(calling_intervals.interval),
               hl.literal(rg.par)))

    # checkpoint for efficient multiple downstream usages
    info("'impute_sex_chromosome_ploidy': checkpointing calling intervals")
    calling_intervals = calling_intervals.checkpoint(
        new_temp_file(extension='ht'))

    interval = calling_intervals.key[0]
    (any_bad_intervals, chrs_represented) = calling_intervals.aggregate(
        (hl.agg.any(interval.start.contig != interval.end.contig),
         hl.agg.collect_as_set(interval.start.contig)))
    if any_bad_intervals:
        raise ValueError(
            "'impute_sex_chromosome_ploidy' does not support calling intervals that span chromosome boundaries"
        )

    if len(rg.x_contigs) != 1:
        raise NotImplementedError(
            f"reference genome {rg.name!r} has multiple X contigs, this is not supported in 'impute_sex_chromosome_ploidy'"
        )
    chr_x = rg.x_contigs[0]
    if len(rg.y_contigs) != 1:
        raise NotImplementedError(
            f"reference genome {rg.name!r} has multiple Y contigs, this is not supported in 'impute_sex_chromosome_ploidy'"
        )
    chr_y = rg.y_contigs[0]

    kept_contig_filter = hl.array(chrs_represented).map(
        lambda x: hl.parse_locus_interval(x, reference_genome=rg))
    vds = VariantDataset(
        hl.filter_intervals(vds.reference_data, kept_contig_filter),
        hl.filter_intervals(vds.variant_data, kept_contig_filter))

    coverage = interval_coverage(vds, calling_intervals,
                                 gq_thresholds=()).drop('gq_thresholds')

    coverage = coverage.annotate_rows(contig=coverage.interval.start.contig)
    coverage = coverage.annotate_cols(__mean_dp=hl.agg.group_by(
        coverage.contig,
        hl.agg.sum(coverage.sum_dp) / hl.agg.sum(coverage.interval_size)))

    mean_dp_dict = coverage.__mean_dp
    auto_dp = mean_dp_dict.get(normalization_contig)
    x_dp = mean_dp_dict.get(chr_x)
    y_dp = mean_dp_dict.get(chr_y)
    per_sample = coverage.transmute_cols(autosomal_mean_dp=auto_dp,
                                         x_mean_dp=x_dp,
                                         x_ploidy=2 * x_dp / auto_dp,
                                         y_mean_dp=y_dp,
                                         y_ploidy=2 * y_dp / auto_dp)
    info(
        "'impute_sex_chromosome_ploidy': computing and checkpointing coverage and karyotype metrics"
    )
    return per_sample.cols().checkpoint(
        new_temp_file('impute_sex_karyotype', extension='ht'))