def compressed_variant_id(locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression) -> hl.expr.StringExpression:
    return hl.rbind(
        hl.len(alleles[0]),
        hl.len(alleles[1]),
        lambda ref_len, alt_len: hl.case()
        .when(
            ref_len > alt_len,
            normalized_contig(locus.contig)
            + "-"
            + hl.str(locus.position)
            + "d"
            + hl.str(ref_len - alt_len)
            + "-"
            + alleles[1],
        )
        .when(
            ref_len < alt_len,
            normalized_contig(locus.contig)
            + "-"
            + hl.str(locus.position)
            + "i"
            + hl.str(alt_len - ref_len)
            + "-"
            + _encode_allele(alleles[1]),
        )
        .default(variant_id(locus, alleles)),
    )
Beispiel #2
0
    def test_import_bgen_variant_filtering(self):
        desired_variant_indexes = [1, 2, 3, 5, 7, 9, 11, 13, 17, 198]
        actual = hl.import_bgen(resource('example.8bits.bgen'), ['GT'],
                                contig_recoding={'01': '1'},
                                reference_genome=None,
                                n_partitions=10,
                                _row_fields=['file_row_idx'],
                                _variants_per_file={
                                    resource('example.8bits.bgen'):
                                    desired_variant_indexes
                                })
        # doing the expected import_bgen second catches the case where the
        # hadoop configuraiton is polluted with old data from the
        # _variants_per_file
        everything = hl.import_bgen(resource('example.8bits.bgen'), ['GT'],
                                    contig_recoding={'01': '1'},
                                    reference_genome=None,
                                    _row_fields=['file_row_idx'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(
            hl.set(desired_variant_indexes).contains(
                hl.int32(everything.file_row_idx)))

        self.assertTrue(expected._same(actual))
        self.assertEqual(
            (hl.str(actual.locus.contig) + ":" +
             hl.str(actual.locus.position)).collect(), [
                 '1:3000', '1:4000', '1:5000', '1:7000', '1:9000', '1:11000',
                 '1:13000', '1:15000', '1:19000', '1:100001'
             ])
Beispiel #3
0
def add_coding_information(
        mt: hl.MatrixTable,
        coding_ht: hl.Table,
        phesant_phenotype_info_path: str,
        download_missing_codings: bool = False) -> hl.MatrixTable:
    """
    Add coding information from coding_ht as column annotations into mt

    :param MatrixTable mt: Input MT
    :param Table coding_ht: HT with coding information
    :param str phesant_phenotype_info_path: PHESANT phenotype metadata path
    :param bool download_missing_codings: Whether to download missing coding data
    :return: MT with coding information in column data
    :rtype: MatrixTable
    """
    mt = mt.annotate_cols(**coding_ht[(mt.coding_id, hl.str(mt.coding))])
    if download_missing_codings: get_missing_codings(mt.cols())
    phesant_summary = hl.import_table(phesant_phenotype_info_path,
                                      impute=True,
                                      missing='',
                                      key='FieldID')
    phesant_reassign = get_phesant_reassignments(phesant_summary)
    mt = mt.annotate_cols(recoding=hl.or_missing(
        hl.is_missing(mt.meaning), phesant_reassign[mt.col_key.select(
            'phenocode', 'coding')].reassign_from))
    return mt.annotate_cols(
        **hl.cond(hl.is_defined(mt.meaning),
                  hl.struct(**{x: mt[x]
                               for x in list(coding_ht.row_value)}),
                  coding_ht[(mt.coding_id, hl.str(mt.recoding))]), )
def load_cmg(cmg_csv: str) -> hl.Table:
    cmg_ht = hl.import_table(cmg_csv, impute=True, delimiter=",", quote='"')

    cmg_ht = cmg_ht.transmute(
        locus1_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_1), cmg_ht.pos_1, reference_genome='GRCh38'),
        alleles1_b38=[cmg_ht.ref_1, cmg_ht.alt_1],
        locus2_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_2), cmg_ht.pos_2, reference_genome='GRCh38'),
        alleles2_b38=[cmg_ht.ref_2, cmg_ht.alt_2]
    )

    liftover_references = get_liftover_genome(cmg_ht.rename({'locus1_b38': 'locus'}))
    lifted_over_variants = hl.sorted(
        hl.array([
            liftover_expr(cmg_ht.locus1_b38, cmg_ht.alleles1_b38, liftover_references[1]),
            liftover_expr(cmg_ht.locus2_b38, cmg_ht.alleles2_b38, liftover_references[1])
        ]),
        lambda x: x.locus
    )

    cmg_ht = cmg_ht.key_by(
        locus1=lifted_over_variants[0].locus,
        alleles1=lifted_over_variants[0].alleles,
        locus2=lifted_over_variants[1].locus,
        alleles2=lifted_over_variants[1].alleles
    )

    return cmg_ht.annotate(
        bad_liftover=(
                hl.is_missing(cmg_ht.locus1) |
                hl.is_missing(cmg_ht.locus2) |
                (cmg_ht.locus1.sequence_context() != cmg_ht.alleles1[0][0]) |
                (cmg_ht.locus2.sequence_context() != cmg_ht.alleles2[0][0])
        )
    )
def intersect_target_ref(ref_mt_filt,
                         snp_list,
                         grch37_or_grch38,
                         intersect_out,
                         overwrite: bool = False):
    mt = hl.read_matrix_table(ref_mt_filt)
    if grch37_or_grch38.lower() == 'grch38':
        snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr),
                                                  hl.int(snp_list.pos),
                                                  reference_genome='GRCh38'),
                                   alleles=[snp_list.ref, snp_list.alt])
        mt = mt.filter_rows(hl.is_defined(snp_list[mt.row_key]))

    elif grch37_or_grch38.lower() == 'grch37':
        snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr),
                                                  hl.int(snp_list.pos),
                                                  reference_genome='GRCh37'),
                                   alleles=[snp_list.ref, snp_list.alt])
        # liftover snp list to GRCh38, filter to SNPs in mt
        rg37, rg38 = load_liftover()

        snp_liftover = snp_list.annotate(
            new_locus=hl.liftover(snp_list.locus, 'GRCh38'))
        snp_liftover = snp_liftover.filter(
            hl.is_defined(snp_liftover.new_locus))
        snp_liftover = snp_liftover.key_by(locus=snp_liftover.new_locus,
                                           alleles=snp_liftover.alleles)
        mt = mt.filter_rows(hl.is_defined(snp_liftover[mt.row_key]))

    mt = mt.repartition(5000)
    mt = mt.checkpoint(intersect_out,
                       overwrite=overwrite,
                       _read_if_exists=not overwrite)
Beispiel #6
0
def require_biallelic(dataset, method) -> MatrixTable:
    require_row_key_variant(dataset, method)
    return dataset._select_rows(
        method,
        hl.case().when(dataset.alleles.length() == 2, dataset._rvrow).or_error(
            f"'{method}' expects biallelic variants ('alleles' field of length 2), found "
            + hl.str(dataset.locus) + ", " + hl.str(dataset.alleles)))
Beispiel #7
0
def annotate_variant_id(
        t: Union[hl.Table, hl.MatrixTable],
        field_name: str = 'vid') -> Union[hl.Table, hl.MatrixTable]:
    """
    Expected input dataset with bi-allelic variant, and fields `locus` and `alleles`.
    Annotate variant ids as follow 'chr:position:ref:alt'.

    :param field_name: variant id field name
    :param t: dataset
    :return: HailTable or MatrixTable
    """

    variant_id_ann_exp = {
        field_name:
        hl.delimit([
            hl.str(t.locus.contig),
            hl.str(t.locus.position),
            hl.str(t.alleles[0]),
            hl.str(t.alleles[1])
        ],
                   delimiter=":")
    }

    if isinstance(t, hl.Table):
        return t.annotate(**variant_id_ann_exp)
    else:
        return t.annotate_rows(**variant_id_ann_exp)
Beispiel #8
0
def require_biallelic(dataset, method) -> MatrixTable:
    require_row_key_variant(dataset, method)
    return dataset._select_rows(method,
                                hl.case()
                                .when(dataset.alleles.length() == 2, dataset._rvrow)
                                .or_error(f"'{method}' expects biallelic variants ('alleles' field of length 2), found " +
                                        hl.str(dataset.locus) + ", " + hl.str(dataset.alleles)))
Beispiel #9
0
 def get_lgt(e, n_alleles, has_non_ref, row):
     index = e.GT.unphased_diploid_gt_index()
     n_no_nonref = n_alleles - hl.int(has_non_ref)
     triangle_without_nonref = hl.triangle(n_no_nonref)
     return (hl.case().when(index < triangle_without_nonref, e.GT).when(
         index < hl.triangle(n_alleles),
         hl.null('call')).or_error('invalid GT ' + hl.str(e.GT) +
                                   ' at site ' + hl.str(row.locus)))
Beispiel #10
0
 def parse_first_occurrence(x):
     return (hl.case(missing_false=True)
         .when(hl.is_defined(hl.parse_float(x)), hl.float64(x))  # Source of the first code ...
         .when(hl.literal(pseudo_dates).contains(hl.str(x)), hl.null(hl.tfloat64))  # Setting past and future dates to missing
         .when(hl.str(x) == '1902-02-02', 0.0)  # Matches DOB
         .when(hl.str(x) == '1903-03-03',  # Within year of birth (taking midpoint between month of birth and EOY)
               (hl.experimental.strptime('1970-12-31 00:00:00', '%Y-%m-%d %H:%M:%S', 'GMT') -
                hl.experimental.strptime('1970-' + month + '-15 00:00:00', '%Y-%m-%d %H:%M:%S',
                                         'GMT')) / 2)
         .default(hl.experimental.strptime(hl.str(x) + ' 00:00:00', '%Y-%m-%d %H:%M:%S', 'GMT') - dob
     ))
Beispiel #11
0
def require_biallelic(dataset,
                      method,
                      tolerate_generic_locus: bool = False) -> MatrixTable:
    if tolerate_generic_locus:
        require_row_key_variant_w_struct_locus(dataset, method)
    else:
        require_row_key_variant(dataset, method)
    return dataset._select_rows(
        method,
        hl.case().when(dataset.alleles.length() == 2, dataset._rvrow).or_error(
            f"'{method}' expects biallelic variants ('alleles' field of length 2), found "
            + hl.str(dataset.locus) + ", " + hl.str(dataset.alleles)))
Beispiel #12
0
    def test(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr,
                            f=hl.tarray(hl.tint32),
                            g=hl.tarray(
                                hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)),
                            h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr),
                            i=hl.tbool,
                            j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5,
                 'e': "hello", 'f': [1, 2, 3],
                 'g': [hl.Struct(x=1, y=5, z='banana')],
                 'h': hl.Struct(a=5, b=3, c='winter'),
                 'i': True,
                 'j': hl.Struct(x=3, y=2, z='summer')}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d),
            ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5),
            dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])),
            dpois=hl.dpois(4, kt.a),
            drop=kt.h.drop('b', 'c'),
            exp=hl.exp(kt.c),
            fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d),
            hwe=hl.hardy_weinberg_p(1, 2, 1),
            index=hl.index(kt.g, 'z'),
            is_defined=hl.is_defined(kt.i),
            is_missing=hl.is_missing(kt.i),
            is_nan=hl.is_nan(hl.float64(kt.a)),
            json=hl.json(kt.g),
            log=hl.log(kt.a, kt.b),
            log10=hl.log10(kt.c),
            or_else=hl.or_else(kt.a, 5),
            or_missing=hl.or_missing(kt.i, kt.j),
            pchisqtail=hl.pchisqtail(kt.a, kt.b),
            pcoin=hl.rand_bool(0.5),
            pnorm=hl.pnorm(0.2),
            pow=2.0 ** kt.b,
            ppois=hl.ppois(kt.a, kt.b),
            qchisqtail=hl.qchisqtail(kt.a, kt.b),
            range=hl.range(0, 5, kt.b),
            rnorm=hl.rand_norm(0.0, kt.b),
            rpois=hl.rand_pois(kt.a),
            runif=hl.rand_unif(kt.b, kt.a),
            select=kt.h.select('c', 'b'),
            sqrt=hl.sqrt(kt.a),
            to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)],
            where=hl.cond(kt.i, 5, 10)
        ).take(1)[0])
Beispiel #13
0
def to_plink(pops: list,
             subsets_dir,
             mt,
             ht_sample,
             bfile_path,
             export_varid: bool = True,
             overwrite=False):
    r'''
    Exports matrix table to PLINK2 files
    NOTE: These files will need to split up by chromosome before plink_clump.py
    can be run. 
    '''
    assert 'GT' in mt.entry, "mt must have 'GT' as an entry field"
    assert mt.GT.dtype == hl.tcall, "entry field 'GT' must be of type `Call`"

    if not overwrite and all([
            hl.hadoop_exists(f'{bfile_path}.{suffix}')
            for suffix in ['bed', 'bim']
    ]):
        print(f'\nPLINK .bed and .bim files already exist for {bfile_path}')
        print(bfile_path)
    else:
        print(f'Saving to bfile prefix {bfile_path}')
        mt_sample = mt.annotate_rows(varid=hl.str(mt.locus) + ':' +
                                     mt.alleles[0] + ':' + mt.alleles[1])
        mt_sample = mt_sample.filter_cols(hl.is_defined(
            ht_sample[mt_sample.s]))
        hl.export_plink(dataset=mt_sample,
                        output=bfile_path,
                        ind_id=mt_sample.s,
                        varid=mt_sample.varid)  # varid used to be rsid
Beispiel #14
0
 def setupAnnotationDBTests(cls):
     startTestHailContext()
     t = hl.utils.range_table(10)
     t = t.annotate(locus=hl.locus('1', t.idx + 1))
     t = t.annotate(annotation=hl.str(t.idx))
     d = tempfile.TemporaryDirectory()
     fname = d.name + '/f.mt'
     t.write(fname)
     cls.temp_dir = d
     cls.db_json = {
         'unique_dataset': {
             'description': 'now with unique rows!',
             'url': 'https://example.com',
             'key_properties': ['unique'],
             'versions': [{
                 'url': fname,
                 'version': 'v1-GRCh37'
             }]
         },
         'nonunique_dataset': {
             'description': 'non-unique rows :(',
             'url': 'https://example.net',
             'key_properties': [],
             'versions': [{
                 'url': fname,
                 'version': 'v1-GRCh37'
             }]
         }
     }
Beispiel #15
0
def specific_clumps(filename):
    clump = hl.import_table(filename,
                            delimiter='\s+',
                            min_partitions=10,
                            types={'P': hl.tfloat})
    clump = clump.key_by(locus=hl.locus(hl.str(clump.CHR), hl.int(clump.BP)))
    return clump
Beispiel #16
0
def create_gene_map_ht(ht, check_gene_contigs=False):
    from gnomad.utils.vep import process_consequences

    ht = process_consequences(ht)
    ht = ht.explode(ht.vep.worst_csq_by_gene_canonical)
    ht = ht.annotate(
        variant_id=ht.locus.contig + ':' + hl.str(ht.locus.position) + '_' +
        ht.alleles[0] + '/' + ht.alleles[1],
        annotation=annotation_case_builder(ht.vep.worst_csq_by_gene_canonical))
    if check_gene_contigs:
        gene_contigs = ht.group_by(
            gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id,
            gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol,
        ).aggregate(contigs=hl.agg.collect_as_set(ht.locus.contig))
        assert gene_contigs.all(hl.len(gene_contigs.contigs) == 1)

    gene_map_ht = ht.group_by(
        gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id,
        gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol,
    ).partition_hint(100).aggregate(
        interval=hl.interval(start=hl.locus(
            hl.agg.take(ht.locus.contig, 1)[0], hl.agg.min(ht.locus.position)),
                             end=hl.locus(
                                 hl.agg.take(ht.locus.contig, 1)[0],
                                 hl.agg.max(ht.locus.position))),
        variants=hl.agg.group_by(ht.annotation, hl.agg.collect(ht.variant_id)),
    )
    return gene_map_ht
Beispiel #17
0
    def test_make_table_row_equivalence(self):
        mt = hl.utils.range_matrix_table(3, 3)
        mt = mt.annotate_rows(r1 = hl.rand_norm(), r2 = hl.rand_norm())
        mt = mt.annotate_entries(e1 = hl.rand_norm(), e2 = hl.rand_norm())
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        assert mt.make_table().select(*mt.row_value)._same(mt.rows())
Beispiel #18
0
def table_aggregate_downsample_dense():
    ht = hl.read_table(resource('many_ints_table.ht'))
    ht.aggregate(
        tuple([
            hl.agg.downsample(ht[f'i{i}'], ht['i3'], label=hl.str(ht['i4']))
            for i in range(3)
        ]))
Beispiel #19
0
    def test_pcrelate(self):
        dataset = hl.balding_nichols_model(3, 100, 100)
        dataset = dataset.annotate_cols(sample_idx = hl.str(dataset.sample_idx))
        t = hl.pc_relate(dataset, 2, 0.05, block_size=64, statistics="phi")

        self.assertTrue(isinstance(t, hl.Table))
        t.count()
Beispiel #20
0
    def test_make_table_row_equivalence(self):
        mt = hl.utils.range_matrix_table(3, 3)
        mt = mt.annotate_rows(r1=hl.rand_norm(), r2=hl.rand_norm())
        mt = mt.annotate_entries(e1=hl.rand_norm(), e2=hl.rand_norm())
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        assert mt.make_table().select(*mt.row_value)._same(mt.rows())
Beispiel #21
0
    def test_make_table(self):
        mt = hl.utils.range_matrix_table(3, 2)
        mt = mt.select_entries(x=mt.row_idx * mt.col_idx)
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        t = hl.Table.parallelize(
            [{
                'row_idx': 0,
                '0.x': 0,
                '1.x': 0
            }, {
                'row_idx': 1,
                '0.x': 0,
                '1.x': 1
            }, {
                'row_idx': 2,
                '0.x': 0,
                '1.x': 2
            }],
            hl.tstruct(**{
                'row_idx': hl.tint32,
                '0.x': hl.tint32,
                '1.x': hl.tint32
            }),
            key='row_idx')

        self.assertTrue(mt.make_table()._same(t))
Beispiel #22
0
def table_aggregate_downsample_dense(ht_path):
    ht = hl.read_table(ht_path)
    ht.aggregate(
        tuple([
            hl.agg.downsample(ht[f'i{i}'], ht['i3'], label=hl.str(ht['i4']))
            for i in range(3)
        ]))
def import_key(ss_filename, ss_keys, clump_name):
    keys = ss_keys.split(',')
    ss = hl.import_table(ss_filename,
                         impute=True,
                         delimiter='\s+',
                         types={
                             keys[1]: hl.tfloat,
                             keys[0]: hl.tstr
                         },
                         min_partitions=100)
    clump = hl.import_table(clump_name,
                            delimiter='\s+',
                            min_partitions=10,
                            types={
                                'P': hl.tfloat,
                                'CHR': hl.tstr,
                                'BP': hl.tint
                            })
    clump = clump.key_by(locus=hl.locus(clump.CHR, clump.BP))
    clump = clump.filter(clump.P < 5e-8)
    ss = ss.annotate(**{keys[1]: hl.int(ss[keys[1]])})
    chroms = set(map(str, range(1, 23)))
    ss = ss.filter(hl.literal(chroms).contains(ss[keys[0]]))
    ss = ss.annotate(locus=hl.locus(hl.str(ss[keys[0]]), ss[keys[1]]),
                     alleles=[ss[keys[2]], ss[keys[3]]])
    ss = ss.key_by(ss.locus)
    ss = ss.annotate(clump=hl.is_defined(clump[ss.key]))
    ss = ss.key_by(ss.locus, ss.alleles)
    p = keys[-1]
    return ss, p
def specific_clumps(filename):
    clump = hl.import_table(filename, delimiter='\s+', min_partitions=10, types={'P': hl.tfloat})
    clump_dict = clump.aggregate(hl.dict(hl.agg.collect(
        (hl.locus(hl.str(clump.CHR), hl.int(clump.BP)),
        True)
    )), _localize=False)
    return clump_dict
Beispiel #25
0
def annotate_phen(tb, phen, sex, phen_tb_dict, filter_to_phen=True):
    r'''
    Annotates `tb` with phenotype `phen` and filters to individuals with 
    phenotype defined. Uses sex-specific IRNT phenotypes.
    sex options: female, male, both_sexes
    '''
    print(
        f'\n... Reading UKB phenotype "{phen_dict[phen][0]}" for {sex} (code: {phen}) ...'
    )

    phen_tb0 = phen_tb_dict[sex]
    phen_tb = phen_tb0.select(phen).rename({phen: 'phen'})

    if type(tb) == hl.table.Table:
        annotate_fn = hl.Table.annotate
        filter_fn = hl.Table.filter
    elif type(tb) == hl.matrixtable.MatrixTable:
        annotate_fn = hl.MatrixTable.annotate_cols
        filter_fn = hl.MatrixTable.filter_cols

    tb0 = annotate_fn(self=tb,
                      phen_str=hl.str(phen_tb[tb.s]['phen']).replace('\"', ''))

    if filter_to_phen:  # filter to individuals with phenotype data defined
        tb1 = filter_fn(self=tb0, expr=tb0.phen_str == '', keep=False)

    if phen_tb.phen.dtype == hl.dtype('bool'):
        tb2 = annotate_fn(self=tb1,
                          phen=hl.bool(tb1.phen_str)).drop('phen_str')
    else:
        tb2 = annotate_fn(self=tb1,
                          phen=hl.float64(tb1.phen_str)).drop('phen_str')

    return tb2
Beispiel #26
0
def assign_platform_from_pcs(
    platform_pca_scores_ht: hl.Table,
    pc_scores_ann: str = "scores",
    hdbscan_min_cluster_size: Optional[int] = None,
    hdbscan_min_samples: int = None,
) -> hl.Table:
    """
    Assigns platforms using HBDSCAN on the results of call rate PCA.
    :param platform_pca_scores_ht: Input table with the PCA score for each sample
    :param pc_scores_ann: Field containing the scores
    :param hdbscan_min_cluster_size: HDBSCAN `min_cluster_size` parameter. If not specified the smallest of 500 and 0.1*n_samples will be used.
    :param hdbscan_min_samples: HDBSCAN `min_samples` parameter
    :return: A Table with a `qc_platform` annotation containing the platform based on HDBSCAN clustering
    """

    logger.info("Assigning platforms based on platform PCA clustering")

    # Read and format data for clustering
    data = platform_pca_scores_ht.to_pandas()
    callrate_data = np.matrix(data[pc_scores_ann].tolist())
    logger.info("Assigning platforms to {} samples.".format(
        len(callrate_data)))

    # Cluster data
    if hdbscan_min_cluster_size is None:
        hdbscan_min_cluster_size = min(500, 0.1 * data.shape[0])
    clusterer = hdbscan.HDBSCAN(min_cluster_size=hdbscan_min_cluster_size,
                                min_samples=hdbscan_min_samples)
    cluster_labels = clusterer.fit_predict(callrate_data)
    n_clusters = len(set(cluster_labels)) - (
        -1 in cluster_labels
    )  # NOTE: -1 is the label for noisy (un-classifiable) data points
    logger.info("Found {} unique platforms during platform imputation.".format(
        n_clusters))

    data["qc_platform"] = cluster_labels

    # Note: write pandas dataframe to disk and re-import as HailTable.
    # This a temporary solution until sort the hail's issue with the function 'hl.Table.from_pandas'
    # and different python versions between driver/executors.
    (data.drop(axis=1, labels=pc_scores_ann).to_csv(
        f'{local_dir}/tmp/data_tmp_hdbscan.tsv', index=False, sep='\t'))
    ht_tmp = (hl.import_table(f'{nfs_dir}/tmp/data_tmp_hdbscan.tsv',
                              impute=True).key_by(*platform_pca_scores_ht.key))

    ht = platform_pca_scores_ht.join(ht_tmp)

    # original/elegant solution (TODO: sort issue with 'from_pandas' function)
    # ht = hl.Table.from_pandas(data, key=[*platform_pca_scores_ht.key])

    # expand array structure and annotate scores (PCs) as individual fields.
    # drop array scores field before to export the results.
    n_pcs = len(ht[pc_scores_ann].take(1)[0])
    ht = (ht.annotate(
        **{f'platform_PC{i + 1}': ht[pc_scores_ann][i]
           for i in range(n_pcs)}).drop(pc_scores_ann))

    ht = ht.annotate(qc_platform="platform_" + hl.str(ht.qc_platform))
    return ht
Beispiel #27
0
def default_compute_info(mt: hl.MatrixTable,
                         site_annotations: bool = False,
                         n_partitions: int = 5000) -> hl.Table:
    """
    Computes a HT with the typical GATK allele-specific (AS) info fields 
    as well as ACs and lowqual fields.
    Note that this table doesn't split multi-allelic sites.

    :param mt: Input MatrixTable. Note that this table should be filtered to nonref sites.
    :param site_annotations: Whether to also generate site level info fields. Default is False.
    :param n_partitions: Number of desired partitions for output Table. Default is 5000.
    :return: Table with info fields
    :rtype: Table
    """
    # Move gvcf info entries out from nested struct
    mt = mt.transmute_entries(**mt.gvcf_info)

    # Compute AS info expr
    info_expr = get_as_info_expr(mt)

    if site_annotations:
        info_expr = info_expr.annotate(**get_site_info_expr(mt))

    # Add AC and AC_raw:
    # First compute ACs for each non-ref allele, grouped by adj
    grp_ac_expr = hl.agg.array_agg(
        lambda ai: hl.agg.filter(
            mt.LA.contains(ai),
            hl.agg.group_by(
                get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD),
                hl.agg.sum(
                    mt.LGT.one_hot_alleles(mt.LA.map(lambda x: hl.str(x)))[
                        mt.LA.index(ai)]),
            ),
        ),
        hl.range(1, hl.len(mt.alleles)),
    )

    # Then, for each non-ref allele, compute
    # AC as the adj group
    # AC_raw as the sum of adj and non-adj groups
    info_expr = info_expr.annotate(
        AC_raw=grp_ac_expr.map(
            lambda i: hl.int32(i.get(True, 0) + i.get(False, 0))),
        AC=grp_ac_expr.map(lambda i: hl.int32(i.get(True, 0))),
    )

    info_ht = mt.select_rows(info=info_expr).rows()

    # Add AS lowqual flag
    info_ht = info_ht.annotate(AS_lowqual=get_lowqual_expr(
        info_ht.alleles, info_ht.info.AS_QUALapprox))

    if site_annotations:
        # Add lowqual flag
        info_ht = info_ht.annotate(
            lowqual=get_lowqual_expr(info_ht.alleles, info_ht.info.QUALapprox))

    return info_ht.naive_coalesce(n_partitions)
def get_omim():

    ht = hl.import_table("s3://seqr-resources/omim/genemap2.txt",
                         delimiter='|')
    ht = ht.annotate(colname=hl.str("omim"))
    ht = ht.to_matrix_table('Ensembl Gene ID', "colname")
    #ht = import_vcf("s3://seqr-resources/topmed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.vcf.gz","37","topmed")
    return ht
def import_key(ss_filename, ss_keys):
    ss = hl.import_table(ss_filename, impute=True, delimiter='\s+')
    keys = ss_keys.split(',')
    p = keys[-1]
    ss = ss.annotate(locus=hl.locus(hl.str(ss[keys[0]]), ss[keys[1]]),
                     alleles=[ss[keys[2]], ss[keys[3]]])
    ss = ss.key_by(ss.locus, ss.alleles)
    return ss, p
Beispiel #30
0
    def compute_variant_id(alt):
        var_id = normalized_contig(locus) + "-" + hl.str(
            locus.position) + "-" + alleles[0] + "-" + alt

        if max_length is not None:
            var_id = var_id[:max_length]

        return var_id
Beispiel #31
0
    def test_export_gen_exprs(self):
        gen = hl.import_gen(resource('example.gen'),
                            sample_file=resource('example.sample'),
                            contig_recoding={
                                "01": "1"
                            },
                            reference_genome='GRCh37',
                            min_partitions=3).add_col_index().add_row_index()

        out1 = new_temp_file()
        hl.export_gen(gen,
                      out1,
                      id1=hl.str(gen.col_idx),
                      id2=hl.str(gen.col_idx),
                      missing=0.5,
                      varid=hl.str(gen.row_idx),
                      rsid=hl.str(gen.row_idx),
                      gp=[0.0, 1.0, 0.0])

        in1 = (hl.import_gen(out1 + '.gen',
                             sample_file=out1 + '.sample',
                             min_partitions=3).add_col_index().add_row_index())
        self.assertTrue(
            in1.aggregate_entries(hl.agg.fraction(
                in1.GP == [0.0, 1.0, 0.0])) == 1.0)
        self.assertTrue(
            in1.aggregate_rows(
                hl.agg.fraction((in1.varid == hl.str(in1.row_idx))
                                & (in1.rsid == hl.str(in1.row_idx)))) == 1.0)
        self.assertTrue(
            in1.aggregate_cols(hl.agg.fraction(
                (in1.s == hl.str(in1.col_idx)))))
Beispiel #32
0
    def test_make_table_empty_entry_field(self):
        mt = hl.utils.range_matrix_table(3, 2)
        mt = mt.select_entries(**{'': mt.row_idx * mt.col_idx})
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        t = mt.make_table()
        self.assertEqual(
            t.row.dtype,
            hl.tstruct(**{'row_idx': hl.tint32, '0': hl.tint32, '1': hl.tint32}))
Beispiel #33
0
    def test_make_table_empty_entry_field(self):
        mt = hl.utils.range_matrix_table(3, 2)
        mt = mt.select_entries(**{'': mt.row_idx * mt.col_idx})
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        t = mt.make_table()
        self.assertEqual(
            t.row.dtype,
            hl.tstruct(**{'row_idx': hl.tint32, '0': hl.tint32, '1': hl.tint32}))
Beispiel #34
0
    def test_make_table_sep(self):
        mt = hl.utils.range_matrix_table(3, 2)
        mt = mt.select_entries(x=mt.row_idx * mt.col_idx)
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        t = mt.make_table()
        assert list(t.row) == ['row_idx', '0.x', '1.x']

        t = mt.make_table(separator='__')
        assert list(t.row) == ['row_idx', '0__x', '1__x']
Beispiel #35
0
    def test_rename_duplicates(self):
        mt = hl.utils.range_matrix_table(5, 5)

        assert hl.rename_duplicates(
            mt.key_cols_by(s=hl.str(mt.col_idx))
        ).unique_id.collect() == ['0', '1', '2', '3', '4']

        assert hl.rename_duplicates(
            mt.key_cols_by(s='0')
        ).unique_id.collect() == ['0', '0_1', '0_2', '0_3', '0_4']

        assert hl.rename_duplicates(
            mt.key_cols_by(s=hl.literal(['0', '0_1', '0', '0_2', '0'])[mt.col_idx])
        ).unique_id.collect() == ['0', '0_1', '0_2', '0_2_1', '0_3']

        assert hl.rename_duplicates(
            mt.key_cols_by(s=hl.str(mt.col_idx)),
            'foo'
        )['foo'].dtype == hl.tstr
Beispiel #36
0
    def test_make_table(self):
        mt = hl.utils.range_matrix_table(3, 2)
        mt = mt.select_entries(x=mt.row_idx * mt.col_idx)
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        t = hl.Table.parallelize(
            [{'row_idx': 0, '0.x': 0, '1.x': 0},
             {'row_idx': 1, '0.x': 0, '1.x': 1},
             {'row_idx': 2, '0.x': 0, '1.x': 2}],
            hl.tstruct(**{'row_idx': hl.tint32, '0.x': hl.tint32, '1.x': hl.tint32}),
            key='row_idx')

        self.assertTrue(mt.make_table()._same(t))
Beispiel #37
0
    def test_export_import_plink_same(self):
        mt = get_dataset()
        mt = mt.select_rows(rsid=hl.delimit([mt.locus.contig, hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1]], ':'),
                            cm_position=15.0)
        mt = mt.select_cols(fam_id=hl.null(hl.tstr), pat_id=hl.null(hl.tstr), mat_id=hl.null(hl.tstr),
                            is_female=hl.null(hl.tbool), is_case=hl.null(hl.tbool))
        mt = mt.select_entries('GT')

        bfile = '/tmp/test_import_export_plink'
        hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position)

        mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam',
                                      a2_reference=True, reference_genome='GRCh37')
        self.assertTrue(mt._same(mt_imported))
        self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
Beispiel #38
0
def generate_random_gen():
    mt = hl.utils.range_matrix_table(30, 10)
    mt = (mt.annotate_rows(locus = hl.locus('20', mt.row_idx + 1),
                           alleles = ['A', 'G'])
          .key_rows_by('locus', 'alleles'))
    mt = (mt.annotate_cols(s = hl.str(mt.col_idx))
          .key_cols_by('s'))
    # using totally random values leads rounding differences where
    # identical GEN values get rounded differently, leading to
    # differences in the GT call between import_{gen, bgen}
    mt = mt.annotate_entries(a = hl.int32(hl.rand_unif(0.0, 255.0)))
    mt = mt.annotate_entries(b = hl.int32(hl.rand_unif(0.0, 255.0 - mt.a)))
    mt = mt.transmute_entries(GP = hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0)
    # 20% missing
    mt = mt.filter_entries(hl.rand_bool(0.8))
    hl.export_gen(mt, 'random', precision=4)
Beispiel #39
0
    def test_joins(self):
        vds = self.get_vds().select_rows(x1=1, y1=1)
        vds2 = vds.select_rows(x2=1, y2=2)
        vds2 = vds2.select_cols(c1=1, c2=2)

        vds = vds.annotate_rows(y2=vds2.index_rows(vds.row_key).y2)
        vds = vds.annotate_cols(c2=vds2.index_cols(vds.s).c2)

        vds = vds.annotate_cols(c2=vds2.index_cols(hl.str(vds.s)).c2)

        rt = vds.rows()
        ct = vds.cols()

        vds.annotate_rows(**rt[vds.locus, vds.alleles])

        self.assertTrue(rt.all(rt.y2 == 2))
        self.assertTrue(ct.all(ct.c2 == 2))
Beispiel #40
0
    def test_export_gen_exprs(self):
        gen = hl.import_gen(resource('example.gen'),
                            sample_file=resource('example.sample'),
                            contig_recoding={"01": "1"},
                            reference_genome='GRCh37',
                            min_partitions=3).add_col_index().add_row_index()

        out1 = new_temp_file()
        hl.export_gen(gen, out1, id1=hl.str(gen.col_idx), id2=hl.str(gen.col_idx), missing=0.5,
                      varid=hl.str(gen.row_idx), rsid=hl.str(gen.row_idx), gp=[0.0, 1.0, 0.0])

        in1 = (hl.import_gen(out1 + '.gen', sample_file=out1 + '.sample', min_partitions=3)
               .add_col_index()
               .add_row_index())
        self.assertTrue(in1.aggregate_entries(hl.agg.fraction(in1.GP == [0.0, 1.0, 0.0])) == 1.0)
        self.assertTrue(in1.aggregate_rows(hl.agg.fraction((in1.varid == hl.str(in1.row_idx)) &
                                                           (in1.rsid == hl.str(in1.row_idx)))) == 1.0)
        self.assertTrue(in1.aggregate_cols(hl.agg.fraction((in1.s == hl.str(in1.col_idx)))))
Beispiel #41
0
def _collect_scatter_plot_data(
        x: Tuple[str, NumericExpression],
        y: Tuple[str, NumericExpression],
        fields: Dict[str, Expression] = None,
        n_divisions: int = None,
        missing_label: str =  'NA'
) -> pd.DataFrame:

    expressions = dict()
    if fields is not None:
        expressions.update({k: hail.or_else(v, missing_label) if isinstance(v, StringExpression) else v for k, v in fields.items()})

    if n_divisions is None:
        collect_expr = hail.struct(**dict((k,v) for k,v in (x,y)), **expressions)
        plot_data = [point for point in collect_expr.collect() if point[x[0]] is not None and point[y[0]] is not None]
        source_pd = pd.DataFrame(plot_data)
    else:
        # FIXME: remove the type conversion logic if/when downsample supports continuous values for labels
        # Save all numeric types to cast in DataFrame
        numeric_expr = {k: 'int32' for k,v in expressions.items() if isinstance(v, Int32Expression)}
        numeric_expr.update({k: 'int64' for k,v in expressions.items() if isinstance(v, Int64Expression)})
        numeric_expr.update({k: 'float32' for k, v in expressions.items() if isinstance(v, Float32Expression)})
        numeric_expr.update({k: 'float64' for k, v in expressions.items() if isinstance(v, Float64Expression)})

        # Cast non-string types to string
        expressions = {k: hail.str(v) if not isinstance(v, StringExpression) else v for k,v in expressions.items()}

        agg_f = x[1]._aggregation_method()
        res = agg_f(hail.agg.downsample(x[1], y[1], label=list(expressions.values()) if expressions else None, n_divisions=n_divisions))
        source_pd = pd.DataFrame([
            dict(
                **{x[0]: point[0], y[0]: point[1]},
                **(dict(zip(expressions, point[2])) if point[2] is not None else {})
            ) for point in res
        ])
        source_pd = source_pd.astype(numeric_expr, copy=False)

    return source_pd
Beispiel #42
0
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, significance_line=5e-8):
    """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot)

    Parameters
    ----------
    pvals : :class:`.Float64Expression`
        P-values to be plotted.
    locus : :class:`.LocusExpression`
        Locus values to be plotted.
    title : str
        Title of the plot.
    size : int
        Size of markers in screen space units.
    hover_fields : Dict[str, :class:`.Expression`]
        Dictionary of field names and values to be shown in the HoverTool of the plot.
    collect_all : bool
        Whether to collect all values or downsample before plotting.
    n_divisions : int
        Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints.
    significance_line : float, optional
        p-value at which to add a horizontal, dotted red line indicating
        genome-wide significance.  If ``None``, no line is added.

    Returns
    -------
    :class:`bokeh.plotting.figure.Figure`
    """
    if locus is None:
        locus = pvals._indices.source.locus

    ref = locus.dtype.reference_genome

    if hover_fields is None:
        hover_fields = {}

    hover_fields['locus'] = hail.str(locus)

    pvals = -hail.log10(pvals)

    source_pd = _collect_scatter_plot_data(
        ('_global_locus', locus.global_position()),
        ('_pval', pvals),
        fields=hover_fields,
        n_divisions=None if collect_all else n_divisions
    )
    source_pd['p_value'] = [10 ** (-p) for p in source_pd['_pval']]
    source_pd['_contig'] = [locus.split(":")[0] for locus in source_pd['locus']]

    observed_contigs = set(source_pd['_contig'])
    observed_contigs = [contig for contig in ref.contigs.copy() if contig in observed_contigs]
    contig_ticks = hail.eval([hail.locus(contig, int(ref.lengths[contig]/2)).global_position() for contig in observed_contigs])
    color_mapper = CategoricalColorMapper(factors=ref.contigs, palette= palette[:2] * int((len(ref.contigs)+1)/2))

    p = figure(title=title, x_axis_label='Chromosome', y_axis_label='P-value (-log10 scale)', width=1000)
    p, _, legend, _, _, _ = _get_scatter_plot_elements(
        p, source_pd, x_col='_global_locus', y_col='_pval',
        label_cols=['_contig'], colors={'_contig': color_mapper},
        size=size
    )
    legend.visible = False
    p.xaxis.ticker = contig_ticks
    p.xaxis.major_label_overrides = dict(zip(contig_ticks, observed_contigs))
    p.select_one(HoverTool).tooltips = [t for t in p.select_one(HoverTool).tooltips if not t[0].startswith('_')]

    if significance_line is not None:
        p.renderers.append(Span(location=-log10(significance_line),
                                dimension='width',
                                line_color='red',
                                line_dash='dashed',
                                line_width=1.5))

    return p
Beispiel #43
0
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
    ...                      fam='data/ldsc.fam')

    >>> # Create locus-keyed Table with numeric variant annotations
    >>> ht = hl.import_table('data/ldsc.annot',
    ...                      types={'BP': hl.tint,
    ...                             'binary': hl.tfloat,
    ...                             'continuous': hl.tfloat})
    >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
    >>> ht = ht.key_by('locus')

    >>> # Annotate MatrixTable with external annotations
    >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary,
    ...                       continuous_annotation=ht[mt.locus].continuous)

    >>> # Calculate LD scores using centimorgan coordinates
    >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(),
    ...                                      locus_expr=mt.locus,
    ...                                      radius=1.0,
    ...                                      coord_expr=mt.cm_position,
    ...                                      annotation_exprs=[mt.binary_annotation,
    ...                                                        mt.continuous_annotation])

    >>> # Show results
    >>> ht_scores.show(3)

    .. code-block:: text

        +---------------+-------------------+-----------------------+-------------+
        | locus         | binary_annotation | continuous_annotation |  univariate |
        +---------------+-------------------+-----------------------+-------------+
        | locus<GRCh37> |           float64 |               float64 |     float64 |
        +---------------+-------------------+-----------------------+-------------+
        | 20:82079      |       1.15183e+00 |           7.30145e+01 | 1.60117e+00 |
        | 20:103517     |       2.04604e+00 |           2.75392e+02 | 4.69239e+00 |
        | 20:108286     |       2.06585e+00 |           2.86453e+02 | 5.00124e+00 |
        +---------------+-------------------+-----------------------+-------------+


    Warning
    -------
        :func:`.ld_score` will fail if ``entry_expr`` results in any missing
        values. The special float value ``nan`` is not considered a
        missing value.

    **Further reading**

    For more in-depth discussion of LD scores, see:

    - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__
    - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__

    Notes
    -----

    `entry_expr`, `locus_expr`, `coord_expr` (if specified), and
    `annotation_exprs` (if specified) must come from the same
    MatrixTable.


    Parameters
    ----------
    entry_expr : :class:`.NumericExpression`
        Expression for entries of genotype matrix
        (e.g. ``mt.GT.n_alt_alleles()``).
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression.
    radius : :obj:`int` or :obj:`float`
        Radius of window for row values (in units of `coord_expr` if set,
        otherwise in units of basepairs).
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value used to window
        variants. By default, the row value is given by the locus
        position.
    annotation_exprs : :class:`.NumericExpression` or
                       :obj:`list` of :class:`.NumericExpression`, optional
        Annotation expression(s) to partition LD scores. Univariate
        annotation will always be included and does not need to be
        specified.
    block_size : :obj:`int`, optional
        Block size. Default given by :meth:`.BlockMatrix.default_block_size`.

    Returns
    -------
    :class:`.Table`
        Table keyed by `locus_expr` with LD scores for each variant and
        `annotation_expr`. The function will always return LD scores for
        the univariate (all SNPs) annotation."""

    mt = entry_expr._indices.source
    mt_locus_expr = locus_expr._indices.source

    if coord_expr is None:
        mt_coord_expr = mt_locus_expr
    else:
        mt_coord_expr = coord_expr._indices.source

    if not annotation_exprs:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr])
    else:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr] +
                        [mt == x._indices.source
                         for x in wrap_to_list(annotation_exprs)])

    if not check_mts:
        raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr
                            (if specified), and annotation_exprs (if
                            specified) must come from same MatrixTable.""")

    n = mt.count_cols()
    r2 = hl.row_correlation(entry_expr, block_size) ** 2
    r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0))

    starts, stops = hl.linalg.utils.locus_windows(locus_expr,
                                                  radius,
                                                  coord_expr)
    r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops)

    r2_adj_sparse_tmp = new_temp_file()
    r2_adj_sparse.write(r2_adj_sparse_tmp)
    r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp)

    if not annotation_exprs:
        cols = ['univariate']
        col_idxs = {0: 'univariate'}
        l2 = r2_adj_sparse.sum(axis=1)
    else:
        ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows()
        ht = ht.annotate(univariate=hl.literal(1.0))
        names = [name for name in ht.row if name not in ht.key]

        ht_union = hl.Table.union(
            *[(ht.annotate(name=hl.str(x),
                           value=hl.float(ht[x]))
                 .select('name', 'value')) for x in names])
        mt_annotations = ht_union.to_matrix_table(
            row_key=list(ht_union.key),
            col_key=['name'])

        cols = mt_annotations.key_cols_by()['name'].collect()
        col_idxs = {i: cols[i] for i in range(len(cols))}

        a_tmp = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp)

        a = BlockMatrix.read(a_tmp)
        l2 = r2_adj_sparse @ a

    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_temp_file()
    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index()
    ht_scores = ht_scores.key_by('idx')
    ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i]
                                  for i in range(len(cols))})

    ht = mt.select_rows(__locus=locus_expr).rows()
    ht = ht.add_index()
    ht = ht.annotate(**ht_scores[ht.idx])
    ht = ht.key_by('__locus')
    ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key])
    ht = ht.rename({'__locus': 'locus'})

    return ht
Beispiel #44
0
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500):
    """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot)

    Parameters
    ----------
    pvals : :class:`.Float64Expression`
        P-values to be plotted.
    locus : :class:`.LocusExpression`
        Locus values to be plotted.
    title : str
        Title of the plot.
    size : int
        Size of markers in screen space units.
    hover_fields : Dict[str, :class:`.Expression`]
        Dictionary of field names and values to be shown in the HoverTool of the plot.
    collect_all : bool
        Whether to collect all values or downsample before plotting.
    n_divisions : int
        Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints.

    Returns
    -------
    :class:`bokeh.plotting.figure.Figure`
    """
    def get_contig_index(x, starts):
        left = 0
        right = len(starts) - 1
        while left <= right:
            mid = (left + right) // 2
            if x < starts[mid]:
                if x >= starts[mid - 1]:
                    return mid - 1
                right = mid
            elif x >= starts[mid+1]:
                left = mid + 1
            else:
                return mid

    if locus is None:
        locus = pvals._indices.source.locus

    if hover_fields is None:
        hover_fields = {}

    hover_fields['locus'] = hail.str(locus)

    pvals = -hail.log10(pvals)

    if collect_all:
        res = hail.tuple([locus.global_position(), pvals, hail.struct(**hover_fields)]).collect()
        hf_struct = [point[2] for point in res]
        for key in hover_fields:
            hover_fields[key] = [item[key] for item in hf_struct]
    else:
        agg_f = pvals._aggregation_method()
        res = agg_f(aggregators.downsample(locus.global_position(), pvals,
                                           label=hail.array([hail.str(x) for x in hover_fields.values()]),
                                           n_divisions=n_divisions))
        fields = [point[2] for point in res]
        for idx, key in enumerate(list(hover_fields.keys())):
            hover_fields[key] = [field[idx] for field in fields]

    x = [point[0] for point in res]
    y = [point[1] for point in res]
    y_linear = [10 ** (-p) for p in y]
    hover_fields['p_value'] = y_linear

    ref = locus.dtype.reference_genome

    total_pos = 0
    start_points = []
    for i in range(0, len(ref.contigs)):
        start_points.append(total_pos)
        total_pos += ref.lengths.get(ref.contigs[i])
    start_points.append(total_pos)  # end point of all contigs

    observed_contigs = set()
    label = []
    for element in x:
        contig_index = get_contig_index(element, start_points)
        label.append(str(contig_index % 2))
        observed_contigs.add(ref.contigs[contig_index])

    labels = ref.contigs.copy()
    num_deleted = 0
    mid_points = []
    for i in range(0, len(ref.contigs)):
        if ref.contigs[i] in observed_contigs:
            length = ref.lengths.get(ref.contigs[i])
            mid = start_points[i] + length / 2
            if mid % 1 == 0:
                mid += 0.5
            mid_points.append(mid)
        else:
            del labels[i - num_deleted]
            num_deleted += 1

    p = scatter(x, y, label=label, title=title, xlabel='Chromosome', ylabel='P-value (-log10 scale)',
                size=size, legend=False, source_fields=hover_fields)

    p.xaxis.ticker = mid_points
    p.xaxis.major_label_overrides = dict(zip(mid_points, labels))
    p.width = 1000

    tooltips = [(key, "@{}".format(key)) for key in hover_fields]
    p.add_tools(HoverTool(
        tooltips=tooltips
    ))

    return p
Beispiel #45
0
def histogram2d(x, y, bins=40, range=None,
                 title=None, width=600, height=600, font_size='7pt',
                 colors=bokeh.palettes.all_palettes['Blues'][7][::-1]):
    """Plot a two-dimensional histogram.

    ``x`` and ``y`` must both be a :class:`NumericExpression` from the same :class:`Table`.

    If ``x_range`` or ``y_range`` are not provided, the function will do a pass through the data to determine
    min and max of each variable.

    Examples
    --------

    >>> ht = hail.utils.range_table(1000).annotate(x=hail.rand_norm(), y=hail.rand_norm())
    >>> p_hist = hail.plot.histogram2d(ht.x, ht.y)

    >>> ht = hail.utils.range_table(1000).annotate(x=hail.rand_norm(), y=hail.rand_norm())
    >>> p_hist = hail.plot.histogram2d(ht.x, ht.y, bins=10, range=((0, 1), None))

    Parameters
    ----------
    x : :class:`.NumericExpression`
        Expression for x-axis (from a Hail table).
    y : :class:`.NumericExpression`
        Expression for y-axis (from the same Hail table as ``x``).
    bins : int or [int, int]
        The bin specification:
        -   If int, the number of bins for the two dimensions (nx = ny = bins).
        -   If [int, int], the number of bins in each dimension (nx, ny = bins).
        The default value is 40.
    range : None or ((float, float), (float, float))
        The leftmost and rightmost edges of the bins along each dimension:
        ((xmin, xmax), (ymin, ymax)). All values outside of this range will be considered outliers
        and not tallied in the histogram. If this value is None, or either of the inner lists is None,
        the range will be computed from the data.
    width : int
        Plot width (default 600px).
    height : int
        Plot height (default 600px).
    title : str
        Title of the plot.
    font_size : str
        String of font size in points (default '7pt').
    colors : List[str]
        List of colors (hex codes, or strings as described
        `here <https://bokeh.pydata.org/en/latest/docs/reference/colors.html>`__). Compatible with one of the many
        built-in palettes available `here <https://bokeh.pydata.org/en/latest/docs/reference/palettes.html>`__.

    Returns
    -------
    :class:`bokeh.plotting.figure.Figure`
    """
    source = x._indices.source
    y_source = y._indices.source

    if source is None or y_source is None:
        raise ValueError("histogram_2d expects two expressions of 'Table', found scalar expression")
    if isinstance(source, hail.MatrixTable):
        raise ValueError("histogram_2d requires source to be Table, not MatrixTable")
    if source != y_source:
        raise ValueError(f"histogram_2d expects two expressions from the same 'Table', found {source} and {y_source}")
    check_row_indexed('histogram_2d', x)
    check_row_indexed('histogram_2d', y)
    if isinstance(bins, int):
        x_bins = y_bins = bins
    else:
        x_bins, y_bins = bins
    if range is None:
        x_range = y_range = None
    else:
        x_range, y_range = range
    if x_range is None or y_range is None:
        warnings.warn('At least one range was not defined in histogram_2d. Doing two passes...')
        ranges = source.aggregate(hail.struct(x_stats=hail.agg.stats(x),
                                              y_stats=hail.agg.stats(y)))
        if x_range is None:
            x_range = (ranges.x_stats.min, ranges.x_stats.max)
        if y_range is None:
            y_range = (ranges.y_stats.min, ranges.y_stats.max)
    else:
        warnings.warn('If x_range or y_range are specified in histogram_2d, and there are points '
                      'outside of these ranges, they will not be plotted')
    x_range = list(map(float, x_range))
    y_range = list(map(float, y_range))
    x_spacing = (x_range[1] - x_range[0]) / x_bins
    y_spacing = (y_range[1] - y_range[0]) / y_bins

    def frange(start, stop, step):
        from itertools import count, takewhile
        return takewhile(lambda x: x <= stop, count(start, step))

    x_levels = hail.literal(list(frange(x_range[0], x_range[1], x_spacing))[::-1])
    y_levels = hail.literal(list(frange(y_range[0], y_range[1], y_spacing))[::-1])

    grouped_ht = source.group_by(
        x=hail.str(x_levels.find(lambda w: x >= w)),
        y=hail.str(y_levels.find(lambda w: y >= w))
    ).aggregate(c=hail.agg.count())
    data = grouped_ht.filter(hail.is_defined(grouped_ht.x) & (grouped_ht.x != str(x_range[1])) &
                             hail.is_defined(grouped_ht.y) & (grouped_ht.y != str(y_range[1]))).to_pandas()

    mapper = LinearColorMapper(palette=colors, low=data.c.min(), high=data.c.max())

    x_axis = sorted(set(data.x), key=lambda z: float(z))
    y_axis = sorted(set(data.y), key=lambda z: float(z))
    p = figure(title=title,
               x_range=x_axis, y_range=y_axis,
               x_axis_location="above", plot_width=width, plot_height=height,
               tools="hover,save,pan,box_zoom,reset,wheel_zoom", toolbar_location='below')

    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 0
    p.axis.major_label_text_font_size = font_size
    import math
    p.xaxis.major_label_orientation = math.pi / 3

    p.rect(x='x', y='y', width=1, height=1,
           source=data,
           fill_color={'field': 'c', 'transform': mapper},
           line_color=None)

    color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size=font_size,
                         ticker=BasicTicker(desired_num_ticks=6),
                         label_standoff=6, border_line_color=None, location=(0, 0))
    p.add_layout(color_bar, 'right')

    def set_font_size(p, font_size: str = '12pt'):
        """Set most of the font sizes in a bokeh figure

        Parameters
        ----------
        p : :class:`bokeh.plotting.figure.Figure`
            Input figure.
        font_size : str
            String of font size in points (e.g. '12pt').

        Returns
        -------
        :class:`bokeh.plotting.figure.Figure`
        """
        p.legend.label_text_font_size = font_size
        p.xaxis.axis_label_text_font_size = font_size
        p.yaxis.axis_label_text_font_size = font_size
        p.xaxis.major_label_text_font_size = font_size
        p.yaxis.major_label_text_font_size = font_size
        if hasattr(p.title, 'text_font_size'):
            p.title.text_font_size = font_size
        if hasattr(p.xaxis, 'group_text_font_size'):
            p.xaxis.group_text_font_size = font_size
        return p

    p.select_one(HoverTool).tooltips = [('x', '@x'), ('y', '@y',), ('count', '@c')]
    p = set_font_size(p, font_size)
    return p