コード例 #1
0
def compressed_variant_id(locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression) -> hl.expr.StringExpression:
    return hl.rbind(
        hl.len(alleles[0]),
        hl.len(alleles[1]),
        lambda ref_len, alt_len: hl.case()
        .when(
            ref_len > alt_len,
            normalized_contig(locus.contig)
            + "-"
            + hl.str(locus.position)
            + "d"
            + hl.str(ref_len - alt_len)
            + "-"
            + alleles[1],
        )
        .when(
            ref_len < alt_len,
            normalized_contig(locus.contig)
            + "-"
            + hl.str(locus.position)
            + "i"
            + hl.str(alt_len - ref_len)
            + "-"
            + _encode_allele(alleles[1]),
        )
        .default(variant_id(locus, alleles)),
    )
コード例 #2
0
ファイル: test_impex.py プロジェクト: maccum/hail
    def test_import_bgen_variant_filtering(self):
        desired_variant_indexes = [1, 2, 3, 5, 7, 9, 11, 13, 17, 198]
        actual = hl.import_bgen(resource('example.8bits.bgen'), ['GT'],
                                contig_recoding={'01': '1'},
                                reference_genome=None,
                                n_partitions=10,
                                _row_fields=['file_row_idx'],
                                _variants_per_file={
                                    resource('example.8bits.bgen'):
                                    desired_variant_indexes
                                })
        # doing the expected import_bgen second catches the case where the
        # hadoop configuraiton is polluted with old data from the
        # _variants_per_file
        everything = hl.import_bgen(resource('example.8bits.bgen'), ['GT'],
                                    contig_recoding={'01': '1'},
                                    reference_genome=None,
                                    _row_fields=['file_row_idx'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(
            hl.set(desired_variant_indexes).contains(
                hl.int32(everything.file_row_idx)))

        self.assertTrue(expected._same(actual))
        self.assertEqual(
            (hl.str(actual.locus.contig) + ":" +
             hl.str(actual.locus.position)).collect(), [
                 '1:3000', '1:4000', '1:5000', '1:7000', '1:9000', '1:11000',
                 '1:13000', '1:15000', '1:19000', '1:100001'
             ])
コード例 #3
0
def add_coding_information(
        mt: hl.MatrixTable,
        coding_ht: hl.Table,
        phesant_phenotype_info_path: str,
        download_missing_codings: bool = False) -> hl.MatrixTable:
    """
    Add coding information from coding_ht as column annotations into mt

    :param MatrixTable mt: Input MT
    :param Table coding_ht: HT with coding information
    :param str phesant_phenotype_info_path: PHESANT phenotype metadata path
    :param bool download_missing_codings: Whether to download missing coding data
    :return: MT with coding information in column data
    :rtype: MatrixTable
    """
    mt = mt.annotate_cols(**coding_ht[(mt.coding_id, hl.str(mt.coding))])
    if download_missing_codings: get_missing_codings(mt.cols())
    phesant_summary = hl.import_table(phesant_phenotype_info_path,
                                      impute=True,
                                      missing='',
                                      key='FieldID')
    phesant_reassign = get_phesant_reassignments(phesant_summary)
    mt = mt.annotate_cols(recoding=hl.or_missing(
        hl.is_missing(mt.meaning), phesant_reassign[mt.col_key.select(
            'phenocode', 'coding')].reassign_from))
    return mt.annotate_cols(
        **hl.cond(hl.is_defined(mt.meaning),
                  hl.struct(**{x: mt[x]
                               for x in list(coding_ht.row_value)}),
                  coding_ht[(mt.coding_id, hl.str(mt.recoding))]), )
コード例 #4
0
def load_cmg(cmg_csv: str) -> hl.Table:
    cmg_ht = hl.import_table(cmg_csv, impute=True, delimiter=",", quote='"')

    cmg_ht = cmg_ht.transmute(
        locus1_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_1), cmg_ht.pos_1, reference_genome='GRCh38'),
        alleles1_b38=[cmg_ht.ref_1, cmg_ht.alt_1],
        locus2_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_2), cmg_ht.pos_2, reference_genome='GRCh38'),
        alleles2_b38=[cmg_ht.ref_2, cmg_ht.alt_2]
    )

    liftover_references = get_liftover_genome(cmg_ht.rename({'locus1_b38': 'locus'}))
    lifted_over_variants = hl.sorted(
        hl.array([
            liftover_expr(cmg_ht.locus1_b38, cmg_ht.alleles1_b38, liftover_references[1]),
            liftover_expr(cmg_ht.locus2_b38, cmg_ht.alleles2_b38, liftover_references[1])
        ]),
        lambda x: x.locus
    )

    cmg_ht = cmg_ht.key_by(
        locus1=lifted_over_variants[0].locus,
        alleles1=lifted_over_variants[0].alleles,
        locus2=lifted_over_variants[1].locus,
        alleles2=lifted_over_variants[1].alleles
    )

    return cmg_ht.annotate(
        bad_liftover=(
                hl.is_missing(cmg_ht.locus1) |
                hl.is_missing(cmg_ht.locus2) |
                (cmg_ht.locus1.sequence_context() != cmg_ht.alleles1[0][0]) |
                (cmg_ht.locus2.sequence_context() != cmg_ht.alleles2[0][0])
        )
    )
コード例 #5
0
def intersect_target_ref(ref_mt_filt,
                         snp_list,
                         grch37_or_grch38,
                         intersect_out,
                         overwrite: bool = False):
    mt = hl.read_matrix_table(ref_mt_filt)
    if grch37_or_grch38.lower() == 'grch38':
        snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr),
                                                  hl.int(snp_list.pos),
                                                  reference_genome='GRCh38'),
                                   alleles=[snp_list.ref, snp_list.alt])
        mt = mt.filter_rows(hl.is_defined(snp_list[mt.row_key]))

    elif grch37_or_grch38.lower() == 'grch37':
        snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr),
                                                  hl.int(snp_list.pos),
                                                  reference_genome='GRCh37'),
                                   alleles=[snp_list.ref, snp_list.alt])
        # liftover snp list to GRCh38, filter to SNPs in mt
        rg37, rg38 = load_liftover()

        snp_liftover = snp_list.annotate(
            new_locus=hl.liftover(snp_list.locus, 'GRCh38'))
        snp_liftover = snp_liftover.filter(
            hl.is_defined(snp_liftover.new_locus))
        snp_liftover = snp_liftover.key_by(locus=snp_liftover.new_locus,
                                           alleles=snp_liftover.alleles)
        mt = mt.filter_rows(hl.is_defined(snp_liftover[mt.row_key]))

    mt = mt.repartition(5000)
    mt = mt.checkpoint(intersect_out,
                       overwrite=overwrite,
                       _read_if_exists=not overwrite)
コード例 #6
0
def require_biallelic(dataset, method) -> MatrixTable:
    require_row_key_variant(dataset, method)
    return dataset._select_rows(
        method,
        hl.case().when(dataset.alleles.length() == 2, dataset._rvrow).or_error(
            f"'{method}' expects biallelic variants ('alleles' field of length 2), found "
            + hl.str(dataset.locus) + ", " + hl.str(dataset.alleles)))
コード例 #7
0
ファイル: annotation.py プロジェクト: enriquea/wes_chd_ukbb
def annotate_variant_id(
        t: Union[hl.Table, hl.MatrixTable],
        field_name: str = 'vid') -> Union[hl.Table, hl.MatrixTable]:
    """
    Expected input dataset with bi-allelic variant, and fields `locus` and `alleles`.
    Annotate variant ids as follow 'chr:position:ref:alt'.

    :param field_name: variant id field name
    :param t: dataset
    :return: HailTable or MatrixTable
    """

    variant_id_ann_exp = {
        field_name:
        hl.delimit([
            hl.str(t.locus.contig),
            hl.str(t.locus.position),
            hl.str(t.alleles[0]),
            hl.str(t.alleles[1])
        ],
                   delimiter=":")
    }

    if isinstance(t, hl.Table):
        return t.annotate(**variant_id_ann_exp)
    else:
        return t.annotate_rows(**variant_id_ann_exp)
コード例 #8
0
ファイル: misc.py プロジェクト: tpoterba/hail
def require_biallelic(dataset, method) -> MatrixTable:
    require_row_key_variant(dataset, method)
    return dataset._select_rows(method,
                                hl.case()
                                .when(dataset.alleles.length() == 2, dataset._rvrow)
                                .or_error(f"'{method}' expects biallelic variants ('alleles' field of length 2), found " +
                                        hl.str(dataset.locus) + ", " + hl.str(dataset.alleles)))
コード例 #9
0
 def get_lgt(e, n_alleles, has_non_ref, row):
     index = e.GT.unphased_diploid_gt_index()
     n_no_nonref = n_alleles - hl.int(has_non_ref)
     triangle_without_nonref = hl.triangle(n_no_nonref)
     return (hl.case().when(index < triangle_without_nonref, e.GT).when(
         index < hl.triangle(n_alleles),
         hl.null('call')).or_error('invalid GT ' + hl.str(e.GT) +
                                   ' at site ' + hl.str(row.locus)))
コード例 #10
0
ファイル: phenotype_loading.py プロジェクト: wlu04/ukb_common
 def parse_first_occurrence(x):
     return (hl.case(missing_false=True)
         .when(hl.is_defined(hl.parse_float(x)), hl.float64(x))  # Source of the first code ...
         .when(hl.literal(pseudo_dates).contains(hl.str(x)), hl.null(hl.tfloat64))  # Setting past and future dates to missing
         .when(hl.str(x) == '1902-02-02', 0.0)  # Matches DOB
         .when(hl.str(x) == '1903-03-03',  # Within year of birth (taking midpoint between month of birth and EOY)
               (hl.experimental.strptime('1970-12-31 00:00:00', '%Y-%m-%d %H:%M:%S', 'GMT') -
                hl.experimental.strptime('1970-' + month + '-15 00:00:00', '%Y-%m-%d %H:%M:%S',
                                         'GMT')) / 2)
         .default(hl.experimental.strptime(hl.str(x) + ' 00:00:00', '%Y-%m-%d %H:%M:%S', 'GMT') - dob
     ))
コード例 #11
0
def require_biallelic(dataset,
                      method,
                      tolerate_generic_locus: bool = False) -> MatrixTable:
    if tolerate_generic_locus:
        require_row_key_variant_w_struct_locus(dataset, method)
    else:
        require_row_key_variant(dataset, method)
    return dataset._select_rows(
        method,
        hl.case().when(dataset.alleles.length() == 2, dataset._rvrow).or_error(
            f"'{method}' expects biallelic variants ('alleles' field of length 2), found "
            + hl.str(dataset.locus) + ", " + hl.str(dataset.alleles)))
コード例 #12
0
ファイル: test_api.py プロジェクト: shulik7/hail
    def test(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr,
                            f=hl.tarray(hl.tint32),
                            g=hl.tarray(
                                hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)),
                            h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr),
                            i=hl.tbool,
                            j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5,
                 'e': "hello", 'f': [1, 2, 3],
                 'g': [hl.Struct(x=1, y=5, z='banana')],
                 'h': hl.Struct(a=5, b=3, c='winter'),
                 'i': True,
                 'j': hl.Struct(x=3, y=2, z='summer')}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d),
            ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5),
            dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])),
            dpois=hl.dpois(4, kt.a),
            drop=kt.h.drop('b', 'c'),
            exp=hl.exp(kt.c),
            fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d),
            hwe=hl.hardy_weinberg_p(1, 2, 1),
            index=hl.index(kt.g, 'z'),
            is_defined=hl.is_defined(kt.i),
            is_missing=hl.is_missing(kt.i),
            is_nan=hl.is_nan(hl.float64(kt.a)),
            json=hl.json(kt.g),
            log=hl.log(kt.a, kt.b),
            log10=hl.log10(kt.c),
            or_else=hl.or_else(kt.a, 5),
            or_missing=hl.or_missing(kt.i, kt.j),
            pchisqtail=hl.pchisqtail(kt.a, kt.b),
            pcoin=hl.rand_bool(0.5),
            pnorm=hl.pnorm(0.2),
            pow=2.0 ** kt.b,
            ppois=hl.ppois(kt.a, kt.b),
            qchisqtail=hl.qchisqtail(kt.a, kt.b),
            range=hl.range(0, 5, kt.b),
            rnorm=hl.rand_norm(0.0, kt.b),
            rpois=hl.rand_pois(kt.a),
            runif=hl.rand_unif(kt.b, kt.a),
            select=kt.h.select('c', 'b'),
            sqrt=hl.sqrt(kt.a),
            to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)],
            where=hl.cond(kt.i, 5, 10)
        ).take(1)[0])
コード例 #13
0
def to_plink(pops: list,
             subsets_dir,
             mt,
             ht_sample,
             bfile_path,
             export_varid: bool = True,
             overwrite=False):
    r'''
    Exports matrix table to PLINK2 files
    NOTE: These files will need to split up by chromosome before plink_clump.py
    can be run. 
    '''
    assert 'GT' in mt.entry, "mt must have 'GT' as an entry field"
    assert mt.GT.dtype == hl.tcall, "entry field 'GT' must be of type `Call`"

    if not overwrite and all([
            hl.hadoop_exists(f'{bfile_path}.{suffix}')
            for suffix in ['bed', 'bim']
    ]):
        print(f'\nPLINK .bed and .bim files already exist for {bfile_path}')
        print(bfile_path)
    else:
        print(f'Saving to bfile prefix {bfile_path}')
        mt_sample = mt.annotate_rows(varid=hl.str(mt.locus) + ':' +
                                     mt.alleles[0] + ':' + mt.alleles[1])
        mt_sample = mt_sample.filter_cols(hl.is_defined(
            ht_sample[mt_sample.s]))
        hl.export_plink(dataset=mt_sample,
                        output=bfile_path,
                        ind_id=mt_sample.s,
                        varid=mt_sample.varid)  # varid used to be rsid
コード例 #14
0
 def setupAnnotationDBTests(cls):
     startTestHailContext()
     t = hl.utils.range_table(10)
     t = t.annotate(locus=hl.locus('1', t.idx + 1))
     t = t.annotate(annotation=hl.str(t.idx))
     d = tempfile.TemporaryDirectory()
     fname = d.name + '/f.mt'
     t.write(fname)
     cls.temp_dir = d
     cls.db_json = {
         'unique_dataset': {
             'description': 'now with unique rows!',
             'url': 'https://example.com',
             'key_properties': ['unique'],
             'versions': [{
                 'url': fname,
                 'version': 'v1-GRCh37'
             }]
         },
         'nonunique_dataset': {
             'description': 'non-unique rows :(',
             'url': 'https://example.net',
             'key_properties': [],
             'versions': [{
                 'url': fname,
                 'version': 'v1-GRCh37'
             }]
         }
     }
コード例 #15
0
def specific_clumps(filename):
    clump = hl.import_table(filename,
                            delimiter='\s+',
                            min_partitions=10,
                            types={'P': hl.tfloat})
    clump = clump.key_by(locus=hl.locus(hl.str(clump.CHR), hl.int(clump.BP)))
    return clump
コード例 #16
0
def create_gene_map_ht(ht, check_gene_contigs=False):
    from gnomad.utils.vep import process_consequences

    ht = process_consequences(ht)
    ht = ht.explode(ht.vep.worst_csq_by_gene_canonical)
    ht = ht.annotate(
        variant_id=ht.locus.contig + ':' + hl.str(ht.locus.position) + '_' +
        ht.alleles[0] + '/' + ht.alleles[1],
        annotation=annotation_case_builder(ht.vep.worst_csq_by_gene_canonical))
    if check_gene_contigs:
        gene_contigs = ht.group_by(
            gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id,
            gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol,
        ).aggregate(contigs=hl.agg.collect_as_set(ht.locus.contig))
        assert gene_contigs.all(hl.len(gene_contigs.contigs) == 1)

    gene_map_ht = ht.group_by(
        gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id,
        gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol,
    ).partition_hint(100).aggregate(
        interval=hl.interval(start=hl.locus(
            hl.agg.take(ht.locus.contig, 1)[0], hl.agg.min(ht.locus.position)),
                             end=hl.locus(
                                 hl.agg.take(ht.locus.contig, 1)[0],
                                 hl.agg.max(ht.locus.position))),
        variants=hl.agg.group_by(ht.annotation, hl.agg.collect(ht.variant_id)),
    )
    return gene_map_ht
コード例 #17
0
ファイル: test_matrix_table.py プロジェクト: tpoterba/hail
    def test_make_table_row_equivalence(self):
        mt = hl.utils.range_matrix_table(3, 3)
        mt = mt.annotate_rows(r1 = hl.rand_norm(), r2 = hl.rand_norm())
        mt = mt.annotate_entries(e1 = hl.rand_norm(), e2 = hl.rand_norm())
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        assert mt.make_table().select(*mt.row_value)._same(mt.rows())
コード例 #18
0
def table_aggregate_downsample_dense():
    ht = hl.read_table(resource('many_ints_table.ht'))
    ht.aggregate(
        tuple([
            hl.agg.downsample(ht[f'i{i}'], ht['i3'], label=hl.str(ht['i4']))
            for i in range(3)
        ]))
コード例 #19
0
ファイル: test_methods.py プロジェクト: shulik7/hail
    def test_pcrelate(self):
        dataset = hl.balding_nichols_model(3, 100, 100)
        dataset = dataset.annotate_cols(sample_idx = hl.str(dataset.sample_idx))
        t = hl.pc_relate(dataset, 2, 0.05, block_size=64, statistics="phi")

        self.assertTrue(isinstance(t, hl.Table))
        t.count()
コード例 #20
0
    def test_make_table_row_equivalence(self):
        mt = hl.utils.range_matrix_table(3, 3)
        mt = mt.annotate_rows(r1=hl.rand_norm(), r2=hl.rand_norm())
        mt = mt.annotate_entries(e1=hl.rand_norm(), e2=hl.rand_norm())
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        assert mt.make_table().select(*mt.row_value)._same(mt.rows())
コード例 #21
0
    def test_make_table(self):
        mt = hl.utils.range_matrix_table(3, 2)
        mt = mt.select_entries(x=mt.row_idx * mt.col_idx)
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        t = hl.Table.parallelize(
            [{
                'row_idx': 0,
                '0.x': 0,
                '1.x': 0
            }, {
                'row_idx': 1,
                '0.x': 0,
                '1.x': 1
            }, {
                'row_idx': 2,
                '0.x': 0,
                '1.x': 2
            }],
            hl.tstruct(**{
                'row_idx': hl.tint32,
                '0.x': hl.tint32,
                '1.x': hl.tint32
            }),
            key='row_idx')

        self.assertTrue(mt.make_table()._same(t))
コード例 #22
0
def table_aggregate_downsample_dense(ht_path):
    ht = hl.read_table(ht_path)
    ht.aggregate(
        tuple([
            hl.agg.downsample(ht[f'i{i}'], ht['i3'], label=hl.str(ht['i4']))
            for i in range(3)
        ]))
コード例 #23
0
def import_key(ss_filename, ss_keys, clump_name):
    keys = ss_keys.split(',')
    ss = hl.import_table(ss_filename,
                         impute=True,
                         delimiter='\s+',
                         types={
                             keys[1]: hl.tfloat,
                             keys[0]: hl.tstr
                         },
                         min_partitions=100)
    clump = hl.import_table(clump_name,
                            delimiter='\s+',
                            min_partitions=10,
                            types={
                                'P': hl.tfloat,
                                'CHR': hl.tstr,
                                'BP': hl.tint
                            })
    clump = clump.key_by(locus=hl.locus(clump.CHR, clump.BP))
    clump = clump.filter(clump.P < 5e-8)
    ss = ss.annotate(**{keys[1]: hl.int(ss[keys[1]])})
    chroms = set(map(str, range(1, 23)))
    ss = ss.filter(hl.literal(chroms).contains(ss[keys[0]]))
    ss = ss.annotate(locus=hl.locus(hl.str(ss[keys[0]]), ss[keys[1]]),
                     alleles=[ss[keys[2]], ss[keys[3]]])
    ss = ss.key_by(ss.locus)
    ss = ss.annotate(clump=hl.is_defined(clump[ss.key]))
    ss = ss.key_by(ss.locus, ss.alleles)
    p = keys[-1]
    return ss, p
コード例 #24
0
def specific_clumps(filename):
    clump = hl.import_table(filename, delimiter='\s+', min_partitions=10, types={'P': hl.tfloat})
    clump_dict = clump.aggregate(hl.dict(hl.agg.collect(
        (hl.locus(hl.str(clump.CHR), hl.int(clump.BP)),
        True)
    )), _localize=False)
    return clump_dict
コード例 #25
0
def annotate_phen(tb, phen, sex, phen_tb_dict, filter_to_phen=True):
    r'''
    Annotates `tb` with phenotype `phen` and filters to individuals with 
    phenotype defined. Uses sex-specific IRNT phenotypes.
    sex options: female, male, both_sexes
    '''
    print(
        f'\n... Reading UKB phenotype "{phen_dict[phen][0]}" for {sex} (code: {phen}) ...'
    )

    phen_tb0 = phen_tb_dict[sex]
    phen_tb = phen_tb0.select(phen).rename({phen: 'phen'})

    if type(tb) == hl.table.Table:
        annotate_fn = hl.Table.annotate
        filter_fn = hl.Table.filter
    elif type(tb) == hl.matrixtable.MatrixTable:
        annotate_fn = hl.MatrixTable.annotate_cols
        filter_fn = hl.MatrixTable.filter_cols

    tb0 = annotate_fn(self=tb,
                      phen_str=hl.str(phen_tb[tb.s]['phen']).replace('\"', ''))

    if filter_to_phen:  # filter to individuals with phenotype data defined
        tb1 = filter_fn(self=tb0, expr=tb0.phen_str == '', keep=False)

    if phen_tb.phen.dtype == hl.dtype('bool'):
        tb2 = annotate_fn(self=tb1,
                          phen=hl.bool(tb1.phen_str)).drop('phen_str')
    else:
        tb2 = annotate_fn(self=tb1,
                          phen=hl.float64(tb1.phen_str)).drop('phen_str')

    return tb2
コード例 #26
0
def assign_platform_from_pcs(
    platform_pca_scores_ht: hl.Table,
    pc_scores_ann: str = "scores",
    hdbscan_min_cluster_size: Optional[int] = None,
    hdbscan_min_samples: int = None,
) -> hl.Table:
    """
    Assigns platforms using HBDSCAN on the results of call rate PCA.
    :param platform_pca_scores_ht: Input table with the PCA score for each sample
    :param pc_scores_ann: Field containing the scores
    :param hdbscan_min_cluster_size: HDBSCAN `min_cluster_size` parameter. If not specified the smallest of 500 and 0.1*n_samples will be used.
    :param hdbscan_min_samples: HDBSCAN `min_samples` parameter
    :return: A Table with a `qc_platform` annotation containing the platform based on HDBSCAN clustering
    """

    logger.info("Assigning platforms based on platform PCA clustering")

    # Read and format data for clustering
    data = platform_pca_scores_ht.to_pandas()
    callrate_data = np.matrix(data[pc_scores_ann].tolist())
    logger.info("Assigning platforms to {} samples.".format(
        len(callrate_data)))

    # Cluster data
    if hdbscan_min_cluster_size is None:
        hdbscan_min_cluster_size = min(500, 0.1 * data.shape[0])
    clusterer = hdbscan.HDBSCAN(min_cluster_size=hdbscan_min_cluster_size,
                                min_samples=hdbscan_min_samples)
    cluster_labels = clusterer.fit_predict(callrate_data)
    n_clusters = len(set(cluster_labels)) - (
        -1 in cluster_labels
    )  # NOTE: -1 is the label for noisy (un-classifiable) data points
    logger.info("Found {} unique platforms during platform imputation.".format(
        n_clusters))

    data["qc_platform"] = cluster_labels

    # Note: write pandas dataframe to disk and re-import as HailTable.
    # This a temporary solution until sort the hail's issue with the function 'hl.Table.from_pandas'
    # and different python versions between driver/executors.
    (data.drop(axis=1, labels=pc_scores_ann).to_csv(
        f'{local_dir}/tmp/data_tmp_hdbscan.tsv', index=False, sep='\t'))
    ht_tmp = (hl.import_table(f'{nfs_dir}/tmp/data_tmp_hdbscan.tsv',
                              impute=True).key_by(*platform_pca_scores_ht.key))

    ht = platform_pca_scores_ht.join(ht_tmp)

    # original/elegant solution (TODO: sort issue with 'from_pandas' function)
    # ht = hl.Table.from_pandas(data, key=[*platform_pca_scores_ht.key])

    # expand array structure and annotate scores (PCs) as individual fields.
    # drop array scores field before to export the results.
    n_pcs = len(ht[pc_scores_ann].take(1)[0])
    ht = (ht.annotate(
        **{f'platform_PC{i + 1}': ht[pc_scores_ann][i]
           for i in range(n_pcs)}).drop(pc_scores_ann))

    ht = ht.annotate(qc_platform="platform_" + hl.str(ht.qc_platform))
    return ht
コード例 #27
0
def default_compute_info(mt: hl.MatrixTable,
                         site_annotations: bool = False,
                         n_partitions: int = 5000) -> hl.Table:
    """
    Computes a HT with the typical GATK allele-specific (AS) info fields 
    as well as ACs and lowqual fields.
    Note that this table doesn't split multi-allelic sites.

    :param mt: Input MatrixTable. Note that this table should be filtered to nonref sites.
    :param site_annotations: Whether to also generate site level info fields. Default is False.
    :param n_partitions: Number of desired partitions for output Table. Default is 5000.
    :return: Table with info fields
    :rtype: Table
    """
    # Move gvcf info entries out from nested struct
    mt = mt.transmute_entries(**mt.gvcf_info)

    # Compute AS info expr
    info_expr = get_as_info_expr(mt)

    if site_annotations:
        info_expr = info_expr.annotate(**get_site_info_expr(mt))

    # Add AC and AC_raw:
    # First compute ACs for each non-ref allele, grouped by adj
    grp_ac_expr = hl.agg.array_agg(
        lambda ai: hl.agg.filter(
            mt.LA.contains(ai),
            hl.agg.group_by(
                get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD),
                hl.agg.sum(
                    mt.LGT.one_hot_alleles(mt.LA.map(lambda x: hl.str(x)))[
                        mt.LA.index(ai)]),
            ),
        ),
        hl.range(1, hl.len(mt.alleles)),
    )

    # Then, for each non-ref allele, compute
    # AC as the adj group
    # AC_raw as the sum of adj and non-adj groups
    info_expr = info_expr.annotate(
        AC_raw=grp_ac_expr.map(
            lambda i: hl.int32(i.get(True, 0) + i.get(False, 0))),
        AC=grp_ac_expr.map(lambda i: hl.int32(i.get(True, 0))),
    )

    info_ht = mt.select_rows(info=info_expr).rows()

    # Add AS lowqual flag
    info_ht = info_ht.annotate(AS_lowqual=get_lowqual_expr(
        info_ht.alleles, info_ht.info.AS_QUALapprox))

    if site_annotations:
        # Add lowqual flag
        info_ht = info_ht.annotate(
            lowqual=get_lowqual_expr(info_ht.alleles, info_ht.info.QUALapprox))

    return info_ht.naive_coalesce(n_partitions)
コード例 #28
0
def get_omim():

    ht = hl.import_table("s3://seqr-resources/omim/genemap2.txt",
                         delimiter='|')
    ht = ht.annotate(colname=hl.str("omim"))
    ht = ht.to_matrix_table('Ensembl Gene ID', "colname")
    #ht = import_vcf("s3://seqr-resources/topmed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.vcf.gz","37","topmed")
    return ht
コード例 #29
0
def import_key(ss_filename, ss_keys):
    ss = hl.import_table(ss_filename, impute=True, delimiter='\s+')
    keys = ss_keys.split(',')
    p = keys[-1]
    ss = ss.annotate(locus=hl.locus(hl.str(ss[keys[0]]), ss[keys[1]]),
                     alleles=[ss[keys[2]], ss[keys[3]]])
    ss = ss.key_by(ss.locus, ss.alleles)
    return ss, p
コード例 #30
0
ファイル: variant_id.py プロジェクト: shechter/gnomadjs
    def compute_variant_id(alt):
        var_id = normalized_contig(locus) + "-" + hl.str(
            locus.position) + "-" + alleles[0] + "-" + alt

        if max_length is not None:
            var_id = var_id[:max_length]

        return var_id
コード例 #31
0
    def test_export_gen_exprs(self):
        gen = hl.import_gen(resource('example.gen'),
                            sample_file=resource('example.sample'),
                            contig_recoding={
                                "01": "1"
                            },
                            reference_genome='GRCh37',
                            min_partitions=3).add_col_index().add_row_index()

        out1 = new_temp_file()
        hl.export_gen(gen,
                      out1,
                      id1=hl.str(gen.col_idx),
                      id2=hl.str(gen.col_idx),
                      missing=0.5,
                      varid=hl.str(gen.row_idx),
                      rsid=hl.str(gen.row_idx),
                      gp=[0.0, 1.0, 0.0])

        in1 = (hl.import_gen(out1 + '.gen',
                             sample_file=out1 + '.sample',
                             min_partitions=3).add_col_index().add_row_index())
        self.assertTrue(
            in1.aggregate_entries(hl.agg.fraction(
                in1.GP == [0.0, 1.0, 0.0])) == 1.0)
        self.assertTrue(
            in1.aggregate_rows(
                hl.agg.fraction((in1.varid == hl.str(in1.row_idx))
                                & (in1.rsid == hl.str(in1.row_idx)))) == 1.0)
        self.assertTrue(
            in1.aggregate_cols(hl.agg.fraction(
                (in1.s == hl.str(in1.col_idx)))))
コード例 #32
0
ファイル: test_matrix_table.py プロジェクト: tpoterba/hail
    def test_make_table_empty_entry_field(self):
        mt = hl.utils.range_matrix_table(3, 2)
        mt = mt.select_entries(**{'': mt.row_idx * mt.col_idx})
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        t = mt.make_table()
        self.assertEqual(
            t.row.dtype,
            hl.tstruct(**{'row_idx': hl.tint32, '0': hl.tint32, '1': hl.tint32}))
コード例 #33
0
ファイル: test_matrix_table.py プロジェクト: similarface/hail
    def test_make_table_empty_entry_field(self):
        mt = hl.utils.range_matrix_table(3, 2)
        mt = mt.select_entries(**{'': mt.row_idx * mt.col_idx})
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        t = mt.make_table()
        self.assertEqual(
            t.row.dtype,
            hl.tstruct(**{'row_idx': hl.tint32, '0': hl.tint32, '1': hl.tint32}))
コード例 #34
0
ファイル: test_matrix_table.py プロジェクト: tpoterba/hail
    def test_make_table_sep(self):
        mt = hl.utils.range_matrix_table(3, 2)
        mt = mt.select_entries(x=mt.row_idx * mt.col_idx)
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        t = mt.make_table()
        assert list(t.row) == ['row_idx', '0.x', '1.x']

        t = mt.make_table(separator='__')
        assert list(t.row) == ['row_idx', '0__x', '1__x']
コード例 #35
0
ファイル: test_misc.py プロジェクト: jigold/hail
    def test_rename_duplicates(self):
        mt = hl.utils.range_matrix_table(5, 5)

        assert hl.rename_duplicates(
            mt.key_cols_by(s=hl.str(mt.col_idx))
        ).unique_id.collect() == ['0', '1', '2', '3', '4']

        assert hl.rename_duplicates(
            mt.key_cols_by(s='0')
        ).unique_id.collect() == ['0', '0_1', '0_2', '0_3', '0_4']

        assert hl.rename_duplicates(
            mt.key_cols_by(s=hl.literal(['0', '0_1', '0', '0_2', '0'])[mt.col_idx])
        ).unique_id.collect() == ['0', '0_1', '0_2', '0_2_1', '0_3']

        assert hl.rename_duplicates(
            mt.key_cols_by(s=hl.str(mt.col_idx)),
            'foo'
        )['foo'].dtype == hl.tstr
コード例 #36
0
ファイル: test_matrix_table.py プロジェクト: tpoterba/hail
    def test_make_table(self):
        mt = hl.utils.range_matrix_table(3, 2)
        mt = mt.select_entries(x=mt.row_idx * mt.col_idx)
        mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx))

        t = hl.Table.parallelize(
            [{'row_idx': 0, '0.x': 0, '1.x': 0},
             {'row_idx': 1, '0.x': 0, '1.x': 1},
             {'row_idx': 2, '0.x': 0, '1.x': 2}],
            hl.tstruct(**{'row_idx': hl.tint32, '0.x': hl.tint32, '1.x': hl.tint32}),
            key='row_idx')

        self.assertTrue(mt.make_table()._same(t))
コード例 #37
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_export_import_plink_same(self):
        mt = get_dataset()
        mt = mt.select_rows(rsid=hl.delimit([mt.locus.contig, hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1]], ':'),
                            cm_position=15.0)
        mt = mt.select_cols(fam_id=hl.null(hl.tstr), pat_id=hl.null(hl.tstr), mat_id=hl.null(hl.tstr),
                            is_female=hl.null(hl.tbool), is_case=hl.null(hl.tbool))
        mt = mt.select_entries('GT')

        bfile = '/tmp/test_import_export_plink'
        hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position)

        mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam',
                                      a2_reference=True, reference_genome='GRCh37')
        self.assertTrue(mt._same(mt_imported))
        self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
コード例 #38
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
def generate_random_gen():
    mt = hl.utils.range_matrix_table(30, 10)
    mt = (mt.annotate_rows(locus = hl.locus('20', mt.row_idx + 1),
                           alleles = ['A', 'G'])
          .key_rows_by('locus', 'alleles'))
    mt = (mt.annotate_cols(s = hl.str(mt.col_idx))
          .key_cols_by('s'))
    # using totally random values leads rounding differences where
    # identical GEN values get rounded differently, leading to
    # differences in the GT call between import_{gen, bgen}
    mt = mt.annotate_entries(a = hl.int32(hl.rand_unif(0.0, 255.0)))
    mt = mt.annotate_entries(b = hl.int32(hl.rand_unif(0.0, 255.0 - mt.a)))
    mt = mt.transmute_entries(GP = hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0)
    # 20% missing
    mt = mt.filter_entries(hl.rand_bool(0.8))
    hl.export_gen(mt, 'random', precision=4)
コード例 #39
0
ファイル: test_matrix_table.py プロジェクト: tpoterba/hail
    def test_joins(self):
        vds = self.get_vds().select_rows(x1=1, y1=1)
        vds2 = vds.select_rows(x2=1, y2=2)
        vds2 = vds2.select_cols(c1=1, c2=2)

        vds = vds.annotate_rows(y2=vds2.index_rows(vds.row_key).y2)
        vds = vds.annotate_cols(c2=vds2.index_cols(vds.s).c2)

        vds = vds.annotate_cols(c2=vds2.index_cols(hl.str(vds.s)).c2)

        rt = vds.rows()
        ct = vds.cols()

        vds.annotate_rows(**rt[vds.locus, vds.alleles])

        self.assertTrue(rt.all(rt.y2 == 2))
        self.assertTrue(ct.all(ct.c2 == 2))
コード例 #40
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_export_gen_exprs(self):
        gen = hl.import_gen(resource('example.gen'),
                            sample_file=resource('example.sample'),
                            contig_recoding={"01": "1"},
                            reference_genome='GRCh37',
                            min_partitions=3).add_col_index().add_row_index()

        out1 = new_temp_file()
        hl.export_gen(gen, out1, id1=hl.str(gen.col_idx), id2=hl.str(gen.col_idx), missing=0.5,
                      varid=hl.str(gen.row_idx), rsid=hl.str(gen.row_idx), gp=[0.0, 1.0, 0.0])

        in1 = (hl.import_gen(out1 + '.gen', sample_file=out1 + '.sample', min_partitions=3)
               .add_col_index()
               .add_row_index())
        self.assertTrue(in1.aggregate_entries(hl.agg.fraction(in1.GP == [0.0, 1.0, 0.0])) == 1.0)
        self.assertTrue(in1.aggregate_rows(hl.agg.fraction((in1.varid == hl.str(in1.row_idx)) &
                                                           (in1.rsid == hl.str(in1.row_idx)))) == 1.0)
        self.assertTrue(in1.aggregate_cols(hl.agg.fraction((in1.s == hl.str(in1.col_idx)))))
コード例 #41
0
ファイル: plots.py プロジェクト: jigold/hail
def _collect_scatter_plot_data(
        x: Tuple[str, NumericExpression],
        y: Tuple[str, NumericExpression],
        fields: Dict[str, Expression] = None,
        n_divisions: int = None,
        missing_label: str =  'NA'
) -> pd.DataFrame:

    expressions = dict()
    if fields is not None:
        expressions.update({k: hail.or_else(v, missing_label) if isinstance(v, StringExpression) else v for k, v in fields.items()})

    if n_divisions is None:
        collect_expr = hail.struct(**dict((k,v) for k,v in (x,y)), **expressions)
        plot_data = [point for point in collect_expr.collect() if point[x[0]] is not None and point[y[0]] is not None]
        source_pd = pd.DataFrame(plot_data)
    else:
        # FIXME: remove the type conversion logic if/when downsample supports continuous values for labels
        # Save all numeric types to cast in DataFrame
        numeric_expr = {k: 'int32' for k,v in expressions.items() if isinstance(v, Int32Expression)}
        numeric_expr.update({k: 'int64' for k,v in expressions.items() if isinstance(v, Int64Expression)})
        numeric_expr.update({k: 'float32' for k, v in expressions.items() if isinstance(v, Float32Expression)})
        numeric_expr.update({k: 'float64' for k, v in expressions.items() if isinstance(v, Float64Expression)})

        # Cast non-string types to string
        expressions = {k: hail.str(v) if not isinstance(v, StringExpression) else v for k,v in expressions.items()}

        agg_f = x[1]._aggregation_method()
        res = agg_f(hail.agg.downsample(x[1], y[1], label=list(expressions.values()) if expressions else None, n_divisions=n_divisions))
        source_pd = pd.DataFrame([
            dict(
                **{x[0]: point[0], y[0]: point[1]},
                **(dict(zip(expressions, point[2])) if point[2] is not None else {})
            ) for point in res
        ])
        source_pd = source_pd.astype(numeric_expr, copy=False)

    return source_pd
コード例 #42
0
ファイル: plots.py プロジェクト: jigold/hail
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, significance_line=5e-8):
    """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot)

    Parameters
    ----------
    pvals : :class:`.Float64Expression`
        P-values to be plotted.
    locus : :class:`.LocusExpression`
        Locus values to be plotted.
    title : str
        Title of the plot.
    size : int
        Size of markers in screen space units.
    hover_fields : Dict[str, :class:`.Expression`]
        Dictionary of field names and values to be shown in the HoverTool of the plot.
    collect_all : bool
        Whether to collect all values or downsample before plotting.
    n_divisions : int
        Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints.
    significance_line : float, optional
        p-value at which to add a horizontal, dotted red line indicating
        genome-wide significance.  If ``None``, no line is added.

    Returns
    -------
    :class:`bokeh.plotting.figure.Figure`
    """
    if locus is None:
        locus = pvals._indices.source.locus

    ref = locus.dtype.reference_genome

    if hover_fields is None:
        hover_fields = {}

    hover_fields['locus'] = hail.str(locus)

    pvals = -hail.log10(pvals)

    source_pd = _collect_scatter_plot_data(
        ('_global_locus', locus.global_position()),
        ('_pval', pvals),
        fields=hover_fields,
        n_divisions=None if collect_all else n_divisions
    )
    source_pd['p_value'] = [10 ** (-p) for p in source_pd['_pval']]
    source_pd['_contig'] = [locus.split(":")[0] for locus in source_pd['locus']]

    observed_contigs = set(source_pd['_contig'])
    observed_contigs = [contig for contig in ref.contigs.copy() if contig in observed_contigs]
    contig_ticks = hail.eval([hail.locus(contig, int(ref.lengths[contig]/2)).global_position() for contig in observed_contigs])
    color_mapper = CategoricalColorMapper(factors=ref.contigs, palette= palette[:2] * int((len(ref.contigs)+1)/2))

    p = figure(title=title, x_axis_label='Chromosome', y_axis_label='P-value (-log10 scale)', width=1000)
    p, _, legend, _, _, _ = _get_scatter_plot_elements(
        p, source_pd, x_col='_global_locus', y_col='_pval',
        label_cols=['_contig'], colors={'_contig': color_mapper},
        size=size
    )
    legend.visible = False
    p.xaxis.ticker = contig_ticks
    p.xaxis.major_label_overrides = dict(zip(contig_ticks, observed_contigs))
    p.select_one(HoverTool).tooltips = [t for t in p.select_one(HoverTool).tooltips if not t[0].startswith('_')]

    if significance_line is not None:
        p.renderers.append(Span(location=-log10(significance_line),
                                dimension='width',
                                line_color='red',
                                line_dash='dashed',
                                line_width=1.5))

    return p
コード例 #43
0
ファイル: ldscore.py プロジェクト: bcajes/hail
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
    ...                      fam='data/ldsc.fam')

    >>> # Create locus-keyed Table with numeric variant annotations
    >>> ht = hl.import_table('data/ldsc.annot',
    ...                      types={'BP': hl.tint,
    ...                             'binary': hl.tfloat,
    ...                             'continuous': hl.tfloat})
    >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
    >>> ht = ht.key_by('locus')

    >>> # Annotate MatrixTable with external annotations
    >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary,
    ...                       continuous_annotation=ht[mt.locus].continuous)

    >>> # Calculate LD scores using centimorgan coordinates
    >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(),
    ...                                      locus_expr=mt.locus,
    ...                                      radius=1.0,
    ...                                      coord_expr=mt.cm_position,
    ...                                      annotation_exprs=[mt.binary_annotation,
    ...                                                        mt.continuous_annotation])

    >>> # Show results
    >>> ht_scores.show(3)

    .. code-block:: text

        +---------------+-------------------+-----------------------+-------------+
        | locus         | binary_annotation | continuous_annotation |  univariate |
        +---------------+-------------------+-----------------------+-------------+
        | locus<GRCh37> |           float64 |               float64 |     float64 |
        +---------------+-------------------+-----------------------+-------------+
        | 20:82079      |       1.15183e+00 |           7.30145e+01 | 1.60117e+00 |
        | 20:103517     |       2.04604e+00 |           2.75392e+02 | 4.69239e+00 |
        | 20:108286     |       2.06585e+00 |           2.86453e+02 | 5.00124e+00 |
        +---------------+-------------------+-----------------------+-------------+


    Warning
    -------
        :func:`.ld_score` will fail if ``entry_expr`` results in any missing
        values. The special float value ``nan`` is not considered a
        missing value.

    **Further reading**

    For more in-depth discussion of LD scores, see:

    - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__
    - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__

    Notes
    -----

    `entry_expr`, `locus_expr`, `coord_expr` (if specified), and
    `annotation_exprs` (if specified) must come from the same
    MatrixTable.


    Parameters
    ----------
    entry_expr : :class:`.NumericExpression`
        Expression for entries of genotype matrix
        (e.g. ``mt.GT.n_alt_alleles()``).
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression.
    radius : :obj:`int` or :obj:`float`
        Radius of window for row values (in units of `coord_expr` if set,
        otherwise in units of basepairs).
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value used to window
        variants. By default, the row value is given by the locus
        position.
    annotation_exprs : :class:`.NumericExpression` or
                       :obj:`list` of :class:`.NumericExpression`, optional
        Annotation expression(s) to partition LD scores. Univariate
        annotation will always be included and does not need to be
        specified.
    block_size : :obj:`int`, optional
        Block size. Default given by :meth:`.BlockMatrix.default_block_size`.

    Returns
    -------
    :class:`.Table`
        Table keyed by `locus_expr` with LD scores for each variant and
        `annotation_expr`. The function will always return LD scores for
        the univariate (all SNPs) annotation."""

    mt = entry_expr._indices.source
    mt_locus_expr = locus_expr._indices.source

    if coord_expr is None:
        mt_coord_expr = mt_locus_expr
    else:
        mt_coord_expr = coord_expr._indices.source

    if not annotation_exprs:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr])
    else:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr] +
                        [mt == x._indices.source
                         for x in wrap_to_list(annotation_exprs)])

    if not check_mts:
        raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr
                            (if specified), and annotation_exprs (if
                            specified) must come from same MatrixTable.""")

    n = mt.count_cols()
    r2 = hl.row_correlation(entry_expr, block_size) ** 2
    r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0))

    starts, stops = hl.linalg.utils.locus_windows(locus_expr,
                                                  radius,
                                                  coord_expr)
    r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops)

    r2_adj_sparse_tmp = new_temp_file()
    r2_adj_sparse.write(r2_adj_sparse_tmp)
    r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp)

    if not annotation_exprs:
        cols = ['univariate']
        col_idxs = {0: 'univariate'}
        l2 = r2_adj_sparse.sum(axis=1)
    else:
        ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows()
        ht = ht.annotate(univariate=hl.literal(1.0))
        names = [name for name in ht.row if name not in ht.key]

        ht_union = hl.Table.union(
            *[(ht.annotate(name=hl.str(x),
                           value=hl.float(ht[x]))
                 .select('name', 'value')) for x in names])
        mt_annotations = ht_union.to_matrix_table(
            row_key=list(ht_union.key),
            col_key=['name'])

        cols = mt_annotations.key_cols_by()['name'].collect()
        col_idxs = {i: cols[i] for i in range(len(cols))}

        a_tmp = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp)

        a = BlockMatrix.read(a_tmp)
        l2 = r2_adj_sparse @ a

    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_temp_file()
    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index()
    ht_scores = ht_scores.key_by('idx')
    ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i]
                                  for i in range(len(cols))})

    ht = mt.select_rows(__locus=locus_expr).rows()
    ht = ht.add_index()
    ht = ht.annotate(**ht_scores[ht.idx])
    ht = ht.key_by('__locus')
    ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key])
    ht = ht.rename({'__locus': 'locus'})

    return ht
コード例 #44
0
ファイル: plots.py プロジェクト: lfrancioli/hail
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500):
    """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot)

    Parameters
    ----------
    pvals : :class:`.Float64Expression`
        P-values to be plotted.
    locus : :class:`.LocusExpression`
        Locus values to be plotted.
    title : str
        Title of the plot.
    size : int
        Size of markers in screen space units.
    hover_fields : Dict[str, :class:`.Expression`]
        Dictionary of field names and values to be shown in the HoverTool of the plot.
    collect_all : bool
        Whether to collect all values or downsample before plotting.
    n_divisions : int
        Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints.

    Returns
    -------
    :class:`bokeh.plotting.figure.Figure`
    """
    def get_contig_index(x, starts):
        left = 0
        right = len(starts) - 1
        while left <= right:
            mid = (left + right) // 2
            if x < starts[mid]:
                if x >= starts[mid - 1]:
                    return mid - 1
                right = mid
            elif x >= starts[mid+1]:
                left = mid + 1
            else:
                return mid

    if locus is None:
        locus = pvals._indices.source.locus

    if hover_fields is None:
        hover_fields = {}

    hover_fields['locus'] = hail.str(locus)

    pvals = -hail.log10(pvals)

    if collect_all:
        res = hail.tuple([locus.global_position(), pvals, hail.struct(**hover_fields)]).collect()
        hf_struct = [point[2] for point in res]
        for key in hover_fields:
            hover_fields[key] = [item[key] for item in hf_struct]
    else:
        agg_f = pvals._aggregation_method()
        res = agg_f(aggregators.downsample(locus.global_position(), pvals,
                                           label=hail.array([hail.str(x) for x in hover_fields.values()]),
                                           n_divisions=n_divisions))
        fields = [point[2] for point in res]
        for idx, key in enumerate(list(hover_fields.keys())):
            hover_fields[key] = [field[idx] for field in fields]

    x = [point[0] for point in res]
    y = [point[1] for point in res]
    y_linear = [10 ** (-p) for p in y]
    hover_fields['p_value'] = y_linear

    ref = locus.dtype.reference_genome

    total_pos = 0
    start_points = []
    for i in range(0, len(ref.contigs)):
        start_points.append(total_pos)
        total_pos += ref.lengths.get(ref.contigs[i])
    start_points.append(total_pos)  # end point of all contigs

    observed_contigs = set()
    label = []
    for element in x:
        contig_index = get_contig_index(element, start_points)
        label.append(str(contig_index % 2))
        observed_contigs.add(ref.contigs[contig_index])

    labels = ref.contigs.copy()
    num_deleted = 0
    mid_points = []
    for i in range(0, len(ref.contigs)):
        if ref.contigs[i] in observed_contigs:
            length = ref.lengths.get(ref.contigs[i])
            mid = start_points[i] + length / 2
            if mid % 1 == 0:
                mid += 0.5
            mid_points.append(mid)
        else:
            del labels[i - num_deleted]
            num_deleted += 1

    p = scatter(x, y, label=label, title=title, xlabel='Chromosome', ylabel='P-value (-log10 scale)',
                size=size, legend=False, source_fields=hover_fields)

    p.xaxis.ticker = mid_points
    p.xaxis.major_label_overrides = dict(zip(mid_points, labels))
    p.width = 1000

    tooltips = [(key, "@{}".format(key)) for key in hover_fields]
    p.add_tools(HoverTool(
        tooltips=tooltips
    ))

    return p
コード例 #45
0
ファイル: plots.py プロジェクト: jigold/hail
def histogram2d(x, y, bins=40, range=None,
                 title=None, width=600, height=600, font_size='7pt',
                 colors=bokeh.palettes.all_palettes['Blues'][7][::-1]):
    """Plot a two-dimensional histogram.

    ``x`` and ``y`` must both be a :class:`NumericExpression` from the same :class:`Table`.

    If ``x_range`` or ``y_range`` are not provided, the function will do a pass through the data to determine
    min and max of each variable.

    Examples
    --------

    >>> ht = hail.utils.range_table(1000).annotate(x=hail.rand_norm(), y=hail.rand_norm())
    >>> p_hist = hail.plot.histogram2d(ht.x, ht.y)

    >>> ht = hail.utils.range_table(1000).annotate(x=hail.rand_norm(), y=hail.rand_norm())
    >>> p_hist = hail.plot.histogram2d(ht.x, ht.y, bins=10, range=((0, 1), None))

    Parameters
    ----------
    x : :class:`.NumericExpression`
        Expression for x-axis (from a Hail table).
    y : :class:`.NumericExpression`
        Expression for y-axis (from the same Hail table as ``x``).
    bins : int or [int, int]
        The bin specification:
        -   If int, the number of bins for the two dimensions (nx = ny = bins).
        -   If [int, int], the number of bins in each dimension (nx, ny = bins).
        The default value is 40.
    range : None or ((float, float), (float, float))
        The leftmost and rightmost edges of the bins along each dimension:
        ((xmin, xmax), (ymin, ymax)). All values outside of this range will be considered outliers
        and not tallied in the histogram. If this value is None, or either of the inner lists is None,
        the range will be computed from the data.
    width : int
        Plot width (default 600px).
    height : int
        Plot height (default 600px).
    title : str
        Title of the plot.
    font_size : str
        String of font size in points (default '7pt').
    colors : List[str]
        List of colors (hex codes, or strings as described
        `here <https://bokeh.pydata.org/en/latest/docs/reference/colors.html>`__). Compatible with one of the many
        built-in palettes available `here <https://bokeh.pydata.org/en/latest/docs/reference/palettes.html>`__.

    Returns
    -------
    :class:`bokeh.plotting.figure.Figure`
    """
    source = x._indices.source
    y_source = y._indices.source

    if source is None or y_source is None:
        raise ValueError("histogram_2d expects two expressions of 'Table', found scalar expression")
    if isinstance(source, hail.MatrixTable):
        raise ValueError("histogram_2d requires source to be Table, not MatrixTable")
    if source != y_source:
        raise ValueError(f"histogram_2d expects two expressions from the same 'Table', found {source} and {y_source}")
    check_row_indexed('histogram_2d', x)
    check_row_indexed('histogram_2d', y)
    if isinstance(bins, int):
        x_bins = y_bins = bins
    else:
        x_bins, y_bins = bins
    if range is None:
        x_range = y_range = None
    else:
        x_range, y_range = range
    if x_range is None or y_range is None:
        warnings.warn('At least one range was not defined in histogram_2d. Doing two passes...')
        ranges = source.aggregate(hail.struct(x_stats=hail.agg.stats(x),
                                              y_stats=hail.agg.stats(y)))
        if x_range is None:
            x_range = (ranges.x_stats.min, ranges.x_stats.max)
        if y_range is None:
            y_range = (ranges.y_stats.min, ranges.y_stats.max)
    else:
        warnings.warn('If x_range or y_range are specified in histogram_2d, and there are points '
                      'outside of these ranges, they will not be plotted')
    x_range = list(map(float, x_range))
    y_range = list(map(float, y_range))
    x_spacing = (x_range[1] - x_range[0]) / x_bins
    y_spacing = (y_range[1] - y_range[0]) / y_bins

    def frange(start, stop, step):
        from itertools import count, takewhile
        return takewhile(lambda x: x <= stop, count(start, step))

    x_levels = hail.literal(list(frange(x_range[0], x_range[1], x_spacing))[::-1])
    y_levels = hail.literal(list(frange(y_range[0], y_range[1], y_spacing))[::-1])

    grouped_ht = source.group_by(
        x=hail.str(x_levels.find(lambda w: x >= w)),
        y=hail.str(y_levels.find(lambda w: y >= w))
    ).aggregate(c=hail.agg.count())
    data = grouped_ht.filter(hail.is_defined(grouped_ht.x) & (grouped_ht.x != str(x_range[1])) &
                             hail.is_defined(grouped_ht.y) & (grouped_ht.y != str(y_range[1]))).to_pandas()

    mapper = LinearColorMapper(palette=colors, low=data.c.min(), high=data.c.max())

    x_axis = sorted(set(data.x), key=lambda z: float(z))
    y_axis = sorted(set(data.y), key=lambda z: float(z))
    p = figure(title=title,
               x_range=x_axis, y_range=y_axis,
               x_axis_location="above", plot_width=width, plot_height=height,
               tools="hover,save,pan,box_zoom,reset,wheel_zoom", toolbar_location='below')

    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 0
    p.axis.major_label_text_font_size = font_size
    import math
    p.xaxis.major_label_orientation = math.pi / 3

    p.rect(x='x', y='y', width=1, height=1,
           source=data,
           fill_color={'field': 'c', 'transform': mapper},
           line_color=None)

    color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size=font_size,
                         ticker=BasicTicker(desired_num_ticks=6),
                         label_standoff=6, border_line_color=None, location=(0, 0))
    p.add_layout(color_bar, 'right')

    def set_font_size(p, font_size: str = '12pt'):
        """Set most of the font sizes in a bokeh figure

        Parameters
        ----------
        p : :class:`bokeh.plotting.figure.Figure`
            Input figure.
        font_size : str
            String of font size in points (e.g. '12pt').

        Returns
        -------
        :class:`bokeh.plotting.figure.Figure`
        """
        p.legend.label_text_font_size = font_size
        p.xaxis.axis_label_text_font_size = font_size
        p.yaxis.axis_label_text_font_size = font_size
        p.xaxis.major_label_text_font_size = font_size
        p.yaxis.major_label_text_font_size = font_size
        if hasattr(p.title, 'text_font_size'):
            p.title.text_font_size = font_size
        if hasattr(p.xaxis, 'group_text_font_size'):
            p.xaxis.group_text_font_size = font_size
        return p

    p.select_one(HoverTool).tooltips = [('x', '@x'), ('y', '@y',), ('count', '@c')]
    p = set_font_size(p, font_size)
    return p