コード例 #1
0
def load_cmg(cmg_csv: str) -> hl.Table:
    cmg_ht = hl.import_table(cmg_csv, impute=True, delimiter=",", quote='"')

    cmg_ht = cmg_ht.transmute(
        locus1_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_1), cmg_ht.pos_1, reference_genome='GRCh38'),
        alleles1_b38=[cmg_ht.ref_1, cmg_ht.alt_1],
        locus2_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_2), cmg_ht.pos_2, reference_genome='GRCh38'),
        alleles2_b38=[cmg_ht.ref_2, cmg_ht.alt_2]
    )

    liftover_references = get_liftover_genome(cmg_ht.rename({'locus1_b38': 'locus'}))
    lifted_over_variants = hl.sorted(
        hl.array([
            liftover_expr(cmg_ht.locus1_b38, cmg_ht.alleles1_b38, liftover_references[1]),
            liftover_expr(cmg_ht.locus2_b38, cmg_ht.alleles2_b38, liftover_references[1])
        ]),
        lambda x: x.locus
    )

    cmg_ht = cmg_ht.key_by(
        locus1=lifted_over_variants[0].locus,
        alleles1=lifted_over_variants[0].alleles,
        locus2=lifted_over_variants[1].locus,
        alleles2=lifted_over_variants[1].alleles
    )

    return cmg_ht.annotate(
        bad_liftover=(
                hl.is_missing(cmg_ht.locus1) |
                hl.is_missing(cmg_ht.locus2) |
                (cmg_ht.locus1.sequence_context() != cmg_ht.alleles1[0][0]) |
                (cmg_ht.locus2.sequence_context() != cmg_ht.alleles2[0][0])
        )
    )
コード例 #2
0
def create_gene_map_ht(ht, check_gene_contigs=False):
    from gnomad.utils.vep import process_consequences

    ht = process_consequences(ht)
    ht = ht.explode(ht.vep.worst_csq_by_gene_canonical)
    ht = ht.annotate(
        variant_id=ht.locus.contig + ':' + hl.str(ht.locus.position) + '_' +
        ht.alleles[0] + '/' + ht.alleles[1],
        annotation=annotation_case_builder(ht.vep.worst_csq_by_gene_canonical))
    if check_gene_contigs:
        gene_contigs = ht.group_by(
            gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id,
            gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol,
        ).aggregate(contigs=hl.agg.collect_as_set(ht.locus.contig))
        assert gene_contigs.all(hl.len(gene_contigs.contigs) == 1)

    gene_map_ht = ht.group_by(
        gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id,
        gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol,
    ).partition_hint(100).aggregate(
        interval=hl.interval(start=hl.locus(
            hl.agg.take(ht.locus.contig, 1)[0], hl.agg.min(ht.locus.position)),
                             end=hl.locus(
                                 hl.agg.take(ht.locus.contig, 1)[0],
                                 hl.agg.max(ht.locus.position))),
        variants=hl.agg.group_by(ht.annotation, hl.agg.collect(ht.variant_id)),
    )
    return gene_map_ht
コード例 #3
0
def intersect_target_ref(ref_mt_filt,
                         snp_list,
                         grch37_or_grch38,
                         intersect_out,
                         overwrite: bool = False):
    mt = hl.read_matrix_table(ref_mt_filt)
    if grch37_or_grch38.lower() == 'grch38':
        snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr),
                                                  hl.int(snp_list.pos),
                                                  reference_genome='GRCh38'),
                                   alleles=[snp_list.ref, snp_list.alt])
        mt = mt.filter_rows(hl.is_defined(snp_list[mt.row_key]))

    elif grch37_or_grch38.lower() == 'grch37':
        snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr),
                                                  hl.int(snp_list.pos),
                                                  reference_genome='GRCh37'),
                                   alleles=[snp_list.ref, snp_list.alt])
        # liftover snp list to GRCh38, filter to SNPs in mt
        rg37, rg38 = load_liftover()

        snp_liftover = snp_list.annotate(
            new_locus=hl.liftover(snp_list.locus, 'GRCh38'))
        snp_liftover = snp_liftover.filter(
            hl.is_defined(snp_liftover.new_locus))
        snp_liftover = snp_liftover.key_by(locus=snp_liftover.new_locus,
                                           alleles=snp_liftover.alleles)
        mt = mt.filter_rows(hl.is_defined(snp_liftover[mt.row_key]))

    mt = mt.repartition(5000)
    mt = mt.checkpoint(intersect_out,
                       overwrite=overwrite,
                       _read_if_exists=not overwrite)
コード例 #4
0
def import_key(ss_filename, ss_keys, clump_name):
    keys = ss_keys.split(',')
    ss = hl.import_table(ss_filename,
                         impute=True,
                         delimiter='\s+',
                         types={
                             keys[1]: hl.tfloat,
                             keys[0]: hl.tstr
                         },
                         min_partitions=100)
    clump = hl.import_table(clump_name,
                            delimiter='\s+',
                            min_partitions=10,
                            types={
                                'P': hl.tfloat,
                                'CHR': hl.tstr,
                                'BP': hl.tint
                            })
    clump = clump.key_by(locus=hl.locus(clump.CHR, clump.BP))
    clump = clump.filter(clump.P < 5e-8)
    ss = ss.annotate(**{keys[1]: hl.int(ss[keys[1]])})
    chroms = set(map(str, range(1, 23)))
    ss = ss.filter(hl.literal(chroms).contains(ss[keys[0]]))
    ss = ss.annotate(locus=hl.locus(hl.str(ss[keys[0]]), ss[keys[1]]),
                     alleles=[ss[keys[2]], ss[keys[3]]])
    ss = ss.key_by(ss.locus)
    ss = ss.annotate(clump=hl.is_defined(clump[ss.key]))
    ss = ss.key_by(ss.locus, ss.alleles)
    p = keys[-1]
    return ss, p
コード例 #5
0
    def test_haploid(self):
        expected = hl.Table.parallelize([
            hl.struct(locus=hl.locus("X", 16050036),
                      s="C1046::HG02024",
                      GT=hl.call(0, 0),
                      AD=[10, 0],
                      GQ=44),
            hl.struct(locus=hl.locus("X", 16050036),
                      s="C1046::HG02025",
                      GT=hl.call(1),
                      AD=[0, 6],
                      GQ=70),
            hl.struct(locus=hl.locus("X", 16061250),
                      s="C1046::HG02024",
                      GT=hl.call(2, 2),
                      AD=[0, 0, 11],
                      GQ=33),
            hl.struct(locus=hl.locus("X", 16061250),
                      s="C1046::HG02025",
                      GT=hl.call(2),
                      AD=[0, 0, 9],
                      GQ=24)
        ],
                                        key=['locus', 's'])

        mt = hl.import_vcf(resource('haploid.vcf'))
        entries = mt.entries()
        entries = entries.key_by('locus', 's')
        entries = entries.select('GT', 'AD', 'GQ')
        self.assertTrue(entries._same(expected))
コード例 #6
0
ファイル: test_reference_genome.py プロジェクト: zscu/hail
    def test_liftover_strand(self):
        grch37 = hl.get_reference('GRCh37')
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'),
                            'GRCh38')

        self.assertEqual(
            hl.eval(
                hl.liftover(hl.locus('20', 60001, 'GRCh37'),
                            'GRCh38',
                            include_strand=True)),
            hl.eval(
                hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'),
                          is_negative_strand=False)))

        self.assertEqual(
            hl.eval(
                hl.liftover(hl.locus_interval('20', 37007582, 37007586, True,
                                              True, 'GRCh37'),
                            'GRCh38',
                            include_strand=True)),
            hl.eval(
                hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121,
                                                   True, True, 'GRCh38'),
                          is_negative_strand=True)))

        with self.assertRaises(FatalError):
            hl.eval(
                hl.liftover(
                    hl.parse_locus_interval('1:10000-10000',
                                            reference_genome='GRCh37'),
                    'GRCh38'))

        grch37.remove_liftover("GRCh38")
コード例 #7
0
    def test_call_fields(self):
        expected = hl.Table.parallelize([
            hl.struct(locus=hl.locus("X", 16050036),
                      s="C1046::HG02024",
                      GT=hl.call(0, 0),
                      GTA=hl.null(hl.tcall),
                      GTZ=hl.call(0, 1)),
            hl.struct(locus=hl.locus("X", 16050036),
                      s="C1046::HG02025",
                      GT=hl.call(1),
                      GTA=hl.null(hl.tcall),
                      GTZ=hl.call(0)),
            hl.struct(locus=hl.locus("X", 16061250),
                      s="C1046::HG02024",
                      GT=hl.call(2, 2),
                      GTA=hl.call(2, 1),
                      GTZ=hl.call(1, 1)),
            hl.struct(locus=hl.locus("X", 16061250),
                      s="C1046::HG02025",
                      GT=hl.call(2),
                      GTA=hl.null(hl.tcall),
                      GTZ=hl.call(1))
        ],
                                        key=['locus', 's'])

        mt = hl.import_vcf(resource('generic.vcf'),
                           call_fields=['GT', 'GTA', 'GTZ'])
        entries = mt.entries()
        entries = entries.key_by('locus', 's')
        entries = entries.select('GT', 'GTA', 'GTZ')
        self.assertTrue(entries._same(expected))
コード例 #8
0
ファイル: test_reference_genome.py プロジェクト: bcajes/hail
    def test_liftover_strand(self):
        grch37 = hl.get_reference('GRCh37')
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')

        self.assertEqual(hl.eval(hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)),
                         hl.eval(hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False)))

        self.assertEqual(hl.eval(hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'),
                                             'GRCh38', include_strand=True)),
                         hl.eval(hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'),
                                           is_negative_strand=True)))

        grch37.remove_liftover("GRCh38")
コード例 #9
0
 def test_uniqueness(self):
     db = hl.experimental.DB(config=AnnotationDBTests.db_json)
     t = hl.utils.range_table(10)
     t = t.annotate(locus=hl.locus('1', t.idx + 1))
     t = db.annotate_rows_db(t, 'unique_dataset', 'nonunique_dataset')
     t.unique_dataset.dtype == hl.tstruct(annotation=hl.tstr)
     t.nonunique_dataset.dtype == hl.tstruct(annotation=hl.tarray(hl.tstr))
コード例 #10
0
def main(args):
    add_args = {}
    if args.n_threads is not None:
        add_args['master'] = f'local[{args.n_threads}]'
    hl.init(default_reference='GRCh38', log='/load_finngen.log', **add_args)

    if args.load_single:
        ht = hl.import_table(args.input_file,
                             impute=True,
                             force_bgz=True,
                             min_partitions=100).rename({'#chrom': 'chrom'})
        ht = ht.transmute(locus=hl.locus('chr' + ht.chrom, ht.pos),
                          alleles=[ht.ref, ht.alt]).key_by('locus', 'alleles')
        ht = ht.transmute(Pvalue=ht.pval).annotate_globals(
            **json.loads(args.additional_dict))
        ht = ht.annotate(**get_vep_formatted_data(args.vep_path)[ht.key])
        ht = ht.checkpoint(args.output_ht,
                           overwrite=args.overwrite,
                           _read_if_exists=not args.overwrite)
        ht = ht.select_globals().annotate(**json.loads(args.additional_dict))
        mt = ht.to_matrix_table(
            ['locus', 'alleles'], ['phenocode'],
            ['rsids', 'nearest_genes', 'gene', 'annotation'],
            ['category', 'name', 'n_cases', 'n_controls'])
        mt.checkpoint(args.output_mt,
                      overwrite=args.overwrite,
                      _read_if_exists=not args.overwrite)

    if args.combine_all:
        # all_hts = list(filter(lambda y: y.endswith('.ht'), map(lambda x: x['path'], hl.hadoop_ls(args.input_directory))))
        # print(f'Got {len(all_hts)} HTs...')
        # mt = mwzj_hts_by_tree(all_hts, temp_bucket + '/finngen', ['phenocode'], debug=True)
        # mt.checkpoint(temp_mt_path, overwrite=args.overwrite, _read_if_exists=not args.overwrite)
        mt = hl.read_matrix_table(temp_mt_path)
        mt.naive_coalesce(5000).write(args.output_mt, args.overwrite)
コード例 #11
0
def main(args):
    full_vcf = hl.read_matrix_table(args.allreads_prefix + '.mt')

    # liftover chains
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(
        'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)

    chips = hl.hadoop_open(args.chip_loci)
    chip_dict = {}
    for chip in chips:
        chip = chip.strip().split()
        chip_pos = hl.import_table(chip[1],
                                   filter='\[Controls\]',
                                   skip_blank_lines=True)
        chip_pos = chip_pos.filter(
            hl.array(list(map(str, range(1, 23))) + ['X', 'Y']).contains(
                chip_pos.chr))
        chip_pos = chip_pos.key_by(
            locus=hl.locus(chip_pos.chr, hl.int(chip_pos.pos)))

        #  liftover chip position info
        chip_pos = chip_pos.annotate(
            new_locus=hl.liftover(chip_pos.locus, 'GRCh38'))
        chip_pos = chip_pos.filter(hl.is_defined(chip_pos.new_locus))
        chip_pos = chip_pos.key_by(locus=chip_pos.new_locus)

        # filter full vcf to sites in genotype data
        geno_vcf = full_vcf.filter_rows(hl.is_defined(
            chip_pos[full_vcf.locus]))
        hl.export_vcf(
            geno_vcf,
            'gs://neurogap/high_coverage/NeuroGap_30x_' + chip[0] + '.vcf.bgz')
コード例 #12
0
def import_cadd_table(path: str, genome_version: str, partitions) -> hl.Table:
    if genome_version not in ("37", "38"):
        raise ValueError(f"Invalid genome version: {genome_version}")

    column_names = {'f0': 'chrom', 'f1': 'pos', 'f2': 'ref', 'f3': 'alt', 'f4': 'RawScore', 'f5': 'PHRED'}
    types = {'f0': hl.tstr, 'f1': hl.tint, 'f4': hl.tfloat32, 'f5': hl.tfloat32}

    cadd_ht = import_table(path, force_bgz=True, comment="#", no_header=True, types=types, min_partitions=partitions)
    cadd_ht = cadd_ht.rename(column_names)

    chrom = hl.format("chr%s", cadd_ht.chrom) if genome_version == "38" else cadd_ht.chrom
    locus = hl.locus(chrom, cadd_ht.pos, reference_genome=hl.get_reference(f"GRCh{genome_version}"))
    alleles = hl.array([cadd_ht.ref, cadd_ht.alt])
    cadd_ht = cadd_ht.transmute(locus=locus, alleles=alleles)

    cadd_union_ht = cadd_ht.head(0)
    for contigs in (range(1, 10), list(range(10, 23)) + ["X", "Y", "MT"]):
        contigs = ["chr%s" % contig for contig in contigs] if genome_version == "38" else contigs
        cadd_ht_subset = cadd_ht.filter(hl.array(list(map(str, contigs))).contains(cadd_ht.locus.contig))
        cadd_union_ht = cadd_union_ht.union(cadd_ht_subset)

    cadd_union_ht = cadd_union_ht.key_by("locus", "alleles")

    cadd_union_ht.describe()

    return cadd_union_ht
コード例 #13
0
 def setupAnnotationDBTests(cls):
     startTestHailContext()
     t = hl.utils.range_table(10)
     t = t.annotate(locus=hl.locus('1', t.idx + 1))
     t = t.annotate(annotation=hl.str(t.idx))
     d = tempfile.TemporaryDirectory()
     fname = d.name + '/f.mt'
     t.write(fname)
     cls.temp_dir = d
     cls.db_json = {
         'unique_dataset': {
             'description': 'now with unique rows!',
             'url': 'https://example.com',
             'key_properties': ['unique'],
             'versions': [{
                 'url': fname,
                 'version': 'v1-GRCh37'
             }]
         },
         'nonunique_dataset': {
             'description': 'non-unique rows :(',
             'url': 'https://example.net',
             'key_properties': [],
             'versions': [{
                 'url': fname,
                 'version': 'v1-GRCh37'
             }]
         }
     }
コード例 #14
0
def specific_clumps(filename):
    clump = hl.import_table(filename,
                            delimiter='\s+',
                            min_partitions=10,
                            types={'P': hl.tfloat})
    clump = clump.key_by(locus=hl.locus(hl.str(clump.CHR), hl.int(clump.BP)))
    return clump
コード例 #15
0
def annotate_variants_with_mnvs(variants_path, mnvs_path):
    ds = hl.read_table(mnvs_path)

    ds = ds.select("changes_amino_acids_for_snvs", "constituent_snvs", "constituent_snv_ids", "n_individuals",)

    ds = ds.explode(ds.constituent_snvs, "snv")
    ds = ds.annotate(
        locus=hl.locus(ds.snv.chrom, ds.snv.pos, reference_genome="GRCh37"), alleles=[ds.snv.ref, ds.snv.alt]
    )
    ds = ds.group_by(ds.locus, ds.alleles).aggregate(multi_nucleotide_variants=hl.agg.collect(ds.row.drop("snv")))

    variants = hl.read_table(variants_path)

    variants = variants.annotate(multi_nucleotide_variants=ds[variants.key].multi_nucleotide_variants)
    variants = variants.annotate(
        flags=hl.if_else(
            hl.len(variants.multi_nucleotide_variants) > 0,
            variants.flags.add("mnv"),
            variants.flags,
            missing_false=True,
        ),
        multi_nucleotide_variants=variants.multi_nucleotide_variants.map(
            lambda mnv: mnv.select(
                combined_variant_id=mnv.variant_id,
                changes_amino_acids=mnv.changes_amino_acids_for_snvs.contains(variants.variant_id),
                n_individuals=mnv.n_individuals,
                other_constituent_snvs=mnv.constituent_snv_ids.filter(lambda snv_id: snv_id != variants.variant_id),
            )
        ),
    )

    return variants
コード例 #16
0
ファイル: sparse_mt.py プロジェクト: enriquea/gnomad_hail
    def get_contig_size(contig: str) -> int:
        logger.info(f"Working on {contig}")
        contig_ht = hl.utils.range_table(
            ref.contig_length(contig),
            n_partitions=int(ref.contig_length(contig) / 500_000),
        )
        contig_ht = contig_ht.annotate(
            locus=hl.locus(contig=contig, pos=contig_ht.idx + 1, reference_genome=ref)
        )
        contig_ht = contig_ht.filter(contig_ht.locus.sequence_context().lower() != "n")

        if contig in ref.x_contigs:
            contig_ht = contig_ht.filter(contig_ht.locus.in_x_nonpar())
        if contig in ref.y_contigs:
            contig_ht = contig_ht.filter(contig_ht.locus.in_y_nonpar())

        contig_ht = contig_ht.key_by("locus")
        if included_calling_intervals is not None:
            contig_ht = contig_ht.filter(
                hl.is_defined(included_calling_intervals[contig_ht.key])
            )
        if excluded_calling_intervals is not None:
            contig_ht = contig_ht.filter(
                hl.is_missing(excluded_calling_intervals[contig_ht.key])
            )
        contig_size = contig_ht.count()
        logger.info(f"Contig {contig} has {contig_size} bases for coverage.")
        return contig_size
コード例 #17
0
def rekey_new_reference(
        t: Union[hl.Table, hl.MatrixTable],
        reference: hl.ReferenceGenome) -> Union[hl.Table, hl.MatrixTable]:
    """
    Re-key Table or MatrixTable with a new reference genome.

    :param t: Input Table/MatrixTable.
    :param reference: Reference genome to re-key with.
    :return: Re-keyed Table/MatrixTable
    """
    t = t.rename({"locus": "locus_original"})
    locus_expr = hl.locus(
        t.locus_original.contig,
        t.locus_original.position,
        reference_genome=reference,
    )

    if isinstance(t, hl.MatrixTable):
        t = t.annotate_rows(locus=locus_expr)
        t = t.key_rows_by("locus", "alleles").drop("locus_original")
    else:
        t = t.annotate(locus=locus_expr)
        t = t.key_by("locus", "alleles").drop("locus_original")

    return t
コード例 #18
0
def compute_prs_mt(genotype_mt_path, prs_mt_path):
    scratch_dir = 'gs://ukbb-diverse-temp-30day/nb-scratch'

    clumped = hl.read_table(
        'gs://ukb-diverse-pops/ld_prune/results_high_quality/not_AMR/phecode-250.2-both_sexes/clump_results.ht/'
    )
    sumstats = hl.import_table(
        'gs://ukb-diverse-pops/sumstats_flat_files/phecode-250.2-both_sexes.tsv.bgz',
        impute=True)
    sumstats = sumstats.annotate(locus=hl.locus(sumstats.chr, sumstats.pos),
                                 alleles=hl.array([sumstats.ref,
                                                   sumstats.alt]))
    sumstats = sumstats.key_by('locus', 'alleles')
    sumstats.describe()
    #    mt = hl.read_matrix_table(genotype_mt_path) # read genotype mt subset

    # get full genotype mt
    meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
    mt = get_filtered_mt_with_x()
    mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key]))
    mt = mt.select_entries('dosage')
    mt = mt.select_rows()
    mt = mt.select_cols()

    mt = mt.annotate_rows(beta=hl.if_else(hl.is_defined(clumped[mt.row_key]),
                                          sumstats[mt.row_key].beta_meta, 0))
    mt = mt.annotate_cols(score=hl.agg.sum(mt.beta * mt.dosage))
    mt_cols = mt.cols()
    mt_cols = mt_cols.repartition(1000)
    mt_cols.write(f'{scratch_dir}/prs_all_samples.ht')
コード例 #19
0
ファイル: test_reference_genome.py プロジェクト: zscu/hail
    def test_reference_genome_sequence(self):
        gr3 = ReferenceGenome.read(resource("fake_ref_genome.json"))
        self.assertEqual(gr3.name, "my_reference_genome")
        self.assertFalse(gr3.has_sequence())

        gr4 = ReferenceGenome.from_fasta_file(
            "test_rg",
            resource("fake_reference.fasta"),
            resource("fake_reference.fasta.fai"),
            mt_contigs=["b", "c"],
            x_contigs=["a"])
        self.assertTrue(gr4.has_sequence())
        self.assertTrue(gr4.x_contigs == ["a"])

        t = hl.import_table(resource("fake_reference.tsv"), impute=True)
        self.assertTrue(
            hl.eval(
                t.all(
                    hl.get_sequence(t.contig, t.pos, reference_genome=gr4) ==
                    t.base)))

        l = hl.locus("a", 7, gr4)
        self.assertTrue(
            hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA"))

        gr4.remove_sequence()
        assert not gr4.has_sequence()

        gr4.add_sequence(resource("fake_reference.fasta"),
                         resource("fake_reference.fasta.fai"))
        assert gr4.has_sequence()
コード例 #20
0
ファイル: test_family_methods.py プロジェクト: jigold/hail
    def test_de_novo(self):
        mt = hl.import_vcf(resource('denovo.vcf'))
        mt = mt.filter_rows(mt.locus.in_y_par(), keep=False)  # de_novo_finder doesn't know about y PAR
        ped = hl.Pedigree.read(resource('denovo.fam'))
        r = hl.de_novo(mt, ped, mt.info.ESP)
        r = r.select(
            prior=r.prior,
            kid_id=r.proband.s,
            dad_id=r.father.s,
            mom_id=r.mother.s,
            p_de_novo=r.p_de_novo,
            confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id')

        truth = hl.import_table(resource('denovo.out'), impute=True, comment='#')
        truth = truth.select(
            locus=hl.locus(truth['Chr'], truth['Pos']),
            alleles=[truth['Ref'], truth['Alt']],
            kid_id=truth['Child_ID'],
            dad_id=truth['Dad_ID'],
            mom_id=truth['Mom_ID'],
            p_de_novo=truth['Prob_dn'],
            confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id',
                                                                            'mom_id')

        j = r.join(truth, how='outer')
        self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
コード例 #21
0
    def test_window_by_locus(self):
        mt = hl.utils.range_matrix_table(100, 2, n_partitions=10)
        mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1))
        mt = mt.key_rows_by('locus')
        mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx)
        mt = hl.window_by_locus(mt, 5).cache()

        self.assertEqual(mt.count_rows(), 100)

        rows = mt.rows()
        self.assertTrue(
            rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5)))
        self.assertTrue(
            rows.all(
                hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx,
                       hl.zip_with_index(rows.prev_rows))))

        entries = mt.entries()
        self.assertTrue(
            entries.all(
                hl.all(lambda x: x.e_col_idx == entries.col_idx,
                       entries.prev_entries)))
        self.assertTrue(
            entries.all(
                hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx,
                       hl.zip_with_index(entries.prev_entries))))
コード例 #22
0
    def test_import_keyby_count_ldsc_lowered_shuffle(self):
        # integration test pulled out of test_ld_score_regression to isolate issues with lowered shuffles
        # and RDD serialization, 2021-07-06
        # if this comment no longer reflects the backend system, that's a really good thing
        ht_scores = hl.import_table(
            doctest_resource('ld_score_regression.univariate_ld_scores.tsv'),
            key='SNP',
            types={
                'L2': hl.tfloat,
                'BP': hl.tint
            })

        ht_20160 = hl.import_table(
            doctest_resource('ld_score_regression.20160.sumstats.tsv'),
            key='SNP',
            types={
                'N': hl.tint,
                'Z': hl.tfloat
            })

        j1 = ht_scores[ht_20160['SNP']]
        ht_20160 = ht_20160.annotate(ld_score=j1['L2'],
                                     locus=hl.locus(j1['CHR'], j1['BP']),
                                     alleles=hl.array(
                                         [ht_20160['A2'], ht_20160['A1']]))

        ht_20160 = ht_20160.key_by(ht_20160['locus'], ht_20160['alleles'])
        assert ht_20160._force_count() == 151
コード例 #23
0
ファイル: test_family_methods.py プロジェクト: jigold/hail
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
                 .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                            alleles=[truth.REF, truth.ALT])
                 .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
               .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))
        bad.describe()

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
コード例 #24
0
    def test_de_novo(self):
        mt = hl.import_vcf(resource('denovo.vcf'))
        mt = mt.filter_rows(mt.locus.in_y_par(), keep=False)  # de_novo_finder doesn't know about y PAR
        ped = hl.Pedigree.read(resource('denovo.fam'))
        r = hl.de_novo(mt, ped, mt.info.ESP)
        r = r.select(
            prior=r.prior,
            kid_id=r.proband.s,
            dad_id=r.father.s,
            mom_id=r.mother.s,
            p_de_novo=r.p_de_novo,
            confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id')

        truth = hl.import_table(resource('denovo.out'), impute=True, comment='#')
        truth = truth.select(
            locus=hl.locus(truth['Chr'], truth['Pos']),
            alleles=[truth['Ref'], truth['Alt']],
            kid_id=truth['Child_ID'],
            dad_id=truth['Dad_ID'],
            mom_id=truth['Mom_ID'],
            p_de_novo=truth['Prob_dn'],
            confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id',
                                                                            'mom_id')

        j = r.join(truth, how='outer')
        self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
コード例 #25
0
ファイル: test_methods.py プロジェクト: shulik7/hail
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
            .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                       alleles=[truth.REF, truth.ALT])
            .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
            .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi2 - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
コード例 #26
0
def specific_clumps(filename):
    clump = hl.import_table(filename, delimiter='\s+', min_partitions=10, types={'P': hl.tfloat})
    clump_dict = clump.aggregate(hl.dict(hl.agg.collect(
        (hl.locus(hl.str(clump.CHR), hl.int(clump.BP)),
        True)
    )), _localize=False)
    return clump_dict
コード例 #27
0
def create_rf_2_0_2_rank(data_type: str, beta: bool) -> None:
    """
    Creates a rank file for 2.0.2 RF and writes it to its correct location.

    :param str data_type: One of 'exomes' or 'genomes'
    :param bool beta: If set, then creates the table for the "beta" 2.0.2 RF with QD / max(p(AB))
    :return: Nothing
    :rtype: None
    """
    logger.info(
        f"Creating rank file for {data_type} RF 2.0.2{'beta' if beta else ''}")

    if not hl.hadoop_exists(
            f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht'):
        ht = hl.import_table(get_2_0_2_rf_path(data_type, beta),
                             types={'chrom': hl.tstr},
                             impute=True,
                             min_partitions=1000)
        if 'chrom' in ht.row:
            ht = ht.transmute(locus=hl.locus(ht.chrom, ht.pos),
                              alleles=[ht.ref, ht.alt])
        else:
            ht = ht.transmute(
                v=hl.parse_variant(ht.v),
                rfprob=ht.rf_rpob_tp  # Yes, this is awful
            )
            ht = ht.transmute(locus=ht.v.locus, alleles=ht.v.alleles)

        ht = ht.key_by('locus', 'alleles')

        gnomad_ht = get_gnomad_annotations(data_type)
        ht = ht.annotate(**gnomad_ht[ht.key], score=ht.rfprob)

        ht.write(
            f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht')
    ht = hl.read_table(
        f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht')
    ht = add_rank(ht,
                  score_expr=1 - ht.score,
                  subrank_expr={
                      'singleton_rank':
                      ht.singleton,
                      'biallelic_rank':
                      ~ht.was_split,
                      'biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton,
                      'adj_rank':
                      ht.ac > 0,
                      'adj_biallelic_rank':
                      ~ht.was_split & (ht.ac > 0),
                      'adj_singleton_rank':
                      ht.singleton & (ht.ac > 0),
                      'adj_biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton & (ht.ac > 0)
                  })

    ht.write(score_ranking_path(data_type,
                                'rf_2.0.2{}'.format('_beta' if beta else '')),
             overwrite=True)
コード例 #28
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_haploid(self):
        expected = hl.Table.parallelize(
            [hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02024",
                       GT = hl.call(0, 0), AD = [10, 0], GQ = 44),
             hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02025",
                       GT = hl.call(1), AD = [0, 6], GQ = 70),
             hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02024",
                       GT = hl.call(2, 2), AD = [0, 0, 11], GQ = 33),
             hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02025",
                       GT = hl.call(2), AD = [0, 0, 9], GQ = 24)],
            key=['locus', 's'])

        mt = hl.import_vcf(resource('haploid.vcf'))
        entries = mt.entries()
        entries = entries.key_by('locus', 's')
        entries = entries.select('GT', 'AD', 'GQ')
        self.assertTrue(entries._same(expected))
コード例 #29
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
    def test_call_fields(self):
        expected = hl.Table.parallelize(
            [hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02024",
                       GT = hl.call(0, 0), GTA = hl.null(hl.tcall), GTZ = hl.call(0, 1)),
             hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02025",
                       GT = hl.call(1), GTA = hl.null(hl.tcall), GTZ = hl.call(0)),
             hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02024",
                       GT = hl.call(2, 2), GTA = hl.call(2, 1), GTZ = hl.call(1, 1)),
             hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02025",
                       GT = hl.call(2), GTA = hl.null(hl.tcall), GTZ = hl.call(1))],
            key=['locus', 's'])

        mt = hl.import_vcf(resource('generic.vcf'), call_fields=['GT', 'GTA', 'GTZ'])
        entries = mt.entries()
        entries = entries.key_by('locus', 's')
        entries = entries.select('GT', 'GTA', 'GTZ')
        self.assertTrue(entries._same(expected))
コード例 #30
0
def import_key(ss_filename, ss_keys):
    ss = hl.import_table(ss_filename, impute=True, delimiter='\s+')
    keys = ss_keys.split(',')
    p = keys[-1]
    ss = ss.annotate(locus=hl.locus(hl.str(ss[keys[0]]), ss[keys[1]]),
                     alleles=[ss[keys[2]], ss[keys[3]]])
    ss = ss.key_by(ss.locus, ss.alleles)
    return ss, p
コード例 #31
0
ファイル: get_coding_variants.py プロジェクト: nikbaya/smiles
def get_annot_ht():
    t = hl.import_table(f'{wd_data}/gencode.v31lift37.annotation.gff3.gz',no_header=True,impute=True, comment=('#'),force=True)
    #t = hl.import_table('/Users/nbaya/Downloads/gencode.v31lift37.annotation.gtf',no_header=True,impute=True, comment=('#'))
    
                                                                                                                        
    t2 = t.annotate(GFF_Columns = t.f8.split(";").map(lambda x: x.split("=")))
    t2 = t2.filter(t2.f2 == "CDS") # only want coding sequences, not entire genes
    t2 = t2.filter(hl.is_valid_locus(t2.f0[3:], t2.f3, 'GRCh37'))
    t2 = t2.filter(hl.is_valid_locus(t2.f0[3:], t2.f4, 'GRCh37'))
    t2 = t2.annotate(interval=hl.interval(hl.locus(t2.f0[3:], t2.f3, 'GRCh37'), hl.locus(t2.f0[3:], t2.f4, 'GRCh37')))
    t2 = t2.annotate(GFF_Columns = hl.dict(t2.GFF_Columns.map(lambda x: (x[0], x[1]))))
    t2 = t2.annotate(ID=t2.GFF_Columns["ID"], gene_id=t2.GFF_Columns["gene_id"], 
                     gene_name=t2.GFF_Columns["gene_name"], gene_type=t2.GFF_Columns["gene_type"], 
                     level=t2.GFF_Columns["level"])
    t2 = t2.annotate(type=t2.f2, gene_score=t2.f5, gene_strand=t2.f6, gene_phase=t2.f7)
    t2 = t2.drop(t2.GFF_Columns, t2.f8, t2.f0, t2.f1, t2.f2, t2.f3, t2.f4, t2.f5, t2.f6, t2.f7)
    t2 = t2.key_by(t2.interval)
    return t2
コード例 #32
0
 def setupAnnotationDBTests(cls):
     startTestHailContext()
     t = hl.utils.range_table(10)
     t = t.key_by(locus=hl.locus('1', t.idx + 1))
     t = t.annotate(annotation=hl.str(t.idx))
     cls.tempdir_manager = hl.TemporaryDirectory()
     d = cls.tempdir_manager.__enter__()
     fname = d + '/f.mt'
     t.write(fname)
     cls.db_json = {
         'unique_dataset': {
             'description':
             'now with unique rows!',
             'url':
             'https://example.com',
             'annotation_db': {
                 'key_properties': ['unique']
             },
             'versions': [{
                 'url': {
                     "aws": {
                         "eu": fname,
                         "us": fname
                     },
                     "gcp": {
                         "eu": fname,
                         "us": fname
                     }
                 },
                 'version': 'v1',
                 'reference_genome': 'GRCh37'
             }]
         },
         'nonunique_dataset': {
             'description':
             'non-unique rows :(',
             'url':
             'https://example.net',
             'annotation_db': {
                 'key_properties': []
             },
             'versions': [{
                 'url': {
                     "aws": {
                         "eu": fname,
                         "us": fname
                     },
                     "gcp": {
                         "eu": fname,
                         "us": fname
                     }
                 },
                 'version': 'v1',
                 'reference_genome': 'GRCh37'
             }]
         }
     }
コード例 #33
0
ファイル: test_api.py プロジェクト: shulik7/hail
    def test_constructors(self):
        rg = hl.ReferenceGenome("foo", ["1"], {"1": 100})

        schema = hl.tstruct(a=hl.tfloat64, b=hl.tfloat64, c=hl.tint32, d=hl.tint32)
        rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': 5}]
        kt = hl.Table.parallelize(rows, schema)
        kt = kt.annotate(d=hl.int64(kt.d))

        kt = kt.annotate(l1=hl.parse_locus("1:51"),
                         l2=hl.locus("1", 51, reference_genome=rg),
                         i1=hl.parse_locus_interval("1:51-56", reference_genome=rg),
                         i2=hl.interval(hl.locus("1", 51, reference_genome=rg),
                                        hl.locus("1", 56, reference_genome=rg)))

        expected_schema = {'a': hl.tfloat64, 'b': hl.tfloat64, 'c': hl.tint32, 'd': hl.tint64,
                           'l1': hl.tlocus(), 'l2': hl.tlocus(rg),
                           'i1': hl.tinterval(hl.tlocus(rg)), 'i2': hl.tinterval(hl.tlocus(rg))}

        self.assertTrue(all([expected_schema[f] == t for f, t in kt.row.dtype.items()]))
コード例 #34
0
 def test_uniqueness(self):
     db = hl.experimental.DB(region='us',
                             cloud='gcp',
                             config=AnnotationDBTests.db_json)
     t = hl.utils.range_table(10)
     t = t.key_by(locus=hl.locus('1', t.idx + 1))
     t = db.annotate_rows_db(t, 'unique_dataset', 'nonunique_dataset')
     assert t.unique_dataset.dtype == hl.dtype(
         'struct{idx: int32, annotation: str}')
     assert t.nonunique_dataset.dtype == hl.dtype(
         'array<struct{idx: int32, annotation: str}>')
コード例 #35
0
ファイル: helpers.py プロジェクト: jigold/hail
def create_all_values():
    return hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )
コード例 #36
0
def create_all_values():
    return hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )
コード例 #37
0
ファイル: test_file_formats.py プロジェクト: bcajes/hail
def create_all_values_datasets():
    all_values = hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )

    def prefix(s, p):
        return hl.struct(**{p + k: s[k] for k in s})

    all_values_table = (hl.utils.range_table(5, n_partitions=3)
                        .annotate_globals(**prefix(all_values, 'global_'))
                        .annotate(**all_values)
                        .cache())

    all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2)
                               .annotate_globals(**prefix(all_values, 'global_'))
                               .annotate_rows(**prefix(all_values, 'row_'))
                               .annotate_cols(**prefix(all_values, 'col_'))
                               .annotate_entries(**prefix(all_values, 'entry_'))
                               .cache())

    return all_values_table, all_values_matrix_table
コード例 #38
0
ファイル: test_reference_genome.py プロジェクト: danking/hail
    def test_reference_genome_sequence(self):
        gr3 = ReferenceGenome.read(resource("fake_ref_genome.json"))
        self.assertEqual(gr3.name, "my_reference_genome")
        self.assertFalse(gr3.has_sequence())

        gr4 = ReferenceGenome.from_fasta_file("test_rg", resource("fake_reference.fasta"),
                                              resource("fake_reference.fasta.fai"),
                                              mt_contigs=["b", "c"], x_contigs=["a"])
        self.assertTrue(gr4.has_sequence())
        self.assertTrue(gr4.x_contigs == ["a"])

        t = hl.import_table(resource("fake_reference.tsv"), impute=True)
        self.assertTrue(hl.eval(t.all(hl.get_sequence(t.contig, t.pos, reference_genome=gr4) == t.base)))

        l = hl.locus("a", 7, gr4)
        self.assertTrue(hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA"))
コード例 #39
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
def generate_random_gen():
    mt = hl.utils.range_matrix_table(30, 10)
    mt = (mt.annotate_rows(locus = hl.locus('20', mt.row_idx + 1),
                           alleles = ['A', 'G'])
          .key_rows_by('locus', 'alleles'))
    mt = (mt.annotate_cols(s = hl.str(mt.col_idx))
          .key_cols_by('s'))
    # using totally random values leads rounding differences where
    # identical GEN values get rounded differently, leading to
    # differences in the GT call between import_{gen, bgen}
    mt = mt.annotate_entries(a = hl.int32(hl.rand_unif(0.0, 255.0)))
    mt = mt.annotate_entries(b = hl.int32(hl.rand_unif(0.0, 255.0 - mt.a)))
    mt = mt.transmute_entries(GP = hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0)
    # 20% missing
    mt = mt.filter_entries(hl.rand_bool(0.8))
    hl.export_gen(mt, 'random', precision=4)
コード例 #40
0
ファイル: test_misc.py プロジェクト: danking/hail
    def test_window_by_locus(self):
        mt = hl.utils.range_matrix_table(100, 2, n_partitions=10)
        mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1))
        mt = mt.key_rows_by('locus')
        mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx)
        mt = hl.window_by_locus(mt, 5).cache()

        self.assertEqual(mt.count_rows(), 100)

        rows = mt.rows()
        self.assertTrue(rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5)))
        self.assertTrue(rows.all(hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx,
                                        hl.zip_with_index(rows.prev_rows))))

        entries = mt.entries()
        self.assertTrue(entries.all(hl.all(lambda x: x.e_col_idx == entries.col_idx, entries.prev_entries)))
        self.assertTrue(entries.all(hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx,
                                           hl.zip_with_index(entries.prev_entries))))
コード例 #41
0
ファイル: test_matrix_table.py プロジェクト: tpoterba/hail
    def test_hardy_weinberg_test(self):
        mt = hl.import_vcf(resource('HWE_test.vcf'))
        mt = mt.select_rows(**hl.agg.hardy_weinberg_test(mt.GT))
        rt = mt.rows()
        expected = hl.Table.parallelize([
            hl.struct(
                locus=hl.locus('20', pos),
                alleles=alleles,
                het_freq_hwe=r,
                p_value=p)
            for (pos, alleles, r, p) in [
                (1, ['A', 'G'], 0.0, 0.5),
                (2, ['A', 'G'], 0.25, 0.5),
                (3, ['T', 'C'], 0.5357142857142857, 0.21428571428571427),
                (4, ['T', 'A'], 0.5714285714285714, 0.6571428571428573),
                (5, ['G', 'A'], 0.3333333333333333, 0.5)]],
            key=['locus', 'alleles'])
        self.assertTrue(rt.filter(rt.locus.position != 6)._same(expected))

        rt6 = rt.filter(rt.locus.position == 6).collect()[0]
        self.assertEqual(rt6['p_value'], 0.5)
        self.assertTrue(math.isnan(rt6['het_freq_hwe']))
コード例 #42
0
ファイル: conftest.py プロジェクト: bcajes/hail
def init(doctest_namespace):
    # This gets run once per process -- must avoid race conditions
    print("setting up doctest...")

    olddir = os.getcwd()
    os.chdir("docs/")

    doctest_namespace['hl'] = hl
    doctest_namespace['agg'] = agg

    if not os.path.isdir("output/"):
        try:
            os.mkdir("output/")
        except OSError:
            pass

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.read_matrix_table('data/example.vds')
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv', impute=True,
                             types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                    'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                    'E': hl.tstruct(A=hl.tint32, B=hl.tint32)})
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+',
                                   types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)},
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44})
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    print("finished setting up doctest...")
    yield
    os.chdir(olddir)
コード例 #43
0
ファイル: test_reference_genome.py プロジェクト: jigold/hail
    def test_reference_genome_liftover(self):
        grch37 = hl.get_reference('GRCh37')
        grch38 = hl.get_reference('GRCh38')

        self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37'))
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')
        grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37')
        self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37'))

        ds = hl.import_vcf(resource('sample.vcf'))
        t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows()
        self.assertTrue(t.all(t.locus == t.liftover))

        null_locus = hl.null(hl.tlocus('GRCh38'))

        rows = [
            {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')},
            {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')},
            {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')},
            {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')},
            {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')},
            {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')},
            {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus}
        ]
        schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38),
                                      hl.liftover(t.l37, 'GRCh38') == t.l38,
                                      hl.is_missing(hl.liftover(t.l37, 'GRCh38')))))

        t = t.filter(hl.is_defined(t.l38))
        self.assertTrue(t.count() == 6)

        t = t.key_by('l38')
        t.count()
        self.assertTrue(list(t.key) == ['l38'])

        null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38')))
        rows = [
            {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval},
            {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'),
             'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')}
        ]
        schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38)))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38))

        grch37.remove_liftover("GRCh38")
        grch38.remove_liftover("GRCh37")
コード例 #44
0
ファイル: test_experimental.py プロジェクト: jigold/hail
    def test_ld_score(self):

        ht = hl.import_table(doctest_resource('ldsc.annot'),
                             types={'BP': hl.tint,
                                    'CM': hl.tfloat,
                                    'binary': hl.tint,
                                    'continuous': hl.tfloat})
        ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
        ht = ht.key_by('locus')

        mt = hl.import_plink(bed=doctest_resource('ldsc.bed'),
                             bim=doctest_resource('ldsc.bim'),
                             fam=doctest_resource('ldsc.fam'))
        mt = mt.annotate_rows(binary=ht[mt.locus].binary,
                              continuous=ht[mt.locus].continuous)

        ht_univariate = hl.experimental.ld_score(
            entry_expr=mt.GT.n_alt_alleles(),
            locus_expr=mt.locus,
            radius=1.0,
            coord_expr=mt.cm_position)

        ht_annotated = hl.experimental.ld_score(
            entry_expr=mt.GT.n_alt_alleles(),
            locus_expr=mt.locus,
            radius=1.0,
            coord_expr=mt.cm_position,
            annotation_exprs=[mt.binary,
                              mt.continuous])

        univariate = ht_univariate.aggregate(hl.struct(
            chr20=hl.agg.filter(
                (ht_univariate.locus.contig == '20') &
                (ht_univariate.locus.position == 82079),
                hl.agg.collect(ht_univariate.univariate))[0],
            chr22 =hl.agg.filter(
                (ht_univariate.locus.contig == '22') &
                (ht_univariate.locus.position == 16894090),
                hl.agg.collect(ht_univariate.univariate))[0],
            mean=hl.agg.mean(ht_univariate.univariate)))

        self.assertAlmostEqual(univariate.chr20, 1.601, places=3)
        self.assertAlmostEqual(univariate.chr22, 1.140, places=3)
        self.assertAlmostEqual(univariate.mean, 3.507, places=3)

        annotated = ht_annotated.aggregate(
            hl.struct(
                chr20=hl.struct(binary=hl.agg.filter(
                    (ht_annotated.locus.contig == '20') &
                    (ht_annotated.locus.position == 82079),
                    hl.agg.collect(ht_annotated.binary))[0],
                                continuous=hl.agg.filter(
                                    (ht_annotated.locus.contig == '20') &
                                    (ht_annotated.locus.position == 82079),
                                    hl.agg.collect(ht_annotated.continuous))[0]),
                chr22=hl.struct(
                    binary=hl.agg.filter(
                        (ht_annotated.locus.contig == '22') &
                        (ht_annotated.locus.position == 16894090),
                        hl.agg.collect(ht_annotated.binary))[0],
                    continuous=hl.agg.filter(
                        (ht_annotated.locus.contig == '22') &
                        (ht_annotated.locus.position == 16894090),
                        hl.agg.collect(ht_annotated.continuous))[0]),
                mean_stats=hl.struct(binary=hl.agg.mean(ht_annotated.binary),
                                     continuous=hl.agg.mean(ht_annotated.continuous))))

        self.assertAlmostEqual(annotated.chr20.binary, 1.152, places=3)
        self.assertAlmostEqual(annotated.chr20.continuous, 73.014, places=3)
        self.assertAlmostEqual(annotated.chr22.binary, 1.107, places=3)
        self.assertAlmostEqual(annotated.chr22.continuous, 102.174, places=3)
        self.assertAlmostEqual(annotated.mean_stats.binary, 0.965, places=3)
        self.assertAlmostEqual(annotated.mean_stats.continuous, 176.528, places=3)
コード例 #45
0
ファイル: test_experimental.py プロジェクト: jigold/hail
    def test_ld_score_regression(self):

        ht_scores = hl.import_table(
            doctest_resource('ld_score_regression.univariate_ld_scores.tsv'),
            key='SNP', types={'L2': hl.tfloat, 'BP': hl.tint})

        ht_50_irnt = hl.import_table(
            doctest_resource('ld_score_regression.50_irnt.sumstats.tsv'),
            key='SNP', types={'N': hl.tint, 'Z': hl.tfloat})

        ht_50_irnt = ht_50_irnt.annotate(
            chi_squared=ht_50_irnt['Z']**2,
            n=ht_50_irnt['N'],
            ld_score=ht_scores[ht_50_irnt['SNP']]['L2'],
            locus=hl.locus(ht_scores[ht_50_irnt['SNP']]['CHR'],
                           ht_scores[ht_50_irnt['SNP']]['BP']),
            alleles=hl.array([ht_50_irnt['A2'], ht_50_irnt['A1']]),
            phenotype='50_irnt')

        ht_50_irnt = ht_50_irnt.key_by(ht_50_irnt['locus'],
                                       ht_50_irnt['alleles'])

        ht_50_irnt = ht_50_irnt.select(ht_50_irnt['chi_squared'],
                                       ht_50_irnt['n'],
                                       ht_50_irnt['ld_score'],
                                       ht_50_irnt['phenotype'])

        ht_20160 = hl.import_table(
            doctest_resource('ld_score_regression.20160.sumstats.tsv'),
            key='SNP', types={'N': hl.tint, 'Z': hl.tfloat})

        ht_20160 = ht_20160.annotate(
            chi_squared=ht_20160['Z']**2,
            n=ht_20160['N'],
            ld_score=ht_scores[ht_20160['SNP']]['L2'],
            locus=hl.locus(ht_scores[ht_20160['SNP']]['CHR'],
                           ht_scores[ht_20160['SNP']]['BP']),
            alleles=hl.array([ht_20160['A2'], ht_20160['A1']]),
            phenotype='20160')

        ht_20160 = ht_20160.key_by(ht_20160['locus'],
                                   ht_20160['alleles'])

        ht_20160 = ht_20160.select(ht_20160['chi_squared'],
                                   ht_20160['n'],
                                   ht_20160['ld_score'],
                                   ht_20160['phenotype'])

        ht = ht_50_irnt.union(ht_20160)
        mt = ht.to_matrix_table(row_key=['locus', 'alleles'],
                                col_key=['phenotype'],
                                row_fields=['ld_score'],
                                col_fields=[])

        mt_tmp = new_temp_file()
        mt.write(mt_tmp, overwrite=True)
        mt = hl.read_matrix_table(mt_tmp)

        ht_results = hl.experimental.ld_score_regression(
            weight_expr=mt['ld_score'],
            ld_score_expr=mt['ld_score'],
            chi_sq_exprs=mt['chi_squared'],
            n_samples_exprs=mt['n'],
            n_blocks=20,
            two_step_threshold=5,
            n_reference_panel_variants=1173569)

        results = {
            x['phenotype']: {
                'mean_chi_sq': x['mean_chi_sq'],
                'intercept_estimate': x['intercept']['estimate'],
                'intercept_standard_error': x['intercept']['standard_error'],
                'snp_heritability_estimate': x['snp_heritability']['estimate'],
                'snp_heritability_standard_error':
                    x['snp_heritability']['standard_error']}
            for x in ht_results.collect()}

        self.assertAlmostEqual(
            results['50_irnt']['mean_chi_sq'],
            3.4386, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['intercept_estimate'],
            0.7727, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['intercept_standard_error'],
            0.2461, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['snp_heritability_estimate'],
            0.3845, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['snp_heritability_standard_error'],
            0.1067, places=4)

        self.assertAlmostEqual(
            results['20160']['mean_chi_sq'],
            1.5209, places=4)
        self.assertAlmostEqual(
            results['20160']['intercept_estimate'],
            1.2109, places=4)
        self.assertAlmostEqual(
            results['20160']['intercept_standard_error'],
            0.2238, places=4)
        self.assertAlmostEqual(
            results['20160']['snp_heritability_estimate'],
            0.0486, places=4)
        self.assertAlmostEqual(
            results['20160']['snp_heritability_standard_error'],
            0.0416, places=4)

        ht = ht_50_irnt.annotate(
            chi_squared_50_irnt=ht_50_irnt['chi_squared'],
            n_50_irnt=ht_50_irnt['n'],
            chi_squared_20160=ht_20160[ht_50_irnt.key]['chi_squared'],
            n_20160=ht_20160[ht_50_irnt.key]['n'])

        ht_results = hl.experimental.ld_score_regression(
            weight_expr=ht['ld_score'],
            ld_score_expr=ht['ld_score'],
            chi_sq_exprs=[ht['chi_squared_50_irnt'],
                               ht['chi_squared_20160']],
            n_samples_exprs=[ht['n_50_irnt'],
                             ht['n_20160']],
            n_blocks=20,
            two_step_threshold=5,
            n_reference_panel_variants=1173569)

        results = {
            x['phenotype']: {
                'mean_chi_sq': x['mean_chi_sq'],
                'intercept_estimate': x['intercept']['estimate'],
                'intercept_standard_error': x['intercept']['standard_error'],
                'snp_heritability_estimate': x['snp_heritability']['estimate'],
                'snp_heritability_standard_error':
                    x['snp_heritability']['standard_error']}
            for x in ht_results.collect()}

        self.assertAlmostEqual(
            results[0]['mean_chi_sq'],
            3.4386, places=4)
        self.assertAlmostEqual(
            results[0]['intercept_estimate'],
            0.7727, places=4)
        self.assertAlmostEqual(
            results[0]['intercept_standard_error'],
            0.2461, places=4)
        self.assertAlmostEqual(
            results[0]['snp_heritability_estimate'],
            0.3845, places=4)
        self.assertAlmostEqual(
            results[0]['snp_heritability_standard_error'],
            0.1067, places=4)

        self.assertAlmostEqual(
            results[1]['mean_chi_sq'],
            1.5209, places=4)
        self.assertAlmostEqual(
            results[1]['intercept_estimate'],
            1.2109, places=4)
        self.assertAlmostEqual(
            results[1]['intercept_standard_error'],
            0.2238, places=4)
        self.assertAlmostEqual(
            results[1]['snp_heritability_estimate'],
            0.0486, places=4)
        self.assertAlmostEqual(
            results[1]['snp_heritability_standard_error'],
            0.0416, places=4)
コード例 #46
0
ファイル: plots.py プロジェクト: jigold/hail
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, significance_line=5e-8):
    """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot)

    Parameters
    ----------
    pvals : :class:`.Float64Expression`
        P-values to be plotted.
    locus : :class:`.LocusExpression`
        Locus values to be plotted.
    title : str
        Title of the plot.
    size : int
        Size of markers in screen space units.
    hover_fields : Dict[str, :class:`.Expression`]
        Dictionary of field names and values to be shown in the HoverTool of the plot.
    collect_all : bool
        Whether to collect all values or downsample before plotting.
    n_divisions : int
        Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints.
    significance_line : float, optional
        p-value at which to add a horizontal, dotted red line indicating
        genome-wide significance.  If ``None``, no line is added.

    Returns
    -------
    :class:`bokeh.plotting.figure.Figure`
    """
    if locus is None:
        locus = pvals._indices.source.locus

    ref = locus.dtype.reference_genome

    if hover_fields is None:
        hover_fields = {}

    hover_fields['locus'] = hail.str(locus)

    pvals = -hail.log10(pvals)

    source_pd = _collect_scatter_plot_data(
        ('_global_locus', locus.global_position()),
        ('_pval', pvals),
        fields=hover_fields,
        n_divisions=None if collect_all else n_divisions
    )
    source_pd['p_value'] = [10 ** (-p) for p in source_pd['_pval']]
    source_pd['_contig'] = [locus.split(":")[0] for locus in source_pd['locus']]

    observed_contigs = set(source_pd['_contig'])
    observed_contigs = [contig for contig in ref.contigs.copy() if contig in observed_contigs]
    contig_ticks = hail.eval([hail.locus(contig, int(ref.lengths[contig]/2)).global_position() for contig in observed_contigs])
    color_mapper = CategoricalColorMapper(factors=ref.contigs, palette= palette[:2] * int((len(ref.contigs)+1)/2))

    p = figure(title=title, x_axis_label='Chromosome', y_axis_label='P-value (-log10 scale)', width=1000)
    p, _, legend, _, _, _ = _get_scatter_plot_elements(
        p, source_pd, x_col='_global_locus', y_col='_pval',
        label_cols=['_contig'], colors={'_contig': color_mapper},
        size=size
    )
    legend.visible = False
    p.xaxis.ticker = contig_ticks
    p.xaxis.major_label_overrides = dict(zip(contig_ticks, observed_contigs))
    p.select_one(HoverTool).tooltips = [t for t in p.select_one(HoverTool).tooltips if not t[0].startswith('_')]

    if significance_line is not None:
        p.renderers.append(Span(location=-log10(significance_line),
                                dimension='width',
                                line_color='red',
                                line_dash='dashed',
                                line_width=1.5))

    return p
コード例 #47
0
ファイル: test_linalg.py プロジェクト: danking/hail
    def test_locus_windows(self):
        def assert_eq(a, b):
            self.assertTrue(np.array_equal(a, np.array(b)))

        centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9])

        mt = hl.balding_nichols_model(1, 5, 5).add_row_index()
        mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache()

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2)
        assert_eq(starts, [0, 0, 0, 1, 2])
        assert_eq(stops, [3, 4, 5, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 0.5, coord_expr=mt.cm)
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)])
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        rows = [{'locus': hl.Locus('1', 1), 'cm': 1.0},
                {'locus': hl.Locus('1', 2), 'cm': 3.0},
                {'locus': hl.Locus('1', 4), 'cm': 4.0},
                {'locus': hl.Locus('2', 1), 'cm': 2.0},
                {'locus': hl.Locus('2', 1), 'cm': 2.0},
                {'locus': hl.Locus('3', 3), 'cm': 5.0}]

        ht = hl.Table.parallelize(rows,
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64),
                                  key=['locus'])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1)
        assert_eq(starts, [0, 0, 2, 3, 3, 5])
        assert_eq(stops, [2, 2, 3, 5, 5, 6])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        assert_eq(starts, [0, 1, 1, 3, 3, 5])
        assert_eq(stops, [1, 3, 3, 5, 5, 6])

        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0)
        self.assertTrue('ascending order' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx)
        self.assertTrue('different source' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0)
        self.assertTrue("no source" in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0)
        self.assertTrue("no source" in str(cm.exception))

        ht = ht.annotate_globals(x = hl.locus('1', 1), y = 1.0)
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.x, 1.0)
        self.assertTrue("row-indexed" in str(cm.exception))
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y)
        self.assertTrue("row-indexed" in str(cm.exception))

        ht = hl.Table.parallelize([{'locus': hl.null(hl.tlocus()), 'cm': 1.0}],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))

        ht = hl.Table.parallelize([{'locus': hl.Locus('1', 1), 'cm': hl.null(hl.tfloat64)}],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))
コード例 #48
0
ファイル: misc.py プロジェクト: jigold/hail
def locus_windows(locus_expr, radius, coord_expr=None, _localize=True):
    """Returns start and stop indices for window around each locus.

    Examples
    --------

    Windows with 2bp radius for one contig with positions 1, 2, 3, 4, 5:

    >>> starts, stops = hl.linalg.utils.locus_windows(
    ...     hl.balding_nichols_model(1, 5, 5).locus,
    ...     radius=2)
    >>> starts, stops
    (array([0, 0, 0, 1, 2]), array([3, 4, 5, 5, 5]))

    The following examples involve three contigs.

    >>> loci = [{'locus': hl.Locus('1', 1), 'cm': 1.0},
    ...         {'locus': hl.Locus('1', 2), 'cm': 3.0},
    ...         {'locus': hl.Locus('1', 4), 'cm': 4.0},
    ...         {'locus': hl.Locus('2', 1), 'cm': 2.0},
    ...         {'locus': hl.Locus('2', 1), 'cm': 2.0},
    ...         {'locus': hl.Locus('3', 3), 'cm': 5.0}]

    >>> ht = hl.Table.parallelize(
    ...         loci,
    ...         hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64),
    ...         key=['locus'])

    Windows with 1bp radius:

    >>> hl.linalg.utils.locus_windows(ht.locus, 1)
    (array([0, 0, 2, 3, 3, 5]), array([2, 2, 3, 5, 5, 6]))

    Windows with 1cm radius:

    >>> hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
    (array([0, 1, 1, 3, 3, 5]), array([1, 3, 3, 5, 5, 6]))

    Notes
    -----
    This function returns two 1-dimensional ndarrays of integers,
    ``starts`` and ``stops``, each of size equal to the number of rows.

    By default, for all indices ``i``, ``[starts[i], stops[i])`` is the maximal
    range of row indices ``j`` such that ``contig[i] == contig[j]`` and
    ``position[i] - radius <= position[j] <= position[i] + radius``.

    If the :meth:`.global_position` on `locus_expr` is not in ascending order,
    this method will fail. Ascending order should hold for a matrix table keyed
    by locus or variant (and the associated row table), or for a table that has
    been ordered by `locus_expr`.

    Set `coord_expr` to use a value other than position to define the windows.
    This row-indexed numeric expression must be non-missing, non-``nan``, on the
    same source as `locus_expr`, and ascending with respect to locus
    position for each contig; otherwise the function will fail.

    The last example above uses centimorgan coordinates, so
    ``[starts[i], stops[i])`` is the maximal range of row indices ``j`` such
    that ``contig[i] == contig[j]`` and
    ``cm[i] - radius <= cm[j] <= cm[i] + radius``.

    Index ranges are start-inclusive and stop-exclusive. This function is
    especially useful in conjunction with
    :meth:`.BlockMatrix.sparsify_row_intervals`.

    Parameters
    ----------
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression on a table or matrix table.
    radius: :obj:`int`
        Radius of window for row values.
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value.
        Must be on the same table or matrix table as `locus_expr`.
        By default, the row value is given by the locus position.

    Returns
    -------
    (:class:`ndarray` of :obj:`int64`, :class:`ndarray` of :obj:`int64`)
        Tuple of start indices array and stop indices array.
    """
    if radius < 0:
        raise ValueError(f"locus_windows: 'radius' must be non-negative, found {radius}")
    check_row_indexed('locus_windows', locus_expr)
    if coord_expr is not None:
        check_row_indexed('locus_windows', coord_expr)

    src = locus_expr._indices.source
    if locus_expr not in src._fields_inverse:
        locus = Env.get_uid()
        annotate_fields = {locus: locus_expr}

        if coord_expr is not None:
            if coord_expr not in src._fields_inverse:
                coords = Env.get_uid()
                annotate_fields[coords] = coord_expr
            else:
                coords = src._fields_inverse[coord_expr]

        if isinstance(src, hl.MatrixTable):
            new_src = src.annotate_rows(**annotate_fields)
        else:
            new_src = src.annotate(**annotate_fields)

        locus_expr = new_src[locus]
        if coord_expr is not None:
            coord_expr = new_src[coords]

    if coord_expr is None:
        coord_expr = locus_expr.position

    rg = locus_expr.dtype.reference_genome
    contig_group_expr = hl.agg.group_by(hl.locus(locus_expr.contig, 1, reference_genome=rg), hl.agg.collect(coord_expr))

    # check loci are in sorted order
    last_pos = hl.fold(lambda a, elt: (hl.case()
                                         .when(a <= elt, elt)
                                         .or_error("locus_windows: 'locus_expr' global position must be in ascending order.")),
                       -1,
                       hl.agg.collect(hl.case()
                                        .when(hl.is_defined(locus_expr), locus_expr.global_position())
                                        .or_error("locus_windows: missing value for 'locus_expr'.")))
    checked_contig_groups = (hl.case()
                               .when(last_pos >= 0, contig_group_expr)
                               .or_error("locus_windows: 'locus_expr' has length 0"))

    contig_groups = locus_expr._aggregation_method()(checked_contig_groups, _localize=False)

    coords = hl.sorted(hl.array(contig_groups)).map(lambda t: t[1])
    starts_and_stops = hl._locus_windows_per_contig(coords, radius)

    if not _localize:
        return starts_and_stops

    starts, stops = hl.eval(starts_and_stops)
    return np.array(starts), np.array(stops)
コード例 #49
0
ファイル: test_table.py プロジェクト: lfrancioli/hail
 def test_expand_types(self):
     t1 = hl.utils.range_table(10)
     t1 = t1.key_by(x = hl.locus('1', t1.idx+1)).expand_types()
     t2 = hl.utils.range_table(10).key_by()
     t2 = t2.annotate(x=hl.struct(contig='1', position=t2.idx+1))
     self.assertTrue(t1._same(t2))