Esempio n. 1
0
 def filter_by_coordinates(self, gp_range, nlp_range):
     assert (len(gp_range) == 2 and len(nlp_range) == 2)
     return self.ht.filter(
         hl.interval(hl.int64(gp_range[0]),
                     hl.int64(gp_range[1]),
                     includes_start=True,
                     includes_end=True).contains(self.ht.global_position)
         & hl.interval(hl.float64(nlp_range[0]),
                       hl.float64(nlp_range[1]),
                       includes_start=True,
                       includes_end=True).contains(self.ht.neg_log_pval))
Esempio n. 2
0
    def test_row_joins_into_table(self):
        rt = hl.utils.range_matrix_table(9, 13, 3)
        mt1 = rt.key_rows_by(idx=rt.row_idx)
        mt1 = mt1.select_rows(v=mt1.idx + 2)
        mt2 = rt.key_rows_by(idx=rt.row_idx, idx2=rt.row_idx + 1)
        mt2 = mt2.select_rows(v=mt2.idx + 2)

        t1 = hl.utils.range_table(10, 3)
        t2 = t1.key_by(t1.idx, idx2=t1.idx + 1)
        t1 = t1.select(v=t1.idx + 2)
        t2 = t2.select(v=t2.idx + 2)

        tinterval1 = t1.key_by(k=hl.interval(t1.idx, t1.idx, True, True))
        tinterval1 = tinterval1.select(v=tinterval1.idx + 2)
        tinterval2 = t2.key_by(k=hl.interval(t2.key, t2.key, True, True))
        tinterval2 = tinterval2.select(v=tinterval2.idx + 2)

        values = [hl.Struct(v=i + 2) for i in range(9)]
        # join on mt row key
        self.assertEqual(t1.index(mt1.row_key).collect(), values)
        self.assertEqual(t2.index(mt2.row_key).collect(), values)
        self.assertEqual(t1.index(mt1.idx).collect(), values)
        self.assertEqual(t2.index(mt2.idx, mt2.idx2).collect(), values)
        self.assertEqual(t1.index(mt2.idx).collect(), values)
        with self.assertRaises(hl.expr.ExpressionException):
            t2.index(mt2.idx).collect()
        with self.assertRaises(hl.expr.ExpressionException):
            t2.index(mt1.row_key).collect()

        # join on not mt row key
        self.assertEqual(
            t1.index(mt1.v).collect(),
            [hl.Struct(v=i + 2) for i in range(2, 10)] + [None])
        self.assertEqual(
            t2.index(mt2.idx2, mt2.v).collect(),
            [hl.Struct(v=i + 2) for i in range(1, 10)])
        with self.assertRaises(hl.expr.ExpressionException):
            t2.index(mt2.v).collect()

        # join on interval of first field of mt row key
        self.assertEqual(tinterval1.index(mt1.idx).collect(), values)
        self.assertEqual(tinterval1.index(mt1.row_key).collect(), values)
        self.assertEqual(tinterval1.index(mt2.idx).collect(), values)

        with self.assertRaises(hl.expr.ExpressionException):
            tinterval1.index(mt2.row_key).collect()
        with self.assertRaises(hl.expr.ExpressionException):
            tinterval2.index(mt2.idx).collect()
        with self.assertRaises(hl.expr.ExpressionException):
            tinterval2.index(mt2.row_key).collect()
        with self.assertRaises(hl.expr.ExpressionException):
            tinterval2.index(mt2.idx, mt2.idx2).collect()
Esempio n. 3
0
    def test_row_joins_into_table(self):
        rt = hl.utils.range_matrix_table(9, 13, 3)
        mt1 = rt.key_rows_by(idx=rt.row_idx)
        mt1 = mt1.select_rows(v=mt1.idx + 2)
        mt2 = rt.key_rows_by(idx=rt.row_idx, idx2=rt.row_idx + 1)
        mt2 = mt2.select_rows(v=mt2.idx + 2)

        t1 = hl.utils.range_table(10, 3)
        t2 = t1.key_by(t1.idx, idx2=t1.idx + 1)
        t1 = t1.select(v=t1.idx + 2)
        t2 = t2.select(v=t2.idx + 2)

        tinterval1 = t1.key_by(k=hl.interval(t1.idx, t1.idx, True, True))
        tinterval1 = tinterval1.select(v=tinterval1.idx + 2)
        tinterval2 = t2.key_by(k=hl.interval(t2.key, t2.key, True, True))
        tinterval2 = tinterval2.select(v=tinterval2.idx + 2)

        values = [hl.Struct(v=i + 2) for i in range(9)]
        # join on mt row key
        self.assertEqual(t1.index(mt1.row_key).collect(), values)
        self.assertEqual(t2.index(mt2.row_key).collect(), values)
        self.assertEqual(t1.index(mt1.idx).collect(), values)
        self.assertEqual(t2.index(mt2.idx, mt2.idx2).collect(), values)
        self.assertEqual(t1.index(mt2.idx).collect(), values)
        with self.assertRaises(hl.expr.ExpressionException):
            t2.index(mt2.idx).collect()
        with self.assertRaises(hl.expr.ExpressionException):
            t2.index(mt1.row_key).collect()

        # join on not mt row key
        self.assertEqual(t1.index(mt1.v).collect(), [hl.Struct(v=i + 2) for i in range(2, 10)] + [None])
        self.assertEqual(t2.index(mt2.idx2, mt2.v).collect(), [hl.Struct(v=i + 2) for i in range(1, 10)])
        with self.assertRaises(hl.expr.ExpressionException):
            t2.index(mt2.v).collect()

        # join on interval of first field of mt row key
        self.assertEqual(tinterval1.index(mt1.idx).collect(), values)
        self.assertEqual(tinterval1.index(mt1.row_key).collect(), values)
        self.assertEqual(tinterval1.index(mt2.idx).collect(), values)

        with self.assertRaises(hl.expr.ExpressionException):
            tinterval1.index(mt2.row_key).collect()
        with self.assertRaises(hl.expr.ExpressionException):
            tinterval2.index(mt2.idx).collect()
        with self.assertRaises(hl.expr.ExpressionException):
            tinterval2.index(mt2.row_key).collect()
        with self.assertRaises(hl.expr.ExpressionException):
            tinterval2.index(mt2.idx, mt2.idx2).collect()
Esempio n. 4
0
def create_gene_map_ht(ht, check_gene_contigs=False):
    from gnomad.utils.vep import process_consequences

    ht = process_consequences(ht)
    ht = ht.explode(ht.vep.worst_csq_by_gene_canonical)
    ht = ht.annotate(
        variant_id=ht.locus.contig + ':' + hl.str(ht.locus.position) + '_' +
        ht.alleles[0] + '/' + ht.alleles[1],
        annotation=annotation_case_builder(ht.vep.worst_csq_by_gene_canonical))
    if check_gene_contigs:
        gene_contigs = ht.group_by(
            gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id,
            gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol,
        ).aggregate(contigs=hl.agg.collect_as_set(ht.locus.contig))
        assert gene_contigs.all(hl.len(gene_contigs.contigs) == 1)

    gene_map_ht = ht.group_by(
        gene_id=ht.vep.worst_csq_by_gene_canonical.gene_id,
        gene_symbol=ht.vep.worst_csq_by_gene_canonical.gene_symbol,
    ).partition_hint(100).aggregate(
        interval=hl.interval(start=hl.locus(
            hl.agg.take(ht.locus.contig, 1)[0], hl.agg.min(ht.locus.position)),
                             end=hl.locus(
                                 hl.agg.take(ht.locus.contig, 1)[0],
                                 hl.agg.max(ht.locus.position))),
        variants=hl.agg.group_by(ht.annotation, hl.agg.collect(ht.variant_id)),
    )
    return gene_map_ht
Esempio n. 5
0
def _dumps_partitions(partitions, row_key_type):
    parts_type = partitions.dtype
    if not (isinstance(parts_type, hl.tarray)
            and isinstance(parts_type.element_type, hl.tinterval)):
        raise ValueError(
            f'partitions type invalid: {parts_type} must be array of intervals'
        )

    point_type = parts_type.element_type.point_type

    f1, t1 = next(iter(row_key_type.items()))
    if point_type == t1:
        partitions = hl.map(
            lambda x: hl.interval(start=hl.struct(**{f1: x.start}),
                                  end=hl.struct(**{f1: x.end}),
                                  includes_start=x.includes_start,
                                  includes_end=x.includes_end), partitions)
    else:
        if not isinstance(point_type, hl.tstruct):
            raise ValueError(
                f'partitions has wrong type: {point_type} must be struct or type of first row key field'
            )
        if not point_type._is_prefix_of(row_key_type):
            raise ValueError(
                f'partitions type invalid: {point_type} must be prefix of {row_key_type}'
            )

    s = json.dumps(partitions.dtype._convert_to_json(hl.eval(partitions)))
    return s, partitions.dtype
Esempio n. 6
0
 def test_interval_join(self):
     left = hl.utils.range_table(50, n_partitions=10)
     intervals = hl.utils.range_table(4)
     intervals = intervals.key_by(interval=hl.interval(intervals.idx * 10, intervals.idx * 10 + 5))
     left = left.annotate(interval_matches=intervals.index(left.key))
     self.assertTrue(left.all(hl.case()
                              .when(left.idx % 10 < 5, left.interval_matches.idx == left.idx // 10)
                              .default(hl.is_missing(left.interval_matches))))
Esempio n. 7
0
 def test_interval_join(self):
     left = hl.utils.range_table(50, n_partitions=10)
     intervals = hl.utils.range_table(4)
     intervals = intervals.key_by(interval=hl.interval(intervals.idx * 10, intervals.idx * 10 + 5))
     left = left.annotate(interval_matches=intervals.index(left.key))
     self.assertTrue(left.all(hl.case()
                              .when(left.idx % 10 < 5, left.interval_matches.idx == left.idx // 10)
                              .default(hl.is_missing(left.interval_matches))))
Esempio n. 8
0
def calculate_new_intervals(ht, n, reference_genome):
    """takes a table, keyed by ['locus', ...] and produces a list of intervals suitable
    for repartitioning a combiner matrix table

    Parameters
    ----------
    ht : :class:`.Table`
        Table / Rows Table to compute new intervals for
    n : :obj:`int`
        Number of rows each partition should have, (last partition may be smaller)
    reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional
        Reference genome to use.

    Returns
    -------
    :obj:`List[Interval]`
    """
    assert list(ht.key) == ['locus']
    assert ht.locus.dtype == hl.tlocus(reference_genome=reference_genome)
    end = hl.Locus(reference_genome.contigs[-1],
                   reference_genome.lengths[reference_genome.contigs[-1]],
                   reference_genome=reference_genome)

    n_rows = ht.count()

    if n_rows == 0:
        raise ValueError('empty table!')

    ht = ht.select()
    ht = ht.annotate(x=hl.scan.count())
    ht = ht.annotate(y=ht.x + 1)
    ht = ht.filter((ht.x // n != ht.y // n) | (ht.x == (n_rows - 1)))
    ht = ht.select()
    ht = ht.annotate(start=hl.or_else(
        hl.scan._prev_nonnull(
            hl.locus_from_global_position(ht.locus.global_position() + 1,
                                          reference_genome=reference_genome)),
        hl.locus_from_global_position(0, reference_genome=reference_genome)))
    ht = ht.key_by()
    ht = ht.select(
        interval=hl.interval(start=ht.start, end=ht.locus, includes_end=True))

    intervals = ht.aggregate(hl.agg.collect(ht.interval))

    last_st = hl.eval(
        hl.locus_from_global_position(
            hl.literal(intervals[-1].end).global_position() + 1,
            reference_genome=reference_genome))
    interval = hl.Interval(start=last_st, end=end, includes_end=True)
    intervals.append(interval)
    return intervals
Esempio n. 9
0
    def test_segment_intervals(self):
        intervals = hl.Table.parallelize(
            [
                hl.struct(interval=hl.interval(0, 10)),
                hl.struct(interval=hl.interval(20, 50)),
                hl.struct(interval=hl.interval(52, 52))
            ],
            schema=hl.tstruct(interval=hl.tinterval(hl.tint32)),
            key='interval')

        points1 = [-1, 5, 30, 40, 52, 53]

        segmented1 = hl.segment_intervals(intervals, points1)

        assert segmented1.aggregate(
            hl.agg.collect(segmented1.interval) == [
                hl.interval(0, 5),
                hl.interval(5, 10),
                hl.interval(20, 30),
                hl.interval(30, 40),
                hl.interval(40, 50),
                hl.interval(52, 52)
            ])
Esempio n. 10
0
def get_annot_ht():
    t = hl.import_table(f'{wd_data}/gencode.v31lift37.annotation.gff3.gz',no_header=True,impute=True, comment=('#'),force=True)
    #t = hl.import_table('/Users/nbaya/Downloads/gencode.v31lift37.annotation.gtf',no_header=True,impute=True, comment=('#'))
    
                                                                                                                        
    t2 = t.annotate(GFF_Columns = t.f8.split(";").map(lambda x: x.split("=")))
    t2 = t2.filter(t2.f2 == "CDS") # only want coding sequences, not entire genes
    t2 = t2.filter(hl.is_valid_locus(t2.f0[3:], t2.f3, 'GRCh37'))
    t2 = t2.filter(hl.is_valid_locus(t2.f0[3:], t2.f4, 'GRCh37'))
    t2 = t2.annotate(interval=hl.interval(hl.locus(t2.f0[3:], t2.f3, 'GRCh37'), hl.locus(t2.f0[3:], t2.f4, 'GRCh37')))
    t2 = t2.annotate(GFF_Columns = hl.dict(t2.GFF_Columns.map(lambda x: (x[0], x[1]))))
    t2 = t2.annotate(ID=t2.GFF_Columns["ID"], gene_id=t2.GFF_Columns["gene_id"], 
                     gene_name=t2.GFF_Columns["gene_name"], gene_type=t2.GFF_Columns["gene_type"], 
                     level=t2.GFF_Columns["level"])
    t2 = t2.annotate(type=t2.f2, gene_score=t2.f5, gene_strand=t2.f6, gene_phase=t2.f7)
    t2 = t2.drop(t2.GFF_Columns, t2.f8, t2.f0, t2.f1, t2.f2, t2.f3, t2.f4, t2.f5, t2.f6, t2.f7)
    t2 = t2.key_by(t2.interval)
    return t2
Esempio n. 11
0
    def test_constructors(self):
        rg = hl.ReferenceGenome("foo", ["1"], {"1": 100})

        schema = hl.tstruct(a=hl.tfloat64, b=hl.tfloat64, c=hl.tint32, d=hl.tint32)
        rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': 5}]
        kt = hl.Table.parallelize(rows, schema)
        kt = kt.annotate(d=hl.int64(kt.d))

        kt = kt.annotate(l1=hl.parse_locus("1:51"),
                         l2=hl.locus("1", 51, reference_genome=rg),
                         i1=hl.parse_locus_interval("1:51-56", reference_genome=rg),
                         i2=hl.interval(hl.locus("1", 51, reference_genome=rg),
                                        hl.locus("1", 56, reference_genome=rg)))

        expected_schema = {'a': hl.tfloat64, 'b': hl.tfloat64, 'c': hl.tint32, 'd': hl.tint64,
                           'l1': hl.tlocus(), 'l2': hl.tlocus(rg),
                           'i1': hl.tinterval(hl.tlocus(rg)), 'i2': hl.tinterval(hl.tlocus(rg))}

        self.assertTrue(all([expected_schema[f] == t for f, t in kt.row.dtype.items()]))
Esempio n. 12
0
def create_all_values():
    return hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )
Esempio n. 13
0
def create_all_values():
    return hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )
Esempio n. 14
0
def create_all_values_datasets():
    all_values = hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )

    def prefix(s, p):
        return hl.struct(**{p + k: s[k] for k in s})

    all_values_table = (hl.utils.range_table(5, n_partitions=3)
                        .annotate_globals(**prefix(all_values, 'global_'))
                        .annotate(**all_values)
                        .cache())

    all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2)
                               .annotate_globals(**prefix(all_values, 'global_'))
                               .annotate_rows(**prefix(all_values, 'row_'))
                               .annotate_cols(**prefix(all_values, 'col_'))
                               .annotate_entries(**prefix(all_values, 'entry_'))
                               .cache())

    return all_values_table, all_values_matrix_table
Esempio n. 15
0
def create_all_values_datasets():
    all_values = hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({
            hl.array(['a', 'b']): 0.5,
            hl.array(['x', hl.null(hl.tstr), 'z']): 0.3
        }),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(hl.locus('1', 999), hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo',
                    hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)))

    def prefix(s, p):
        return hl.struct(**{p + k: s[k] for k in s})

    all_values_table = (hl.utils.range_table(
        5, n_partitions=3).annotate_globals(
            **prefix(all_values, 'global_')).annotate(**all_values).cache())

    all_values_matrix_table = (hl.utils.range_matrix_table(
        3, 2, n_partitions=2).annotate_globals(
            **prefix(all_values, 'global_')).annotate_rows(
                **prefix(all_values, 'row_')).annotate_cols(
                    **prefix(all_values, 'col_')).annotate_entries(
                        **prefix(all_values, 'entry_')).cache())

    return all_values_table, all_values_matrix_table
Esempio n. 16
0
def init(doctest_namespace):
    # This gets run once per process -- must avoid race conditions
    print("setting up doctest...")

    olddir = os.getcwd()
    os.chdir("docs/")

    doctest_namespace['hl'] = hl
    doctest_namespace['agg'] = agg

    if not os.path.isdir("output/"):
        try:
            os.mkdir("output/")
        except OSError:
            pass

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.read_matrix_table('data/example.vds')
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv', impute=True,
                             types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                    'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                    'E': hl.tstruct(A=hl.tint32, B=hl.tint32)})
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+',
                                   types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)},
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44})
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    print("finished setting up doctest...")
    yield
    os.chdir(olddir)
Esempio n. 17
0
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl
    doctest_namespace['np'] = np

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'output/example.mt', overwrite=True)

    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    small_mt = hl.balding_nichols_model(3, 4, 4)
    doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt',
                                                        overwrite=True)

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'output/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    ld_score_one_pheno_sumstats = hl.import_table(
        'data/ld_score_regression.one_pheno.sumstats.tsv',
        types={
            'locus': hl.tlocus('GRCh37'),
            'alleles': hl.tarray(hl.tstr),
            'chi_squared': hl.tfloat64,
            'n': hl.tint32,
            'ld_score': hl.tfloat64,
            'phenotype': hl.tstr,
            'chi_squared_50_irnt': hl.tfloat64,
            'n_50_irnt': hl.tint32,
            'chi_squared_20160': hl.tfloat64,
            'n_20160': hl.tint32
        },
        key=['locus', 'alleles'])
    doctest_namespace[
        'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats

    mt = hl.import_matrix_table(
        'data/ld_score_regression.all_phenos.sumstats.tsv',
        row_fields={
            'locus': hl.tstr,
            'alleles': hl.tstr,
            'ld_score': hl.tfloat64
        },
        entry_type=hl.tstr)
    mt = mt.key_cols_by(phenotype=mt.col_id)
    mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus),
                        alleles=mt.alleles.split(','))
    mt = mt.drop('row_id', 'col_id')
    mt = mt.annotate_entries(x=mt.x.split(","))
    mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]),
                              n=hl.int32(mt.x[1]))
    mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score))
    doctest_namespace['ld_score_all_phenos_sumstats'] = mt

    print("finished setting up doctest...")
Esempio n. 18
0
    ht = ht.annotate(locus=hl.locus('chr' + ht['chromosome'].replace('MT', 'M'), ht['position'], 'hg19'))
    if args.b == 'GRCh37':
        hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19tob37.chain.gz', 'GRCh37')
        ht = ht.annotate(locus=hl.liftover(ht['locus'], 'GRCh37'))
    if args.b == 'GRCh38':
        hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19ToHg38.over.chain.gz', 'GRCh38')
        ht = ht.annotate(locus=hl.liftover(ht['locus'], 'GRCh38'))
    ht = ht.filter(hl.is_defined(ht['locus']))
    ht = ht.select('locus', 'N', 'S')
    ht = ht.key_by('locus')

if args.d == 'elements':
    name = 'GERP_elements'
    ht = hl.import_table('gs://hail-datasets-extracted-data/GERP++/GERP++_elements.hg19.tsv.bgz',
                         types={'start': hl.tint, 'end': hl.tint, 'S': hl.tfloat, 'p_value': hl.tfloat})
    ht = ht.annotate(interval=hl.interval(hl.locus(ht['chromosome'], ht['start'], 'hg19'),
                                          hl.locus(ht['chromosome'], ht['end'], 'hg19')))
    if args.b == 'GRCh37':
        hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19tob37.chain.gz', 'GRCh37')
        ht = ht.annotate(interval=hl.liftover(ht['interval'], 'GRCh37'))
    if args.b == 'GRCh38':
        hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19ToHg38.over.chain.gz', 'GRCh38')
        ht = ht.annotate(interval=hl.liftover(ht['interval'], 'GRCh38'))
    ht = ht.filter(hl.is_defined(ht['interval']))
    ht = ht.select('interval', 'S', 'p_value')
    ht = ht.key_by('interval')

n_rows = ht.count()
n_partitions = ht.n_partitions()
ht = ht.annotate_globals(metadata=hl.struct(name=name,
                                            version='GERP++',
                                            reference_genome=args.b,
Esempio n. 19
0
def create_genome_intervals_file() -> hl.Table:
    # Load GTF file
    tmp_path = f'/tmp_{uuid.uuid4()}.ht'
    ht = _load_gencode_gtf()
    ht.filter((ht.feature == 'gene')
              & (ht.gene_type == 'protein_coding')).write(tmp_path, True)

    # Scan to get bounds, create intervals
    tmp_path2 = f'/tmp/tmp_{uuid.uuid4()}.ht'
    ht = hl.read_table(tmp_path)
    ht = ht.filter((ht.feature == 'gene') & (ht.gene_type == 'protein_coding'))
    ht = ht.select('gene_id', 'gene_name')
    last_locus = hl.scan.take(ht.row,
                              1,
                              ordering=-ht.interval.start.global_position())
    intergenic_region = hl.or_missing(
        (hl.len(last_locus) > 0) &
        (last_locus[0].interval.end.contig == ht.interval.start.contig),
        hl.interval(last_locus[0].interval.end, ht.interval.start))
    ht = ht.annotate(last_locus=last_locus,
                     intergenic_region=intergenic_region)
    intergenic_length = ht.intergenic_region.end.position - ht.intergenic_region.start.position
    intergenic_region = hl.or_missing(intergenic_length > 0,
                                      ht.intergenic_region)
    intergenic_dist = hl.int(
        (intergenic_region.end.position - intergenic_region.start.position) /
        2)
    chrom = ht.interval.start.contig

    def interval(pos1, pos2):
        return hl.interval(hl.locus(chrom, pos1), hl.locus(chrom, pos2))

    ht = ht.transmute(
        intergenic_region1=hl.or_missing(
            hl.is_defined(intergenic_region),
            interval(
                intergenic_region.start.position +
                1,  # gene interval is closed
                intergenic_region.start.position + intergenic_dist)),
        intergenic_region2=hl.or_missing(
            hl.is_defined(intergenic_region),
            interval(intergenic_region.start.position + intergenic_dist,
                     intergenic_region.end.position))).key_by()
    regions = hl.array([
        hl.struct(interval=ht.interval,
                  gene_id=ht.gene_id,
                  gene_name=ht.gene_name,
                  within_gene=True)
    ])
    regions = hl.if_else(
        hl.is_defined(ht.intergenic_region1),
        regions.extend([
            hl.struct(interval=ht.intergenic_region1,
                      gene_id=ht.last_locus[0].gene_id,
                      gene_name=ht.last_locus[0].gene_name,
                      within_gene=False),
            hl.struct(interval=ht.intergenic_region2,
                      gene_id=ht.gene_id,
                      gene_name=ht.gene_name,
                      within_gene=False)
        ]), regions)
    ht = ht.annotate(regions=regions).explode('regions')
    ht = ht.select(**ht.regions)
    return ht.key_by('interval')
Esempio n. 20
0
 def interval(pos1, pos2):
     return hl.interval(hl.locus(chrom, pos1), hl.locus(chrom, pos2))
Esempio n. 21
0
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl

    if not os.path.isdir("output/"):
        try:
            os.mkdir("output/")
        except OSError:
            pass

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.read_matrix_table('data/example.vds')
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    print("finished setting up doctest...")
ht_transcripts = hl.import_table('gs://hail-datasets/raw-data/gtex/v7/reference/gencode.v19.transcripts.patched_contigs.gtf',
                                 comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint}, missing='.', min_partitions=12)

ht_transcripts = ht_transcripts.rename({'f0': 'contig',
                                        'f1': 'annotation_source',
                                        'f2': 'feature_type',
                                        'f3': 'start',
                                        'f4': 'end',
                                        'f5': 'score',
                                        'f6': 'strand',
                                        'f7': 'phase',
                                        'f8': 'attributes'})

ht_transcripts = ht_transcripts.filter(ht_transcripts.feature_type == 'transcript')
ht_transcripts = ht_transcripts.annotate(interval=hl.interval(hl.locus(ht_transcripts.contig, ht_transcripts.start, 'GRCh37'), hl.locus(ht_transcripts.contig, ht_transcripts.end + 1, 'GRCh37')))
ht_transcripts = ht_transcripts.annotate(attributes=hl.dict(hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht_transcripts.attributes.split('; '))))
attribute_cols = list(ht_transcripts.aggregate(hl.set(hl.flatten(hl.agg.collect(ht_transcripts.attributes.keys())))))
ht_transcripts = ht_transcripts.annotate(**{x: hl.or_missing(ht_transcripts.attributes.contains(x), ht_transcripts.attributes[x]) for x in attribute_cols})
ht_transcripts = ht_transcripts.select(*(['transcript_id', 'transcript_name', 'transcript_type', 'strand', 'transcript_status', 'havana_transcript', 'ccdsid', 'ont', 'gene_name', 'interval', 'gene_type', 'annotation_source', 'havana_gene', 'gene_status', 'tag']))
ht_transcripts = ht_transcripts.rename({'havana_transcript': 'havana_transcript_id', 'havana_gene': 'havana_gene_id'})
ht_transcripts = ht_transcripts.key_by(ht_transcripts.transcript_id)

mt = hl.import_matrix_table('gs://hail-datasets/raw-data/gtex/v7/rna-seq/processed/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_expected_count.tsv.bgz',
                            row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr}, row_key='transcript_id', missing='', entry_type=hl.tfloat)

mt = mt.annotate_cols(sample_id=mt.col_id)
mt = mt.key_cols_by(mt.sample_id)

mt = mt.annotate_entries(read_count=hl.int(mt.x))
mt = mt.drop(mt.col_id, mt.x)
Esempio n. 23
0
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table:
    """Import a GTF file.

       The GTF file format is identical to the GFF version 2 file format,
       and so this function can be used to import GFF version 2 files as
       well.

       See https://www.ensembl.org/info/website/upload/gff.html for more
       details on the GTF/GFF2 file format.

       The :class:`.Table` returned by this function will be keyed by the
       ``interval`` row field and will include the following row fields:

       .. code-block:: text

           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'interval': interval<>

       There will also be corresponding fields for every tag found in the
       attribute field of the GTF file.

       Note
       ----

       This function will return an ``interval`` field of type :class:`.tinterval`
       constructed from the ``seqname``, ``start``, and ``end`` fields in the
       GTF file. This interval is inclusive of both the start and end positions
       in the GTF file. 

       If the ``reference_genome`` parameter is specified, the start and end
       points of the ``interval`` field will be of type :class:`.tlocus`.
       Otherwise, the start and end points of the ``interval`` field will be of
       type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and
       ``position`` (type :class:`.tint32`).

       Furthermore, if the ``reference_genome`` parameter is specified and
       ``skip_invalid_contigs`` is ``True``, this import function will skip
       lines in the GTF where ``seqname`` is not consistent with the reference
       genome specified.

       Example
       -------

       >>> ht = hl.experimental.import_gtf('data/test.gtf', 
       ...                                 reference_genome='GRCh37',
       ...                                 skip_invalid_contigs=True)

       >>> ht.describe()  # doctest: +NOTEST
       ----------------------------------------
       Global fields:
       None
       ----------------------------------------
       Row fields:
           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'gene_type': str
           'exon_id': str
           'havana_transcript': str
           'level': str
           'transcript_name': str
           'gene_status': str
           'gene_id': str
           'transcript_type': str
           'tag': str
           'transcript_status': str
           'gene_name': str
           'transcript_id': str
           'exon_number': str
           'havana_gene': str
           'interval': interval<locus<GRCh37>>
       ----------------------------------------
       Key: ['interval']
       ----------------------------------------

       Parameters
       ----------

       path : :obj:`str`
           File to import.
       reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
           Reference genome to use.
       skip_invalid_contigs : :obj:`bool`
           If ``True`` and `reference_genome` is not ``None``, skip lines where
           ``seqname`` is not consistent with the reference genome.
       min_partitions : :obj:`int` or :obj:`None`
           Minimum number of partitions (passed to import_table).

       Returns
       -------
       :class:`.Table`
       """

    ht = hl.import_table(path,
                         min_partitions=min_partitions,
                         comment='#',
                         no_header=True,
                         types={'f3': hl.tint,
                                'f4': hl.tint,
                                'f5': hl.tfloat,
                                'f7': hl.tint},
                         missing='.',
                         delimiter='\t')

    ht = ht.rename({'f0': 'seqname',
                    'f1': 'source',
                    'f2': 'feature',
                    'f3': 'start',
                    'f4': 'end',
                    'f5': 'score',
                    'f6': 'strand',
                    'f7': 'frame',
                    'f8': 'attribute'})

    ht = ht.annotate(attribute=hl.dict(
        hl.map(lambda x: (x.split(' ')[0],
                          x.split(' ')[1].replace('"', '').replace(';$', '')),
               ht['attribute'].split('; '))))

    attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys()))

    ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x),
                                          ht['attribute'][x])
                         for x in attributes if x})

    if reference_genome:
        if reference_genome == 'GRCh37':
            ht = ht.annotate(seqname=ht['seqname'].replace('^chr', ''))
        else:
            ht = ht.annotate(seqname=hl.case()
                                       .when(ht['seqname'].startswith('HLA'), ht['seqname'])
                                       .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', ''))
                                       .when(ht['seqname'].startswith('chr'), ht['seqname'])
                                       .default('chr' + ht['seqname']))
        if skip_invalid_contigs:
            valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs))
            ht = ht.filter(valid_contigs.contains(ht['seqname']))
        ht = ht.transmute(interval=hl.locus_interval(ht['seqname'],
                                                     ht['start'],
                                                     ht['end'],
                                                     includes_start=True,
                                                     includes_end=True,
                                                     reference_genome=reference_genome))
    else:
        ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']),
                                               hl.struct(seqname=ht['seqname'], position=ht['end']),
                                               includes_start=True,
                                               includes_end=True))

    ht = ht.key_by('interval')

    return ht
Esempio n. 24
0
def generate_datasets(doctest_namespace, output_dir):
    doctest_namespace['hl'] = hl

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'{output_dir.name}/example.vds', overwrite=True)
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'{output_dir.name}/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    print("finished setting up doctest...")
Esempio n. 25
0
def import_gtf(path,
               reference_genome=None,
               skip_invalid_contigs=False,
               min_partitions=None) -> hl.Table:
    """Import a GTF file.

       The GTF file format is identical to the GFF version 2 file format,
       and so this function can be used to import GFF version 2 files as
       well.

       See https://www.ensembl.org/info/website/upload/gff.html for more
       details on the GTF/GFF2 file format.

       The :class:`.Table` returned by this function will be keyed by the
       ``interval`` row field and will include the following row fields:

       .. code-block:: text

           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'interval': interval<>

       There will also be corresponding fields for every tag found in the
       attribute field of the GTF file.

       Note
       ----

       This function will return an ``interval`` field of type :class:`.tinterval`
       constructed from the ``seqname``, ``start``, and ``end`` fields in the
       GTF file. This interval is inclusive of both the start and end positions
       in the GTF file. 

       If the ``reference_genome`` parameter is specified, the start and end
       points of the ``interval`` field will be of type :class:`.tlocus`.
       Otherwise, the start and end points of the ``interval`` field will be of
       type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and
       ``position`` (type :class:`.tint32`).

       Furthermore, if the ``reference_genome`` parameter is specified and
       ``skip_invalid_contigs`` is ``True``, this import function will skip
       lines in the GTF where ``seqname`` is not consistent with the reference
       genome specified.

       Example
       -------

       >>> ht = hl.experimental.import_gtf('data/test.gtf', 
       ...                                 reference_genome='GRCh37',
       ...                                 skip_invalid_contigs=True)

       >>> ht.describe()  # doctest: +SKIP_OUTPUT_CHECK
       ----------------------------------------
       Global fields:
       None
       ----------------------------------------
       Row fields:
           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'gene_type': str
           'exon_id': str
           'havana_transcript': str
           'level': str
           'transcript_name': str
           'gene_status': str
           'gene_id': str
           'transcript_type': str
           'tag': str
           'transcript_status': str
           'gene_name': str
           'transcript_id': str
           'exon_number': str
           'havana_gene': str
           'interval': interval<locus<GRCh37>>
       ----------------------------------------
       Key: ['interval']
       ----------------------------------------

       Parameters
       ----------

       path : :obj:`str`
           File to import.
       reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
           Reference genome to use.
       skip_invalid_contigs : :obj:`bool`
           If ``True`` and `reference_genome` is not ``None``, skip lines where
           ``seqname`` is not consistent with the reference genome.
       min_partitions : :obj:`int` or :obj:`None`
           Minimum number of partitions (passed to import_table).

       Returns
       -------
       :class:`.Table`
       """

    ht = hl.import_table(path,
                         min_partitions=min_partitions,
                         comment='#',
                         no_header=True,
                         types={
                             'f3': hl.tint,
                             'f4': hl.tint,
                             'f5': hl.tfloat,
                             'f7': hl.tint
                         },
                         missing='.',
                         delimiter='\t')

    ht = ht.rename({
        'f0': 'seqname',
        'f1': 'source',
        'f2': 'feature',
        'f3': 'start',
        'f4': 'end',
        'f5': 'score',
        'f6': 'strand',
        'f7': 'frame',
        'f8': 'attribute'
    })

    ht = ht.annotate(attribute=hl.dict(
        hl.map(
            lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').
                       replace(';$', '')), ht['attribute'].split('; '))))

    attributes = ht.aggregate(
        hl.agg.explode(lambda x: hl.agg.collect_as_set(x),
                       ht['attribute'].keys()))

    ht = ht.transmute(
        **{
            x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x])
            for x in attributes if x
        })

    if reference_genome:
        if reference_genome == 'GRCh37':
            ht = ht.annotate(seqname=ht['seqname'].replace('^chr', ''))
        else:
            ht = ht.annotate(seqname=hl.case().when(
                ht['seqname'].startswith('HLA'), ht['seqname']).when(
                    ht['seqname'].startswith('chrHLA'), ht['seqname'].replace(
                        '^chr', '')).when(ht['seqname'].startswith(
                            'chr'), ht['seqname']).default('chr' +
                                                           ht['seqname']))
        if skip_invalid_contigs:
            valid_contigs = hl.literal(
                set(hl.get_reference(reference_genome).contigs))
            ht = ht.filter(valid_contigs.contains(ht['seqname']))
        ht = ht.transmute(
            interval=hl.locus_interval(ht['seqname'],
                                       ht['start'],
                                       ht['end'],
                                       includes_start=True,
                                       includes_end=True,
                                       reference_genome=reference_genome))
    else:
        ht = ht.transmute(interval=hl.interval(
            hl.struct(seqname=ht['seqname'], position=ht['start']),
            hl.struct(seqname=ht['seqname'], position=ht['end']),
            includes_start=True,
            includes_end=True))

    ht = ht.key_by('interval')

    return ht
Esempio n. 26
0
import hail as hl

t = hl.import_table('/Users/bfranco/downloads/gencode.v30.annotation.gff3',no_header=True, impute=True, comment=('#'))

t2 = t.annotate(GFF_Columns = t.f8.split(";").map(lambda x: x.split("=")))

t2 = t2.filter(t2.f2 == "gene")

t2 = t2.annotate(interval=hl.interval(hl.locus(t2.f0, t2.f3, 'GRCh38'), hl.locus(t2.f0, t2.f3, 'GRCh38')))

t2 = t2.annotate(GFF_Columns = hl.dict(t2.GFF_Columns.map(lambda x: (x[0], x[1]))))

t2 = t2.annotate(ID=t2.GFF_Columns["ID"], gene_id=t2.GFF_Columns["gene_id"], gene_name=t2.GFF_Columns["gene_name"], gene_type=t2.GFF_Columns["gene_type"], level=t2.GFF_Columns["level"])

t2 = t2.annotate(type=t2.f2, gene_score=t2.f5, gene_strand=t2.f6, gene_phase=t2.f7)

t2 = t2.drop(t2.GFF_Columns, t2.f8, t2.f0, t2.f1, t2.f2, t2.f3, t2.f4, t2.f5, t2.f6, t2.f7)

t2.write('gs://hail-datasets-hail-data/gencode_v30_annotation.mt', overwrite=True)
Esempio n. 27
0
def segment_intervals(ht, points):
    """Segment the interval keys of `ht` at a given set of points.

    Parameters
    ----------
    ht : :class:`.Table`
        Table with interval keys.
    points : :class:`.Table` or :class:`.ArrayExpression`
        Points at which to segment the intervals, a table or an array.

    Returns
    -------
    :class:`.Table`
    """
    if len(ht.key) != 1 or not isinstance(ht.key[0].dtype, hl.tinterval):
        raise ValueError(
            "'segment_intervals' expects a table with interval keys")
    point_type = ht.key[0].dtype.point_type
    if isinstance(points, Table):
        if len(points.key) != 1 or points.key[0].dtype != point_type:
            raise ValueError(
                "'segment_intervals' expects points to be a table with a single"
                " key of the same type as the intervals in 'ht', or an array of those points:"
                f"\n  expect {point_type}, found {list(points.key.dtype.values())}"
            )
        points = hl.array(hl.set(points.collect(_localize=False)))
    if points.dtype.element_type != point_type:
        raise ValueError(
            f"'segment_intervals' expects points to be a table with a single"
            f" key of the same type as the intervals in 'ht', or an array of those points:"
            f"\n  expect {point_type}, found {points.dtype.element_type}")

    points = hl._sort_by(points, lambda l, r: hl._compare(l, r) < 0)

    ht = ht.annotate_globals(__points=points)

    interval = ht.key[0]
    points = ht.__points
    lower = hl.expr.functions._lower_bound(points, interval.start)
    higher = hl.expr.functions._lower_bound(points, interval.end)
    n_points = hl.len(points)
    lower = hl.if_else((lower < n_points) & (points[lower] == interval.start),
                       lower + 1, lower)
    higher = hl.if_else((higher < n_points) & (points[higher] == interval.end),
                        higher - 1, higher)
    interval_results = hl.rbind(
        lower, higher, lambda lower, higher: hl.cond(
            lower >= higher, [interval],
            hl.flatten([
                [
                    hl.interval(interval.start,
                                points[lower],
                                includes_start=interval.includes_start,
                                includes_end=False)
                ],
                hl.range(lower, higher - 1).map(lambda x: hl.interval(
                    points[x],
                    points[x + 1],
                    includes_start=True,
                    includes_end=False)),
                [
                    hl.interval(points[higher - 1],
                                interval.end,
                                includes_start=True,
                                includes_end=interval.includes_end)
                ],
            ])))
    ht = ht.annotate(__new_intervals=interval_results,
                     lower=lower,
                     higher=higher).explode('__new_intervals')
    return ht.key_by(**{
        list(ht.key)[0]: ht.__new_intervals
    }).drop('__new_intervals')
Esempio n. 28
0
 def _coerce(self, x):
     assert isinstance(x, hl.expr.IntervalExpression)
     return hl.interval(self.point_type.coerce(x.start),
                        self.point_type.coerce(x.end),
                        includes_start=x.includes_start,
                        includes_end=x.includes_end)
Esempio n. 29
0
def import_gtf(path,
               reference_genome=None,
               skip_invalid_contigs=False,
               min_partitions=None,
               force_bgz=False,
               force=False) -> hl.Table:
    """Import a GTF file.

       The GTF file format is identical to the GFF version 2 file format,
       and so this function can be used to import GFF version 2 files as
       well.

       See https://www.ensembl.org/info/website/upload/gff.html for more
       details on the GTF/GFF2 file format.

       The :class:`.Table` returned by this function will be keyed by the
       ``interval`` row field and will include the following row fields:

       .. code-block:: text

           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'interval': interval<>

       There will also be corresponding fields for every tag found in the
       attribute field of the GTF file.

       Note
       ----

       This function will return an ``interval`` field of type :class:`.tinterval`
       constructed from the ``seqname``, ``start``, and ``end`` fields in the
       GTF file. This interval is inclusive of both the start and end positions
       in the GTF file.

       If the ``reference_genome`` parameter is specified, the start and end
       points of the ``interval`` field will be of type :class:`.tlocus`.
       Otherwise, the start and end points of the ``interval`` field will be of
       type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and
       ``position`` (type :obj:`.tint32`).

       Furthermore, if the ``reference_genome`` parameter is specified and
       ``skip_invalid_contigs`` is ``True``, this import function will skip
       lines in the GTF where ``seqname`` is not consistent with the reference
       genome specified.

       Example
       -------

       >>> ht = hl.experimental.import_gtf('data/test.gtf',
       ...                                 reference_genome='GRCh37',
       ...                                 skip_invalid_contigs=True)

       >>> ht.describe()  # doctest: +SKIP_OUTPUT_CHECK
       ----------------------------------------
       Global fields:
       None
       ----------------------------------------
       Row fields:
           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'gene_type': str
           'exon_id': str
           'havana_transcript': str
           'level': str
           'transcript_name': str
           'gene_status': str
           'gene_id': str
           'transcript_type': str
           'tag': str
           'transcript_status': str
           'gene_name': str
           'transcript_id': str
           'exon_number': str
           'havana_gene': str
           'interval': interval<locus<GRCh37>>
       ----------------------------------------
       Key: ['interval']
       ----------------------------------------

       Parameters
       ----------

       path : :class:`str`
           File to import.
       reference_genome : :class:`str` or :class:`.ReferenceGenome`, optional
           Reference genome to use.
       skip_invalid_contigs : :obj:`bool`
           If ``True`` and `reference_genome` is not ``None``, skip lines where
           ``seqname`` is not consistent with the reference genome.
       min_partitions : :obj:`int` or :obj:`None`
           Minimum number of partitions (passed to import_table).
       force_bgz : :obj:`bool`
           If ``True``, load files as blocked gzip files, assuming
           that they were actually compressed using the BGZ codec. This option is
           useful when the file extension is not ``'.bgz'``, but the file is
           blocked gzip, so that the file can be read in parallel and not on a
           single node.
       force : :obj:`bool`
           If ``True``, load gzipped files serially on one core. This should
           be used only when absolutely necessary, as processing time will be
           increased due to lack of parallelism.

       Returns
       -------
       :class:`.Table`
       """

    ht = hl.import_table(path,
                         min_partitions=min_partitions,
                         comment='#',
                         no_header=True,
                         types={
                             'f3': hl.tint,
                             'f4': hl.tint,
                             'f5': hl.tfloat,
                             'f7': hl.tint
                         },
                         missing='.',
                         delimiter='\t',
                         force_bgz=force_bgz,
                         force=force)

    ht = ht.rename({
        'f0': 'seqname',
        'f1': 'source',
        'f2': 'feature',
        'f3': 'start',
        'f4': 'end',
        'f5': 'score',
        'f6': 'strand',
        'f7': 'frame',
        'f8': 'attribute'
    })

    def parse_attributes(unparsed_attributes):
        def parse_attribute(attribute):
            key_and_value = attribute.split(' ')
            key = key_and_value[0]
            value = key_and_value[1]
            return (key, value.replace('"|;\\$', ''))

        return hl.dict(unparsed_attributes.split('; ').map(parse_attribute))

    ht = ht.annotate(attribute=parse_attributes(ht['attribute']))

    ht = ht.checkpoint(new_temp_file())

    attributes = ht.aggregate(
        hl.agg.explode(lambda x: hl.agg.collect_as_set(x),
                       ht['attribute'].keys()))

    ht = ht.transmute(
        **{
            x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x])
            for x in attributes if x
        })

    if reference_genome:
        if reference_genome.name == 'GRCh37':
            ht = ht.annotate(
                seqname=hl.case().when((ht['seqname'] == 'M')
                                       | (ht['seqname'] == 'chrM'), 'MT').
                when(ht['seqname'].startswith('chr'), ht['seqname'].replace(
                    '^chr', '')).default(ht['seqname']))
        else:
            ht = ht.annotate(seqname=hl.case().when(
                ht['seqname'].startswith('HLA'), ht['seqname']).when(
                    ht['seqname'].startswith('chrHLA'), ht['seqname'].replace(
                        '^chr', '')).when(ht['seqname'].startswith(
                            'chr'), ht['seqname']).default('chr' +
                                                           ht['seqname']))
        if skip_invalid_contigs:
            valid_contigs = hl.literal(set(reference_genome.contigs))
            ht = ht.filter(valid_contigs.contains(ht['seqname']))
        ht = ht.transmute(
            interval=hl.locus_interval(ht['seqname'],
                                       ht['start'],
                                       ht['end'],
                                       includes_start=True,
                                       includes_end=True,
                                       reference_genome=reference_genome))
    else:
        ht = ht.transmute(interval=hl.interval(
            hl.struct(seqname=ht['seqname'], position=ht['start']),
            hl.struct(seqname=ht['seqname'], position=ht['end']),
            includes_start=True,
            includes_end=True))

    ht = ht.key_by('interval')

    return ht
Esempio n. 30
0
                'SMRRNANM',
                'SMVQCFL',
                'SMTRSCPT',
                'SMMPPDPR',
                'SMCGLGTH',
                'SMUNPDRD',
                'SMMPPDUN',
                'SME2ANTI',
                'SMALTALG',
                'SME2SNSE',
                'SMMFLGTH',
                'SMSPLTRD',
                'SME1ANTI',
                'SME1SNSE',
                'SMNUM5CD']

    ht_samples = ht_samples.annotate(**{x: hl.float(ht_samples[x]) for x in float_cols})
    ht_samples = ht_samples.annotate(**{x: hl.int(ht_samples[x].replace('.0$', '')) for x in int_cols})

    ht = ht.filter(ht.feature_type == 'gene')
    ht = ht.annotate(interval=hl.interval(hl.locus(ht['contig'], ht['start'], 'GRCh37'), hl.locus(ht['contig'], ht['end'] + 1, 'GRCh37')))
    ht = ht.annotate(attributes=hl.dict(hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attributes'].split('; '))))
    attribute_cols = list(ht.aggregate(hl.set(hl.flatten(hl.agg.collect(ht.attributes.keys())))))
    ht = ht.annotate(**{x: hl.or_missing(ht_genes.attributes.contains(x), ht_genes.attributes[x]) for x in attribute_cols})
    ht = ht.select(*(['gene_id', 'interval', 'gene_type', 'strand', 'annotation_source', 'havana_gene', 'gene_status', 'tag']))
    ht = ht.rename({'havana_gene': 'havana_gene_id'})
    ht = ht.key_by(ht_genes.gene_id)

"""

Esempio n. 31
0
 def _coerce(self, x):
     assert isinstance(x, hl.expr.IntervalExpression)
     return hl.interval(self.point_type.coerce(x.start),
                        self.point_type.coerce(x.end),
                        includes_start=x.includes_start,
                        includes_end=x.includes_end)
Esempio n. 32
0
    ht = ht.key_by('locus')

if args.d == 'lcr':
    name = 'Ensembl_low_complexity_regions'
    ht = hl.import_table(
        'gs://hail-datasets-extracted-data/Ensembl/{n}.{v}.{rg}.tsv.bgz'.
        format(n=name, v=version, rg=reference_genome),
        types={
            'start': hl.tint,
            'end': hl.tint
        })
    if reference_genome == 'GRCh38':
        ht = ht.annotate(chromosome='chr' +
                         ht['chromosome'].replace('MT', 'M'))
    ht = ht.annotate(interval=hl.interval(
        hl.locus(ht['chromosome'], ht['start'], reference_genome),
        hl.locus(ht['chromosome'], ht['end'], reference_genome)))
    ht = ht.select('interval')
    ht = ht.key_by('interval')

if args.d in set(['cdna', 'cds', 'ncrna']):
    if args.d == 'cdna':
        name = 'Ensembl_cDNA_regions'
    elif args.d == 'cds':
        name = 'Ensembl_CDS_regions'
    else:
        name = 'Ensembl_ncRNA_regions'
    ht = hl.import_table(
        'gs://hail-datasets-extracted-data/Ensembl/Ensembl_{0}_regions.{1}.{2}.tsv.bgz'
        .format(args.d, version, reference_genome),
        types={