def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.rbind( old_entry.LGT, lambda lgt: hl.if_else( lgt.is_non_ref(), hl.downcode( lgt, hl.or_else(local_a_index, hl.len(old_entry.LA)) ), lgt)) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.rbind( old_entry.LPGT, lambda lpgt: hl.if_else( lpgt.is_non_ref(), hl.downcode( lpgt, hl.or_else(local_a_index, hl.len(old_entry.LA)) ), lpgt)) dropped_fields.append('LPGT') if 'LAD' in fields: non_ref_ad = hl.or_else(old_entry.LAD[local_a_index], 0) # zeroed if not in LAD new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [hl.sum(old_entry.LAD) - non_ref_ad, non_ref_ad]) dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields)))
def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode( old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode( old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [ old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0) ]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields)))
def with_local_a_index(local_a_index): new_pl = hl.or_missing( hl.is_defined(old_entry.LPL), hl.or_missing( hl.is_defined(local_a_index), hl.range(0, 3).map(lambda i: hl.min( hl.range(0, hl.triangle(hl.len(old_entry.LA))). filter(lambda j: hl.downcode( hl.unphased_diploid_gt_index_call(j), local_a_index ) == hl.unphased_diploid_gt_index_call(i)).map( lambda idx: old_entry.LPL[idx]))))) fields = set(old_entry.keys()) def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode( old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode( old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [ old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0) ]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields)) if 'LPL' in fields: return hl.bind(with_pl, new_pl) else: return with_pl(None)
def split_multi(ds): sm = hl.SplitMulti(ds) sm.update_rows(a_index=sm.a_index(), was_split=sm.was_split()) sm.update_entries( GT=hl.downcode(ds.GT, sm.a_index()), AD=hl.or_missing(hl.is_defined(ds.AD), [hl.sum(ds.AD) - ds.AD[sm.a_index()], ds.AD[sm.a_index()]]), DP=ds.DP ) split_ds = sm.result() return split_ds
def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode(old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode(old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0)]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond(hl.len(ds.alleles) == 1, old_entry.annotate(**{f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields}).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields))
def with_local_a_index(local_a_index): new_pl = hl.or_missing( hl.is_defined(old_entry.LPL), hl.or_missing( hl.is_defined(local_a_index), hl.range(0, 3).map(lambda i: hl.min( hl.range(0, hl.triangle(hl.len(old_entry.LA))) .filter(lambda j: hl.downcode(hl.unphased_diploid_gt_index_call(j), local_a_index) == hl.unphased_diploid_gt_index_call(i)) .map(lambda idx: old_entry.LPL[idx]))))) fields = set(old_entry.keys()) def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode(old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode(old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0)]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond(hl.len(ds.alleles) == 1, old_entry.annotate(**{f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields}).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields)) if 'LPL' in fields: return hl.bind(with_pl, new_pl) else: return with_pl(None)
ht_samples = hl.read_table( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_samples.ht') ht_relationships = hl.read_table( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_sample_relationships.ht') mt = hl.import_vcf( 'gs://hail-datasets-raw-data/1000_Genomes/1000_Genomes_phase3_chrMT_GRCh37.vcf.bgz', reference_genome='GRCh37') mt = mt.annotate_cols(**ht_samples[mt.s]) mt = mt.annotate_cols(**ht_relationships[mt.s]) mt_split = hl.split_multi(mt) mt_split = mt_split.select_entries( GT=hl.downcode(mt_split.GT, mt_split.a_index)) mt_split = mt_split.annotate_rows(info=hl.struct( AC=mt_split.info.AC[mt_split.a_index - 1], VT=(hl.case().when((mt_split.alleles[0].length() == 1) & ( mt_split.alleles[1].length() == 1), 'SNP').when( mt_split.alleles[0].matches('<CN*>') | mt_split.alleles[1].matches('<CN*>'), 'SV').default('INDEL')))) n_rows, n_cols = mt_split.count() n_partitions = mt_split.n_partitions() mt_split = hl.sample_qc(mt_split) mt_split = hl.variant_qc(mt_split) mt_split = mt_split.annotate_globals( metadata=hl.struct(name='1000_Genomes_phase3_chrMT',
def with_local_a_index(local_a_index): fields = set(old_entry.keys()) def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode( old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode( old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: non_ref_ad = hl.or_else(old_entry.LAD[local_a_index], 0) # zeroed if not in LAD new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [hl.sum(old_entry.LAD) - non_ref_ad, non_ref_ad]) dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields))) if 'LPL' in fields: new_pl = hl.or_missing( hl.is_defined(old_entry.LPL), hl.or_missing( hl.is_defined(local_a_index), hl.range(0, 3).map(lambda i: hl.min( hl.range(0, hl.triangle(hl.len(old_entry.LA))). filter(lambda j: hl.downcode( hl.unphased_diploid_gt_index_call(j), local_a_index) == hl. unphased_diploid_gt_index_call(i)).map( lambda idx: old_entry.LPL[idx]))))) return hl.bind(with_pl, new_pl) else: return with_pl(None)
import hail as hl ht_samples = hl.import_table('gs://hail-datasets/raw-data/1000_genomes/samples_1kg.tsv', key='sample') mt = hl.import_vcf('gs://hail-datasets/raw-data/1000_genomes/ALL.chrMT.phase3_callmom-v0_4.20130502.genotypes.vcf.bgz') mt = mt.key_rows_by('locus') mt = mt.distinct_by_row() mt = mt.partition_rows_by(['locus'], 'locus', 'alleles') mt_split = hl.split_multi(mt) mt_split = mt_split.select_entries(GT=hl.downcode(mt_split.GT, mt_split.a_index)) mt_split = mt_split.annotate_rows(info=hl.struct(AC=mt_split.info.AC, VT=(hl.case() .when((mt_split.alleles[0].length() == 1) & (mt_split.alleles[1].length() == 1), 'SNP') .when(mt_split.alleles[0].matches('<CN*>') | mt_split.alleles[1].matches('<CN*>'), 'SV') .default('INDEL')))) mt_split.describe() mt_split = mt_split.drop('old_locus', 'old_alleles', 'a_index') mt_split = mt_split.annotate_cols(sex=ht_samples[mt_split.s].gender, super_population=ht_samples[mt_split.s].super_pop, population=ht_samples[mt_split.s].pop) mt_split = hl.sample_qc(mt_split) mt_split = hl.variant_qc(mt_split) mt_split = hl.vep(mt_split, 'gs://hail-common/vep/vep/vep85-gcloud.json') mt_split.describe() mt_split.write('gs://hail-datasets/hail-data/1000_genomes_phase3_chrMT.GRCh37.mt', overwrite=True)
def split_multi_dynamic( t: Union[hl.MatrixTable, hl.Table], keep_star: bool = False, left_aligned: bool = True) -> Union[hl.MatrixTable, hl.Table]: """ Splits MatrixTable based on entry fields found. Downcodes whatever it can. Supported so far: GT, DP, AD, PL, GQ PGT, PID ADALL :param MatrixTable t: Input MatrixTable :param bool keep_star: whether to keep star alleles (passed to SplitMulti) :param bool left_aligned: whether matrix table is already left_aligned (passed to SplitMulti) :return: Split MatrixTable :rtype: MatrixTable """ if isinstance(t, hl.Table): t = t.annotate(a_index=hl.range(0, hl.len(t.alleles) - 1)).explode('a_index') return t.annotate(alleles=[t.alleles[0], t.alleles[t.a_index] ]) # Note: does not minrep at the moment fields = list(t.entry) sm = hl.SplitMulti(t, keep_star=keep_star, left_aligned=left_aligned) sm.update_rows(a_index=sm.a_index(), was_split=sm.was_split()) expression = {} # HTS/standard if 'GT' in fields: expression['GT'] = hl.downcode(t.GT, sm.a_index()) if 'DP' in fields: expression['DP'] = t.DP if 'AD' in fields: expression['AD'] = hl.or_missing(hl.is_defined( t.AD), [hl.sum(t.AD) - t.AD[sm.a_index()], t.AD[sm.a_index()]]) if 'PL' in fields: pl = hl.or_missing(hl.is_defined( t.PL), (hl.range(0, 3).map(lambda i: hl.min( (hl.range(0, hl.triangle(t.alleles.length())).filter( lambda j: hl.downcode(hl.unphased_diploid_gt_index_call( j), sm.a_index()) == hl.unphased_diploid_gt_index_call( i)).map(lambda j: t.PL[j])))))) expression['PL'] = pl if 'GQ' in fields: expression['GQ'] = hl.gq_from_pl(pl) else: if 'GQ' in fields: expression['GQ'] = t.GQ # Phased data if 'PGT' in fields: expression['PGT'] = hl.downcode(t.PGT, sm.a_index()) if 'PID' in fields: expression['PID'] = t.PID # Custom data if 'ADALL' in fields: # found in NA12878 expression['ADALL'] = hl.or_missing( hl.is_defined(t.ADALL), [hl.sum(t.ADALL) - t.ADALL[sm.a_index()], t.ADALL[sm.a_index()]]) sm.update_entries(**expression) return sm.result()