def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.rbind( old_entry.LGT, lambda lgt: hl.if_else( lgt.is_non_ref(), hl.downcode( lgt, hl.or_else(local_a_index, hl.len(old_entry.LA)) ), lgt)) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.rbind( old_entry.LPGT, lambda lpgt: hl.if_else( lpgt.is_non_ref(), hl.downcode( lpgt, hl.or_else(local_a_index, hl.len(old_entry.LA)) ), lpgt)) dropped_fields.append('LPGT') if 'LAD' in fields: non_ref_ad = hl.or_else(old_entry.LAD[local_a_index], 0) # zeroed if not in LAD new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [hl.sum(old_entry.LAD) - non_ref_ad, non_ref_ad]) dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields)))
def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode( old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode( old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [ old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0) ]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields)))
def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode(old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode(old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0)]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond(hl.len(ds.alleles) == 1, old_entry.annotate(**{f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields}).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields))
def split_multi_dynamic( t: Union[hl.MatrixTable, hl.Table], keep_star: bool = False, left_aligned: bool = True) -> Union[hl.MatrixTable, hl.Table]: """ Splits MatrixTable based on entry fields found. Downcodes whatever it can. Supported so far: GT, DP, AD, PL, GQ PGT, PID ADALL :param MatrixTable t: Input MatrixTable :param bool keep_star: whether to keep star alleles (passed to SplitMulti) :param bool left_aligned: whether matrix table is already left_aligned (passed to SplitMulti) :return: Split MatrixTable :rtype: MatrixTable """ if isinstance(t, hl.Table): t = t.annotate(a_index=hl.range(0, hl.len(t.alleles) - 1)).explode('a_index') return t.annotate(alleles=[t.alleles[0], t.alleles[t.a_index] ]) # Note: does not minrep at the moment fields = list(t.entry) sm = hl.SplitMulti(t, keep_star=keep_star, left_aligned=left_aligned) sm.update_rows(a_index=sm.a_index(), was_split=sm.was_split()) expression = {} # HTS/standard if 'GT' in fields: expression['GT'] = hl.downcode(t.GT, sm.a_index()) if 'DP' in fields: expression['DP'] = t.DP if 'AD' in fields: expression['AD'] = hl.or_missing(hl.is_defined( t.AD), [hl.sum(t.AD) - t.AD[sm.a_index()], t.AD[sm.a_index()]]) if 'PL' in fields: pl = hl.or_missing(hl.is_defined( t.PL), (hl.range(0, 3).map(lambda i: hl.min( (hl.range(0, hl.triangle(t.alleles.length())).filter( lambda j: hl.downcode(hl.unphased_diploid_gt_index_call( j), sm.a_index()) == hl.unphased_diploid_gt_index_call( i)).map(lambda j: t.PL[j])))))) expression['PL'] = pl if 'GQ' in fields: expression['GQ'] = hl.gq_from_pl(pl) else: if 'GQ' in fields: expression['GQ'] = t.GQ # Phased data if 'PGT' in fields: expression['PGT'] = hl.downcode(t.PGT, sm.a_index()) if 'PID' in fields: expression['PID'] = t.PID # Custom data if 'ADALL' in fields: # found in NA12878 expression['ADALL'] = hl.or_missing( hl.is_defined(t.ADALL), [hl.sum(t.ADALL) - t.ADALL[sm.a_index()], t.ADALL[sm.a_index()]]) sm.update_entries(**expression) return sm.result()