def test_filter_alleles(self): # poor man's Gen paths = [resource('sample.vcf'), resource('multipleChromosomes.vcf'), resource('sample2.vcf')] for path in paths: ds = hl.import_vcf(path) self.assertEqual( hl.filter_alleles(ds, lambda a, i: False).count_rows(), 0) self.assertEqual(hl.filter_alleles(ds, lambda a, i: True).count_rows(), ds.count_rows())
def split_mt_to_indels(mt: hl.MatrixTable) -> hl.MatrixTable: ''' :param mt: hail matrixtable of all samples with both indels and SNVs :return: hail matrixtable with only the indels ''' mt_indels = hl.filter_alleles( mt, lambda allele, _: hl.is_indel(mt.alleles[0], allele)) return mt_indels
def split_mt_to_snps(mt: hl.MatrixTable) -> hl.MatrixTable: ''' :param mt: hail matrixtable of all samples with both indels and SNVs :return: matrixtable with only the SNPs from all the samples ''' mt_snps = hl.filter_alleles( mt, lambda allele, _: hl.is_snp(mt.alleles[0], allele)) return mt_snps
exomes_proband = exomes.filter_cols(exomes.s == exomes.source_trio.proband.s) exomes_fath = exomes.filter_cols(exomes.s == exomes.source_trio.father.s) exomes_moth = exomes.filter_cols(exomes.s == exomes.source_trio.mother.s) exomes_proband = exomes_proband.key_cols_by( exomes_proband['source_trio'].fam_id) exomes_fath = exomes_fath.key_cols_by(exomes_fath['source_trio'].fam_id) exomes_moth = exomes_moth.key_cols_by(exomes_moth['source_trio'].fam_id) exomes_proband = exomes_proband.annotate_entries( mother_PBT_GT=exomes_moth[exomes_proband.row_key, exomes_proband.col_key]["PBT_GT"], father_PBT_GT=exomes_fath[exomes_proband.row_key, exomes_proband.col_key]["PBT_GT"]) exomes_proband = exomes_proband.annotate_entries( hethet=((exomes_proband.mother_PBT_GT.is_het_ref()) & (exomes_proband.father_PBT_GT.is_het_ref()))) exomes_proband = hl.filter_alleles(exomes_proband, lambda allele, i: hl.is_snp( exomes_proband.alleles[0], allele)) # currently take only SNP exomes_proband = exomes_proband.filter_entries( exomes_proband.GT.is_het()) # throw away unwanted entries (non alt) exomes_proband_et = exomes_proband.entries() exomes_proband_et = exomes_proband_et.filter(exomes_proband_et.adj) #exomes_proband_et = exomes_proband_et.filter(exomes_proband_et.GT.is_het_ref() & exomes_proband_et.GT.is_diploid()) aggstats = exomes_proband_et.group_by( "GT", 'PBT_GT', 'mother_PBT_GT', "father_PBT_GT").aggregate(n=hl.agg.count()) aggstats.write("gs://gnomad_qingbowang/MNV/hethet_aggstats_exome_wGT_re.ht")
class DataException(Exception): pass exomes = get_gnomad_data("genomes", release_samples=True, adj=True, release_annotations=True) #実際はgenomeだけど. #ex20 = hl.filter_intervals(exomes.select_rows("allele_info").select_cols(), [hl.parse_locus_interval("20:start-2M")]) #first 2Mb ex20 = hl.filter_intervals( exomes.select_rows("info_DP").select_cols(), [hl.parse_locus_interval("20")]) #ex20 = ex20.filter_cols(hl.rand_bool(0.1)) #~=15000 samples for small test, exome #ex20 = ex20.filter_cols(hl.rand_bool(0.1)) #~=1500 samples for small test, genome ex20 = hl.filter_alleles(ex20, lambda allele, i: hl.is_snp( ex20.alleles[0], allele)) # currently take only SNP ex20 = ex20.filter_entries( ex20.GT.is_het()) # throw away unwanted entries (non alt) ex20_pair = hl.window_by_locus(ex20, 2) #just look at nearby pairs for now ex20_pair = ex20_pair.filter_entries( (hl.is_defined(ex20_pair.GT) & (ex20_pair.prev_entries.length() > 0))) ex20_pair = ex20_pair.filter_entries( ex20_pair.prev_entries.filter(lambda x: x.GT.is_het()).length() > 0) et = ex20_pair.entries() et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows))) et = et.explode('indices') et = et.transmute(prev_row=et.prev_rows[et.indices], prev_entry=et.prev_entries[et.indices]) et = et.filter(hl.is_defined( et.prev_entry.GT)) # and remove non-corresponding entries et = et.annotate(agrees_PID=((et.GT.phased) & (et.prev_entry.GT.phased)
def phase_sensitivity_fast(mt, windowsize=1, adj=True): # trying to make the above faster. # takes matrix table that has PID, GT, PBT_GT, calculate the phase sensitivity, sum of all individuals # for window size k, get the result of window size=1, 2, ... k import pandas as pd mt = hl.filter_alleles(mt, lambda allele, i: hl.is_snp( mt.alleles[0], allele)) # currently take only SNP mt = mt.select_rows() # throw away unwanted rows mt = mt.filter_entries( mt.GT.is_het()) # throw away unwanted entries (non alt) mt = hl.window_by_locus(mt, windowsize) mt = mt.filter_entries( (hl.is_defined(mt.GT) & (mt.prev_entries.length() > 0))) mt = mt.filter_entries( mt.prev_entries.filter(lambda x: x.GT.is_het()).length() > 0) et = mt.entries() et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows))) et = et.explode('indices') et = et.transmute(prev_row=et.prev_rows[et.indices], prev_entry=et.prev_entries[et.indices]) et = et.filter(hl.is_defined( et.prev_entry.GT)) # and remove non-corresponding entries if adj: # restrict to adj pass et = et.filter(et.adj & et.prev_entry.adj) print("\n et created and filtered. \n Starting to look at phase info \n" + tm.ctime()) # annotate columns et = et.annotate(dist=et.locus.position - et.prev_row.locus.position, pair_phased=(et.GT.phased) & (et.prev_entry.GT.phased), has_PBT=(hl.is_defined(et.PBT_GT)) & (hl.is_defined(et.prev_entry.PBT_GT))) et = et.annotate(is_mnv=(et.pair_phased & (et.PID == et.prev_entry.PID) & (et.GT == et.prev_entry.GT))) et = et.annotate(flipped_GT=hl.call(et.GT[1], et.GT[0], phased=et.GT.phased), prev_entry_flipped_GT=hl.call( et.prev_entry.GT[1], et.prev_entry.GT[0], phased=et.prev_entry.GT.phased)) et = et.annotate(agrees_PBT=( ((et.GT == et.PBT_GT) & (et.prev_entry.GT == et.prev_entry.PBT_GT)) | ((et.flipped_GT == et.PBT_GT) & (et.prev_entry_flipped_GT == et.prev_entry.PBT_GT)))) et = et.annotate(agrees_PID=((et.pair_phased) & (et.PID == et.prev_entry.PID) & hl.is_defined(et.PID))) #agrees PID only if they are phased at all # define each categ et_has_PBT = et.filter(et.has_PBT) et_agrees_PBT = et.filter(et.agrees_PBT) et_phased = et.filter(et.pair_phased) et_phased_and_has_PBT = et_phased.filter(et_phased.has_PBT) et_phased_and_agrees_PBT = et_phased.filter(et_phased.agrees_PBT) et_same_PID = et.filter(et.agrees_PID) et_same_PID_and_has_PBT = et_same_PID.filter(et_same_PID.has_PBT) et_same_PID_and_agrees_PBT = et_same_PID.filter(et_same_PID.agrees_PBT) et_mnv = et.filter(et.is_mnv) et_mnv_and_has_PBT = et_mnv.filter(et_mnv.has_PBT) et_mnv_and_agrees_PBT = et_mnv.filter(et_mnv.agrees_PBT) print("Starting to aggregate \n" + tm.ctime()) n_all = et.aggregate(hl.agg.counter(et.dist)) n_has_PBT = et_has_PBT.aggregate(hl.agg.counter(et_has_PBT.dist)) n_agrees_PBT = et_agrees_PBT.aggregate(hl.agg.counter(et_agrees_PBT.dist)) n_phased = et_phased.aggregate(hl.agg.counter(et_phased.dist)) n_phased_and_has_PBT = et_phased_and_has_PBT.aggregate( hl.agg.counter(et_phased_and_has_PBT.dist)) n_phased_and_agrees_PBT = et_phased_and_agrees_PBT.aggregate( hl.agg.counter(et_phased_and_agrees_PBT.dist)) n_same_PID = et_same_PID.aggregate(hl.agg.counter(et_same_PID.dist)) n_same_PID_and_has_PBT = et_same_PID_and_has_PBT.aggregate( hl.agg.counter(et_same_PID_and_has_PBT.dist)) n_same_PID_and_agrees_PBT = et_same_PID_and_agrees_PBT.aggregate( hl.agg.counter(et_same_PID_and_agrees_PBT.dist)) n_mnv = et_mnv.aggregate(hl.agg.counter(et_mnv.dist)) n_mnv_and_has_PBT = et_mnv_and_has_PBT.aggregate( hl.agg.counter(et_mnv_and_has_PBT.dist)) n_mnv_and_agrees_PBT = et_mnv_and_agrees_PBT.aggregate( hl.agg.counter(et_mnv_and_agrees_PBT.dist)) #also some missing: same PID and has PBT in general (not restricting to MNV) / those that agrees. #no we actually have it. print("Done aggregate \n" + tm.ctime()) # and if we return these we are done df = pd.DataFrame(n_all, index=["n_all"]) df2 = pd.DataFrame(n_has_PBT, index=["n_has_PBT"]) df3 = pd.DataFrame(n_agrees_PBT, index=["n_agrees_PBT"]) df4 = pd.DataFrame(n_phased, index=["n_phased"]) df5 = pd.DataFrame(n_phased_and_has_PBT, index=["n_phased_and_has_PBT"]) df6 = pd.DataFrame(n_phased_and_agrees_PBT, index=["n_phased_and_agrees_PBT"]) df7 = pd.DataFrame(n_mnv, index=["n_mnv"]) df8 = pd.DataFrame(n_mnv_and_has_PBT, index=["n_mnv_and_has_PBT"]) df9 = pd.DataFrame(n_mnv_and_agrees_PBT, index=["n_mnv_and_agrees_PBT"]) df10 = pd.DataFrame(n_same_PID, index=["n_same_PID"]) df11 = pd.DataFrame(n_same_PID_and_has_PBT, index=["n_same_PID_and_has_PBT"]) df12 = pd.DataFrame(n_same_PID_and_agrees_PBT, index=["n_same_PID_and_agrees_PBT"]) print(n_all) return (pd.concat( [df, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12]))