def test_window_by_locus(self): mt = hl.utils.range_matrix_table(100, 2, n_partitions=10) mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1)) mt = mt.key_rows_by('locus') mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx) mt = hl.window_by_locus(mt, 5).cache() self.assertEqual(mt.count_rows(), 100) rows = mt.rows() self.assertTrue( rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5))) self.assertTrue( rows.all( hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx, hl.zip_with_index(rows.prev_rows)))) entries = mt.entries() self.assertTrue( entries.all( hl.all(lambda x: x.e_col_idx == entries.col_idx, entries.prev_entries))) self.assertTrue( entries.all( hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx, hl.zip_with_index(entries.prev_entries))))
def test_window_by_locus(self): mt = hl.utils.range_matrix_table(100, 2, n_partitions=10) mt = mt.annotate_rows(locus=hl.locus('1', mt.row_idx + 1)) mt = mt.key_rows_by('locus') mt = mt.annotate_entries(e_row_idx=mt.row_idx, e_col_idx=mt.col_idx) mt = hl.window_by_locus(mt, 5).cache() self.assertEqual(mt.count_rows(), 100) rows = mt.rows() self.assertTrue(rows.all((rows.row_idx < 5) | (rows.prev_rows.length() == 5))) self.assertTrue(rows.all(hl.all(lambda x: (rows.row_idx - 1 - x[0]) == x[1].row_idx, hl.zip_with_index(rows.prev_rows)))) entries = mt.entries() self.assertTrue(entries.all(hl.all(lambda x: x.e_col_idx == entries.col_idx, entries.prev_entries))) self.assertTrue(entries.all(hl.all(lambda x: entries.row_idx - 1 - x[0] == x[1].e_row_idx, hl.zip_with_index(entries.prev_entries))))
exomes = get_gnomad_data("genomes", release_samples=True, adj=True, release_annotations=True) #実際はgenomeだけど. #ex20 = hl.filter_intervals(exomes.select_rows("allele_info").select_cols(), [hl.parse_locus_interval("20:start-2M")]) #first 2Mb ex20 = hl.filter_intervals( exomes.select_rows("info_DP").select_cols(), [hl.parse_locus_interval("20")]) #ex20 = ex20.filter_cols(hl.rand_bool(0.1)) #~=15000 samples for small test, exome #ex20 = ex20.filter_cols(hl.rand_bool(0.1)) #~=1500 samples for small test, genome ex20 = hl.filter_alleles(ex20, lambda allele, i: hl.is_snp( ex20.alleles[0], allele)) # currently take only SNP ex20 = ex20.filter_entries( ex20.GT.is_het()) # throw away unwanted entries (non alt) ex20_pair = hl.window_by_locus(ex20, 2) #just look at nearby pairs for now ex20_pair = ex20_pair.filter_entries( (hl.is_defined(ex20_pair.GT) & (ex20_pair.prev_entries.length() > 0))) ex20_pair = ex20_pair.filter_entries( ex20_pair.prev_entries.filter(lambda x: x.GT.is_het()).length() > 0) et = ex20_pair.entries() et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows))) et = et.explode('indices') et = et.transmute(prev_row=et.prev_rows[et.indices], prev_entry=et.prev_entries[et.indices]) et = et.filter(hl.is_defined( et.prev_entry.GT)) # and remove non-corresponding entries et = et.annotate(agrees_PID=((et.GT.phased) & (et.prev_entry.GT.phased) & (et.PID == et.prev_entry.PID) & hl.is_defined(et.PID))) et = et.annotate(dist=et.locus.position -
#repartition -actually not needed. 10000 from the beginning. mt = hl.filter_intervals(mt_all, [hl.parse_locus_interval(chr)]) rf = hl.filter_intervals(rf_all, [hl.parse_locus_interval(chr)]) mt = mt.repartition(1000) rf = rf.repartition(1000) #let's actually filter to >15x from the beginning.. #no, will do it for the downstream, but not here. #keep also AF etc info mt = mt.select_cols() mt = mt.select_rows(mt.info.AF, mt.info.AC, mt.a_index) mt = mt.annotate_rows(filters = rf[mt.row_key].filters) #rf as a new "filters" row mt = mt.annotate_rows(AC = mt.AC[mt.a_index-1], AF = mt.AF[mt.a_index-1]) #re-annotating the AF/AC mt = mt.filter_entries(mt.GT.is_non_ref() & hl.is_defined(mt.PID)) #throwing away unneeded things mt = hl.window_by_locus(mt, 10) #partition in window -- maximum 10 actually. mt = mt.filter_entries((hl.is_defined(mt.GT) & (mt.prev_entries.length() > 0))) #throwing away no MNV SNPs mt = mt.filter_entries(mt.prev_entries.filter(lambda x: x.GT.is_non_ref()).length() > 0) #same et = mt.key_cols_by().entries() # Matrix with 1000 rows (variant) + 1000 cols (sample)=> 1 million entries et = et.annotate(indices = hl.range(0, hl.len(et.prev_rows))) et = et.explode('indices') et = et.transmute(prev_row = et.prev_rows[et.indices], prev_entry = et.prev_entries[et.indices]) et = et.annotate(dist=et.locus.position - et.prev_row.locus.position) #annotating the distance #et.cache() #should make everything faster -> no, actually seems like making it slower.. #het x het et_het = et.filter((et.GT.phased) & (et.prev_entry.GT.phased) & (et.PID == et.prev_entry.PID) & (et.GT == et.prev_entry.GT) & (et.GT.is_het_ref()) & (et.prev_entry.GT.is_het_ref())) #only het het MNVs (= same phase) et_het = et_het.repartition(1000)
]) #change the call_fields according to the hail documentation vcf = hl.split_multi_hts(vcf) vcf.write(sys.argv[0] + ".mt", overwrite=True) mt = hl.read_matrix_table(sys.argv[0] + ".mt") #calling mt = mt.select_cols() #dropping unneeded columns makes things faster mt = mt.annotate_rows(AC=mt.info.AC[mt.a_index - 1], AF=mt.info.AF[mt.a_index - 1]) #for case of multiallelic mt = mt.select_rows( mt.filters, mt.AC, mt.AF) #or any rows that you want to store for future investigation mt = mt.filter_entries(hl.is_defined(mt.GT) & mt.GT.is_non_ref()) #interested in non-ref only. mt = hl.window_by_locus( mt, 2 ) #partition in window -- we only care within codon reading frame, so the max distance is set to be 2 mt = mt.filter_entries((mt.prev_entries.length() > 0)) mt = mt.filter_entries( (hl.is_defined(mt.GT) & (mt.prev_entries.length() > 0))) #throwing away no MNV SNPs mt = mt.filter_entries( mt.prev_entries.filter(lambda x: x.GT.is_non_ref()).length() > 0) #same et = mt.key_cols_by().entries( ) # Matrix with 1000 rows (variant) + 1000 cols (sample)=> 1 million entries et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows))) et = et.explode( 'indices' ) #for the case where there are more than one prev_row for a variant et = et.transmute(prev_row=et.prev_rows[et.indices],
def phase_sensitivity_fast(mt, windowsize=1, adj=True): # trying to make the above faster. # takes matrix table that has PID, GT, PBT_GT, calculate the phase sensitivity, sum of all individuals # for window size k, get the result of window size=1, 2, ... k import pandas as pd mt = hl.filter_alleles(mt, lambda allele, i: hl.is_snp( mt.alleles[0], allele)) # currently take only SNP mt = mt.select_rows() # throw away unwanted rows mt = mt.filter_entries( mt.GT.is_het()) # throw away unwanted entries (non alt) mt = hl.window_by_locus(mt, windowsize) mt = mt.filter_entries( (hl.is_defined(mt.GT) & (mt.prev_entries.length() > 0))) mt = mt.filter_entries( mt.prev_entries.filter(lambda x: x.GT.is_het()).length() > 0) et = mt.entries() et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows))) et = et.explode('indices') et = et.transmute(prev_row=et.prev_rows[et.indices], prev_entry=et.prev_entries[et.indices]) et = et.filter(hl.is_defined( et.prev_entry.GT)) # and remove non-corresponding entries if adj: # restrict to adj pass et = et.filter(et.adj & et.prev_entry.adj) print("\n et created and filtered. \n Starting to look at phase info \n" + tm.ctime()) # annotate columns et = et.annotate(dist=et.locus.position - et.prev_row.locus.position, pair_phased=(et.GT.phased) & (et.prev_entry.GT.phased), has_PBT=(hl.is_defined(et.PBT_GT)) & (hl.is_defined(et.prev_entry.PBT_GT))) et = et.annotate(is_mnv=(et.pair_phased & (et.PID == et.prev_entry.PID) & (et.GT == et.prev_entry.GT))) et = et.annotate(flipped_GT=hl.call(et.GT[1], et.GT[0], phased=et.GT.phased), prev_entry_flipped_GT=hl.call( et.prev_entry.GT[1], et.prev_entry.GT[0], phased=et.prev_entry.GT.phased)) et = et.annotate(agrees_PBT=( ((et.GT == et.PBT_GT) & (et.prev_entry.GT == et.prev_entry.PBT_GT)) | ((et.flipped_GT == et.PBT_GT) & (et.prev_entry_flipped_GT == et.prev_entry.PBT_GT)))) et = et.annotate(agrees_PID=((et.pair_phased) & (et.PID == et.prev_entry.PID) & hl.is_defined(et.PID))) #agrees PID only if they are phased at all # define each categ et_has_PBT = et.filter(et.has_PBT) et_agrees_PBT = et.filter(et.agrees_PBT) et_phased = et.filter(et.pair_phased) et_phased_and_has_PBT = et_phased.filter(et_phased.has_PBT) et_phased_and_agrees_PBT = et_phased.filter(et_phased.agrees_PBT) et_same_PID = et.filter(et.agrees_PID) et_same_PID_and_has_PBT = et_same_PID.filter(et_same_PID.has_PBT) et_same_PID_and_agrees_PBT = et_same_PID.filter(et_same_PID.agrees_PBT) et_mnv = et.filter(et.is_mnv) et_mnv_and_has_PBT = et_mnv.filter(et_mnv.has_PBT) et_mnv_and_agrees_PBT = et_mnv.filter(et_mnv.agrees_PBT) print("Starting to aggregate \n" + tm.ctime()) n_all = et.aggregate(hl.agg.counter(et.dist)) n_has_PBT = et_has_PBT.aggregate(hl.agg.counter(et_has_PBT.dist)) n_agrees_PBT = et_agrees_PBT.aggregate(hl.agg.counter(et_agrees_PBT.dist)) n_phased = et_phased.aggregate(hl.agg.counter(et_phased.dist)) n_phased_and_has_PBT = et_phased_and_has_PBT.aggregate( hl.agg.counter(et_phased_and_has_PBT.dist)) n_phased_and_agrees_PBT = et_phased_and_agrees_PBT.aggregate( hl.agg.counter(et_phased_and_agrees_PBT.dist)) n_same_PID = et_same_PID.aggregate(hl.agg.counter(et_same_PID.dist)) n_same_PID_and_has_PBT = et_same_PID_and_has_PBT.aggregate( hl.agg.counter(et_same_PID_and_has_PBT.dist)) n_same_PID_and_agrees_PBT = et_same_PID_and_agrees_PBT.aggregate( hl.agg.counter(et_same_PID_and_agrees_PBT.dist)) n_mnv = et_mnv.aggregate(hl.agg.counter(et_mnv.dist)) n_mnv_and_has_PBT = et_mnv_and_has_PBT.aggregate( hl.agg.counter(et_mnv_and_has_PBT.dist)) n_mnv_and_agrees_PBT = et_mnv_and_agrees_PBT.aggregate( hl.agg.counter(et_mnv_and_agrees_PBT.dist)) #also some missing: same PID and has PBT in general (not restricting to MNV) / those that agrees. #no we actually have it. print("Done aggregate \n" + tm.ctime()) # and if we return these we are done df = pd.DataFrame(n_all, index=["n_all"]) df2 = pd.DataFrame(n_has_PBT, index=["n_has_PBT"]) df3 = pd.DataFrame(n_agrees_PBT, index=["n_agrees_PBT"]) df4 = pd.DataFrame(n_phased, index=["n_phased"]) df5 = pd.DataFrame(n_phased_and_has_PBT, index=["n_phased_and_has_PBT"]) df6 = pd.DataFrame(n_phased_and_agrees_PBT, index=["n_phased_and_agrees_PBT"]) df7 = pd.DataFrame(n_mnv, index=["n_mnv"]) df8 = pd.DataFrame(n_mnv_and_has_PBT, index=["n_mnv_and_has_PBT"]) df9 = pd.DataFrame(n_mnv_and_agrees_PBT, index=["n_mnv_and_agrees_PBT"]) df10 = pd.DataFrame(n_same_PID, index=["n_same_PID"]) df11 = pd.DataFrame(n_same_PID_and_has_PBT, index=["n_same_PID_and_has_PBT"]) df12 = pd.DataFrame(n_same_PID_and_agrees_PBT, index=["n_same_PID_and_agrees_PBT"]) print(n_all) return (pd.concat( [df, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12]))