def dnsvFilter(dnsv, precisionlimit=True, sizemin=50, typelimit=True): # print(dnsv_data) dnsv_filtered = pd.DataFrame(columns=dnsv.columns) for i in range(dnsv.shape[0]): try: #dnsv_data.shape[0] # print(dnsv_data.iloc[[i]]['INFO'].iloc[0]) if precisionlimit: if 'IMPRECISE' in dnsv['INFO'].iloc[i]: continue sv_type = svType(dnsv.iloc[[i]]) if typelimit: if sv_type not in ['INS', 'DEL', 'DUP', 'INV']: continue if sizemin is not None: if sv_type in ['INS', 'DEL', 'DUP', 'INV']: if abs(svLen(dnsv.iloc[[i]])) < sizemin: continue dnsv_filtered = pd.concat([dnsv_filtered, dnsv.iloc[[i]]]) except: print('Data Format Error Loc: %s ' % (i + 1)) continue # dnsv_filtered_data.to_csv(out_dir) return dnsv_filtered
def judgeIfDenovo(father_SVs, mother_SVs, son_SVs, refdist, typeignore, overlap_rate, i): flag = 0 # son_sv.shape[0] son_sv_pos = int(son_SVs['POS'][i]) son_sv_chrom = str(son_SVs.index[i]) son_sv_end = svEnd(son_SVs.iloc[[i]]) son_sv_start, son_sv_end = getStartAndEnd(son_sv_pos, son_sv_end) son_sv_type = svType(son_SVs.iloc[[i]]) # if the chrom is the same if son_sv_chrom in father_SVs.index: global bench_father_dict bench_father_df = bench_father_dict[son_sv_chrom] flag = judgeNeighbour(bench_father_df, son_sv_start, son_sv_chrom, son_sv_start, son_sv_end, son_sv_type, typeignore, refdist, overlap_rate) if flag == 0 and son_sv_type not in ['INS', 'None']: flag = judgeNeighbour(bench_father_df, son_sv_end, son_sv_chrom, son_sv_start, son_sv_end, son_sv_type, typeignore, refdist, overlap_rate) if flag == 0: if son_sv_chrom in mother_SVs.index: global bench_mother_dict bench_mother_df = bench_mother_dict[son_sv_chrom] flag = judgeNeighbour(bench_mother_df, son_sv_start, son_sv_chrom, son_sv_start, son_sv_end, son_sv_type, typeignore, refdist, overlap_rate) if flag == 0 and son_sv_type not in ['INS', 'None']: flag = judgeNeighbour(bench_mother_df, son_sv_end, son_sv_chrom, son_sv_start, son_sv_end, son_sv_type, typeignore, refdist, overlap_rate) return flag
def judgeNeighbour(bench_df,home_pos,compared_sv_chrom,compared_sv_start,compared_sv_end,compared_sv_type,typeignore,refdist=200,overlap_rate=0.5): flag = 0 if (bench_df.shape == ()): if (home_pos < bench_df['POS'].iloc[0]): left_neighbour_loc,right_neighbour_loc = None,0 else: left_neighbour_loc,right_neighbour_loc = 0,None else: left_neighbour_loc,right_neighbour_loc = binarySearch(bench_df, home_pos, 0, bench_df.shape[0]-1) if left_neighbour_loc is not None: if bench_df['POS'].shape == (): left_neighbour_end = svEnd(bench_df.to_frame().T.iloc[[left_neighbour_loc]]) left_neighbour_type = svType(bench_df.to_frame().T.iloc[[left_neighbour_loc]]) else: left_neighbour_end = svEnd(bench_df.iloc[[left_neighbour_loc]]) left_neighbour_type = svType(bench_df.iloc[[left_neighbour_loc]]) left_neighbour_start,left_neighbour_end = getStartAndEnd(bench_df['POS'].iloc[left_neighbour_loc],left_neighbour_end) if judgeIfOverlap(compared_sv_start, compared_sv_end, left_neighbour_start,left_neighbour_end,compared_sv_type,refdist,overlap_rate): if typeignore == False: if left_neighbour_type == compared_sv_type: flag = 1 else: flag = 1 if flag == 1: return flag if right_neighbour_loc is not None: if bench_df['POS'].shape == (): right_neighbour_end = svEnd(bench_df.to_frame().T.iloc[[right_neighbour_loc]]) right_neighbour_type = svType(bench_df.to_frame().T.iloc[[right_neighbour_loc]]) else: right_neighbour_end = svEnd(bench_df.iloc[[right_neighbour_loc]]) right_neighbour_type = svType(bench_df.iloc[[right_neighbour_loc]]) right_neighbour_start,right_neighbour_end = getStartAndEnd(bench_df['POS'].iloc[right_neighbour_loc],right_neighbour_end) if judgeIfOverlap(compared_sv_start, compared_sv_end, right_neighbour_start,right_neighbour_end,compared_sv_type,refdist,overlap_rate): if typeignore == False: if right_neighbour_type == compared_sv_type: flag = 1 else: flag = 1 return flag
def judgeIfSame(data_1,data_2,refdist,typeignore,overlap_rate,i): flag = 0 data_1_sv_pos = int(data_1['POS'].iloc[i]) data_1_sv_chrom = str(data_1.index[i]) data_1_sv_end = svEnd(data_1.iloc[[i]]) data_1_sv_start,data_1_sv_end = getStartAndEnd(data_1_sv_pos, data_1_sv_end) data_1_sv_type = svType(data_1.iloc[[i]]) # if the chrom is the same if data_1_sv_chrom in data_2.index: global bench_dict bench_df = bench_dict[data_1_sv_chrom] flag = judgeNeighbour(bench_df,data_1_sv_start,data_1_sv_chrom,data_1_sv_start,data_1_sv_end,data_1_sv_type,typeignore,refdist,overlap_rate) if flag == 0 and (data_1_sv_type not in ['INS','None']): flag = judgeNeighbour(bench_df,data_1_sv_end,data_1_sv_chrom,data_1_sv_start,data_1_sv_end,data_1_sv_type,typeignore,refdist,overlap_rate) return flag