def overlap_2d_bedtools(target, reference, margin, return_ref=False): l_target = target[['chrom1', 'pos1']].rename(columns=lambda x: x.replace('1', '')) l_target = insert_margin(l_target, margin) l_ref = reference[['chrom1', 'pos1']].rename(columns=lambda x: x.replace('1', '')) l_ref = insert_margin(l_ref, margin) with tsv(l_ref) as a, tsv(l_target) as b: l_intersect = bedtools.intersect(a=a.name, b=b.name, wa=True, wb=True) l_intersect.columns = [col + '_r' for col in l_ref.columns ] + [col + '_t' for col in l_target.columns] l_intersect.set_index(['dot_id_r', 'dot_id_t'], inplace=True) r_target = target[['chrom2', 'pos2']].rename(columns=lambda x: x.replace('2', '')) r_target = insert_margin(r_target, margin) r_ref = reference[['chrom2', 'pos2']].rename(columns=lambda x: x.replace('2', '')) r_ref = insert_margin(r_ref, margin) with tsv(r_ref) as a, tsv(r_target) as b: r_intersect = bedtools.intersect(a=a.name, b=b.name, wa=True, wb=True) r_intersect.columns = [col + '_r' for col in r_ref.columns ] + [col + '_t' for col in r_target.columns] r_intersect.set_index(['dot_id_r', 'dot_id_t'], inplace=True) merged_df = l_intersect.merge(r_intersect, how='inner', left_index=True, right_index=True).reset_index() target_inds = merged_df.dot_id_t.values target_result = target.loc[target_inds].copy().sort_index( ).drop_duplicates() if return_ref: ref_inds = merged_df.dot_id_r.values reference_result = reference.loc[ref_inds].copy().sort_index( ).drop_duplicates() return target_result, reference_result return target_result
def pair_sites(sites, separation, slop): """ Create "hand" intervals to the right and to the left of each site. Then join right hands with left hands to pair sites together. """ from bioframe.tools import tsv, bedtools mids = (sites["start"] + sites["end"]) // 2 left_hand = sites[["chrom"]].copy() left_hand["start"] = mids - separation - slop left_hand["end"] = mids - separation + slop left_hand["site_id"] = left_hand.index left_hand["direction"] = "L" left_hand["snip_mid"] = mids left_hand["snip_strand"] = sites["strand"] right_hand = sites[["chrom"]].copy() right_hand["start"] = mids + separation - slop right_hand["end"] = mids + separation + slop right_hand["site_id"] = right_hand.index right_hand["direction"] = "R" right_hand["snip_mid"] = mids right_hand["snip_strand"] = sites["strand"] # ignore out-of-bounds hands mask = (left_hand["start"] > 0) & (right_hand["start"] > 0) left_hand = left_hand[mask].copy() right_hand = right_hand[mask].copy() # intersect right hands (left anchor site) # with left hands (right anchor site) with tsv(right_hand) as R, tsv(left_hand) as L: out = bedtools.intersect(a=R.name, b=L.name, wa=True, wb=True) out.columns = [c + "_r" for c in right_hand.columns] + [ c + "_l" for c in left_hand.columns ] return out
def pair_sites(sites, separation, slop): """ Create "hand" intervals to the right and to the left of each site. Then join right hands with left hands to pair sites together. """ from bioframe.tools import tsv, bedtools mids = (sites['start'] + sites['end']) // 2 left_hand = sites[['chrom']].copy() left_hand['start'] = mids - separation - slop left_hand['end'] = mids - separation + slop left_hand['site_id'] = left_hand.index left_hand['direction'] = 'L' left_hand['snip_mid'] = mids left_hand['snip_strand'] = sites['strand'] right_hand = sites[['chrom']].copy() right_hand['start'] = mids + separation - slop right_hand['end'] = mids + separation + slop right_hand['site_id'] = right_hand.index right_hand['direction'] = 'R' right_hand['snip_mid'] = mids right_hand['snip_strand'] = sites['strand'] # ignore out-of-bounds hands mask = (left_hand['start'] > 0) & (right_hand['start'] > 0) left_hand = left_hand[mask].copy() right_hand = right_hand[mask].copy() # intersect right hands (left anchor site) # with left hands (right anchor site) with tsv(right_hand) as R, tsv(left_hand) as L: out = bedtools.intersect(a=R.name, b=L.name, wa=True, wb=True) out.columns = ([c + '_r' for c in right_hand.columns] + [c + '_l' for c in left_hand.columns]) return out