def getAnchorPETs(jdf, loops, pre, cut=0): """ @param jdf,str, file of .jd @param loops: dict, 'chr8-chr8-605': ['chr8', 61242502, 61242734, 'chr8', 61244107, 61244150] """ anchors = getAnchors(loops) key, mat = parseJd(jdf, cut) report = "%s:%s & %s loops,merged %s anchors" % (key, jdf, len(loops), len(anchors)) logger.info(report) xs_keys, xs = getCorLink(mat[:, 1]) ys_keys, ys = getCorLink(mat[:, 2]) ps = set() for r in anchors: #left end l_idx = np.searchsorted(xs_keys, r[0], side="left") r_idx = np.searchsorted(xs_keys, r[1], side="right") for i in range(l_idx, r_idx): ps.update(xs[xs_keys[i]]) #right end l_idx = np.searchsorted(ys_keys, r[0], side="left") r_idx = np.searchsorted(ys_keys, r[1], side="right") for i in range(l_idx, r_idx): ps.update(ys[ys_keys[i]]) nmat = mat[list(ps), ] joblib.dump(nmat, os.path.join(pre, "-".join(key) + ".jd")) report = "%s:%s raw PETs %s PETs in anchors" % (key, mat.shape[0], nmat.shape[0]) logger.info(report) return len(loops), len(anchors), mat.shape[0], nmat.shape[0]
def singleDBSCAN(f, eps, minPts, cut=0): """ Run DBSCAN to detect interactions for one chromosome. #mat is list, every is [ pointId,x,y ] """ dataI, readI, dataS, readS, dis, dss = [], [], [], [], [], [] key, mat = parseJd(f, cut=0) if cut > 0: d = mat[:, 2] - mat[:, 1] p = np.where(d >= cut)[0] mat = mat[p, :] dss.extend(list(d[d < cut])) if len(mat) == 0: return key, f, dataI, dataS, list(dis), list(dss) #data for interaction records, read for readId report = "Clustering %s and %s using eps as %s, minPts as %s,pre-set distance cutoff as > %s" % ( key[0], key[1], eps, minPts, cut) logger.info(report) db = DBSCAN(mat, eps, minPts) labels = pd.Series(db.labels) mat = np.array(mat) mat = pd.DataFrame(mat[:, 1:].astype("float"), index=mat[:, 0], columns=["X", "Y"]) nlabels = set(labels.values) #collect clusters for label in nlabels: los = list(labels[labels == label].index) sub = mat.loc[los, :] #BEDPE format,+1 to escape the error that exact the same start and end #2017-05-18, changed to remove such interactions if int(np.min(sub["X"])) == int(np.max(sub["X"])) or int( np.min(sub["Y"])) == int(np.max(sub["Y"])): continue r = [ key[0], int(np.min(sub["X"])), int(np.max(sub["X"])), key[1], int(np.min(sub["Y"])), int(np.max(sub["Y"])), #sub.shape[0], #",".join(map(str, los)), #los ] if r[2] < r[4]: dataI.append(r) readI.extend(los) else: dataS.append(r) readS.extend(los) report = "Clustering %s and %s finished. Estimated %s self-ligation reads and %s inter-ligation reads" % ( key[0], key[1], len(readS), len(readI)) logger.info(report) if len(dataI) > 0: dis = mat.loc[readI, "Y"] - mat.loc[readI, "X"] if len(dataS) > 0: dss.extend(list(mat.loc[readS, "Y"] - mat.loc[readS, "X"])) return key, f, dataI, dataS, list(dis), list(dss)
def getGenomeCoverage(f, cut=0): """ Build the genomic model for random access. Could use a lot of memory. @param f:.jd file @param cut: distance cutoff for self-ligation PETs. """ key, mat = parseJd(f, cut) j = mat.shape[0] if j < 2: return None, 0 xs_keys, xs = getCorLink(mat[:, 1]) ys_keys, ys = getCorLink(mat[:, 2]) return [[xs_keys, xs], [ys_keys, ys]], j
def getGenomeCoverage(f, cut=0): """ Build the genomic model for random access. Could use a lot of memory. @param f:.jd file @param cut: distance cutoff for self-ligation PETs. """ key, mat = parseJd(f, cut) j = mat.shape[0] if j == 0: return None, 0 m = max([np.max(mat[:, 1]), np.max(mat[:, 2])]) model = [False] * (m + 1000000) #+10 just in case boundary escape for t in mat: if model[t[1]] == False: model[t[1]] = [] if model[t[2]] == False: model[t[2]] = [] model[t[1]].append(t[0]) model[t[2]].append(0 - t[0]) return model, j * 2