def get_expression(cell, ch, start, end, resolution, min_expre=0): re = replication_data(cell, "ExpGenes", chromosome=ch, start=start, end=end, resolution=resolution, raw=True) X = [] Y = [] D = [] xm = [] ym = [] Ym = [] Xm = [] #std = np.nanstd(re["signalValue"]) for istart, iend, v, strand in zip(re["chromStart"], re["chromEnd"], re["signalValue"], re["strand"]): # print(istart*5,strand) if strand == "+": X.extend([istart, iend, iend + 1]) Y.extend([v, v, np.nan]) else: Xm.extend([istart, iend, iend + 1]) Ym.extend([v, v, np.nan]) xm.append(istart / 2 + iend / 2) ym.append(v) D.append(strand) mean = np.nanmean(Y) stdv = np.nanstd(Y) print(mean, stdv) Y = np.array(Y) X = np.array(X) Ym = np.array(Ym) Xm = np.array(Xm) D = np.array(D) xm = np.array(xm) ym = np.array(ym) directionp = np.arange(start, end, resolution) * 0 for istart, iend, v in zip(X[::3], X[1::3], Y[::3]): if v > min_expre: # print(start,istart,iend,int(round((istart-start)/resolution)),int(round((iend-start)/resolution))) directionp[int(round(istart - start / resolution) ):int(round(iend - start / resolution))] = 1 directionm = np.arange(start, end, resolution) * 0 for istart, iend, v in zip(Xm[::3], Xm[1::3], Ym[::3]): if v > min_expre: directionm[int(round(istart - start / resolution) ):int(round(iend - start / resolution))] = 1 return X * resolution, Y, Xm * resolution, Ym, directionp - directionm
args = parser.parse_args() chromlength = [248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, 138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, 83257441, 80373285, 58617616, 64444167, 46709983, 50818468] #os.makedirs(args.root, exist_ok=True) data = [] for ch, l in enumerate(chromlength, 1): Y = [] for file in args.files: x, y = replication_data("hela", file, filename=file, chromosome=ch, start=0, end=None, resolution=5) if args.remove is not None: print(file,"removing %i points" %np.sum(y>args.remove)) y[y>args.remove] = np.nan Y.append(y) #if len(args.files) == 1: # data.append(Y) #else: # data.append(np.nanmean(Y,axis=0)) X = [["chr%i" % i] * len(d) for i, d in enumerate(data, 1)]
start, end, resolution, min_expre=1) d3p = direction x = Xg d3p = Yg #ymg[ymg<1] = np.nan sup_sig = [xmg, -ymg, "neg"] #d3p[np.abs(d3p)<1]=np.nan elif "[" not in signal and "--" not in signal: print("H") x, d3p = replication_data(cell, signal, chromosome=ch, start=start, end=end, resolution=mini(resolution, signal), raw=False, filename=None) elif "--" in signal or ":" in signal: weights_list = [] if "--" in signal: signal, sigv = signal.split("--") if ":" in sigv: sigv, *weights_list = sigv.split(":") x, d3p = replication_data(cell, signal, chromosome=ch, start=start, end=end,
MRTstd = score["MRTstd"][0] RFDp = float(score["RFDp"][0].split(",")[0][1:]) RFDstd = score["RFDstd"][0] RepTime = score["RepTime"][0] #scorev = 2-c1-c2 return MRTp, MRTstd, RFDp, RFDstd, RepTime for mark in marks: x, d = replication_data(cell, mark, chromosome=ch, start=start, end=end, resolution=5, raw=False, oData=False, bp=True, bpc=False) print(mark, d) if d == []: print("Skipping %s" % mark) continue for kon in [5e-7]: for ndiff in [30, 45, 60, 75, 90, 105, 120]: #ndiff = 60 for random_activation in [0, 0.05, 0.1, 0.2]: for dori in [5, 15, 30]: if "/" in mark: mark0 = "Epi_Bigger"
def detect_peaks(start, end, ch, resolution_polarity=5, exp_factor=4, percentile=85, cell="K562", cellMRT=None, cellRFD=None, nanpolate=False, fsmooth=None, gsmooth=5, recomp=False, dec=None, fich_name=None, sim=True, expRFD="OKSeq", rfd_only=False): rpol = resolution_polarity if fich_name is None: if cellMRT is None: cellMRT = cell if cellRFD is None: cellRFD = cell print(start, end, cellRFD, ch, rpol) x_pol, pol_exp = replication_data(cellRFD, expRFD, chromosome=ch, start=start, end=end, resolution=rpol, raw=False, pad=True) if "Yeast" in cellMRT: resolution = 1 else: resolution = 10 if not rfd_only: x_mrt, mrt_exp = replication_data(cellMRT, "MRT", chromosome=ch, start=start, end=end, resolution=resolution, raw=False) else: pol_expc = pol_exp.copy() pol_expc[np.isnan(pol_expc)] = 0 #mrt_exp = np.array(pd.Series(np.cumsum(pol_expc)).rolling(10000, min_periods=1, center=True).apply(lambda x: np.mean(x<x[len(x)//2])))[::2] if nanpolate: pol_exp = nan_polate(pol_exp) if fsmooth != None: pol_exp = smooth(pol_exp, fsmooth) ratio_res = resolution // rpol pol_exp /= rpol Smpol = np.copy(pol_exp) #print(mrt_exp.shape[0]*2, pol_exp.shape, ratio_res,) if not rfd_only: nmrt = mapboth(mrt_exp, pol_exp, ratio_res, pad=True) else: strain = pd.read_csv(fich_name, sep=",") resolution = 5 x_pol = strain.chromStart if sim: pol_exp = strain.RFDs mrt_exp = strain.MRTs else: pol_exp = strain.RFDe mrt_exp = strain.MRTe nmrt = mrt_exp Smpol = np.copy(pol_exp) ratio_res = 1 if not rfd_only: for delta in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8][::-1]: c1 = nmrt > delta Smpol[c1] = np.array(sm(Smpol, gsmooth))[c1] Smpol = sm(Smpol, 3) else: Smpol = sm(Smpol, 10) delta = Smpol[1:] - Smpol[:-1] delta -= np.nanmin(delta) percentile = np.percentile(delta[~np.isnan(delta)], percentile) print("Threshold value", percentile) delta[delta < percentile] = 0.0 if recomp: pol_exp = smooth(pol_exp, 2) deltap = pol_exp[1:] - pol_exp[:-1] deltap -= np.nanmin(delta) deltap[delta <= 0] = 0 #deltap[deltap < percentile] = 0 delta = deltap delta[delta < 0] = 0 if dec != None: if dec != 2: raise else: for i, (ok0, ok1, ok2) in enumerate(zip(pol_exp, pol_exp[1:], pol_exp[2:])): if ok0 + 0.05 > ok2: delta[i] = 0 # shifted from one on purpose delta[i + 1] = 0 if not rfd_only: delta *= mapboth(np.exp(-exp_factor * mrt_exp), delta, ratio_res, pad=True) delta[np.isnan(delta)] = 0 return x_pol, np.concatenate(([0], delta))
def compare(simu, signal, cell, res, ch, start, end, trim=0.05, return_exp=False, rescale=1, nanpolate=False, smoothf=None, trunc=False, pad=False, return_mask=False, masking=True, propagateNan=True): x, exp_signal = replication_data(cell, signal, chromosome=ch, start=start, end=end, resolution=res, raw=False, pad=pad) print(len(exp_signal), len(simu)) exp_signal *= rescale l = None if trunc and len(simu) != len(exp_signal): print("Truncating", len(simu), len(exp_signal)) l = min(len(simu), len(exp_signal)) simu = simu[:l] exp_signal = exp_signal[:l] mask_exp = np.array([not np.isnan(e) for e in exp_signal]) if masking: maskl = masking # kb if propagateNan: mask_exp = propagate_n_false(mask_exp, int(maskl / res)) exclude = int(maskl / res) mask_exp[:exclude] = False mask_exp[-exclude:] = False #Due to masking mask_exp[np.isnan(simu)] = False if smoothf is not None: exp_signal = nan_polate(exp_signal) exp_signal = smooth(exp_signal, smoothf) if simu is not None: ret = [ stats.pearsonr(simu[mask_exp], exp_signal[mask_exp]), np.mean((simu[mask_exp] - exp_signal[mask_exp])**2)**0.5 ] else: ret = [None, None] if return_exp: ret.append(exp_signal) if return_mask: ret.append([mask_exp, l]) return ret
170805979, 159345973, 145138636, 138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, 83257441, 80373285, 58617616, 64444167, 46709983, 50818468] data = [] X = [] for ch in range(1,len(chroms)+1): if type(chroms) == list: end = chroms[ch-1] end=int(end / 1000) else: end = None print(ch,end) x, y = replication_data("hela", args.file, filename=args.file, chromosome=ch, start=0, end=end, resolution=resolution) if to1: y = nan_polate(y) data.append(y) X = [["chr%i" % i] * len(d) for i, d in enumerate(data, 1)] Pos = [range(0, len(d) * resolution * 1000, resolution * 1000) for i, d in enumerate(data, 1)] X = np.concatenate(X).tolist() Pos = np.concatenate(Pos).tolist() data = np.concatenate(data, axis=0) pd.DataFrame({"chrom":X, "chromStart":np.array(Pos),"chromEnd":np.array(Pos) ,"signalValue":data}).to_csv(args.output,sep="\t",index=False)
gsmooth=args.gsmooth) f = resolution // resolution_polarity #ext = mapboth(d3p0, d3p, f) #d3p[ext == 0] = 0 for i in range(len(d3p0)): d3p0[i] = sum(d3p[i * f:min(i * f + 1, len(d3p))]) d3p = d3p0 if args.correct: x, DNaseI = replication_data(cell, "DNaseI", chromosome=ch, start=start, end=end, resolution=resolution, raw=False) x, CNV = replication_data(cell, "CNV", chromosome=ch, start=start, end=end, resolution=resolution, raw=False) CNV[CNV == 0] = 2 DNaseI[np.isnan(DNaseI)] = 0 DNaseI /= CNV DNaseIsm = smooth(DNaseI, 100)
if args.signal == "peak": x, d3p = detect_peaks(start, end, ch, resolution_polarity=resolution_polarity, exp_factor=exp_factor, percentile=percentile, cell=cell, nanpolate=True) if args.correct: x, DNaseI = replication_data(cell, "DNaseI", chromosome=ch, start=start, end=end, resolution=resolution, raw=False) x, CNV = replication_data(cell, "CNV", chromosome=ch, start=start, end=end, resolution=resolution, raw=False) CNV[CNV == 0] = 2 DNaseI[np.isnan(DNaseI)] = 0 DNaseI /= CNV DNaseIsm = smooth(DNaseI, 100)