Exemple #1
0
def get_expression(cell, ch, start, end, resolution, min_expre=0):
    re = replication_data(cell,
                          "ExpGenes",
                          chromosome=ch,
                          start=start,
                          end=end,
                          resolution=resolution,
                          raw=True)

    X = []
    Y = []
    D = []
    xm = []
    ym = []
    Ym = []
    Xm = []
    #std = np.nanstd(re["signalValue"])
    for istart, iend, v, strand in zip(re["chromStart"], re["chromEnd"],
                                       re["signalValue"], re["strand"]):

        # print(istart*5,strand)
        if strand == "+":
            X.extend([istart, iend, iend + 1])
            Y.extend([v, v, np.nan])
        else:
            Xm.extend([istart, iend, iend + 1])
            Ym.extend([v, v, np.nan])
        xm.append(istart / 2 + iend / 2)
        ym.append(v)
        D.append(strand)

    mean = np.nanmean(Y)
    stdv = np.nanstd(Y)
    print(mean, stdv)
    Y = np.array(Y)
    X = np.array(X)
    Ym = np.array(Ym)
    Xm = np.array(Xm)
    D = np.array(D)
    xm = np.array(xm)
    ym = np.array(ym)

    directionp = np.arange(start, end, resolution) * 0
    for istart, iend, v in zip(X[::3], X[1::3], Y[::3]):
        if v > min_expre:
            # print(start,istart,iend,int(round((istart-start)/resolution)),int(round((iend-start)/resolution)))
            directionp[int(round(istart - start / resolution)
                           ):int(round(iend - start / resolution))] = 1
    directionm = np.arange(start, end, resolution) * 0
    for istart, iend, v in zip(Xm[::3], Xm[1::3], Ym[::3]):
        if v > min_expre:
            directionm[int(round(istart - start / resolution)
                           ):int(round(iend - start / resolution))] = 1

    return X * resolution, Y, Xm * resolution, Ym, directionp - directionm
Exemple #2
0
args = parser.parse_args()

chromlength = [248956422, 242193529, 198295559, 190214555, 181538259,
               170805979, 159345973, 145138636, 138394717,
               133797422, 135086622, 133275309, 114364328, 107043718,
               101991189, 90338345, 83257441,
               80373285, 58617616, 64444167, 46709983, 50818468]

#os.makedirs(args.root, exist_ok=True)


data = []
for ch, l in enumerate(chromlength, 1):
    Y = []
    for file in args.files:
        x, y = replication_data("hela", file, filename=file,
                                chromosome=ch, start=0, end=None, resolution=5)
        if args.remove is not None:
            print(file,"removing %i points" %np.sum(y>args.remove))
            y[y>args.remove] = np.nan
        Y.append(y)

    #if len(args.files) == 1:
    #    data.append(Y)
    #else:

    #
    data.append(np.nanmean(Y,axis=0))



X = [["chr%i" % i] * len(d) for i, d in enumerate(data, 1)]
                                                             start,
                                                             end,
                                                             resolution,
                                                             min_expre=1)
                d3p = direction
                x = Xg
                d3p = Yg
                #ymg[ymg<1] = np.nan
                sup_sig = [xmg, -ymg, "neg"]
                #d3p[np.abs(d3p)<1]=np.nan
            elif "[" not in signal and "--" not in signal:
                print("H")
                x, d3p = replication_data(cell,
                                          signal,
                                          chromosome=ch,
                                          start=start,
                                          end=end,
                                          resolution=mini(resolution, signal),
                                          raw=False,
                                          filename=None)
            elif "--" in signal or ":" in signal:
                weights_list = []
                if "--" in signal:
                    signal, sigv = signal.split("--")

                    if ":" in sigv:
                        sigv, *weights_list = sigv.split(":")
                    x, d3p = replication_data(cell,
                                              signal,
                                              chromosome=ch,
                                              start=start,
                                              end=end,
Exemple #4
0
    MRTstd = score["MRTstd"][0]
    RFDp = float(score["RFDp"][0].split(",")[0][1:])
    RFDstd = score["RFDstd"][0]
    RepTime = score["RepTime"][0]
    #scorev = 2-c1-c2

    return MRTp, MRTstd, RFDp, RFDstd, RepTime


for mark in marks:

    x, d = replication_data(cell,
                            mark,
                            chromosome=ch,
                            start=start,
                            end=end,
                            resolution=5,
                            raw=False,
                            oData=False,
                            bp=True,
                            bpc=False)
    print(mark, d)
    if d == []:
        print("Skipping %s" % mark)
        continue
    for kon in [5e-7]:
        for ndiff in [30, 45, 60, 75, 90, 105, 120]:
            #ndiff = 60
            for random_activation in [0, 0.05, 0.1, 0.2]:
                for dori in [5, 15, 30]:
                    if "/" in mark:
                        mark0 = "Epi_Bigger"
Exemple #5
0
def detect_peaks(start,
                 end,
                 ch,
                 resolution_polarity=5,
                 exp_factor=4,
                 percentile=85,
                 cell="K562",
                 cellMRT=None,
                 cellRFD=None,
                 nanpolate=False,
                 fsmooth=None,
                 gsmooth=5,
                 recomp=False,
                 dec=None,
                 fich_name=None,
                 sim=True,
                 expRFD="OKSeq",
                 rfd_only=False):

    rpol = resolution_polarity

    if fich_name is None:
        if cellMRT is None:
            cellMRT = cell
        if cellRFD is None:
            cellRFD = cell
        print(start, end, cellRFD, ch, rpol)
        x_pol, pol_exp = replication_data(cellRFD,
                                          expRFD,
                                          chromosome=ch,
                                          start=start,
                                          end=end,
                                          resolution=rpol,
                                          raw=False,
                                          pad=True)
        if "Yeast" in cellMRT:
            resolution = 1
        else:
            resolution = 10

        if not rfd_only:
            x_mrt, mrt_exp = replication_data(cellMRT,
                                              "MRT",
                                              chromosome=ch,
                                              start=start,
                                              end=end,
                                              resolution=resolution,
                                              raw=False)
        else:
            pol_expc = pol_exp.copy()
            pol_expc[np.isnan(pol_expc)] = 0
            #mrt_exp = np.array(pd.Series(np.cumsum(pol_expc)).rolling(10000, min_periods=1, center=True).apply(lambda x: np.mean(x<x[len(x)//2])))[::2]

        if nanpolate:
            pol_exp = nan_polate(pol_exp)

            if fsmooth != None:
                pol_exp = smooth(pol_exp, fsmooth)

        ratio_res = resolution // rpol

        pol_exp /= rpol

        Smpol = np.copy(pol_exp)

        #print(mrt_exp.shape[0]*2, pol_exp.shape, ratio_res,)
        if not rfd_only:
            nmrt = mapboth(mrt_exp, pol_exp, ratio_res, pad=True)
    else:
        strain = pd.read_csv(fich_name, sep=",")
        resolution = 5
        x_pol = strain.chromStart
        if sim:
            pol_exp = strain.RFDs
            mrt_exp = strain.MRTs
        else:
            pol_exp = strain.RFDe
            mrt_exp = strain.MRTe
        nmrt = mrt_exp
        Smpol = np.copy(pol_exp)
        ratio_res = 1

    if not rfd_only:
        for delta in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8][::-1]:

            c1 = nmrt > delta
            Smpol[c1] = np.array(sm(Smpol, gsmooth))[c1]

        Smpol = sm(Smpol, 3)
    else:
        Smpol = sm(Smpol, 10)

    delta = Smpol[1:] - Smpol[:-1]
    delta -= np.nanmin(delta)

    percentile = np.percentile(delta[~np.isnan(delta)], percentile)
    print("Threshold value", percentile)
    delta[delta < percentile] = 0.0

    if recomp:
        pol_exp = smooth(pol_exp, 2)
        deltap = pol_exp[1:] - pol_exp[:-1]
        deltap -= np.nanmin(delta)
        deltap[delta <= 0] = 0
        #deltap[deltap < percentile] = 0
        delta = deltap
        delta[delta < 0] = 0

    if dec != None:
        if dec != 2:
            raise
        else:
            for i, (ok0, ok1,
                    ok2) in enumerate(zip(pol_exp, pol_exp[1:], pol_exp[2:])):

                if ok0 + 0.05 > ok2:
                    delta[i] = 0  # shifted from one on purpose
                    delta[i + 1] = 0
    if not rfd_only:
        delta *= mapboth(np.exp(-exp_factor * mrt_exp),
                         delta,
                         ratio_res,
                         pad=True)

    delta[np.isnan(delta)] = 0

    return x_pol, np.concatenate(([0], delta))
Exemple #6
0
def compare(simu,
            signal,
            cell,
            res,
            ch,
            start,
            end,
            trim=0.05,
            return_exp=False,
            rescale=1,
            nanpolate=False,
            smoothf=None,
            trunc=False,
            pad=False,
            return_mask=False,
            masking=True,
            propagateNan=True):
    x, exp_signal = replication_data(cell,
                                     signal,
                                     chromosome=ch,
                                     start=start,
                                     end=end,
                                     resolution=res,
                                     raw=False,
                                     pad=pad)

    print(len(exp_signal), len(simu))
    exp_signal *= rescale

    l = None
    if trunc and len(simu) != len(exp_signal):
        print("Truncating", len(simu), len(exp_signal))
        l = min(len(simu), len(exp_signal))
        simu = simu[:l]
        exp_signal = exp_signal[:l]

    mask_exp = np.array([not np.isnan(e) for e in exp_signal])
    if masking:
        maskl = masking  # kb

        if propagateNan:
            mask_exp = propagate_n_false(mask_exp, int(maskl / res))

        exclude = int(maskl / res)
        mask_exp[:exclude] = False
        mask_exp[-exclude:] = False

    #Due to masking
    mask_exp[np.isnan(simu)] = False

    if smoothf is not None:
        exp_signal = nan_polate(exp_signal)
        exp_signal = smooth(exp_signal, smoothf)
    if simu is not None:
        ret = [
            stats.pearsonr(simu[mask_exp], exp_signal[mask_exp]),
            np.mean((simu[mask_exp] - exp_signal[mask_exp])**2)**0.5
        ]
    else:
        ret = [None, None]
    if return_exp:
        ret.append(exp_signal)
    if return_mask:
        ret.append([mask_exp, l])
    return ret
               170805979, 159345973, 145138636, 138394717,
               133797422, 135086622, 133275309, 114364328, 107043718,
               101991189, 90338345, 83257441,
               80373285, 58617616, 64444167, 46709983, 50818468]

data = []
X = []
for ch in range(1,len(chroms)+1):

    if type(chroms) == list:
        end = chroms[ch-1]
        end=int(end / 1000)
    else:
        end = None
    print(ch,end)
    x, y = replication_data("hela", args.file, filename=args.file,
                            chromosome=ch, start=0, end=end, resolution=resolution)

    if to1:
        y = nan_polate(y)
    data.append(y)

X = [["chr%i" % i] * len(d) for i, d in enumerate(data, 1)]
Pos = [range(0, len(d) * resolution * 1000, resolution * 1000)  for i, d in enumerate(data, 1)]
X = np.concatenate(X).tolist()
Pos = np.concatenate(Pos).tolist()

data = np.concatenate(data, axis=0)

pd.DataFrame({"chrom":X, "chromStart":np.array(Pos),"chromEnd":np.array(Pos) ,"signalValue":data}).to_csv(args.output,sep="\t",index=False)
Exemple #8
0
                                   gsmooth=args.gsmooth)

            f = resolution // resolution_polarity
            #ext = mapboth(d3p0, d3p, f)
            #d3p[ext == 0] = 0

            for i in range(len(d3p0)):
                d3p0[i] = sum(d3p[i * f:min(i * f + 1, len(d3p))])

            d3p = d3p0

        if args.correct:
            x, DNaseI = replication_data(cell,
                                         "DNaseI",
                                         chromosome=ch,
                                         start=start,
                                         end=end,
                                         resolution=resolution,
                                         raw=False)
            x, CNV = replication_data(cell,
                                      "CNV",
                                      chromosome=ch,
                                      start=start,
                                      end=end,
                                      resolution=resolution,
                                      raw=False)
            CNV[CNV == 0] = 2
            DNaseI[np.isnan(DNaseI)] = 0
            DNaseI /= CNV

            DNaseIsm = smooth(DNaseI, 100)
if args.signal == "peak":
    x, d3p = detect_peaks(start,
                          end,
                          ch,
                          resolution_polarity=resolution_polarity,
                          exp_factor=exp_factor,
                          percentile=percentile,
                          cell=cell,
                          nanpolate=True)

    if args.correct:
        x, DNaseI = replication_data(cell,
                                     "DNaseI",
                                     chromosome=ch,
                                     start=start,
                                     end=end,
                                     resolution=resolution,
                                     raw=False)
        x, CNV = replication_data(cell,
                                  "CNV",
                                  chromosome=ch,
                                  start=start,
                                  end=end,
                                  resolution=resolution,
                                  raw=False)
        CNV[CNV == 0] = 2
        DNaseI[np.isnan(DNaseI)] = 0
        DNaseI /= CNV

        DNaseIsm = smooth(DNaseI, 100)