def getIntSig(f, records, minPts, discut): """ @param:discut, distance cutoff determined for self-ligation pets. """ print "Starting estimate significance for interactions in %s" % f model, N = getGenomeCoverage(f, discut) print "Genomic coverage model built from %s" % f if N == 0: print "No cis-PETs parsed as requiring distance cutoff >%s from %s" % ( discut, f) return None ds = {} i = 0 for r in records: chrom = r[0] key = "%s-%s-%s" % (r[0], r[3], i) iva = [r[1] - 1, r[2] + 1] ivb = [r[4] - 1, r[5] + 1] #filter loops distance = abs(sum(ivb) / 2.0 - sum(iva) / 2.0) if distance < discut: continue ra, rb, rab = getPETsforRegions(iva, ivb, model) #filter clusters contain many self-ligation PETs within distance cutoff if rab < minPts: continue i += 1 if i % 100 == 0: cFlush("%s interaction p-values estimated for %s" % (i, f)) ra, rb, rab, es, fdr, hyp, chyp, pop, nbp = getMultiplePsFdr( iva, ivb, model, N) #this part should be furthur modified, as for most ideable data, there are no noise, so the es should be inf, however, not possible if es == "None": continue ds[key] = { "distance": distance, "ra": ra, "rb": rb, "rab": rab, "ES": es, "FDR": fdr, "hypergeometric_p-value": hyp, "hypergeometric_local_FDR": chyp, "poisson_p-value": pop, "binomal_p-value": nbp, "iva": "%s:%s-%s" % (chrom, iva[0], iva[1]), "ivb": "%s:%s-%s" % (chrom, ivb[0], ivb[1]) } #memory usage del model gc.collect() print if len(ds.keys()) == 0: return None ds = pd.DataFrame(ds).T ds["poisson_p-value_corrected"] = getBonPvalues(ds["poisson_p-value"]) ds["binomal_p-value_corrected"] = getBonPvalues(ds["binomal_p-value"]) ds["hypergeometric_p-value_corrected"] = getBonPvalues( ds["hypergeometric_p-value"]) return ds
def parseRawBedpe2(fs, fout, cs, cut, logger): """ Get the cis-PETs, organized by chromosomes. Input could be mixed PETs in bedpe.gz or bedpe. Also change read id to numbers to minize memory usage. @param fs: bedpe files of replicates, could be .bedpe or .bedpe.gz @param fout: output prefix, the name for directory @param cs: chroms that wanted, list like ["chr1","chr2"] """ #chroms data chroms = {} #cis files cfs = [] #distance between PETs mapped to different strands ds = [] i, j, = 0, 0 for f in fs: r = "Parsing PETs from %s, requiring initial distance cutoff > %s" % ( f, cut) logger.info(r) if f.endswith(".gz"): of = gzip.open(f, "rb") else: of = open(f) for line in of: i += 1 if i % 100000 == 0: cFlush("%s PETs processed from %s" % (i, f)) line = line.split("\n")[0].split("\t") if "*" in line and "-1" in line: continue if len(line) < 6: continue try: pet = PET(line) except: continue #cis reads if pet.chromA != pet.chromB: continue #filtering unwanted PETs in chroms if len(cs) > 0 and (not (pet.chromA in cs and pet.chromB in cs)): continue #filtering too close PETs if cut > 0 and pet.distance < cut: continue if pet.chromA not in chroms: cf = os.path.join(fout, "%s-%s" % (pet.chromA, pet.chromB) + ".txt") chroms[pet.chromA] = {"f": open(cf, "w"), "c": 0} cfs.append(cf) nline = [chroms[pet.chromA]["c"], pet.cA, pet.cB] chroms[pet.chromA]["f"].write("\t".join(map(str, nline)) + "\n") chroms[pet.chromA]["c"] += 1 j += 1 print() del(chroms) r = "Totaly %s PETs from %s, in which %s cis PETs" % (i, ",".join(fs), j) logger.info(r) return cfs
def estSigOneChr(rs, jdf, pre, dis=0, win=5): """ Estimating the significances for the loops in one chromosome. """ #all variables with suffix t is treatment, with suffix c in control logger.info("Building genomic coverage model for %s" % jdf) model, N = getGenomeCoverage(jdf, dis) ds = {} i = 0 for key, r in rs.items(): i += 1 if i % 100 == 0: report = "Estimating %s loops for %s" % (i, pre) cFlush(report) chrom = r[0] iva = [r[1], r[2]] ivb = [r[4], r[5]] ra, rb, rab = getPETsforRegions(iva, ivb, model) ivas, ivbs = getNearbyPairRegions(iva, ivb, win=win) mrab = getPermutatedBg(ivas, ivbs, model) if mrab > 0: es = rab / mrab else: es = 100 pop = max([1e-300, poisson.sf(rab - 1.0, mrab)]) niva = "%s:%s-%s" % (chrom, iva[0], iva[1]) nivb = "%s:%s-%s" % (chrom, ivb[0], ivb[1]) ds[key] = { "iva": niva, "ivb": nivb, "ra": ra, "rb": rb, "rab": rab, "ES": es, "poisson_p-value": pop, } if len(ds) == 0: return None ds = pd.DataFrame(ds).T return ds