Example #1
0
def getIntSig(f, records, minPts, discut):
    """
    @param:discut, distance cutoff determined for self-ligation pets.
    """
    print "Starting estimate significance for interactions in %s" % f
    model, N = getGenomeCoverage(f, discut)
    print "Genomic coverage model built from %s" % f
    if N == 0:
        print "No cis-PETs parsed as requiring distance cutoff >%s from %s" % (
            discut, f)
        return None
    ds = {}
    i = 0
    for r in records:
        chrom = r[0]
        key = "%s-%s-%s" % (r[0], r[3], i)
        iva = [r[1] - 1, r[2] + 1]
        ivb = [r[4] - 1, r[5] + 1]
        #filter loops
        distance = abs(sum(ivb) / 2.0 - sum(iva) / 2.0)
        if distance < discut:
            continue
        ra, rb, rab = getPETsforRegions(iva, ivb, model)
        #filter clusters contain many self-ligation PETs within distance cutoff
        if rab < minPts:
            continue
        i += 1
        if i % 100 == 0:
            cFlush("%s interaction p-values estimated for %s" % (i, f))
        ra, rb, rab, es, fdr, hyp, chyp, pop, nbp = getMultiplePsFdr(
            iva, ivb, model, N)
        #this part should be furthur modified, as for most ideable data, there are no noise, so the es should be inf, however, not possible
        if es == "None":
            continue
        ds[key] = {
            "distance": distance,
            "ra": ra,
            "rb": rb,
            "rab": rab,
            "ES": es,
            "FDR": fdr,
            "hypergeometric_p-value": hyp,
            "hypergeometric_local_FDR": chyp,
            "poisson_p-value": pop,
            "binomal_p-value": nbp,
            "iva": "%s:%s-%s" % (chrom, iva[0], iva[1]),
            "ivb": "%s:%s-%s" % (chrom, ivb[0], ivb[1])
        }
    #memory usage
    del model
    gc.collect()
    print
    if len(ds.keys()) == 0:
        return None
    ds = pd.DataFrame(ds).T
    ds["poisson_p-value_corrected"] = getBonPvalues(ds["poisson_p-value"])
    ds["binomal_p-value_corrected"] = getBonPvalues(ds["binomal_p-value"])
    ds["hypergeometric_p-value_corrected"] = getBonPvalues(
        ds["hypergeometric_p-value"])
    return ds
Example #2
0
def parseRawBedpe2(fs, fout, cs, cut, logger):
    """
    Get the cis-PETs, organized by chromosomes. Input could be mixed PETs in bedpe.gz or bedpe. Also change read id to numbers to minize memory usage.
    @param fs: bedpe files of replicates, could be .bedpe or .bedpe.gz
    @param fout: output prefix, the name for directory
    @param cs: chroms that wanted, list like ["chr1","chr2"]
    """
    #chroms data
    chroms = {}
    #cis files
    cfs = []
    #distance between PETs mapped to different strands
    ds = []
    i, j, = 0, 0
    for f in fs:
        r = "Parsing PETs from %s, requiring initial distance cutoff > %s" % (
            f, cut)
        logger.info(r)
        if f.endswith(".gz"):
            of = gzip.open(f, "rb")
        else:
            of = open(f)
        for line in of:
            i += 1
            if i % 100000 == 0:
                cFlush("%s PETs processed from %s" % (i, f))
            line = line.split("\n")[0].split("\t")
            if "*" in line and "-1" in line:
                continue
            if len(line) < 6:
                continue
            try:
                pet = PET(line)
            except:
                continue
            #cis reads
            if pet.chromA != pet.chromB:
                continue
            #filtering unwanted PETs in chroms
            if len(cs) > 0 and (not (pet.chromA in cs and pet.chromB in cs)):
                continue
            #filtering too close PETs
            if cut > 0 and pet.distance < cut:
                continue
            if pet.chromA not in chroms:
                cf = os.path.join(fout,
                                  "%s-%s" % (pet.chromA, pet.chromB) + ".txt")
                chroms[pet.chromA] = {"f": open(cf, "w"), "c": 0}
                cfs.append(cf)
            nline = [chroms[pet.chromA]["c"], pet.cA, pet.cB]
            chroms[pet.chromA]["f"].write("\t".join(map(str, nline)) + "\n")
            chroms[pet.chromA]["c"] += 1
            j += 1
    print()
    del(chroms)
    r = "Totaly %s PETs from %s, in which %s cis PETs" % (i, ",".join(fs), j)
    logger.info(r)
    return cfs
Example #3
0
def estSigOneChr(rs, jdf, pre, dis=0, win=5):
    """
    Estimating the significances for the loops in one chromosome.
    """
    #all variables with suffix t is treatment, with suffix c in control
    logger.info("Building genomic coverage model for %s" % jdf)
    model, N = getGenomeCoverage(jdf, dis)
    ds = {}
    i = 0
    for key, r in rs.items():
        i += 1
        if i % 100 == 0:
            report = "Estimating %s loops for %s" % (i, pre)
            cFlush(report)
        chrom = r[0]
        iva = [r[1], r[2]]
        ivb = [r[4], r[5]]
        ra, rb, rab = getPETsforRegions(iva, ivb, model)
        ivas, ivbs = getNearbyPairRegions(iva, ivb, win=win)
        mrab = getPermutatedBg(ivas, ivbs, model)
        if mrab > 0:
            es = rab / mrab
        else:
            es = 100
        pop = max([1e-300, poisson.sf(rab - 1.0, mrab)])
        niva = "%s:%s-%s" % (chrom, iva[0], iva[1])
        nivb = "%s:%s-%s" % (chrom, ivb[0], ivb[1])
        ds[key] = {
            "iva": niva,
            "ivb": nivb,
            "ra": ra,
            "rb": rb,
            "rab": rab,
            "ES": es,
            "poisson_p-value": pop,
        }
    if len(ds) == 0:
        return None
    ds = pd.DataFrame(ds).T
    return ds