예제 #1
0
    def test_entry_creation(self):
        entry1_1 = bed.BedEntry('X', 8000, 9000)
        entry1_1.signalValue = 10
        entry1_2 = bed.BedEntry('X', 80, 900)
        entry1_2.signalValue = 3
        bed1 = bed.BedFile([entry1_1, entry1_2])

        entry2_1 = bed.BedEntry('X', 8500, 9000)
        entry2_2 = bed.BedEntry('X', 80, 900)
        entry2_1.signalValue = 10
        entry2_2.signalValue = 3
        bed2 = bed.BedFile([entry2_1, entry2_2])

        entry3_1 = bed.BedEntry('X', 7500, 9000)
        entry3_2 = bed.BedEntry('X', 80, 900)
        entry3_1.signalValue = 10
        entry3_2.signalValue = 3
        bed3 = bed.BedFile([entry3_1, entry3_2])

        entry4_1 = bed.BedEntry('X', 5000, 8999)
        entry4_2 = bed.BedEntry('X', 80, 900)

        entry4_1.signalValue = 10
        entry4_2.signalValue = 3
        bed4 = bed.BedFile([entry4_1, entry4_2])

        unions = union([bed1, bed2, bed3, bed4], 2)
        for fragment in unions[0]:
            print(fragment)
예제 #2
0
    def test_single_chrom(self):
        med1 = bed.BedFile("test/data/med/med1_peaks.broadPeak", "Peaks").getChrom("chr17")
        med2 = bed.BedFile("test/data/med/med2_peaks.broadPeak", "Peaks").getChrom("chr17")
        med3 = bed.BedFile("test/data/med/med3_peaks.broadPeak", "Peaks").getChrom("chr17")

        bedf = [med1, med2, med3]
        minentries = 2

        # First create intersection and rank the entries in each replicate and return the rankproduct values
        ranks = rankreps(bedf, minentries, rankmethod='signalValue')

        # Calculate rank product for each entry that contributes to a union entry
        # Calculate the pvalues of the rank product values
        print('Calculating rank product probabilities...')
        rpb_up = rankprodbounds(ranks[1], len(ranks[1]), len(bedf), 'geometric')
        print('Calculating binomial threshold...')
        # Calculate rpb and binomial intersection point
        Pks = thresholdCalc(rpb_up, k=len(bedf) - (minentries - 1))
        if len(Pks[2]) != 0:
            binomAlpha = round(min(Pks[2]), 3)
        else:
            print('No binomial convergence, defaulting to 0.1')
            binomAlpha = 0.1

        # Perform multiple hypothesis testing correction upon the pvals
        fdr = multipletesting.fdrcorrection(rpb_up)

        # Determine whether to remove entries that are called significant
        print('Cleaning up output...')
        for i, v in enumerate(ranks[0][0]):
            p = rpb_up[i]

            if p != 0.0:
                ranks[0][0][i].addOption(name='TBD',
                                         score=min([abs(int(125 * math.log2(rpb_up[i]))), 1000]),
                                         strand='.',
                                         pValue=rpb_up[i],
                                         qValue=fdr[1][i])
            else:
                ranks[0][0][i].addOption(name='TBD',
                                         score=1000,
                                         strand='.',
                                         pValue=2.5e-20,
                                         qValue=2.5e-20)
        collapsed = bed.BedFile(ranks[0][0], 'IDR')

        connected = connect_entries(collapsed, bedf, 20, True)
        self.assertNotEqual(len(collapsed), len(connected))
예제 #3
0
def joinChIA(chia):
    interactions = []
    for rep in chia:
        for intr in rep:
            interactions.append(intr)

    pe = bed.BedFile(interactions, 'bedpe')
    return pe
예제 #4
0
파일: chipr.py 프로젝트: rhysnewell/ChIP-R
    def run_1(self, args):
        for i in args.input:
            print(str(i.name))

        print('Processing Input...')

        bedfs = [bed.BedFile(str(i.name), 'Peaks') for i in args.input]

        rankprod.performrankprod(bedfs, args.minentries, args.rankmethod,
                                 'all', args.duphandling, args.random_seed,
                                 args.alpha, args.output, args.size, False,
                                 args.fragment)
예제 #5
0
    def test_from_bed(self):
        med1 = bed.BedFile("test/data/med/med1_peaks.broadPeak", "Peaks")
        med2 = bed.BedFile("test/data/med/med2_peaks.broadPeak", "Peaks")
        med3 = bed.BedFile("test/data/med/med3_peaks.broadPeak", "Peaks")

        bedf = [med1, med2, med3]

        performrankprod(bedf, minentries=2, rankmethod="pvalue", specifyMax=None,
                        duphandling='average', random_seed=0.5,
                        alpha=0.05,
                        filename="test_fragments_true",
                        default_min_peak=20,
                        print_pvals=True,
                        fragment=True)

        performrankprod(bedf, minentries=2, rankmethod="pvalue", specifyMax=None,
                        duphandling='average', random_seed=0.5,
                        alpha=0.05,
                        filename="test_fragments_false",
                        default_min_peak=20,
                        print_pvals=True,
                        fragment=False)
예제 #6
0
def ChIAreproducibility(chia,
                        chip,
                        minentries=None,
                        rank='signalValue',
                        threshold='all',
                        alpha=0.05,
                        filename='NA'):
    if minentries is None:
        minentries = len(chip)
    if len(chia) == 1 and len(chip) == 1:
        return verifyChIAPeaks(chia[0],
                               chip[0],
                               filename=filename,
                               alpha=alpha)
    else:
        try:  # Check to see if multiple chiapet files are entered
            chia[0]
            inters = joinChIA(chia)
        except TypeError:
            inters = chia
        rankprod.performrankprod(chip,
                                 minentries=minentries,
                                 rankmethod=rank,
                                 alpha=alpha,
                                 filename=filename + '.bed')
        if threshold == 'binom':
            RP = bed.BedFile('T2_' + filename + '.bed', 'Peaks')

            return verifyChIAPeaks(inters, RP, filename=filename, alpha=alpha)
        elif threshold == 'alpha':
            RP = bed.BedFile('T1_' + filename + '.bed', 'Peaks')
            return verifyChIAPeaks(inters, RP, filename=filename, alpha=alpha)

        elif threshold == 'all':
            RP = bed.BedFile('ALL_' + filename + '.bed', 'Peaks')
            return verifyChIAPeaks(inters, RP, filename=filename, alpha=alpha)
예제 #7
0
 def test_bed(self):
     bf = bed.BedFile("test/data/med/med1_peaks.broadPeak", "Peaks")
     print(bf.chroms.keys())
     g = bf.generate('chr1')
     print(next(g))
     print(next(g))
     print(next(g))
     cnt = 0
     for entry in bf:
         cnt += 1
         print(str(cnt) + '\t' + str(entry))
         if cnt == 100:
             break
     entry1 = bed.BedEntry('chrX', 3858266, 3858530)
     print(entry1 in bf)
     entry2 = bed.BedEntry('chrX', 10047550, 10067694)
     for x in bf.getOverlap(entry2):
         print(x)
     entry3 = bed.BedEntry('chr9', 102699903, 102700167)
     for x in bf.getClosest(entry3):
         print(x)
         for y in x:
             print(y)
예제 #8
0
def verifyChIAPeaks(chiaData, chipData, filename, alpha):
    listinters = []
    PETs = []
    probs = []
    inters = {}
    PETgroups = {}
    prs = []
    fdr = []
    RP = []
    if getattr(chiaData, 'format') == 'BED12':
        chiaData = bed.BED12toBEDPE(chiaData)
    for link in chiaData:
        if link.PETs <= 2:
            continue
        o1 = chipData.getOverlap(link.partner1)
        o2 = chipData.getOverlap(link.partner2)
        try:
            PETgroups[link.PETs].append(link)
        except KeyError:
            PETgroups[link.PETs] = [link]

        try:
            if len(o1) > 0 and len(o2) > 0:
                for peaks1, peaks2 in zip(o1, o2):
                    # Create entries in dictionary with peaks as support
                    try:

                        inters[link].append([peaks1, peaks2])
                    except KeyError:

                        inters[link] = [peaks1, peaks2]
        except TypeError:
            continue
    dists = []
    for k, v in inters.items():
        pvals = []
        rs = []
        # if k.PETs <= 1:
        #     continue
        # kvals = []
        for peak in v:
            try:
                pvals.append(peak.pValue)
                rs.append(peak.signalValue)

                # combined = scipy.stats.combine_pvalues([peak1.pValue, peak2.pValue], 'fisher')
                # pvals.append(combined[1])
            except AttributeError:
                if type(peak) == 'list':
                    for p in peak:
                        try:
                            pvals.append(p.pValue)
                            rs.append(peak.signalValue)
                        except AttributeError:
                            continue
                else:
                    continue
        if len(pvals) != 0:
            combined = scipy.stats.combine_pvalues(pvals, 'stouffer', rs)
            probs.append(combined[1])
            # rs = iterflatten(rs)
            # RP.append(rs)
            listinters.append(k)
            PETs.append(k.PETs)
            dists.append(k.getDistance())

    # probcor = multipletesting.fdrcorrection(probs, alpha)
    PETs, listinters, probcor, dists = (list(x) for x in zip(
        *sorted(zip(PETs, listinters, probs, dists),
                key=lambda pair: pair[0],
                reverse=True)))

    bins = np.arange(min(dists), max(dists), max(dists) / 50, dtype=int)
    brobs = []
    ints = []
    pets2 = []
    ainters = []
    tags = 0
    # for bin in range(1, 50):
    #     pets = []
    #     lints = []
    #     ps = []
    #     ds = []
    #     # rp = []
    #     for idx, j in enumerate(np.digitize(dists, bins)):
    #         if j == bin:
    #             pets.append(PETs[idx])
    #             lints.append(listinters[idx])
    #             ps.append(probcor[idx])
    #             ds.append(dists[idx])
    # rp.append(RP[idx])
    # try:
    #     pets, lints, ps, ds, rp = (list(x) for x in zip(*sorted(zip(pets, lints, ps, ds, rp),
    #                                                                  key=lambda pair: pair[0], reverse=True)))
    # except ValueError:
    #     continue
    #
    n = len(RP)
    # k = sum(pets)
    t = sum(PETs)
    # brobs = []
    tags = 0
    # RPranks = scipy.stats.rankdata(RP)
    # mRPrank = max(RPranks)
    # RPranks1 = [mRPrank+1 - x for x in RPranks]
    # probranks = scipy.stats.rankdata(probcor)
    # mprobs = max(probranks)
    # probranks1 = [mprobs+1 -x for x in probranks]
    # # distranks = scipy.stats.rankdata(dists)
    # # mdist = max(distranks)
    # # distranks1 = [mdist+1 - x for x in distranks]
    # # PETranks1 = scipy.stats.rankdata(PETs)
    # # mpet = max(PETranks1)
    # rankprods = []
    # # PETranks = [mpet + 1 - x for x in PETranks1]
    # PETdists = [x/y for x,y in zip(dists, PETs)]
    # PDranks = scipy.stats.rankdata(PETdists)
    # mPD = max(PDranks)
    # PDranks1 = [mPD+1 - x for x in PDranks]
    # for rank in range(len(RP)):
    #     print(PDranks[rank], PETdists[rank])
    #     try:
    #         rankprods.append(PDranks1[rank]*RPranks1[rank])
    #         pets2.append(PETs[rank])
    #     except IndexError:
    #         continue
    # print(rankprods)
    # rpb = rankprodbounds(rankprods, len(rankprods), 2, 'geometric')
    # for p in rpb:
    #     brobs.append(p)
    # print(rpb)
    # mRP = max(rankprods)
    # mPETs = max(PETs)
    # mdists = max(dists)
    for i, p in enumerate(zip(probcor, PETs)):
        tags += p[1]
        b = scipy.stats.binom.cdf(tags, t, 1 - p[0])
        # b2 = scipy.stats.combine_pvalues([b, [i]], 'stouffer', [(mPETs-PETs[i])/mPETs, (mRP-rankprods[i])/mRP])
        print(p[0], p[1])
        brobs.append(b)
        pets2.append(p[1])
        ints.append(listinters[i])
    #     # secinters = []
    #     # secPETs = []
    #     # secprobcor = []
    #     print(rpb)
    # ints, probs = (list(x) for x in zip(*sorted(zip(ainters, probs), key=lambda pair: pair[1], reverse=False)))
    corrected = multipletesting.fdrcorrection(brobs, alpha)
    thresh = rankprod.thresholdCalc(brobs)
    try:
        binomAlpha = round(min(thresh[2]), 3)
        if binomAlpha == 0:
            binomAlpha = 0.05
    except ValueError:
        binomAlpha = 0.05
    # brobs.append(rpb)
    print(binomAlpha)
    for i, p in zip(ints, corrected[1]):
        i.addOption(pValue=p)
        prs.append(p)
        ainters.append(i)
        # print(p, binomAlpha)
        if p <= 0.05:
            if p == 0:
                i.addOption(pValue=0.00000000000001)
            fdr.append(i)

            # else:
            #     secinters.append(i)
            #     secPETs.append(pet)
            #     secprobcor.append(pc)

    # n = sum(secPETs)
    # brobs = []
    # tags = 0
    # for i, p in zip(secPETs, secprobcor):
    #     tags += i
    #     b = scipy.stats.binom.cdf(tags, n, p)
    #     brobs.append(b)
    #
    # corrected = multipletesting.fdrcorrection(brobs, alpha)
    # for i, p in zip(listinters, brobs):
    #     i.addOption(pValue=p)
    #     if p <= alpha:
    #         if p == 0:
    #             i.addOption(pValue=0.00000000000001)
    #         fdr.append(i)

    fdr = bed.BedFile(fdr, "BEDPE")
    allinters = bed.BedFile(ainters, "BEDPE")
    print(len(fdr), 'Interactions pass FDR threshold')
    bed.writeBedFile(allinters, filename + '.bedpe', format='BEDPE')
    bed.writeBedFile(fdr, filename + '.bedpe', format="BEDPE")
    intersBED12 = bed.BEDPEtoBED12(allinters)
    bed.writeBedFile(intersBED12, filename + '.bed', format='BED12')

    return ainters, fdr, allinters, PETs, dists, brobs, corrected[1]
예제 #9
0
def verifyChIAPeaks(chiaData, chipData, filename, alpha):
    listinters = []
    PETs = []
    probs = []
    inters = {}
    PETgroups = {}
    prs = []
    fdr = []
    RP = []
    if getattr(chiaData, 'format') == 'BED12':
        chiaData = bed.BED12toBEDPE(chiaData)
    for link in chiaData:
        if link.PETs <= 2:
            continue
        o1 = chipData.getOverlap(link.partner1)
        o2 = chipData.getOverlap(link.partner2)
        try:
            PETgroups[link.PETs].append(link)
        except KeyError:
            PETgroups[link.PETs] = [link]

        try:
            if len(o1) > 0 and len(o2) > 0:
                for peaks1, peaks2 in zip(o1, o2):
                    # Create entries in dictionary with peaks as support
                    try:

                        inters[link].append([peaks1, peaks2])
                    except KeyError:

                        inters[link] = [peaks1, peaks2]
        except TypeError:
            continue
    dists = []
    for k, v in inters.items():
        pvals = []
        rs = []
        # if k.PETs <= 1:
        #     continue
        # kvals = []
        for peak in v:
            try:
                pvals.append(peak.pValue)
                rs.append(peak.signalValue)

                # combined = scipy.stats.combine_pvalues([peak1.pValue, peak2.pValue], 'fisher')
                # pvals.append(combined[1])
            except AttributeError:
                if type(peak) == 'list':
                    for p in peak:
                        try:
                            pvals.append(p.pValue)
                            rs.append(peak.signalValue)
                        except AttributeError:
                            continue
                else:
                    continue
        if len(pvals) != 0:
            combined = scipy.stats.combine_pvalues(pvals, 'stouffer', rs)
            probs.append(combined[1])
            # rs = iterflatten(rs)
            # RP.append(rs)
            listinters.append(k)
            PETs.append(k.PETs)
            dists.append(k.getDistance())

    # probcor = multipletesting.fdrcorrection(probs, alpha)
    PETs, listinters, probcor, dists = (list(x) for x in zip(
        *sorted(zip(PETs, listinters, probs, dists),
                key=lambda pair: pair[0],
                reverse=True)))

    bins = np.arange(min(dists), max(dists), max(dists) / 50, dtype=int)
    brobs = []
    ints = []
    pets2 = []
    ainters = []
    tags = 0

    n = len(RP)
    # k = sum(pets)
    t = sum(PETs)
    # brobs = []
    tags = 0

    for i, p in enumerate(zip(probcor, PETs)):
        tags += p[1]
        b = scipy.stats.binom.cdf(tags, t, 1 - p[0])
        print(p[0], p[1])
        brobs.append(b)
        pets2.append(p[1])
        ints.append(listinters[i])

    corrected = multipletesting.fdrcorrection(brobs, alpha)
    thresh = rankprod.thresholdCalc(brobs)
    try:
        binomAlpha = round(min(thresh[2]), 3)
        if binomAlpha == 0:
            binomAlpha = 0.05
    except ValueError:
        binomAlpha = 0.05
    # brobs.append(rpb)
    print(binomAlpha)
    for i, p in zip(ints, corrected[1]):
        i.addOption(pValue=p)
        prs.append(p)
        ainters.append(i)
        # print(p, binomAlpha)
        if p <= 0.05:
            if p == 0:
                i.addOption(pValue=0.00000000000001)
            fdr.append(i)

    fdr = bed.BedFile(fdr, "BEDPE")
    allinters = bed.BedFile(ainters, "BEDPE")
    print(len(fdr), 'Interactions pass FDR threshold')
    bed.writeBedFile(allinters, filename + '.bedpe', format='BEDPE')
    bed.writeBedFile(fdr, filename + '.bedpe', format="BEDPE")
    intersBED12 = bed.BEDPEtoBED12(allinters)
    bed.writeBedFile(intersBED12, filename + '.bed', format='BED12')

    return ainters, fdr, allinters, PETs, dists, brobs, corrected[1]