Esempio n. 1
0
    def test_multiple_test_correction(self):
        pvalues = [0.001, 0.005, 0.1, 0.5, 0.01]
        rejected_hypotheses = np.array([True, True, False, False, True])
        corrected_pvalues = np.array([0.005, 0.0125, 0.125, 0.5, 0.01666667])

        rej, cor = multiple_test_correction(pvalues)

        self.assertTrue(np.array_equal(rej, rejected_hypotheses), msg="multiple_test_correction returns wrong list of "
                                                                      "rejected hypotheses")
        self.assertTrue(np.allclose(cor, corrected_pvalues), msg="multiple_test_correction returns wrong list of"
                                                                 " corrected pvalues")

        pvalues = [0.01, 0.05, 0.1, 0.5]
        rejected_hypotheses = np.array([False, False, False, False])
        corrected_pvalues = np.array([0.08333333, 0.20833333, 0.27777777, 1])

        rej, cor = multiple_test_correction(pvalues, method="negcorr")

        self.assertTrue(np.array_equal(rej, rejected_hypotheses), msg="multiple_test_correction(negcorr) returns wrong"
                                                                      " list of rejected hypotheses")
        self.assertTrue(np.allclose(cor, corrected_pvalues), msg="multiple_test_correction(negcorr) returns wrong"
                                                                 " list of corrected pvalues")

        with self.assertRaises(ValueError):
            multiple_test_correction(pvalues, method="some_method")
Esempio n. 2
0
def main():

    # get command line arguments
    args = parseArgs()

    print(f"Using bam: {args.bam}")
    bf = args.bam

    print(f"Will write peaks: {args.outfile}_peaks.tsv")
    of = args.outfile

    cf = args.controlfile
    cs = args.chromsizes
    pvalue = args.pvalue
    minreads = args.minreads
    minsize = args.minsize
    bs = args.binsize

    corr = args.correct_pval
    if corr not in ["bh", "by", None]:
        print("Invalid correction method (please pass either 'bh' or 'by'")
        sys.exit(1)

    res, cov, cov2 = call_peaks(bf, cs, pvalue, minreads, bs, cfile=cf)

    # rpm norm the signal before writing to bigwig
    cov2.coverage = np.array(cov2.coverage,
                             dtype='object') * (1e6 / float(cov2.reads))
    bwfile = of + ".bw"
    write_bigwig(cov2, bwfile, cs)

    # write peaks to file
    outbed = of + "_peaks.tsv"
    write_bed(res, outbed, minsize)

    if corr is not None:
        dat = pd.read_csv(
            outbed,
            sep="\t",
            names=["chr", "start", "end", "name", "score", "strand"])
        if corr == "bh":
            b, corr = multiple_test_correction(dat.score, method="p")
        elif corr == "by":
            b, corr = b, corr = multiple_test_correction(dat.score, method="n")
        dat["score"] = corr
        dat["score"] = dat["score"].apply(lambda x: -np.log(x))
        dat[dat.score > -np.log10(pvalue)].to_csv(outbed,
                                                  sep="\t",
                                                  header=False,
                                                  index=False)
    write_counts(of, dat)
Esempio n. 3
0
def filter_by_pvalue_strand_lag(ratios, pcutoff, pvalues, output, no_correction, name, singlestrand):
    """Filter DPs by strang lag and pvalue"""
    if not singlestrand:
        zscore_ratios = zscore(ratios)
        ratios_pass = np.where(np.bitwise_and(zscore_ratios > -2, zscore_ratios < 2) == True, True, False)
    if not no_correction:
        pv_pass = [True] * len(pvalues)
        pvalues = map(lambda x: 10**-x, pvalues)
        
        _output_BED(name + '-uncor', output, pvalues, pv_pass)
        _output_narrowPeak(name + '-uncor', output, pvalues, pv_pass)
        
        pv_pass, pvalues = multiple_test_correction(pvalues, alpha=pcutoff)
    else:
        pv_pass = np.where(np.asarray(pvalues) >= -log10(pcutoff), True, False)
    
    if not singlestrand:
        filter_pass = np.bitwise_and(ratios_pass, pv_pass)
        assert len(pv_pass) == len(ratios_pass)
    else:
        filter_pass = pv_pass
    
    assert len(output) == len(pvalues)
    assert len(filter_pass) == len(pvalues)
    
    return output, pvalues, filter_pass
Esempio n. 4
0
def filter_by_pvalue_strand_lag(ratios, pcutoff, pvalues, output,
                                no_correction, name, singlestrand):
    """Filter DPs by strang lag and pvalue"""
    if not singlestrand:
        zscore_ratios = zscore(ratios)
        ratios_pass = np.where(
            np.bitwise_and(zscore_ratios > -2, zscore_ratios < 2) == True,
            True, False)
    if not no_correction:
        pv_pass = [True] * len(pvalues)
        pvalues = map(lambda x: 10**-x, pvalues)

        _output_BED(name + '-uncor', output, pvalues, pv_pass)
        _output_narrowPeak(name + '-uncor', output, pvalues, pv_pass)

        pv_pass, pvalues = multiple_test_correction(pvalues, alpha=pcutoff)
    else:
        pv_pass = np.where(np.asarray(pvalues) >= -log10(pcutoff), True, False)

    if not singlestrand:
        filter_pass = np.bitwise_and(ratios_pass, pv_pass)
        assert len(pv_pass) == len(ratios_pass)
    else:
        filter_pass = pv_pass

    assert len(output) == len(pvalues)
    assert len(filter_pass) == len(pvalues)

    return output, pvalues, filter_pass
Esempio n. 5
0
def multiple_correction(dic):
    """
    dic[ty][r][q] = p
    """
    for ty in dic.keys():
        all_p = []
        rn = len(dic[ty].keys())
        qn = len(dic[ty].values()[0].keys())
        cue = {}
        i = 0
        if rn == 1 and qn == 1: return
        # get all p values from the dictionary
        for r in dic[ty].keys():
            for q in dic[ty][r].keys():

                if isinstance(dic[ty][r][q], str):
                    pass
                else:
                    all_p.append(dic[ty][r][q])
                    cue[ty + r + q] = i
                    i = i + 1
        # correction
        reject, pvals_corrected = multiple_test_correction(all_p, alpha=0.05, method='indep')
        # modify all p values
        for ir, r in enumerate(dic[ty].keys()):
            for iq, q in enumerate(dic[ty][r].keys()):
                try:
                    dic[ty][r][q] = pvals_corrected[cue[ty + r + q]]
                except:
                    pass
Esempio n. 6
0
def multiple_correction(dic):
    """
    dic[ty][r][q] = p
    """
    for ty in dic.keys():
        all_p = []
        rn = len(dic[ty].keys())
        qn = len(dic[ty].values()[0].keys())
        cue = {}
        i = 0
        if rn == 1 and qn == 1: return
        # get all p values from the dictionary
        for r in dic[ty].keys():
            for q in dic[ty][r].keys():

                if isinstance(dic[ty][r][q], str):
                    pass
                else:
                    all_p.append(dic[ty][r][q])
                    cue[ty + r + q] = i
                    i = i + 1
        # correction
        reject, pvals_corrected = multiple_test_correction(all_p,
                                                           alpha=0.05,
                                                           method='indep')
        # modify all p values
        for ir, r in enumerate(dic[ty].keys()):
            for iq, q in enumerate(dic[ty][r].keys()):
                try:
                    dic[ty][r][q] = pvals_corrected[cue[ty + r + q]]
                except:
                    pass
Esempio n. 7
0
    def test_multiple_test_correction_using_R(self):
        pvalues_list = [[0.001, 0.005, 0.1, 0.5, 0.01], [0.03, 0.8, 0.47,0.1], [0.0003, 0.4, 0.002]]

        # corrected pvalues calculated by using R function p.adjust
        corrected_pvalues = [np.array([0.005, 0.0125, 0.125, 0.5, 0.01666667]), np.array([0.12, 0.8, 0.62666667, 0.2]),
                             np.array([0.0009, 0.4, 0.003])]

        # get corrected pvalues from own function
        for i in range(0, len(pvalues_list)):
            res = multiple_test_correction(pvalues_list[i])
            self.assertTrue(np.allclose(res[1], corrected_pvalues[i]), msg="multiple_test_correction returns wrong list"
                                                                           "of corrected pvalues")
Esempio n. 8
0
def generate_rna_exp_pv_table(root, multi_corr=True):
    "Generate p value table for Experiments vs RNA in the same project"
    
    nested_dict = lambda: defaultdict(nested_dict)
    #nested_dict = lambda: defaultdict(lambda: 'n.a.')

    data = nested_dict()
    rnas = []
    
    for item in os.listdir(root):
        pro = os.path.join(root, item, "profile.txt")
        if os.path.isfile(pro):
            with open(pro) as f:
                for line in f:
                    if line.startswith("Experiment"): continue
                    else:
                        line = line.strip().split("\t")
                        data[item][line[0]] = float(line[7])
                        rnas.append(line[0])
                        
    
    exp_list = sorted(data.keys())
    rnas = sorted(list(set(rnas)))
    
    pvs = []
    for rna in rnas:
        for exp in exp_list:
            if data[exp][rna]: pvs.append(data[exp][rna])
    reject, pvals_corrected = multiple_test_correction(pvs, alpha=0.05, method='indep')

    with open(os.path.join(root, "table_exp_rna_pv.txt"), "w") as t:
        print("\t".join(["RNA_ID"] + exp_list), file=t)
        i = 0
        for rna in rnas:
            newline = [rna]
            for exp in exp_list:
                if data[exp][rna]:
                    newline.append(str(pvals_corrected[i]))
                    i += 1
                else:
                    newline.append("n.a.")
            print("\t".join(newline), file=t)

    
    for d, p in plist.iteritems():
        list_all_index(path=os.path.dirname(p), 
                       link_d=dirlist, show_RNA_ass_gene=show_RNA_ass_gene)
Esempio n. 9
0
def generate_rna_exp_pv_table(root, multi_corr=True):
    "Generate p value table for Experiments vs RNA in the same project"

    nested_dict = lambda: defaultdict(nested_dict)
    # nested_dict = lambda: defaultdict(lambda: 'n.a.')

    data = nested_dict()
    rnas = []

    for item in os.listdir(root):
        pro = os.path.join(root, item, "profile.txt")
        if os.path.isfile(pro):
            with open(pro) as f:
                for line in f:
                    if line.startswith("Experiment"):
                        continue
                    else:
                        line = line.strip().split("\t")
                        data[item][line[0]] = float(line[7])
                        rnas.append(line[0])

    exp_list = sorted(data.keys())
    rnas = sorted(list(set(rnas)))

    pvs = []
    for rna in rnas:
        for exp in exp_list:
            if data[exp][rna]: pvs.append(data[exp][rna])
    if multi_corr:
        reject, pvals_corrected = multiple_test_correction(pvs, alpha=0.05, method='indep')
    else:
        pvals_corrected = pvs

    with open(os.path.join(root, "table_exp_rna_pv.txt"), "w") as t:
        print("\t".join(["RNA_ID"] + exp_list), file=t)
        i = 0
        for rna in rnas:
            newline = [rna]
            for exp in exp_list:
                if data[exp][rna]:
                    newline.append(str(pvals_corrected[i]))
                    i += 1
                else:
                    newline.append("n.a.")
            print("\t".join(newline), file=t)
Esempio n. 10
0
def get_peaks(name, DCS, states, ext_size, merge, distr, pcutoff,
              no_correction):
    indices_of_interest = DCS.indices_of_interest
    first_overall_coverage = DCS.first_overall_coverage
    second_overall_coverage = DCS.second_overall_coverage

    c1 = list(first_overall_coverage)
    c2 = list(second_overall_coverage)

    tmp_peaks = []

    for i in range(len(indices_of_interest)):
        if states[i] not in [1, 2]:
            continue  #ignore background states

        strand = '+' if states[i] == 1 else '-'

        cov1 = c1[indices_of_interest[i]]
        cov2 = c2[indices_of_interest[i]]
        chrom, start, end = DCS._index2coordinates(indices_of_interest[i])

        tmp_peaks.append((chrom, start, end, cov1, cov2, strand))

    i, j = 0, 0

    peaks = []
    pvalues = []

    while i < len(tmp_peaks):
        j += 1
        c, s, e, c1, c2, strand = tmp_peaks[i]
        v1 = [c1]
        v2 = [c2]

        #merge bins
        while i + 1 < len(tmp_peaks) and e == tmp_peaks[
                i + 1][1] and strand == tmp_peaks[i + 1][5]:
            e = tmp_peaks[i + 1][2]
            v1.append(tmp_peaks[i + 1][3])
            v2.append(tmp_peaks[i + 1][4])
            i += 1

        s1 = sum(v1)
        s2 = sum(v2)

        if s1 + s2 > SIGNAL_CUTOFF:
            pvalues.append(('NA', 'NA', 'NA', 'NA'))
        else:
            if strand == '+':
                pvalues.append((s1, s2, 'l', distr))
            else:
                pvalues.append((s1, s2, 'r', distr))

        peaks.append((c, s, e, s1, s2, strand))
        i += 1

    print('Number of Peaks where p-value is not calculated: ',
          pvalues.count(('NA', 'NA', 'NA', 'NA')),
          file=sys.stderr)

    #pool = multiprocessing.Pool(processes=2)#multiprocessing.cpu_count() * 3/2)
    pvalues = map(_compute_pvalue, pvalues)

    assert len(pvalues) == len(peaks)

    merge_delete(ext_size, merge, peaks, pvalues, name)

    #peaks = [(c, s, e, s1, s2, strand)]
    if not no_correction:
        pvalues = map(lambda x: 10**-x, pvalues)
        pv_pass, pvalues = multiple_test_correction(pvalues, alpha=pcutoff)
        pvalues = map(_get_log10pvalue, pvalues)
    else:
        pv_pass = [True] * len(pvalues)

    _output_BED(name, pvalues, peaks, pv_pass)
    _output_narrowPeak(name, pvalues, peaks, pv_pass)
Esempio n. 11
0
def get_peaks(name, DCS, states, ext_size, merge, distr, pcutoff, no_correction):
    indices_of_interest = DCS.indices_of_interest
    first_overall_coverage = DCS.first_overall_coverage
    second_overall_coverage = DCS.second_overall_coverage
    
    c1 = list(first_overall_coverage)
    c2 = list(second_overall_coverage)
    
    tmp_peaks = []
    
    for i in range(len(indices_of_interest)):
        if states[i] not in [1,2]:
            continue #ignore background states
        
        strand = '+' if states[i] == 1 else '-'
        
        cov1 = c1[indices_of_interest[i]]
        cov2 = c2[indices_of_interest[i]]
        chrom, start, end = DCS._index2coordinates(indices_of_interest[i])
        
        tmp_peaks.append((chrom, start, end, cov1, cov2, strand))

    i, j = 0, 0
    
    peaks = []
    pvalues = []
    
    while i < len(tmp_peaks):
        j+=1
        c, s, e, c1, c2, strand = tmp_peaks[i]
        v1 = [c1]
        v2 = [c2]
        
        #merge bins
        while i+1 < len(tmp_peaks) and e == tmp_peaks[i+1][1] and strand == tmp_peaks[i+1][5]:
            e = tmp_peaks[i+1][2]
            v1.append(tmp_peaks[i+1][3])
            v2.append(tmp_peaks[i+1][4])
            i += 1
        
        s1 = sum(v1)
        s2 = sum(v2)

        if s1 + s2 > SIGNAL_CUTOFF:
            pvalues.append(('NA', 'NA', 'NA', 'NA'))
        else:
            if strand == '+':
                pvalues.append((s1, s2, 'l', distr))
            else:
                pvalues.append((s1, s2, 'r', distr))

        peaks.append((c, s, e, s1, s2, strand))
        i += 1
    
    print('Number of Peaks where p-value is not calculated: ', pvalues.count(('NA', 'NA', 'NA', 'NA')), file=sys.stderr)
    
    #pool = multiprocessing.Pool(processes=2)#multiprocessing.cpu_count() * 3/2)
    pvalues = map(_compute_pvalue, pvalues)
    
    assert len(pvalues) == len(peaks)
    
    merge_delete(ext_size, merge, peaks, pvalues, name)
    
    #peaks = [(c, s, e, s1, s2, strand)]
    if not no_correction:
        #first output uncorrected p-values
        pv_pass = [True] * len(pvalues)
        _output_BED(name + '-uncor', pvalues, peaks, pv_pass)
        _output_narrowPeak(name + '-uncor', pvalues, peaks, pv_pass)
        
        #then correct p-values and output
        pvalues = map(lambda x: 10**-x, pvalues)
        pv_pass, pvalues = multiple_test_correction(pvalues, alpha=pcutoff)
        pvalues = map(_get_log10pvalue, pvalues)
    else:
        pv_pass = list(np.where(np.asarray(pvalues) >= -log10(pcutoff), True, False))
    
    _output_BED(name, pvalues, peaks, pv_pass)
    _output_narrowPeak(name, pvalues, peaks, pv_pass)