コード例 #1
0
ファイル: projection_test.py プロジェクト: rafalcode/reg-gen
class Projection:
    def __init__(self, reference_path, query_path):
        # Reference
        self.rEM = ExperimentalMatrix()
        self.rEM.read(reference_path)
        self.rEM.remove_empty_regionset()
        self.references = self.rEM.get_regionsets()
        self.referencenames = self.rEM.get_regionsnames()
        # Query
        self.qEM = ExperimentalMatrix()
        self.qEM.read(query_path)
        self.qEM.remove_empty_regionset()
        self.query = self.qEM.get_regionsets()
        self.querynames = self.qEM.get_regionsnames()
        self.parameter = []
        self.background = None

    def group_refque(self, groupby=False):
        self.groupedreference, self.groupedquery = group_refque(self.rEM, self.qEM, groupby)

    def colors(self, colorby, definedinEM):
        ############# Color #####################################
        # self.color_list = colormap(self.qEM, colorby, definedinEM)
        self.color_list = color_groupded_region(self.qEM, self.groupedquery, colorby, definedinEM)
        # self.color_tags = gen_tags(self.qEM, colorby)
        # self.color_tags.append('Background')
        self.color_list['Background'] = '0.70'

    def ref_union(self):
        self.background = OrderedDict()
        for ty in self.groupedreference.keys():
            self.background[ty] = GenomicRegionSet("union of references")
            for r in self.groupedreference[ty]:
                self.background[ty].combine(r)
            self.background[ty].merge()

        for ty in self.groupedreference.keys():
            rlist = [ r.trim_by(background=self.background[ty]) for r in self.groupedreference[ty]]
            self.groupedreference[ty] = rlist
            qlist = [ q.trim_by(background=self.background[ty]) for q in self.groupedquery[ty]]
            self.groupedquery[ty] = qlist

    def set_background(self, bed_path):
        bg = GenomicRegionSet("background")
        bg.read_bed(bed_path)
        self.background = OrderedDict()
        for ty in self.groupedreference.keys():
            self.background[ty] = bg
            rlist = [ r.trim_by(background=bg) for r in self.groupedreference[ty]]
            self.groupedreference[ty] = rlist

            qlist = [ q.trim_by(background=bg) for q in self.groupedquery[ty]]
            self.groupedquery[ty] = qlist

    def projection_test(self, organism):
        self.bglist = OrderedDict()
        self.qlist = OrderedDict()
        self.plist = OrderedDict()
        self.interq_list = OrderedDict()
        self.lenlist = {}
        # print2(self.parameter, "\nProjection test")
        # print2(self.parameter, "{0:s}\t{1:s}\t{2:s}\t{3:s}\t{4:s}".format("Reference","Background", "Query", "Proportion", "p value"))

        all_p = {}
        for ty in self.groupedquery.keys():
            # print(ty)
            self.bglist[ty] = OrderedDict()
            self.qlist[ty] = OrderedDict()
            self.plist[ty] = OrderedDict()
            self.interq_list[ty] = OrderedDict()
            if self.background: bgset = self.background[ty]
            else: bgset = None

            for i, r in enumerate(self.groupedreference[ty]):
                # print(r.name)
                self.bglist[ty][r.name] = OrderedDict()
                self.qlist[ty][r.name] = OrderedDict()
                self.plist[ty][r.name] = OrderedDict()
                self.interq_list[ty][r.name] = OrderedDict()
                self.lenlist[r.name] = len(r)
                for j, q in enumerate(self.groupedquery[ty]):
                    # print(r.name, q.name, sep="\t")
                    if r.name == q.name: continue
                    else:
                        bg, ratio, p, interq = r.projection_test(q, organism, extra=True, background=bgset)
                        self.bglist[ty][r.name][q.name] = bg
                        self.qlist[ty][r.name][q.name] = ratio
                        self.plist[ty][r.name][q.name] = p
                        self.interq_list[ty][r.name][q.name] = interq
                        self.lenlist[q.name] = len(q)
                        # if r in self.backgrounds.keys(): pass
                        # else: self.backgrounds[r] = bg

        # multiple test correction
        multiple_correction(self.plist)

        for ty in self.groupedquery.keys():
            for i, r in enumerate(self.groupedreference[ty]):
                for j, q in enumerate(self.groupedquery[ty]):
                    # if r.name == q.name: continue
                    # else:
                    # bg = self.bglist[ty][r.name][q.name]
                    # ratio = self.qlist[ty][r.name][q.name]
                    # p = self.plist[ty][r.name][q.name]
                    self.qlist[ty][r.name]['Background'] = self.bglist[ty][r.name].values()[0]

    def output_interq(self, directory):
        """Output the intersected query to the reference in BED format"""
        try:
            os.stat(os.path.dirname(directory))
        except:
            os.mkdir(os.path.dirname(directory))
        try:
            os.stat(directory)
        except:
            os.mkdir(directory)
        for ty in self.interq_list.keys():
            if ty: g = ty+ "_"
            else:
                g = ""
            for r in self.interq_list[ty].keys():
                for q in self.interq_list[ty][r].keys():
                    self.interq_list[ty][r][q].write_bed(os.path.join(directory, g + q + "_intersected_" + r + ".bed"))

    def plot(self, logt=None, pw=3, ph=3):

        tw = pw
        th = len(self.qlist.keys()) * ph
        f, ax = plt.subplots(len(self.qlist.keys()), 1, dpi=300,figsize=(tw, th))

        # f, ax = plt.subplots(len(self.qlist.keys()),1)
        try:
            ax = ax.reshape(-1)
        except:
            ax = [ax]
        # nm = len(self.groupedreference.keys()) * len(self.groupedreference.values()[0]) * len(self.groupedquery.values()[0])
        # if nm > 40:
        #     f.set_size_inches(nm * 0.2 +1 ,7)

        g_label = []
        for ind_ty, ty in enumerate(self.qlist.keys()):
            g_label.append(ty)
            r_label = []
            for ind_r, r in enumerate(self.qlist[ty].keys()):
                r_label.append(r)
                width = 0.8 / (len(self.qlist[ty][r].keys()) + 1)  # Plus one background
                for ind_q, q in enumerate(self.qlist[ty][r].keys()):
                    x = ind_r + ind_q * width + 0.1
                    y = self.qlist[ty][r][q]
                    if y == 0 and logt: y = 0.000001
                    # print("    "+r+"     "+q+"     "+str(x)+"     "+str(y))
                    ax[ind_ty].bar(x, y, width=width, color=self.color_list[q], edgecolor="none",
                                   align='edge', log=logt, label=q)
            if logt:
                ax[ind_ty].set_yscale('log')
            else:
                ax[ind_ty].locator_params(axis='y', nbins=2)

            # ax[ind_ty].set_ylabel("Percentage of intersected regions",fontsize=12)
            ax[ind_ty].set_title(ty)
            ax[ind_ty].yaxis.tick_left()
            ax[ind_ty].set_ylabel('Percentage of intersected regions', fontsize=8)
            ax[ind_ty].set_xticks([i + 0.5 - 0.5 * width for i in range(len(r_label))])
            ax[ind_ty].set_xticklabels(r_label, rotation=30, ha="right", fontsize=8)
            ax[ind_ty].tick_params(axis='x', which='both', top='off', bottom='off', labelbottom='on')

            handles, labels = ax[ind_ty].get_legend_handles_labels()
            # uniq_labels = unique(labels)
            uniq_labels = [q.name for q in self.groupedquery[ty]] + ["Background"]

            ax[ind_ty].legend([handles[labels.index(l)] for l in uniq_labels], uniq_labels,
                              loc='center left', handlelength=1, handletextpad=1,
                              columnspacing=2, borderaxespad=0., prop={'size': 10}, bbox_to_anchor=(1.05, 0.5))
            for spine in ['top', 'right']:  # 'left', 'bottom'
                ax[ind_ty].spines[spine].set_visible(False)
        # f.text(-0.025, 0.5, "Percentage of intersected regions",fontsize=12, rotation="vertical", va="center")
        # f.tight_layout(pad=1.08, h_pad=None, w_pad=None)
        f.tight_layout()
        self.fig = f

    def heatmap(self):
        f, ax = plt.subplots(1, len(self.plist.keys()))
        try:
            ax = ax.reshape(-1)
        except:
            ax = [ax]

        g_label = []
        for ind_ty, ty in enumerate(self.plist.keys()):
            g_label.append(ty)
            r_label = []
            data = []
            for ind_r, r in enumerate(self.plist[ty].keys()):
                r_label.append(r)
                # data.append(self.plist[ty][r].values())
                for ind_q, q in enumerate(self.plist[ty][r].keys()):
                    pass
            da = numpy.array(data)
            da = da.transpose()
            # im = plt.imshow(da, cmap=ax[ind_r], vmin=, vmax, origin, extent, shape, filternorm, filterrad, imlim, resample, url, hold)

    def gen_html(self, directory, title, args, align=50):
        dir_name = os.path.basename(directory)
        statistic_table = []
        # check_dir(directory)
        html_header = "Projection Test: " + dir_name
        link_d = OrderedDict()
        link_d["Projection test"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")
        html.add_figure("projection_test.png", align="center")

        header_list = ["No.",
                       "Reference<br>name",
                       "Query<br>name",
                       "Reference<br>number",
                       "Query<br>number",
                       "Proportion",
                       "Background<br>proportion",
                       "Positive<br>association<br>p-value",
                       "Negative<br>association<br>p-value"]
        statistic_table.append(["Reference_name", "Query_name", "Reference_number",
                                "Query_number", "Proportion", "Background_proportion",
                                "Positive_association_p-value", "Negative_association_p-value"])
        type_list = 'ssssssssssssssss'
        col_size_list = [5, 10, 10, 10, 10, 10, 10, 15, 15]

        nalist = []
        for ind_ty, ty in enumerate(self.plist.keys()):
            html.add_heading(ty, size=4, bold=False)
            data_table = []
            for ind_r, r in enumerate(self.plist[ty].keys()):
                rlen = str(self.lenlist[r])
                for ind_q, q in enumerate(self.plist[ty][r].keys()):
                    qlen = str(self.lenlist[q])
                    backv = value2str(self.qlist[ty][r]['Background'])
                    propor = value2str(self.qlist[ty][r][q])
                    pv = self.plist[ty][r][q]
                    if pv == "na":
                        nalist.append(r)
                        continue
                    elif self.qlist[ty][r][q] < args.cfp:
                        continue
                    else:
                        pvn = 1 - pv

                        if self.plist[ty][r][q] < 0.05:
                            if self.qlist[ty][r]['Background'] < self.qlist[ty][r][q]:
                                data_table.append([str(ind_ty), r, q, rlen, qlen, propor, backv,
                                                   "<font color=\"red\">" + value2str(pv) + "</font>", value2str(pvn)])
                                statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)])
                            else:
                                data_table.append([str(ind_ty), r, q, rlen, qlen, propor, backv,
                                                   value2str(pvn), "<font color=\"red\">" + value2str(pv) + "</font>"])
                                statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pvn), value2str(pv)])
                        else:
                            data_table.append(
                                [str(ind_ty), r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)])
                            statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)])

            html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, sortable=True)
            output_array(statistic_table, directory=directory, folder=title, filename="statistics" + ty + ".txt")

        header_list = ["Assumptions and hypothesis"]
        data_table = [['If the background proportion is too small, it may cause bias in p value.'],
                      [
                          'For projection test, the reference GenomicRegionSet should have non-zero length in order to calculate its background proportion.'],
                      ['P values are corrected by multiple test correction.'],
                      ['Positive association is defined by: Proportion > Background.'],
                      ['Negative association is defined by: Proportion < Background.']]

        nalist = set(nalist)
        if len(nalist) > 0:
            data_table.append([
                                  'The following references contain zero-length region which cause error in proportion calculation, please check it:<br>' +
                                  '     <font color=\"red\">' + ', '.join([s for s in nalist]) + '</font></p>'])
        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left")
        html.add_fixed_rank_sortable()

        html.write(os.path.join(directory, os.path.join(title, "index.html")))

        # Parameters
        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")
        header_list = ["Description", "Argument", "Value"]
        data_table = [["Reference", "-r", args.r],
                      ["Query", "-q", args.q],
                      ["Output directory", "-o", os.path.basename(args.o)],
                      ["Experiment title", "-t", args.t],
                      # ["Grouping tag", "-g", args.g],
                      # ["Coloring tag", "-c", args.c],
                      # ["Background", "-bg", args.bg],
                      ["Organism", "-organism", args.organism],
                      ["Cutoff of proportion", "-cfp", str(args.cfp)]]

        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left")
        html.add_free_content([
                                  '<a href="reference_experimental_matrix.txt" style="margin-left:100">See reference experimental matrix</a>'])
        html.add_free_content(
            ['<a href="query_experimental_matrix.txt" style="margin-left:100">See query experimental matrix</a>'])
        html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See details</a>'])
        html.write(os.path.join(directory, os.path.join(title, "parameters.html")))

    def table(self, directory, folder):
        arr = numpy.array([["#reference", "query", "background", "proportion", "p-value"]])
        for ty in self.plist.keys():
            for r in self.plist[ty].keys():
                for q in self.plist[ty][r].keys():
                    ar = numpy.array(
                        [[r, q, self.qlist[ty][r]['Background'], self.qlist[ty][r][q], self.plist[ty][r][q]]])
                    arr = numpy.vstack((arr, ar))
        output_array(arr, directory, folder, filename="output_table.txt")

    def distribution(self, organism):
        genome = GenomicRegionSet("genome")
        genome.get_genome_data(organism)
        all_cov = genome.total_coverage()
        self.chrom_list = []
        for ss in genome:
            self.chrom_list.append(ss.chrom)
        self.chrom_list.sort()

        # self.distriDict = OrderedDict()
        self.disperDict = OrderedDict()

        for ty in self.groupedreference.keys():
            # self.distriDict[ty] = OrderedDict()
            self.disperDict[ty] = OrderedDict()
            # Reference
            for r in self.groupedreference[ty]:
                r.merge()
                len_r = r.total_coverage()
                # self.distriDict[ty][r.name] = []
                self.disperDict[ty][r.name] = []

                for ch in self.chrom_list:
                    rc = r.any_chrom(chrom=ch)
                    nr = sum([len(s) for s in rc])
                    # self.distriDict[ty][r.name].append(nr)
                    self.disperDict[ty][r.name].append(nr / len_r)

            # Query
            for q in self.groupedquery[ty]:
                q.merge()
                len_q = q.total_coverage()
                # self.distriDict[ty][q.name] = []
                self.disperDict[ty][q.name] = []

                for ch in self.chrom_list:
                    qc = q.any_chrom(chrom=chr)
                    nq = sum([len(s) for s in qc])
                    # self.distriDict[ty][q.name].append(nq)
                    self.disperDict[ty][q.name].append(nq / len_q)
            # Genome
            # self.distriDict[ty]["Genome"] = [len(genome.any_chrom(chrom=chr)) for chr in self.chrom_list]

            self.disperDict[ty]["Genome"] = [len(genome.any_chrom(chrom=chr)[0]) / all_cov for chr in self.chrom_list]

    def plot_distribution(self):
        def to_percentage(x, pos=0):
            return '{:.2f} %'.format(100 * x)

        self.fig = []

        for ty in self.disperDict.keys():
            colors = plt.cm.Set1(numpy.linspace(0.1, 0.9, len(self.disperDict[ty].keys()))).tolist()

            f, ax = plt.subplots()
            f.set_size_inches(10.5, 30)
            width = 0.9 / len(self.disperDict[ty].keys())
            ind = np.arange(len(self.chrom_list))
            coverage = self.disperDict[ty]

            for ind_r, r in enumerate(self.disperDict[ty].keys()):
                ax.barh(ind + width * ind_r, self.disperDict[ty][r], width, color=colors[ind_r])

            plt.xlabel('Percentage')
            ax.xaxis.set_major_formatter(mtick.FuncFormatter(to_percentage))

            ax.minorticks_off()
            ax.set_yticks([x + 0.5 for x in range(len(self.chrom_list))])
            ax.set_yticklabels(self.chrom_list, rotation=0, ha="right")
            ax.tick_params(axis='y', which='both', top='off', bottom='off', labelbottom='on')

            ax.legend(self.disperDict[ty].keys(), loc='center left', handlelength=1, handletextpad=1,
                      columnspacing=2, borderaxespad=0., prop={'size': 10}, bbox_to_anchor=(1.05, 0.5))
            for spine in ['top', 'right', 'left', 'bottom']:
                ax.spines[spine].set_visible(False)
            f.tight_layout(pad=1.08, h_pad=None, w_pad=None)
            self.fig.append(f)

    def gen_html_distribution(self, outputname, title, align=50):
        fp = os.path.join(dir, outputname, title)
        link_d = {title: "distribution.html"}
        html = Html(name="Viz", links_dict=link_d, fig_dir=os.path.join(dir, outputname, "fig"),
                    other_logo="viz", homepage="../index.html")
        for i, f in enumerate(self.fig):
            html.add_figure("distribution_test_" + str(i) + ".png", align="center")

        html.add_free_content(['<p style=\"margin-left: ' + str(align + 150) + '">' +
                               '** </p>'])

        type_list = 'ssssssssssssssssssssssssssssssssssssssssssssss'
        col_size_list = [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                         10, 10]
        data_table = []
        for ind_ty, ty in enumerate(self.disperDict.keys()):
            header_list = ["Chromosome"] + self.disperDict[ty].keys()
            html.add_heading(ty, size=4, bold=False)
            for i, ch in enumerate(self.chrom_list):
                # for ind_r,r in enumerate(self.disperDict[ty].keys()):

                data_table.append(
                    [ch] + ["{:.3f} %".format(100 * self.disperDict[ty][r][i]) for r in self.disperDict[ty].keys()])

        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align)

        html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>'])
        html.add_free_content([
                                  '<a href="reference_experimental_matrix.txt" style="margin-left:100">See reference experimental matrix</a>'])
        html.add_free_content(
            ['<a href="query_experimental_matrix.txt" style="margin-left:100">See query experimental matrix</a>'])
        html.write(os.path.join(fp, "distribution.html"))
コード例 #2
0
ファイル: genesFromBed.py プロジェクト: CostaLab/reg-gen
    parser.add_option("--distance", "-d", dest="distance", default=50000, help="distance from peak to gene", type="int")
    parser.add_option("--type", "-t", dest="type", default="bed", help="type of bed file (<bed>, <THOR>)", type="str")
    parser.add_option("--metric", dest="metric", default="max", help="metric to merge peaks' scores (mean, max)", type="str")
    (options, args) = parser.parse_args()
     
    i = 3
    if len(args) > i:
        parser.error("Exactly %s parameters are needed" %i)
        
    path_exp_matrix = args[0]
    path_annotation = args[1]
    
    genome_file = os.path.join(path_annotation, "chrom.sizes")
    gene_file = os.path.join(path_annotation, "association_file.bed")
    
    exp_matrix = ExperimentalMatrix()
    exp_matrix.read(path_exp_matrix, is_bedgraph=False)
    
    print("Use metric %s to merge peaks' score." %options.metric, file=sys.stderr)
    
    if options.mode is 1:
        mode_1(exp_matrix,options.distance)
    elif options.mode is 2:
        mode_2(exp_matrix,options.distance)
    elif options.mode is 3:
        mode_3(exp_matrix,options.distance,options.type)
    elif options.mode is 4:
        geneexp_file = args[2]
        mode_4(exp_matrix,options.distance,options.type,geneexp_file)

コード例 #3
0
ファイル: geneAssociation.py プロジェクト: Marvin84/reg-gen
import sys
import os.path
from rgt.GenomicRegionSet import *
from rgt.ExperimentalMatrix import *
from fisher import pvalue

back=False
designFile = sys.argv[1]
anotationPath = sys.argv[2]
genomeFile=anotationPath+"chrom.sizes"
geneFile=anotationPath+"association_file.bed"

exps=ExperimentalMatrix()
exps.read(designFile)

beds=[]
geneLists=[]

#this should be improved
bedGenes = GenomicRegionSet(geneFile)
bedGenes.read_bed(geneFile)
allgenes=[]
for r in bedGenes:
 allgenes.append(r.name)
allgenes=list(set(allgenes))

genesets=exps.get_genesets()

if len(sys.argv) > 3:
    back=True
    backGroundPeaks = sys.argv[3]
コード例 #4
0
    backBed = GenomicRegionSet("BACK")
    backBed.read_bed(backGroundPeaksName)
    backGroundPeaks = True

distance = 50000
if len(sys.argv) > 6:
    distance = len(sys.argv[6])

if len(sys.argv) > 7:
    outdir = sys.argv[7]

#genomeFile=anotationPath+"chrom.sizes"
#geneFile=anotationPath+"association_file.bed"

exps = ExperimentalMatrix()
exps.read(designFile)

beds = []
geneLists = []

#this should be improved
bedGenes = GenomicRegionSet(geneFile)
bedGenes.read_bed(geneFile)
allgenes = []
for r in bedGenes:
    allgenes.append(r.name)
allgenes = list(set(allgenes))

genesets = exps.get_genesets()

if len(outdir) > 0:
コード例 #5
0
 def test_jaccard_test(self):
     matrix1 = ExperimentalMatrix()
     matrix2 = ExperimentalMatrix()
     matrix1.read(path_input2)
     matrix2.read(path_input2)
コード例 #6
0
 def test_projection_test(self):
     matrix1 = ExperimentalMatrix()
     matrix2 = ExperimentalMatrix()
     matrix1.read(path_input2)
     matrix2.read(path_input2)
     AssociationAnalysis.projection_test(matrix1, matrix2)
コード例 #7
0
        c.append(cov.coverage)
    return numpy.transpose(c)


def printTable(namesCol, namesLines, table, fileName):
    f = open(fileName, "w")
    f.write("\t" + ("\t".join(namesCol)) + "\n")
    for i, line in enumerate(table):
        f.write(namesLines[i] + "\t" + ("\t".join([str(j)
                                                   for j in line])) + "\n")


out = ""
experimentalFile = sys.argv[1]
exps = ExperimentalMatrix()
exps.read(experimentalFile)
beds = exps.get_regionsets()
reads = exps.get_readsfiles()
readsnames = exps.get_readsnames()
outputDir = sys.argv[2]
if len(sys.argv) > 3:
    experimentalFile2 = sys.argv[3]
    exps2 = ExperimentalMatrix()
    exps2.read(experimentalFile2)
    reads = exps2.get_readsfiles()
    readsnames = exps2.get_readsnames()
    out = outputDir

for bed in beds:
    bednames = [
        r.chrom + ":" + str(r.initial) + "-" + str(r.final) for r in bed
コード例 #8
0
        parser.error("Exactly three parameters are needed: experimental matrix, gene expression, annotation path and prefix for output")
    
    #map arguments
    experimental_matrix_file = args[0]
    gene_exp = args[1]
    annotation_path = args[2]
    outputdir = args[3]
    
    
#     experimental_matrix_file = "/home/manuel/workspace/cluster_p/THOR/exp/exp23_macs2_payton/1"
#     gene_exp = "/home/manuel/workspace/cluster_p/allhoff/project_THOR/data/payton/gene_expression/CCmean.data"
#     annotation_path = "/home/manuel/workspace/cluster_h/rgtdata/hg19/"
#     outputdir = "/home/manuel/test/"
    
    exps = ExperimentalMatrix()
    exps.read(experimental_matrix_file)
    regionsets = exps.get_regionsets()
    
    genome_file = annotation_path + "/chrom.sizes"
    gene_file = annotation_path + "/association_file.bed"
    
    genes = GeneSet("Expression")
    genes.read_expression(gene_exp)
    
    for region in regionsets:
        bedNew = GenomicRegionSet("")
        [degenes, de_peak_genes, mappedGenes, totalPeaks, regionsToGenes] \
        = bedNew.filter_by_gene_association_old(region.fileName, genes.genes, gene_file, genome_file)
        
        [ct, labels] = averageExpression(region, genes, regionsToGenes)
        aux = region.fileName.split("/")
コード例 #9
0
    parser.add_option("--distance", "-d", dest="distance", default=50000, help="distance from peak to gene", type="int")
    parser.add_option("--type", "-t", dest="type", default="bed", help="type of bed file (<bed>, <THOR>)", type="str")
    parser.add_option("--metric", dest="metric", default="max", help="metric to merge peaks' scores (mean, max)", type="str")
    (options, args) = parser.parse_args()
     
    i = 3
    if len(args) > i:
        parser.error("Exactly %s parameters are needed" %i)
        
    path_exp_matrix = args[0]
    path_annotation = args[1]
    
    genome_file = os.path.join(path_annotation, "chrom.sizes")
    gene_file = os.path.join(path_annotation, "association_file.bed")
    
    exp_matrix = ExperimentalMatrix()
    exp_matrix.read(path_exp_matrix, is_bedgraph=False)
    
    print("Use metric %s to merge peaks' score." %options.metric, file=sys.stderr)
    
    if options.mode is 1:
        mode_1(exp_matrix,options.distance)
    elif options.mode is 2:
        mode_2(exp_matrix,options.distance)
    elif options.mode is 3:
        mode_3(exp_matrix,options.distance,options.type)
    elif options.mode is 4:
        geneexp_file = args[2]
        mode_4(exp_matrix,options.distance,options.type,geneexp_file)

コード例 #10
0
      #cov.normRPM()
      c.append(cov.coverage)
  return numpy.transpose(c)   


def printTable(namesCol,namesLines,table,fileName):
    f=open(fileName,"w")
    f.write("\t"+("\t".join(namesCol))+"\n")
    for i,line in enumerate(table):
      f.write(namesLines[i]+"\t"+("\t".join([str(j) for j in line]))+"\n")


out=""
experimentalFile = sys.argv[1]
exps=ExperimentalMatrix()
exps.read(experimentalFile)
beds = exps.get_regionsets()
reads = exps.get_readsfiles()
readsnames = exps.get_readsnames()
outputDir = sys.argv[2]
if len(sys.argv) > 3:
  experimentalFile2 = sys.argv[3]
  exps2=ExperimentalMatrix()
  exps2.read(experimentalFile2)
  reads = exps2.get_readsfiles()
  readsnames = exps2.get_readsnames()
  out=outputDir

for bed in beds:
  bednames=[r.chrom+":"+str(r.initial)+"-"+str(r.final) for r in bed]
  c=bedCoverage(bed,reads)
コード例 #11
0
class Lineplot:
    def __init__(self, EMpath, title, annotation, organism, center, extend, rs,
                 bs, ss, df, dft, fields, test, sense):

        # Read the Experimental Matrix
        self.title = title
        self.exps = ExperimentalMatrix()
        self.exps.read(EMpath, test=test)
        for f in self.exps.fields:
            if f not in [
                    'name', 'type', 'file', "reads", "regions", "factors"
            ]:
                self.exps.match_ms_tags(f, test=test)
                self.exps.remove_name()

        # if annotation:
        #     self.beds, self.bednames, self.annotation = annotation_dump(organism)

        # else:
        self.beds = self.exps.get_regionsets()  # A list of GenomicRegionSets
        self.bednames = self.exps.get_regionsnames()
        self.annotation = None

        self.reads = self.exps.get_readsfiles()
        self.readsnames = self.exps.get_readsnames()
        self.fieldsDict = self.exps.fieldsDict
        self.parameter = []
        self.center = center
        self.extend = extend
        self.rs = rs
        self.bs = bs
        self.ss = ss
        self.df = df
        self.dft = dft
        self.sense = sense

    def relocate_bed(self):
        self.processed_beds = []
        self.processed_bedsF = []  # Processed beds to be flapped

        for bed in self.beds:
            if self.center == 'bothends':
                newbed = bed.relocate_regions(
                    center='leftend',
                    left_length=self.extend + self.bs,
                    right_length=self.extend + self.bs)
                self.processed_beds.append(newbed)
                newbedF = bed.relocate_regions(
                    center='rightend',
                    left_length=self.extend + self.bs,
                    right_length=self.extend + self.bs)
                self.processed_bedsF.append(newbedF)
            elif self.center == 'upstream' or self.center == 'downstream':
                allbed = bed.relocate_regions(
                    center=self.center,
                    left_length=self.extend + self.bs,
                    right_length=self.extend + self.bs)
                newbed = allbed.filter_strand(strand="+")
                self.processed_beds.append(newbed)
                newbedF = allbed.filter_strand(strand="-")
                self.processed_bedsF.append(newbedF)
            else:
                newbed = bed.relocate_regions(
                    center=self.center,
                    left_length=self.extend + int(0.5 * self.bs) + 2 * self.ss,
                    right_length=self.extend + int(0.5 * self.bs) +
                    2 * self.ss)
                self.processed_beds.append(newbed)

    def group_tags(self, groupby, sortby, colorby):
        """Generate the tags for the grouping of plot
        Parameters:
            groupby = 'reads','regions','cell',or 'factor'
            colorby = 'reads','regions','cell',or 'factor'
            sortby = 'reads','regions','cell',or 'factor'
        """
        self.tag_type = [sortby, groupby, colorby, self.dft]
        if "None" in self.tag_type: self.tag_type.remove("None")

        if groupby == "None":
            self.group_tags = [""]
        elif groupby == "regions" and self.annotation:
            self.group_tags = self.bednames
        else:
            self.group_tags = gen_tags(self.exps, groupby)

        if sortby == "None":
            self.sort_tags = [""]
        elif sortby == "regions" and self.annotation:
            self.sort_tags = self.bednames
        else:
            self.sort_tags = gen_tags(self.exps, sortby)

        if colorby == "None":
            self.color_tags = [""]
        elif colorby == "regions" and self.annotation:
            self.color_tags = self.bednames
        else:
            self.color_tags = gen_tags(self.exps, colorby)

        print("\tColumn labels:\t" + ",".join(self.group_tags))
        print("\tRow labels:\t" + ",".join(self.sort_tags))
        print("\tColor labels:\t" + ",".join(self.color_tags))

    def gen_cues(self):
        self.cuebed = OrderedDict()
        self.cuebam = OrderedDict()

        # if self.annotation:
        #     #all_tags = []
        #     #for dictt in self.exps.fieldsDict.values():
        #     #    for tag in dictt.keys():
        #     #        all_tags.append(tag)
        #     for bed in self.bednames:
        #     #    self.cuebed[bed] = set([bed]+all_tags)
        #         self.cuebed[bed] = set([bed])
        # else:
        for bed in self.bednames:
            self.cuebed[bed] = set(tag_from_r(self.exps, self.tag_type, bed))
            try:
                self.cuebed[bed].remove("None")
            except:
                pass
        for bam in self.readsnames:
            self.cuebam[bam] = set(tag_from_r(self.exps, self.tag_type, bam))

    def coverage(self, sortby, heatmap=False, logt=False, mp=0, log=False):
        def annot_ind(bednames, tags):
            """Find the index for annotation tag"""
            for ind, a in enumerate(bednames):
                if a in tags: return ind

        if mp > 0: ts = time.time()
        normRPM = False
        # Calculate for coverage
        mp_input = []
        data = OrderedDict()

        bi = 0
        for s in self.sort_tags:
            data[s] = OrderedDict()
            for g in self.group_tags:
                data[s][g] = OrderedDict()
                for c in self.color_tags:
                    # if self.df: data[s][g][c] = []
                    data[s][g][c] = OrderedDict()
                    if not self.dft:
                        dfs = [c]
                    else:
                        dfs = self.exps.fieldsDict[self.dft].keys()
                    for d in dfs:
                        data[s][g][c][d] = defaultdict(list)
                        for bed in self.cuebed.keys():
                            # print(self.cuebed[bed])
                            # print(set([s,g,c,d]))
                            # print(self.cuebed[bed].issubset(set([s,g,c,d])))
                            if len(self.cuebed[bed].intersection(
                                    set([s, g, c, d
                                         ]))) > 2 or self.cuebed[bed].issubset(
                                             set([s, g, c, d])):
                                # if self.cuebed[bed] <= set([s,g,c]):
                                for bam in self.cuebam.keys():

                                    # print(self.cuebam[bam])
                                    # print(set([s,g,c]))
                                    if self.cuebam[bam] <= set([s, g, c, d]):
                                        i = self.bednames.index(bed)
                                        j = self.readsnames.index(bam)
                                        # print(bed + "." + bam)

                                        # if len(self.processed_beds[i]) == 0:
                                        #     try:
                                        #         data[s][g][c][d].append(numpy.empty(1, dtype=object))
                                        #     except:
                                        #         data[s][g][c][d] = [numpy.empty(1, dtype=object)]
                                        #     continue
                                        #########################################################################
                                        if mp > 0:  # Multiple processing
                                            mp_input.append([
                                                self.processed_beds[i],
                                                self.reads[j], self.rs,
                                                self.bs, self.ss, self.center,
                                                heatmap, logt, s, g, c, d
                                            ])
                                            data[s][g][c][d] = None

                                        #########################################################################
                                        else:  # Single thread
                                            ts = time.time()
                                            cov = CoverageSet(
                                                bed + "." + bam,
                                                self.processed_beds[i])

                                            # print(len(self.processed_beds[i]))
                                            if "Conservation" in [s, g, c, d]:
                                                cov.phastCons46way_score(
                                                    stepsize=self.ss)

                                            elif ".bigwig" in self.reads[
                                                    j].lower(
                                                    ) or ".bw" in self.reads[
                                                        j].lower():
                                                cov.coverage_from_bigwig(
                                                    bigwig_file=self.reads[j],
                                                    stepsize=self.ss)
                                            else:
                                                if not self.sense:
                                                    cov.coverage_from_bam(
                                                        bam_file=self.reads[j],
                                                        extension_size=self.rs,
                                                        binsize=self.bs,
                                                        stepsize=self.ss)
                                                    if normRPM: cov.normRPM()
                                                else:  # Sense specific
                                                    cov.coverage_from_bam(
                                                        bam_file=self.reads[j],
                                                        extension_size=self.rs,
                                                        binsize=self.bs,
                                                        stepsize=self.ss,
                                                        get_sense_info=True,
                                                        paired_reads=True)
                                                    cov.array_transpose()
                                                    if normRPM: cov.normRPM()

                                            # When bothends, consider the fliping end
                                            if self.center == 'bothends' or self.center == 'upstream' or self.center == 'downstream':
                                                if "Conservation" in [
                                                        s, g, c, d
                                                ]:
                                                    flap = CoverageSet(
                                                        "for flap", self.
                                                        processed_bedsF[i])
                                                    flap.phastCons46way_score(
                                                        stepsize=self.ss)
                                                    ffcoverage = numpy.fliplr(
                                                        flap.coverage)
                                                    cov.coverage = numpy.concatenate(
                                                        (cov.coverage,
                                                         ffcoverage),
                                                        axis=0)
                                                elif ".bigwig" in self.reads[
                                                        j].lower(
                                                        ) or ".bw" in self.reads[
                                                            j].lower():
                                                    flap = CoverageSet(
                                                        "for flap", self.
                                                        processed_bedsF[i])
                                                    flap.coverage_from_bigwig(
                                                        bigwig_file=self.
                                                        reads[j],
                                                        stepsize=self.ss)
                                                    ffcoverage = numpy.fliplr(
                                                        flap.coverage)
                                                    cov.coverage = numpy.concatenate(
                                                        (cov.coverage,
                                                         ffcoverage),
                                                        axis=0)
                                                else:
                                                    flap = CoverageSet(
                                                        "for flap", self.
                                                        processed_bedsF[i])
                                                    if not self.sense:
                                                        flap.coverage_from_bam(
                                                            self.reads[j],
                                                            extension_size=self
                                                            .rs,
                                                            binsize=self.bs,
                                                            stepsize=self.ss)
                                                        if normRPM:
                                                            flap.normRPM()
                                                    else:  # Sense specific
                                                        flap.coverage_from_bam(
                                                            bam_file=self.
                                                            reads[j],
                                                            extension_size=self
                                                            .rs,
                                                            binsize=self.bs,
                                                            stepsize=self.ss,
                                                            get_sense_info=True,
                                                            paired_reads=True)
                                                        flap.array_transpose(
                                                            flip=True)
                                                        if normRPM:
                                                            flap.normRPM()
                                                    ffcoverage = numpy.fliplr(
                                                        flap.coverage)
                                                    try:
                                                        cov.coverage = numpy.concatenate(
                                                            (cov.coverage,
                                                             ffcoverage),
                                                            axis=0)
                                                    except:
                                                        pass

                                                    if self.sense:
                                                        cov.transpose_cov1 = numpy.concatenate(
                                                            (cov.
                                                             transpose_cov1,
                                                             flap.
                                                             transpose_cov1),
                                                            axis=0)
                                                        cov.transpose_cov2 = numpy.concatenate(
                                                            (cov.
                                                             transpose_cov2,
                                                             flap.
                                                             transpose_cov2),
                                                            axis=0)

                                            # Averaging the coverage of all regions of each bed file
                                            if heatmap:
                                                if logt:
                                                    data[s][g][c][
                                                        d] = numpy.log10(
                                                            numpy.vstack(
                                                                cov.coverage) +
                                                            1
                                                        )  # Store the array into data list
                                                else:
                                                    data[s][g][c][
                                                        d] = numpy.vstack(
                                                            cov.coverage
                                                        )  # Store the array into data list
                                            else:
                                                if len(cov.coverage) == 0:
                                                    data[s][g][c][d] = None
                                                    print(
                                                        "** Warning: Cannot open "
                                                        + self.reads[j])
                                                    continue
                                                else:
                                                    for i, car in enumerate(
                                                            cov.coverage):
                                                        if i == 0:
                                                            avearr = np.array(
                                                                car, ndmin=2)
                                                        else:
                                                            # avearr = numpy.vstack((avearr, np.array(car, ndmin=2)))
                                                            try:
                                                                avearr = numpy.vstack(
                                                                    (avearr,
                                                                     np.array(
                                                                         car,
                                                                         ndmin=2
                                                                     )))
                                                            except:
                                                                print(bed +
                                                                      "." +
                                                                      bam +
                                                                      "." +
                                                                      str(i))
                                                    if log:
                                                        avearr = numpy.log2(
                                                            avearr + 1)

                                                    avearr = numpy.average(
                                                        avearr, axis=0)
                                                    if self.sense:
                                                        if log:
                                                            sense_1 = numpy.average(
                                                                numpy.log2(
                                                                    cov.
                                                                    transpose_cov1
                                                                    + 1),
                                                                axis=0)
                                                            sense_2 = numpy.average(
                                                                numpy.log2(
                                                                    cov.
                                                                    transpose_cov2
                                                                    + 1),
                                                                axis=0)
                                                        else:
                                                            sense_1 = numpy.average(
                                                                cov.
                                                                transpose_cov1,
                                                                axis=0)
                                                            sense_2 = numpy.average(
                                                                cov.
                                                                transpose_cov2,
                                                                axis=0)
                                                    cut_end = int(self.bs /
                                                                  self.ss)
                                                    avearr = avearr[
                                                        cut_end:-cut_end]
                                                    data[s][g][c][d][
                                                        "all"].append(avearr)

                                                    if self.sense:
                                                        sense_1 = sense_1[
                                                            cut_end:-cut_end]
                                                        sense_2 = sense_2[
                                                            cut_end:-cut_end]
                                                        data[s][g][c][d][
                                                            "sense_1"].append(
                                                                sense_1)
                                                        data[s][g][c][d][
                                                            "sense_2"].append(
                                                                sense_2)

                                            bi += 1
                                            te = time.time()
                                            print2(
                                                self.parameter,
                                                "\t" + str(bi) + "\t" +
                                                "{0:30}\t--{1:<5.1f}s".format(
                                                    bed + "." + bam, ts - te))

        if mp > 0:
            pool = MyPool(mp)
            mp_output = pool.map(compute_coverage, mp_input)
            pool.close()
            pool.join()
            for s in data.keys():
                for g in data[s].keys():
                    for c in data[s][g].keys():
                        for d in data[s][g][c].keys():
                            for out in mp_output:
                                if out[0] == s and out[1] == g and out[
                                        2] == c and out[3] == d:
                                    if self.df:
                                        try:
                                            data[s][g][c][d][-1].append(out[4])
                                        except:
                                            data[s][g][c][d] = [[out[4]]]
                                    else:
                                        try:
                                            data[s][g][c][d].append(out[4])
                                        except:
                                            data[s][g][c][d] = [out[4]]
        if self.df:
            for s in data.keys():
                for g in data[s].keys():
                    for c in data[s][g].keys():
                        for d in data[s][g][c].keys():
                            if isinstance(
                                    data[s][g][c][d]["all"],
                                    list) and len(data[s][g][c][d]["all"]) > 1:
                                diff = numpy.subtract(
                                    data[s][g][c][d]["all"][0],
                                    data[s][g][c][d]["all"][1])
                                data[s][g][c][d]["df"].append(diff.tolist())
                            else:
                                print(
                                    "Warning: There is no repetitive reads for calculating difference.\n"
                                    "         Please add one more entry in experimental matrix."
                                )
        self.data = data

    def colormap(self, colorby, definedinEM):
        colors = colormap(self.exps,
                          colorby,
                          definedinEM,
                          annotation=self.annotation)
        self.colors = {}
        for i, c in enumerate(self.color_tags):
            self.colors[c] = colors[i]

    def plot(self,
             groupby,
             colorby,
             output,
             printtable=False,
             scol=False,
             srow=False,
             w=2,
             h=2):

        rot = 50
        if len(self.data.values()[0].keys()) < 2:
            ticklabelsize = w * 1.5
        else:
            ticklabelsize = w * 3
        tw = len(self.data.values()[0].keys()) * w
        th = len(self.data.keys()) * (h * 0.8)

        f, axs = plt.subplots(len(self.data.keys()),
                              len(self.data.values()[0].keys()),
                              dpi=300,
                              figsize=(tw, th))

        yaxmax = [0] * len(self.data.values()[0])
        sx_ymax = [0] * len(self.data.keys())
        if self.df:
            yaxmin = [0] * len(self.data.values()[0])
            sx_ymin = [0] * len(self.data.keys())

        if printtable:
            bott = self.extend - int(0.5 * self.ss)
            pArr = [["Group_tag", "Sort_tag", "Color_tag"] +
                    [str(x) for x in range(-bott, bott, self.ss)]]  # Header
        nit = len(self.data.keys())
        for it, s in enumerate(self.data.keys()):

            for i, g in enumerate(self.data[s].keys()):
                try:
                    ax = axs[it, i]
                except:
                    if len(self.data.keys()) == 1 and len(
                            self.data[s].keys()) == 1:
                        ax = axs
                    elif len(self.data.keys()) == 1 and len(
                            self.data[s].keys()) > 1:
                        ax = axs[i]
                    else:
                        ax = axs[it]

                if it == 0:
                    if self.df:
                        ax.set_title(g + "_df", fontsize=ticklabelsize + 2)
                    else:
                        ax.set_title(g, fontsize=ticklabelsize + 2)

                # Processing for future output
                for j, c in enumerate(self.data[s][g].keys()):

                    for k, d in enumerate(self.data[s][g][c].keys()):
                        if not self.data[s][g][c][d]:
                            continue
                        else:
                            if not self.sense:
                                if self.df: pt = self.data[s][g][c][d]["df"]
                                else: pt = self.data[s][g][c][d]["all"]

                                for l, y in enumerate(pt):
                                    # print(y)
                                    yaxmax[i] = max(numpy.amax(y), yaxmax[i])
                                    sx_ymax[it] = max(numpy.amax(y),
                                                      sx_ymax[it])
                                    if self.df:
                                        yaxmin[i] = min(
                                            numpy.amin(y), yaxmin[i])
                                        sx_ymin[it] = min(
                                            numpy.amin(y), sx_ymin[it])

                                    x = numpy.linspace(-self.extend,
                                                       self.extend, len(y))
                                    ax.plot(x,
                                            y,
                                            color=self.colors[c],
                                            lw=1,
                                            label=c)
                                    if it < nit - 1:
                                        ax.set_xticklabels([])
                                    # Processing for future output
                                    if printtable:
                                        pArr.append([g, s, c, d] + list(y))
                            else:
                                plt.text(0.5,
                                         0.51,
                                         'sense',
                                         transform=ax.transAxes,
                                         fontsize=ticklabelsize,
                                         horizontalalignment='center',
                                         verticalalignment='bottom')
                                plt.text(0.5,
                                         0.49,
                                         'anti-sense',
                                         transform=ax.transAxes,
                                         fontsize=ticklabelsize,
                                         horizontalalignment='center',
                                         verticalalignment='top')
                                plt.plot((-self.extend, self.extend), (0, 0),
                                         '0.1',
                                         linewidth=0.2)
                                print(self.data[s][g][c][d])
                                for l, y in enumerate(
                                        self.data[s][g][c][d]["sense_1"]):
                                    # print(y)
                                    ymax1 = numpy.amax(y)
                                    yaxmax[i] = max(ymax1, yaxmax[i])
                                    sx_ymax[it] = max(ymax1, sx_ymax[it])
                                    x = numpy.linspace(-self.extend,
                                                       self.extend, y.shape[0])
                                    ax.plot(x,
                                            y,
                                            color=self.colors[c],
                                            lw=1,
                                            label=c)
                                    if it < nit - 1: ax.set_xticklabels([])
                                    # Processing for future output
                                    if printtable:
                                        pArr.append([g, s, c, d, "+"] +
                                                    list(y))

                                for l, y in enumerate(
                                        self.data[s][g][c][d]["sense_2"]):
                                    # print(y)
                                    ymax2 = numpy.amax(y)
                                    yaxmax[i] = max(ymax2, yaxmax[i])
                                    sx_ymax[it] = max(ymax2, sx_ymax[it])
                                    x = numpy.linspace(-self.extend,
                                                       self.extend, y.shape[0])
                                    ax.plot(x,
                                            -y,
                                            color=self.colors[c],
                                            lw=1,
                                            label=c)
                                    if it < nit - 1: ax.set_xticklabels([])
                                    # Processing for future output
                                    if printtable:
                                        pArr.append([g, s, c, d, "-"] +
                                                    list(y))
                                ym = 1.2 * max(max(yaxmax), max(sx_ymax))
                                ax.set_ylim([-ym, ym])

                ax.get_yaxis().set_label_coords(-0.1, 0.5)
                ax.set_xlim([-self.extend, self.extend])
                plt.setp(ax.get_xticklabels(),
                         fontsize=ticklabelsize,
                         rotation=rot)
                plt.setp(ax.get_yticklabels(), fontsize=ticklabelsize)

                ax.locator_params(axis='x', nbins=4)
                ax.locator_params(axis='y', nbins=2)
                # try:
                #
                # except:
                #     ax.locator_params(axis='y', nbins=2)
                #     pass
        if printtable:
            output_array(pArr,
                         directory=output,
                         folder=self.title,
                         filename="plot_table.txt")

        for it, ty in enumerate(self.data.keys()):
            try:
                axs[it, 0].set_ylabel("{}".format(ty),
                                      fontsize=ticklabelsize + 1)
            except:
                try:
                    axs[it].set_ylabel("{}".format(ty),
                                       fontsize=ticklabelsize + 1)
                except:
                    axs.set_ylabel("{}".format(ty), fontsize=ticklabelsize + 1)

            for i, g in enumerate(self.data[ty].keys()):
                try:
                    axx = axs[it, i]
                except:
                    try:
                        if len(self.data.keys()) == 1:
                            axx = axs[i]
                        else:
                            axx = axs[it]
                    except:
                        axx = axs

                if self.df:
                    if scol:
                        ymin = yaxmin[i] - abs(yaxmin[i] * 0.2)
                        ymax = yaxmax[i] + abs(yaxmax[i] * 0.2)
                    elif srow:
                        ymin = sx_ymin[it] - abs(sx_ymin[it] * 0.2)
                        ymax = sx_ymax[it] + abs(sx_ymax[it] * 0.2)

                else:
                    if scol: ymax = yaxmax[i] * 1.2
                    elif srow: ymax = sx_ymax[it] * 1.2
                    else:
                        ymax = axx.get_ylim()[1]
                    if self.sense: ymin = -ymax
                    else: ymin = 0

                try:
                    axx.set_ylim([ymin, ymax])
                except:
                    pass

        handles, labels = ax.get_legend_handles_labels()
        uniq_labels = unique(labels)

        plt.legend([handles[labels.index(l)] for l in uniq_labels],
                   uniq_labels,
                   loc='center left',
                   handlelength=1,
                   handletextpad=1,
                   columnspacing=2,
                   borderaxespad=0.,
                   prop={'size': ticklabelsize},
                   bbox_to_anchor=(1.05, 0.5))

        f.tight_layout()
        self.fig = f

    def gen_html(self, directory, title, align=50):
        dir_name = os.path.basename(directory)
        # check_dir(directory)
        html_header = dir_name + " / " + title
        link_d = OrderedDict()
        link_d["Lineplot"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header,
                    links_dict=link_d,
                    fig_rpath="../style",
                    RGT_header=False,
                    other_logo="viz",
                    homepage="../index.html")
        html.add_figure("lineplot.png", align="center", width="80%")

        html.write(os.path.join(directory, title, "index.html"))

        ## Parameters
        html = Html(name=html_header,
                    links_dict=link_d,
                    fig_rpath="../style",
                    RGT_header=False,
                    other_logo="viz",
                    homepage="../index.html")
        type_list = 'ssssssssss'
        col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20]
        header_list = ["Assumptions and hypothesis"]
        data_table = []
        if self.annotation:
            data_table.append([
                "Genomic annotation: TSS - Transcription Start Site; TTS - Transcription Termination Site."
            ])
        data_table.append(["Directory:      " + directory.rpartition("/")[2]])
        data_table.append(["Title:          " + title])
        data_table.append(["Extend length:  " + str(self.extend)])
        data_table.append(["Read size:      " + str(self.rs)])
        data_table.append(["Bin size:       " + str(self.bs)])
        data_table.append(["Step size:      " + str(self.ss)])
        data_table.append(["Center mode:    " + self.center])

        html.add_zebra_table(header_list,
                             col_size_list,
                             type_list,
                             data_table,
                             align=align,
                             cell_align="left")

        html.add_free_content([
            '<a href="parameters.txt" style="margin-left:100">See parameters</a>'
        ])
        html.add_free_content([
            '<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>'
        ])

        html.write(os.path.join(directory, title, "parameters.html"))

    def hmsort(self, sort):
        if sort == None:
            pass
        elif sort == 0:
            for t in self.data.keys():
                for i, g in enumerate(self.data[t].keys()):
                    # print(numpy.sum(data[t][bed].values()[0], axis=1))
                    # print(len(numpy.sum(data[t][bed].values()[0], axis=1)))

                    sumarr = numpy.sum([
                        numpy.sum(d, axis=1) for d in self.data[t][g].values()
                    ],
                                       axis=0)
                    # print(sumarr)
                    # sumarr = numpy.sum(sumarr, axis=1)
                    ind = stats.rankdata(
                        sumarr,
                        method='ordinal')  # The index for further sorting
                    # numpy.fliplr(ind)

                    for j, c in enumerate(self.data[t][g].keys()):
                        d = numpy.empty(shape=(self.data[t][g][c].shape))
                        for k, ranki in enumerate(ind):
                            d[-ranki, :] = self.data[t][g][c][k, :]
                        self.data[t][g][c] = d
        else:
            for t in self.data.keys():
                for i, g in enumerate(self.data[t].keys()):
                    sumarr = numpy.sum(self.data[t][g].values()[sort - 1],
                                       axis=1)
                    # print(sumarr)
                    # sumarr = numpy.sum(sumarr, axis=1)
                    ind = stats.rankdata(
                        sumarr,
                        method='ordinal')  # The index for further sorting
                    # list(ind)
                    # print(ind)
                    for j, c in enumerate(self.data[t][g].keys()):
                        d = numpy.empty(shape=(self.data[t][g][c].shape))
                        for k, ranki in enumerate(ind):
                            d[-ranki, :] = self.data[t][g][c][k, :]
                        self.data[t][g][c] = d
                        # print(data[t][bed].values()[0])

    def hmcmlist(self, colorby, definedinEM):
        # self.colors = colormaps(self.exps, colorby, definedinEM)
        self.colors = ["Reds", "Blues", "Oranges", "Greens", "Purples"]

    def heatmap(self, logt):
        tickfontsize = 6
        ratio = 10
        self.hmfiles = []
        self.figs = []
        for ti, t in enumerate(self.data.keys()):
            # fig.append(plt.figure())
            # rows = len(data[t].keys())
            columns = len(self.data[t].values()[0].keys())
            # fig, axs = plt.subplots(rows,columns, sharey=True, dpi=300)
            # matplotlib.pyplot.subplots_adjust(left=1, right=2, top=2, bottom=1)
            fig = plt.figure(t)
            plt.suptitle("Heatmap: " + t, y=1.05)
            rows = len(self.data[t].keys())

            # gs = gridspec.GridSpec(rows*ratio,columns)
            axs = numpy.empty(shape=(rows + 1, columns), dtype=object)

            for bi, g in enumerate(self.data[t].keys()):
                for bj, c in enumerate(self.data[t][g].keys()):
                    max_value = numpy.amax(self.data[t][g][c])
                    max_value = int(max_value)
                    axs[bi, bj] = plt.subplot2grid(shape=(rows * ratio + 1,
                                                          columns),
                                                   loc=(bi * ratio, bj),
                                                   rowspan=ratio)
                    if bi == 0: axs[bi, bj].set_title(c, fontsize=7)
                    # print(self.data[t][g][c])
                    # print(self.colors)
                    # print(bj)
                    # im = axs[bi, bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto',
                    #                        vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj])

                    im = axs[bi, bj].imshow(
                        self.data[t][g][c],
                        extent=[-self.extend, self.extend, 0, 1],
                        aspect='auto',
                        vmin=0,
                        vmax=max_value,
                        interpolation='nearest',
                        cmap=plt.get_cmap("Blues"))

                    # for bi, g in enumerate(self.data[t].keys()):
                    #    for bj, c in enumerate(self.data[t][g].keys()):

                    # im = axs[bi, bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto',
                    #                        vmin=0, vmax=max_value, interpolation='nearest', cmap=cm.coolwarm)
                    axs[bi, bj].set_xlim([-self.extend, self.extend])
                    axs[bi, bj].set_xticks([-self.extend, 0, self.extend])
                    # axs[bi, bj].set_xticklabels([-args.e, 0, args.e]
                    plt.setp(axs[bi, bj].get_xticklabels(),
                             fontsize=tickfontsize,
                             rotation=0)
                    # plt.setp(axs[bi, bj].get_yticklabels(), fontsize=10)
                    # axs[bi, bj].locator_params(axis = 'x', nbins = 2)
                    # axs[bi, bj].locator_params(axis = 'y', nbins = 4)
                    for spine in ['top', 'right', 'left', 'bottom']:
                        axs[bi, bj].spines[spine].set_visible(False)
                    axs[bi, bj].tick_params(axis='x',
                                            which='both',
                                            bottom='off',
                                            top='off',
                                            labelbottom='on')
                    axs[bi, bj].tick_params(axis='y',
                                            which='both',
                                            left='off',
                                            right='off',
                                            labelleft='off')

                    # if bj > 0:
                    #    plt.setp(axs[bi, bj].get_yticklabels(),visible=False)
                    # plt.setp(axarr[i].get_yticks(),visible=False)
                    axs[bi, bj].minorticks_off()
                    if bj == 0:
                        # nregion = len(self.exps.objectsDict[g])
                        # axs[bi, bj].set_ylabel(self.exps.get_type(g,'factor')+" ("+str(nregion) + ")", fontsize=7)
                        axs[bi, bj].set_ylabel(g, fontsize=7)
                    if bi == rows - 1:
                        # divider = make_axes_locatable(axs[bi,bj])
                        # cax = divider.append_axes("bottom", size="5%", pad=0.5)
                        cbar_ax = plt.subplot2grid((rows * ratio + 4, columns),
                                                   (rows * ratio + 3, bj))
                        # axs[rows,bj].tick_params(axis='y', which='both', left='off', right='off', labelleft='off')

                        # cbar = grid.cbar_axes[i//2].colorbar(im)
                        # cbar = plt.colorbar(im, cax = axs[rows,bj], ticks=[0, max_value], orientation='horizontal')
                        # cbar = axs[rows,bj].imshow(range(int(max_value)), extent=[0, int(max_value),0,0], aspect=10, extent=[-self.extend, self.extend,0,0]
                        #                           vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj])
                        # cbar = axs[rows,bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto',
                        #                    vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj])
                        # cbar = axs[rows,bj].imshow([range(2*self.extend),range(2*self.extend),range(2*self.extend)],
                        #                           aspect='auto', vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj] )
                        # cbar.outline.set_linewidth(0.5)
                        # axs[rows,bj].set_ticks_position('none')
                        # axs[rows,bj].tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off')
                        # axs[rows,bj].tick_params(axis='y', which='both', left='off', right='off', labelleft='off')

                        # cbar.set_label('Amplitute of signal')
                        max_value = int(max_value)
                        # width = 0.4/rows
                        # cbar_ax = fig.add_axes([0.01 + bj/columns, 0, width, 0.01])
                        cbar = plt.colorbar(im,
                                            cax=cbar_ax,
                                            ticks=[0, max_value],
                                            orientation='horizontal')
                        cbar.ax.set_xticklabels([0, int(max_value)])
                        if logt:
                            cbar.ax.set_xticklabels(
                                ['0', '{:1.1f}'.format(max_value)],
                                fontsize=tickfontsize)  # horizontal colorbar
                            cbar.set_label('log10', fontsize=tickfontsize)
                            # else:
                            # cbar.ax.set_xticklabels(['0', int(max_value)], fontsize=tickfontsize)# horizontal colorbar
                            # pass
                            # cbar.outline.set_linewidth(0.1)

            # fig.tight_layout()
            # fig.tight_layout(pad=1.08, h_pad=None, w_pad=None)
            # fig.tight_layout(pad=1, h_pad=1, w_pad=1)
            self.figs.append(fig)
            self.hmfiles.append("heatmap" + "_" + t)

    def gen_htmlhm(self, outputname, title, align=50):
        dir_name = os.path.basename(outputname)
        # check_dir(directory)
        html_header = title
        link_d = OrderedDict()
        link_d["Lineplot"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header,
                    links_dict=link_d,
                    fig_rpath="../style",
                    RGT_header=False,
                    other_logo="viz",
                    homepage="../index.html")

        # Each row is a plot with its data
        for name in self.hmfiles:
            html.add_figure(name + ".png", align="center")
        html.write(os.path.join(outputname, title, "index.html"))

        ## Parameters
        html = Html(name=html_header,
                    links_dict=link_d,
                    fig_rpath="../style",
                    RGT_header=False,
                    other_logo="viz",
                    homepage="../index.html")

        html.add_free_content([
            '<a href="parameters.txt" style="margin-left:100">See parameters</a>'
        ])
        html.add_free_content([
            '<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>'
        ])
        html.write(os.path.join(outputname, title, "parameters.html"))
コード例 #12
0
if __name__ == '__main__':
    parser = HelpfulOptionParser(usage=__doc__)
    parser.add_option("--mode", "-m", dest="mode", default=1, help="choose mode", type="int")
    (options, args) = parser.parse_args()
    
    i = 2
    if len(args) != i:
        parser.error("Exactly %s parameters are needed" %i)
      
    path_exp_matrix = args[0]
    path_annotation = args[1]
    
    #options.mode = 3
    #path_exp_matrix = '/workspace/cluster_p/hematology/exp/exp03_rerun_chipseq/assign_peak/exp_matrix_peak_assign_chipseq'
    #path_annotation = '/home/manuel/data/rgt-data/mm9/'
    
    genome_file = os.path.join(path_annotation, "chrom.sizes")
    gene_file = os.path.join(path_annotation, "association_file.bed")
    
    exp_matrix = ExperimentalMatrix()
    exp_matrix.read(path_exp_matrix, is_bedgraph=True)
    
    if options.mode is 1:
        mode_1(exp_matrix)
    elif options.mode is 2:
        mode_2(exp_matrix)
    elif options.mode is 3:
        mode_3(exp_matrix)

コード例 #13
0
        parser.error("Exactly three parameters are needed: experimental matrix, gene expression, annotation path and prefix for output")
    
    #map arguments
    experimental_matrix_file = args[0]
    gene_exp = args[1]
    annotation_path = args[2]
    outputdir = args[3]
    
    
#     experimental_matrix_file = "/home/manuel/workspace/cluster_p/THOR/exp/exp23_macs2_payton/1"
#     gene_exp = "/home/manuel/workspace/cluster_p/allhoff/project_THOR/data/payton/gene_expression/CCmean.data"
#     annotation_path = "/home/manuel/workspace/cluster_h/rgtdata/hg19/"
#     outputdir = "/home/manuel/test/"
    
    exps = ExperimentalMatrix()
    exps.read(experimental_matrix_file)
    regionsets = exps.get_regionsets()
    
    genome_file = annotation_path + "/chrom.sizes"
    gene_file = annotation_path + "/association_file.bed"
    
    genes = GeneSet("Expression")
    genes.read_expression(gene_exp)
    
    for region in regionsets:
        bedNew = GenomicRegionSet("")
        [degenes, de_peak_genes, mappedGenes, totalPeaks, regionsToGenes] \
        = bedNew.filter_by_gene_association_old(region.fileName, genes.genes, gene_file, genome_file, threshDist=options.dist)
        
        [ct, labels] = averageExpression(region, genes, regionsToGenes)
        fileName = path.splitext(path.basename(region.fileName))[0]
コード例 #14
0
ファイル: lineplot.py プロジェクト: eggduzao/reg-gen
class Lineplot:
    def __init__(self, EMpath, title, annotation, organism, center, extend, rs, bs, ss, df, dft, fields, test, sense):

        # Read the Experimental Matrix
        self.title = title
        self.exps = ExperimentalMatrix()
        self.exps.read(EMpath, test=test)
        for f in self.exps.fields:
            if f not in ['name', 'type', 'file', "reads", "regions", "factors"]:
                self.exps.match_ms_tags(f, test=test)
                self.exps.remove_name()

        # if annotation:
        #     self.beds, self.bednames, self.annotation = annotation_dump(organism)

        # else:
        self.beds = self.exps.get_regionsets()  # A list of GenomicRegionSets
        self.bednames = self.exps.get_regionsnames()
        self.annotation = None

        self.reads = self.exps.get_readsfiles()
        self.readsnames = self.exps.get_readsnames()
        self.fieldsDict = self.exps.fieldsDict
        self.parameter = []
        self.center = center
        self.extend = extend
        self.rs = rs
        self.bs = bs
        self.ss = ss
        self.df = df
        self.dft = dft
        self.sense = sense

    def relocate_bed(self):
        self.processed_beds = []
        self.processed_bedsF = []  # Processed beds to be flapped

        for bed in self.beds:
            if self.center == 'bothends':
                newbed = bed.relocate_regions(center='leftend',
                                              left_length=self.extend + self.bs,
                                              right_length=self.extend + self.bs)
                self.processed_beds.append(newbed)
                newbedF = bed.relocate_regions(center='rightend',
                                               left_length=self.extend + self.bs,
                                               right_length=self.extend + self.bs)
                self.processed_bedsF.append(newbedF)
            elif self.center == 'upstream' or self.center == 'downstream':
                allbed = bed.relocate_regions(center=self.center,
                                              left_length=self.extend + self.bs,
                                              right_length=self.extend + self.bs)
                newbed = allbed.filter_strand(strand="+")
                self.processed_beds.append(newbed)
                newbedF = allbed.filter_strand(strand="-")
                self.processed_bedsF.append(newbedF)
            else:
                newbed = bed.relocate_regions(center=self.center,
                                              left_length=self.extend + int(0.5 * self.bs) + 2 * self.ss,
                                              right_length=self.extend + int(0.5 * self.bs) + 2 * self.ss)
                self.processed_beds.append(newbed)

    def group_tags(self, groupby, sortby, colorby):
        """Generate the tags for the grouping of plot
        Parameters:
            groupby = 'reads','regions','cell',or 'factor'
            colorby = 'reads','regions','cell',or 'factor'
            sortby = 'reads','regions','cell',or 'factor'
        """
        self.tag_type = [sortby, groupby, colorby, self.dft]
        if "None" in self.tag_type: self.tag_type.remove("None")

        if groupby == "None":
            self.group_tags = [""]
        elif groupby == "regions" and self.annotation:
            self.group_tags = self.bednames
        else:
            self.group_tags = gen_tags(self.exps, groupby)

        if sortby == "None":
            self.sort_tags = [""]
        elif sortby == "regions" and self.annotation:
            self.sort_tags = self.bednames
        else:
            self.sort_tags = gen_tags(self.exps, sortby)

        if colorby == "None":
            self.color_tags = [""]
        elif colorby == "regions" and self.annotation:
            self.color_tags = self.bednames
        else:
            self.color_tags = gen_tags(self.exps, colorby)

        print("\tColumn labels:\t" + ",".join(self.group_tags))
        print("\tRow labels:\t" + ",".join(self.sort_tags))
        print("\tColor labels:\t" + ",".join(self.color_tags))

    def gen_cues(self):
        self.cuebed = OrderedDict()
        self.cuebam = OrderedDict()

        # if self.annotation:
        #     #all_tags = []
        #     #for dictt in self.exps.fieldsDict.values():
        #     #    for tag in dictt.keys():
        #     #        all_tags.append(tag)
        #     for bed in self.bednames:
        #     #    self.cuebed[bed] = set([bed]+all_tags)
        #         self.cuebed[bed] = set([bed])
        # else:
        for bed in self.bednames:
            self.cuebed[bed] = set(tag_from_r(self.exps, self.tag_type, bed))
            try:
                self.cuebed[bed].remove("None")
            except:
                pass
        for bam in self.readsnames:
            self.cuebam[bam] = set(tag_from_r(self.exps, self.tag_type, bam))

    def coverage(self, sortby, heatmap=False, logt=False, mp=0, log=False):

        def annot_ind(bednames, tags):
            """Find the index for annotation tag"""
            for ind, a in enumerate(bednames):
                if a in tags: return ind

        if mp>0: ts = time.time()
        normRPM = False
        # Calculate for coverage
        mp_input = []
        data = OrderedDict()

        bi = 0
        for s in self.sort_tags:
            data[s] = OrderedDict()
            for g in self.group_tags:
                data[s][g] = OrderedDict()
                for c in self.color_tags:
                    # if self.df: data[s][g][c] = []
                    data[s][g][c] = OrderedDict()
                    if not self.dft:
                        dfs = [c]
                    else:
                        dfs = self.exps.fieldsDict[self.dft].keys()
                    for d in dfs:
                        data[s][g][c][d] = defaultdict(list)
                        for bed in self.cuebed.keys():
                            # print(self.cuebed[bed])
                            # print(set([s,g,c,d]))
                            # print(self.cuebed[bed].issubset(set([s,g,c,d])))
                            if len(self.cuebed[bed].intersection(set([s, g, c, d]))) > 2 or self.cuebed[bed].issubset(
                                    set([s, g, c, d])):
                                # if self.cuebed[bed] <= set([s,g,c]):
                                for bam in self.cuebam.keys():

                                    # print(self.cuebam[bam])
                                    # print(set([s,g,c]))
                                    if self.cuebam[bam] <= set([s, g, c, d]):
                                        i = self.bednames.index(bed)
                                        j = self.readsnames.index(bam)
                                        # print(bed + "." + bam)

                                        # if len(self.processed_beds[i]) == 0:
                                        #     try:
                                        #         data[s][g][c][d].append(numpy.empty(1, dtype=object))
                                        #     except:
                                        #         data[s][g][c][d] = [numpy.empty(1, dtype=object)]
                                        #     continue
                                        #########################################################################
                                        if mp > 0:  # Multiple processing
                                            mp_input.append([self.processed_beds[i], self.reads[j],
                                                             self.rs, self.bs, self.ss, self.center, heatmap, logt,
                                                             s, g, c, d])
                                            data[s][g][c][d] = None

                                        #########################################################################
                                        else:  # Single thread
                                            ts = time.time()
                                            cov = CoverageSet(bed + "." + bam, self.processed_beds[i])

                                            # print(len(self.processed_beds[i]))
                                            if "Conservation" in [s,g,c,d]:
                                                cov.phastCons46way_score(stepsize=self.ss)

                                            elif ".bigwig" in self.reads[j].lower() or ".bw" in self.reads[j].lower():
                                                cov.coverage_from_bigwig(bigwig_file=self.reads[j], stepsize=self.ss)
                                            else:
                                                if not self.sense:
                                                    cov.coverage_from_bam(bam_file=self.reads[j],
                                                                          extension_size=self.rs, binsize=self.bs,
                                                                          stepsize=self.ss)
                                                    if normRPM: cov.normRPM()
                                                else:  # Sense specific
                                                    cov.coverage_from_bam(bam_file=self.reads[j],
                                                                          extension_size=self.rs, binsize=self.bs,
                                                                          stepsize=self.ss, get_sense_info=True,
                                                                          paired_reads=True)
                                                    cov.array_transpose()
                                                    if normRPM: cov.normRPM()

                                            # When bothends, consider the fliping end
                                            if self.center == 'bothends' or self.center == 'upstream' or self.center == 'downstream':
                                                if "Conservation" in [s,g,c,d]:
                                                    flap = CoverageSet("for flap", self.processed_bedsF[i])
                                                    flap.phastCons46way_score(stepsize=self.ss)
                                                    ffcoverage = numpy.fliplr(flap.coverage)
                                                    cov.coverage = numpy.concatenate((cov.coverage, ffcoverage), axis=0)
                                                elif ".bigwig" in self.reads[j].lower() or ".bw" in self.reads[j].lower():
                                                    flap = CoverageSet("for flap", self.processed_bedsF[i])
                                                    flap.coverage_from_bigwig(bigwig_file=self.reads[j],
                                                                              stepsize=self.ss)
                                                    ffcoverage = numpy.fliplr(flap.coverage)
                                                    cov.coverage = numpy.concatenate((cov.coverage, ffcoverage), axis=0)
                                                else:
                                                    flap = CoverageSet("for flap", self.processed_bedsF[i])
                                                    if not self.sense:
                                                        flap.coverage_from_bam(self.reads[j], extension_size=self.rs,
                                                                               binsize=self.bs, stepsize=self.ss)
                                                        if normRPM: flap.normRPM()
                                                    else:  # Sense specific
                                                        flap.coverage_from_bam(bam_file=self.reads[j],
                                                                               extension_size=self.rs, binsize=self.bs,
                                                                               stepsize=self.ss, get_sense_info=True,
                                                                               paired_reads=True)
                                                        flap.array_transpose(flip=True)
                                                        if normRPM: flap.normRPM()
                                                    ffcoverage = numpy.fliplr(flap.coverage)
                                                    try: cov.coverage = numpy.concatenate((cov.coverage, ffcoverage), axis=0)
                                                    except: pass

                                                    if self.sense:
                                                        cov.transpose_cov1 = numpy.concatenate((cov.transpose_cov1,
                                                                                                flap.transpose_cov1),axis=0)
                                                        cov.transpose_cov2 = numpy.concatenate((cov.transpose_cov2,
                                                                                                flap.transpose_cov2), axis=0)

                                            # Averaging the coverage of all regions of each bed file
                                            if heatmap:
                                                if logt:
                                                    data[s][g][c][d] = numpy.log10(numpy.vstack(
                                                        cov.coverage) + 1)  # Store the array into data list
                                                else:
                                                    data[s][g][c][d] = numpy.vstack(
                                                        cov.coverage)  # Store the array into data list
                                            else:
                                                if len(cov.coverage) == 0:
                                                    data[s][g][c][d] = None
                                                    print("** Warning: Cannot open " + self.reads[j])
                                                    continue
                                                else:
                                                    for i, car in enumerate(cov.coverage):
                                                        if i == 0: avearr = np.array(car, ndmin=2)
                                                        else:
                                                            # avearr = numpy.vstack((avearr, np.array(car, ndmin=2)))
                                                            try: avearr = numpy.vstack((avearr, np.array(car, ndmin=2)))
                                                            except: print(bed+"."+bam+"."+str(i))
                                                    if log:
                                                        avearr = numpy.log2(avearr+1)

                                                    avearr = numpy.average(avearr, axis=0)
                                                    if self.sense:
                                                        if log:
                                                            sense_1 = numpy.average(numpy.log2(cov.transpose_cov1+1), axis=0)
                                                            sense_2 = numpy.average(numpy.log2(cov.transpose_cov2+1), axis=0)
                                                        else:
                                                            sense_1 = numpy.average(cov.transpose_cov1,axis=0)
                                                            sense_2 = numpy.average(cov.transpose_cov2,axis=0)
                                                    cut_end = int(self.bs/self.ss)
                                                    avearr = avearr[cut_end:-cut_end]
                                                    data[s][g][c][d]["all"].append(avearr)

                                                    if self.sense:
                                                        sense_1 = sense_1[cut_end:-cut_end]
                                                        sense_2 = sense_2[cut_end:-cut_end]
                                                        data[s][g][c][d]["sense_1"].append(sense_1)
                                                        data[s][g][c][d]["sense_2"].append(sense_2)

                                            bi += 1
                                            te = time.time()
                                            print2(self.parameter,
                                                   "\t" + str(bi) + "\t" + "{0:30}\t--{1:<5.1f}s".format(
                                                       bed + "." + bam, ts - te))


        if mp > 0:
            pool = MyPool(mp)
            mp_output = pool.map(compute_coverage, mp_input)
            pool.close()
            pool.join()
            for s in data.keys():
                for g in data[s].keys():
                    for c in data[s][g].keys():
                        for d in data[s][g][c].keys():
                            for out in mp_output:
                                if out[0] == s and out[1] == g and out[2] == c and out[3] == d:
                                    if self.df:
                                        try:
                                            data[s][g][c][d][-1].append(out[4])
                                        except:
                                            data[s][g][c][d] = [[out[4]]]
                                    else:
                                        try:
                                            data[s][g][c][d].append(out[4])
                                        except:
                                            data[s][g][c][d] = [out[4]]
        if self.df:
            for s in data.keys():
                for g in data[s].keys():
                    for c in data[s][g].keys():
                        for d in data[s][g][c].keys():
                            if isinstance(data[s][g][c][d]["all"], list) and len(data[s][g][c][d]["all"]) > 1:
                                diff = numpy.subtract(data[s][g][c][d]["all"][0], data[s][g][c][d]["all"][1])
                                data[s][g][c][d]["df"].append(diff.tolist())
                            else:
                                print("Warning: There is no repetitive reads for calculating difference.\n"
                                      "         Please add one more entry in experimental matrix.")
        self.data = data

    def colormap(self, colorby, definedinEM):
        colors = colormap(self.exps, colorby, definedinEM, annotation=self.annotation)
        self.colors = {}
        for i, c in enumerate(self.color_tags):
            self.colors[c] = colors[i]

    def plot(self, groupby, colorby, output, printtable=False, scol=False, srow=False, w=2, h=2):

        rot = 50
        if len(self.data.values()[0].keys()) < 2:
            ticklabelsize = w * 1.5
        else:
            ticklabelsize = w * 3
        tw = len(self.data.values()[0].keys()) * w
        th = len(self.data.keys()) * (h * 0.8)

        f, axs = plt.subplots(len(self.data.keys()), len(self.data.values()[0].keys()), dpi=300,
                              figsize=(tw, th))

        yaxmax = [0] * len(self.data.values()[0])
        sx_ymax = [0] * len(self.data.keys())
        if self.df:
            yaxmin = [0] * len(self.data.values()[0])
            sx_ymin = [0] * len(self.data.keys())

        if printtable:
            bott = self.extend - int(0.5 * self.ss)
            pArr = [["Group_tag", "Sort_tag", "Color_tag"] + [str(x) for x in range(-bott, bott, self.ss)]]  # Header
        nit = len(self.data.keys())
        for it, s in enumerate(self.data.keys()):

            for i, g in enumerate(self.data[s].keys()):
                try:
                    ax = axs[it, i]
                except:
                    if len(self.data.keys()) == 1 and len(self.data[s].keys()) == 1:
                        ax = axs
                    elif len(self.data.keys()) == 1 and len(self.data[s].keys()) > 1:
                        ax = axs[i]
                    else:
                        ax = axs[it]

                if it == 0:
                    if self.df:
                        ax.set_title(g + "_df", fontsize=ticklabelsize + 2)
                    else:
                        ax.set_title(g, fontsize=ticklabelsize + 2)

                # Processing for future output
                for j, c in enumerate(self.data[s][g].keys()):

                    for k, d in enumerate(self.data[s][g][c].keys()):
                        if not self.data[s][g][c][d]:
                            continue
                        else:
                            if not self.sense:
                                if self.df: pt = self.data[s][g][c][d]["df"]
                                else: pt = self.data[s][g][c][d]["all"]

                                for l, y in enumerate(pt):
                                    # print(y)
                                    yaxmax[i] = max(numpy.amax(y), yaxmax[i])
                                    sx_ymax[it] = max(numpy.amax(y), sx_ymax[it])
                                    if self.df:
                                        yaxmin[i] = min(numpy.amin(y), yaxmin[i])
                                        sx_ymin[it] = min(numpy.amin(y), sx_ymin[it])

                                    x = numpy.linspace(-self.extend, self.extend, len(y))
                                    ax.plot(x, y, color=self.colors[c], lw=1, label=c)
                                    if it < nit - 1:
                                        ax.set_xticklabels([])
                                    # Processing for future output
                                    if printtable: pArr.append([g, s, c, d] + list(y))
                            else:
                                plt.text(0.5, 0.51, 'sense',transform=ax.transAxes,fontsize=ticklabelsize,
                                         horizontalalignment='center', verticalalignment='bottom')
                                plt.text(0.5, 0.49, 'anti-sense', transform=ax.transAxes,fontsize=ticklabelsize,
                                         horizontalalignment='center', verticalalignment='top')
                                plt.plot((-self.extend, self.extend), (0, 0), '0.1', linewidth=0.2)
                                print(self.data[s][g][c][d])
                                for l, y in enumerate(self.data[s][g][c][d]["sense_1"]):
                                    # print(y)
                                    ymax1 = numpy.amax(y)
                                    yaxmax[i] = max(ymax1, yaxmax[i])
                                    sx_ymax[it] = max(ymax1, sx_ymax[it])
                                    x = numpy.linspace(-self.extend, self.extend, y.shape[0])
                                    ax.plot(x, y, color=self.colors[c], lw=1, label=c)
                                    if it < nit - 1: ax.set_xticklabels([])
                                    # Processing for future output
                                    if printtable: pArr.append([g, s, c, d, "+"] + list(y))

                                for l, y in enumerate(self.data[s][g][c][d]["sense_2"]):
                                    # print(y)
                                    ymax2 = numpy.amax(y)
                                    yaxmax[i] = max(ymax2, yaxmax[i])
                                    sx_ymax[it] = max(ymax2, sx_ymax[it])
                                    x = numpy.linspace(-self.extend, self.extend, y.shape[0])
                                    ax.plot(x, -y, color=self.colors[c], lw=1, label=c)
                                    if it < nit - 1: ax.set_xticklabels([])
                                    # Processing for future output
                                    if printtable: pArr.append([g, s, c, d, "-"] + list(y))
                                ym = 1.2 * max(max(yaxmax), max(sx_ymax))
                                ax.set_ylim([-ym, ym])

                ax.get_yaxis().set_label_coords(-0.1, 0.5)
                ax.set_xlim([-self.extend, self.extend])
                plt.setp(ax.get_xticklabels(), fontsize=ticklabelsize, rotation=rot)
                plt.setp(ax.get_yticklabels(), fontsize=ticklabelsize)


                ax.locator_params(axis='x', nbins=4)
                ax.locator_params(axis='y', nbins=2)
                # try:
                #
                # except:
                #     ax.locator_params(axis='y', nbins=2)
                #     pass
        if printtable:
            output_array(pArr, directory=output, folder=self.title, filename="plot_table.txt")

        for it, ty in enumerate(self.data.keys()):
            try:
                axs[it, 0].set_ylabel("{}".format(ty), fontsize=ticklabelsize + 1)
            except:
                try:
                    axs[it].set_ylabel("{}".format(ty), fontsize=ticklabelsize + 1)
                except:
                    axs.set_ylabel("{}".format(ty), fontsize=ticklabelsize + 1)

            for i, g in enumerate(self.data[ty].keys()):
                try: axx = axs[it, i]
                except:
                    try:
                        if len(self.data.keys()) == 1:
                            axx = axs[i]
                        else:
                            axx = axs[it]
                    except: axx = axs

                if self.df:
                    if scol:
                        ymin = yaxmin[i] - abs(yaxmin[i] * 0.2)
                        ymax = yaxmax[i] + abs(yaxmax[i] * 0.2)
                    elif srow:
                        ymin = sx_ymin[it] - abs(sx_ymin[it] * 0.2)
                        ymax = sx_ymax[it] + abs(sx_ymax[it] * 0.2)

                else:
                    if scol: ymax = yaxmax[i] * 1.2
                    elif srow: ymax = sx_ymax[it] * 1.2
                    else:
                        ymax = axx.get_ylim()[1]
                    if self.sense: ymin = -ymax
                    else: ymin = 0

                try: axx.set_ylim([ymin, ymax])
                except: pass

        handles, labels = ax.get_legend_handles_labels()
        uniq_labels = unique(labels)

        plt.legend([handles[labels.index(l)] for l in uniq_labels], uniq_labels, loc='center left', handlelength=1,
                   handletextpad=1,
                   columnspacing=2, borderaxespad=0., prop={'size': ticklabelsize}, bbox_to_anchor=(1.05, 0.5))

        f.tight_layout()
        self.fig = f

    def gen_html(self, directory, title, align=50):
        dir_name = os.path.basename(directory)
        # check_dir(directory)
        html_header = dir_name + " / " + title
        link_d = OrderedDict()
        link_d["Lineplot"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")
        html.add_figure("lineplot.png", align="center", width="80%")

        html.write(os.path.join(directory, title, "index.html"))

        ## Parameters
        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")
        type_list = 'ssssssssss'
        col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20]
        header_list = ["Assumptions and hypothesis"]
        data_table = []
        if self.annotation:
            data_table.append(
                ["Genomic annotation: TSS - Transcription Start Site; TTS - Transcription Termination Site."])
        data_table.append(["Directory:      " + directory.rpartition("/")[2]])
        data_table.append(["Title:          " + title])
        data_table.append(["Extend length:  " + str(self.extend)])
        data_table.append(["Read size:      " + str(self.rs)])
        data_table.append(["Bin size:       " + str(self.bs)])
        data_table.append(["Step size:      " + str(self.ss)])
        data_table.append(["Center mode:    " + self.center])

        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align,
                             cell_align="left")

        html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>'])
        html.add_free_content(['<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>'])

        html.write(os.path.join(directory, title, "parameters.html"))

    def hmsort(self, sort):
        if sort == None:
            pass
        elif sort == 0:
            for t in self.data.keys():
                for i, g in enumerate(self.data[t].keys()):
                    # print(numpy.sum(data[t][bed].values()[0], axis=1))
                    # print(len(numpy.sum(data[t][bed].values()[0], axis=1)))

                    sumarr = numpy.sum([numpy.sum(d, axis=1) for d in self.data[t][g].values()], axis=0)
                    # print(sumarr)
                    # sumarr = numpy.sum(sumarr, axis=1)
                    ind = stats.rankdata(sumarr, method='ordinal')  # The index for further sorting
                    # numpy.fliplr(ind)

                    for j, c in enumerate(self.data[t][g].keys()):
                        d = numpy.empty(shape=(self.data[t][g][c].shape))
                        for k, ranki in enumerate(ind):
                            d[-ranki, :] = self.data[t][g][c][k, :]
                        self.data[t][g][c] = d
        else:
            for t in self.data.keys():
                for i, g in enumerate(self.data[t].keys()):
                    sumarr = numpy.sum(self.data[t][g].values()[sort - 1], axis=1)
                    # print(sumarr)
                    # sumarr = numpy.sum(sumarr, axis=1)
                    ind = stats.rankdata(sumarr, method='ordinal')  # The index for further sorting
                    # list(ind)
                    # print(ind)
                    for j, c in enumerate(self.data[t][g].keys()):
                        d = numpy.empty(shape=(self.data[t][g][c].shape))
                        for k, ranki in enumerate(ind):
                            d[-ranki, :] = self.data[t][g][c][k, :]
                        self.data[t][g][c] = d
                        # print(data[t][bed].values()[0])

    def hmcmlist(self, colorby, definedinEM):
        # self.colors = colormaps(self.exps, colorby, definedinEM)
        self.colors = ["Reds", "Blues", "Oranges", "Greens", "Purples"]

    def heatmap(self, logt):
        tickfontsize = 6
        ratio = 10
        self.hmfiles = []
        self.figs = []
        for ti, t in enumerate(self.data.keys()):
            # fig.append(plt.figure())
            # rows = len(data[t].keys())
            columns = len(self.data[t].values()[0].keys())
            # fig, axs = plt.subplots(rows,columns, sharey=True, dpi=300)
            # matplotlib.pyplot.subplots_adjust(left=1, right=2, top=2, bottom=1)
            fig = plt.figure(t)
            plt.suptitle("Heatmap: " + t, y=1.05)
            rows = len(self.data[t].keys())

            # gs = gridspec.GridSpec(rows*ratio,columns)
            axs = numpy.empty(shape=(rows + 1, columns), dtype=object)

            for bi, g in enumerate(self.data[t].keys()):
                for bj, c in enumerate(self.data[t][g].keys()):
                    max_value = numpy.amax(self.data[t][g][c])
                    max_value = int(max_value)
                    axs[bi, bj] = plt.subplot2grid(shape=(rows * ratio + 1, columns), loc=(bi * ratio, bj),
                                                   rowspan=ratio)
                    if bi == 0: axs[bi, bj].set_title(c, fontsize=7)
                    # print(self.data[t][g][c])
                    # print(self.colors)
                    # print(bj)
                    # im = axs[bi, bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto',
                    #                        vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj])

                    im = axs[bi, bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0, 1], aspect='auto',
                                            vmin=0, vmax=max_value, interpolation='nearest', cmap=plt.get_cmap("Blues"))

                    # for bi, g in enumerate(self.data[t].keys()):
                    #    for bj, c in enumerate(self.data[t][g].keys()):


                    # im = axs[bi, bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto',
                    #                        vmin=0, vmax=max_value, interpolation='nearest', cmap=cm.coolwarm)
                    axs[bi, bj].set_xlim([-self.extend, self.extend])
                    axs[bi, bj].set_xticks([-self.extend, 0, self.extend])
                    # axs[bi, bj].set_xticklabels([-args.e, 0, args.e]
                    plt.setp(axs[bi, bj].get_xticklabels(), fontsize=tickfontsize, rotation=0)
                    # plt.setp(axs[bi, bj].get_yticklabels(), fontsize=10)
                    # axs[bi, bj].locator_params(axis = 'x', nbins = 2)
                    # axs[bi, bj].locator_params(axis = 'y', nbins = 4)
                    for spine in ['top', 'right', 'left', 'bottom']:
                        axs[bi, bj].spines[spine].set_visible(False)
                    axs[bi, bj].tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='on')
                    axs[bi, bj].tick_params(axis='y', which='both', left='off', right='off', labelleft='off')

                    # if bj > 0:
                    #    plt.setp(axs[bi, bj].get_yticklabels(),visible=False)
                    # plt.setp(axarr[i].get_yticks(),visible=False)
                    axs[bi, bj].minorticks_off()
                    if bj == 0:
                        # nregion = len(self.exps.objectsDict[g])
                        # axs[bi, bj].set_ylabel(self.exps.get_type(g,'factor')+" ("+str(nregion) + ")", fontsize=7)
                        axs[bi, bj].set_ylabel(g, fontsize=7)
                    if bi == rows - 1:
                        # divider = make_axes_locatable(axs[bi,bj])
                        # cax = divider.append_axes("bottom", size="5%", pad=0.5)
                        cbar_ax = plt.subplot2grid((rows * ratio + 4, columns), (rows * ratio + 3, bj))
                        # axs[rows,bj].tick_params(axis='y', which='both', left='off', right='off', labelleft='off')


                        # cbar = grid.cbar_axes[i//2].colorbar(im)
                        # cbar = plt.colorbar(im, cax = axs[rows,bj], ticks=[0, max_value], orientation='horizontal')
                        # cbar = axs[rows,bj].imshow(range(int(max_value)), extent=[0, int(max_value),0,0], aspect=10, extent=[-self.extend, self.extend,0,0]
                        #                           vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj])
                        # cbar = axs[rows,bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto',
                        #                    vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj])
                        # cbar = axs[rows,bj].imshow([range(2*self.extend),range(2*self.extend),range(2*self.extend)],
                        #                           aspect='auto', vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj] )
                        # cbar.outline.set_linewidth(0.5)
                        # axs[rows,bj].set_ticks_position('none')
                        # axs[rows,bj].tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off')
                        # axs[rows,bj].tick_params(axis='y', which='both', left='off', right='off', labelleft='off')


                        # cbar.set_label('Amplitute of signal')
                        max_value = int(max_value)
                        # width = 0.4/rows
                        # cbar_ax = fig.add_axes([0.01 + bj/columns, 0, width, 0.01])
                        cbar = plt.colorbar(im, cax=cbar_ax, ticks=[0, max_value], orientation='horizontal')
                        cbar.ax.set_xticklabels([0, int(max_value)])
                        if logt:
                            cbar.ax.set_xticklabels(['0', '{:1.1f}'.format(max_value)],
                                                    fontsize=tickfontsize)  # horizontal colorbar
                            cbar.set_label('log10', fontsize=tickfontsize)
                            # else:
                            # cbar.ax.set_xticklabels(['0', int(max_value)], fontsize=tickfontsize)# horizontal colorbar
                            # pass
                            # cbar.outline.set_linewidth(0.1)

            # fig.tight_layout()
            # fig.tight_layout(pad=1.08, h_pad=None, w_pad=None)
            # fig.tight_layout(pad=1, h_pad=1, w_pad=1)
            self.figs.append(fig)
            self.hmfiles.append("heatmap" + "_" + t)

    def gen_htmlhm(self, outputname, title, align=50):
        dir_name = os.path.basename(outputname)
        # check_dir(directory)
        html_header = title
        link_d = OrderedDict()
        link_d["Lineplot"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")

        # Each row is a plot with its data
        for name in self.hmfiles:
            html.add_figure(name + ".png", align="center")
        html.write(os.path.join(outputname, title, "index.html"))

        ## Parameters
        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")

        html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>'])
        html.add_free_content(['<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>'])
        html.write(os.path.join(outputname, title, "parameters.html"))
コード例 #15
0
if __name__ == '__main__':
    parser = HelpfulOptionParser(usage=__doc__)
    parser.add_option("--mode", "-m", dest="mode", default=1, help="choose mode", type="int")
    (options, args) = parser.parse_args()
    
    i = 2
    if len(args) != i:
        parser.error("Exactly %s parameters are needed" %i)
      
    path_exp_matrix = args[0]
    path_annotation = args[1]
    
    #options.mode = 3
    #path_exp_matrix = '/workspace/cluster_p/hematology/exp/exp03_rerun_chipseq/assign_peak/exp_matrix_peak_assign_chipseq'
    #path_annotation = '/home/manuel/data/rgt-data/mm9/'
    
    genome_file = os.path.join(path_annotation, "chrom.sizes")
    gene_file = os.path.join(path_annotation, "association_file.bed")
    
    exp_matrix = ExperimentalMatrix()
    exp_matrix.read(path_exp_matrix, is_bedgraph=True)
    
    if options.mode is 1:
        mode_1(exp_matrix)
    elif options.mode is 2:
        mode_2(exp_matrix)
    elif options.mode is 3:
        mode_3(exp_matrix)