コード例 #1
0
ファイル: projection_test.py プロジェクト: rafalcode/reg-gen
    def gen_html_distribution(self, outputname, title, align=50):
        fp = os.path.join(dir, outputname, title)
        link_d = {title: "distribution.html"}
        html = Html(name="Viz", links_dict=link_d, fig_dir=os.path.join(dir, outputname, "fig"),
                    other_logo="viz", homepage="../index.html")
        for i, f in enumerate(self.fig):
            html.add_figure("distribution_test_" + str(i) + ".png", align="center")

        html.add_free_content(['<p style=\"margin-left: ' + str(align + 150) + '">' +
                               '** </p>'])

        type_list = 'ssssssssssssssssssssssssssssssssssssssssssssss'
        col_size_list = [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                         10, 10]
        data_table = []
        for ind_ty, ty in enumerate(self.disperDict.keys()):
            header_list = ["Chromosome"] + self.disperDict[ty].keys()
            html.add_heading(ty, size=4, bold=False)
            for i, ch in enumerate(self.chrom_list):
                # for ind_r,r in enumerate(self.disperDict[ty].keys()):

                data_table.append(
                    [ch] + ["{:.3f} %".format(100 * self.disperDict[ty][r][i]) for r in self.disperDict[ty].keys()])

        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align)

        html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>'])
        html.add_free_content([
                                  '<a href="reference_experimental_matrix.txt" style="margin-left:100">See reference experimental matrix</a>'])
        html.add_free_content(
            ['<a href="query_experimental_matrix.txt" style="margin-left:100">See query experimental matrix</a>'])
        html.write(os.path.join(fp, "distribution.html"))
コード例 #2
0
ファイル: Main.py プロジェクト: Marvin84/reg-gen
def list_all_index(path):
    """Creat an 'index.html' in the defined directory """
    dirname = os.path.basename(path)
    parentdir = os.path.basename(os.path.dirname(path))

    # link_d = {"List":"index.html"}
    link_d = {}
    ####
    for root, dirnames, filenames in os.walk(os.path.dirname(path)):
        for filename in fnmatch.filter(filenames, 'index.html'):
            if root.split('/')[-2] == parentdir:
                link_d[root.split('/')[-1]] = "../"+root.split('/')[-1]+"/index.html"
    link_d = OrderedDict(sorted(link_d.items(), key=lambda (key, value): key))

    ###

    html = Html(name="Directory: "+dirname, links_dict=link_d, 
                fig_dir=os.path.join(path,"style"), fig_rpath="./style", RGT_header=False, other_logo="viz")
    header_list = ["No.", "Experiments"]
    html.add_heading("All experiments in: "+dirname+"/")
    data_table = []
    type_list = 'ssss'
    col_size_list = [10, 10, 10]
    c = 0
    for root, dirnames, filenames in os.walk(path):
        #roots = root.split('/')
        for filename in fnmatch.filter(filenames, '*.html'):
            if filename == 'index.html' and root.split('/')[-1] != dirname:
                # print(root)
                c += 1
                data_table.append([str(c), '<a href="'+os.path.join(root.split('/')[-1], filename)+'"><font size="4">'+root.split('/')[-1]+"</a>"])
                #print(link_d[roots[-1]])
    html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=50, cell_align="left", sortable=True)
    html.add_fixed_rank_sortable()
    html.write(os.path.join(path,"index.html"))
コード例 #3
0
ファイル: shared_function.py プロジェクト: rafalcode/reg-gen
def list_all_index(path):
    """Creat an 'index.html' in the defined directory """
    dirname = os.path.basename(path)
    parentdir = os.path.basename(os.path.dirname(path))

    # link_d = {"List":"index.html"}
    link_d = {}
    ####
    for root, dirnames, filenames in os.walk(os.path.dirname(path)):
        for filename in fnmatch.filter(filenames, 'index.html'):
            if root.split('/')[-2] == parentdir:
                link_d[root.split('/')
                       [-1]] = "../" + root.split('/')[-1] + "/index.html"
    link_d = OrderedDict(sorted(link_d.items(), key=lambda (key, value): key))

    ###

    html = Html(name="Directory: " + dirname,
                links_dict=link_d,
                fig_dir=os.path.join(path, "style"),
                fig_rpath="./style",
                RGT_header=False,
                other_logo="viz")
    header_list = ["No.", "Experiments"]
    html.add_heading("All experiments in: " + dirname + "/")
    data_table = []
    type_list = 'ssss'
    col_size_list = [10, 10, 10]
    c = 0
    for root, dirnames, filenames in os.walk(path):
        # roots = root.split('/')
        for filename in fnmatch.filter(filenames, '*.html'):
            if filename == 'index.html' and root.split('/')[-1] != dirname:
                # print(root)
                c += 1
                data_table.append([
                    str(c),
                    '<a href="' + os.path.join(root.split('/')[-1], filename) +
                    '"><font size="4">' + root.split('/')[-1] + "</a>"
                ])
                # print(link_d[roots[-1]])
    html.add_zebra_table(header_list,
                         col_size_list,
                         type_list,
                         data_table,
                         align=50,
                         cell_align="left",
                         sortable=True)
    html.add_fixed_rank_sortable()
    html.write(os.path.join(path, "index.html"))
コード例 #4
0
def list_all_index(path):
    """Creat an 'index.html' in the defined directory """
    dirname = os.path.basename(path)

    link_d = {"List": "index.html"}
    html = Html(name="Directory: " + dirname,
                links_dict=link_d,
                fig_dir=os.path.join(path, "style"),
                fig_rpath="./style",
                RGT_header=False,
                other_logo="TDF")

    html.add_heading("All experiments in: " + dirname + "/")
    data_table = []
    type_list = 'sssss'
    col_size_list = [10, 10, 10, 10, 10]
    c = 0
    for root, dirnames, filenames in os.walk(path):
        #roots = root.split('/')
        for filename in fnmatch.filter(filenames, '*.html'):
            if filename == 'index.html' and root.split('/')[-1] != dirname:
                c += 1
                if "_" in root.split('/')[-1]:
                    tags = root.split('/')[-1].split("_")
                    p1 = tags[0]
                    p2 = tags[-1]
                    data_table.append([
                        str(c), '<a href="' +
                        os.path.join(root.split('/')[-1], filename) + '">' +
                        root.split('/')[-1] + "</a>", p1, p2
                    ])
                    header_list = ["No.", "Experiments", "Tag1", "Tag2"]
                else:
                    data_table.append([
                        str(c), '<a href="' +
                        os.path.join(root.split('/')[-1], filename) + '">' +
                        root.split('/')[-1] + "</a>"
                    ])
                    header_list = ["No.", "Experiments"]
                #print(link_d[roots[-1]])
    html.add_zebra_table(header_list,
                         col_size_list,
                         type_list,
                         data_table,
                         align=50,
                         cell_align="left",
                         sortable=True)
    html.add_fixed_rank_sortable()
    html.write(os.path.join(path, "index.html"))
コード例 #5
0
    def gen_html(self, directory, title, align=50):
        dir_name = os.path.basename(directory)
        # check_dir(directory)
        html_header = dir_name + " / " + title
        link_d = OrderedDict()
        link_d["Lineplot"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")

        for g in self.group_tags:
            html.add_heading(heading=g)
            html.add_figure("lineplot_" + g + ".png", align="center", width="80%")

        html.write(os.path.join(directory, title, "index.html"))

        ## Parameters
        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")
        type_list = 'ssssssssss'
        col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20]
        header_list = ["Assumptions and hypothesis"]
        data_table = []
        if self.annotation:
            data_table.append(
                ["Genomic annotation: TSS - Transcription Start Site; TTS - Transcription Termination Site."])
        data_table.append(["Directory:      " + directory.rpartition("/")[2]])
        data_table.append(["Title:          " + title])
        data_table.append(["Extend length:  " + str(self.extend)])
        data_table.append(["Read size:      " + str(self.rs)])
        data_table.append(["Bin size:       " + str(self.bs)])
        data_table.append(["Step size:      " + str(self.ss)])
        data_table.append(["Center mode:    " + self.center])

        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align,
                             cell_align="left")

        html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>'])
        html.add_free_content(['<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>'])

        html.write(os.path.join(directory, title, "parameters.html"))
コード例 #6
0
ファイル: lineplot.py プロジェクト: CostaLab/reg-gen
    def gen_html(self, directory, title, align=50):
        dir_name = os.path.basename(directory)
        # check_dir(directory)
        html_header = dir_name + " / " + title
        link_d = OrderedDict()
        link_d["Lineplot"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")

        for g in self.group_tags:
            html.add_heading(heading=g)
            html.add_figure("lineplot_" + g + ".png", align="center", width="80%")

        html.write(os.path.join(directory, title, "index.html"))

        ## Parameters
        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")
        type_list = 'ssssssssss'
        col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20]
        header_list = ["Assumptions and hypothesis"]
        data_table = []
        if self.annotation:
            data_table.append(
                ["Genomic annotation: TSS - Transcription Start Site; TTS - Transcription Termination Site."])
        data_table.append(["Directory:      " + directory.rpartition("/")[2]])
        data_table.append(["Title:          " + title])
        data_table.append(["Extend length:  " + str(self.extend)])
        data_table.append(["Read size:      " + str(self.rs)])
        data_table.append(["Bin size:       " + str(self.bs)])
        data_table.append(["Step size:      " + str(self.ss)])
        data_table.append(["Center mode:    " + self.center])

        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align,
                             cell_align="left")

        html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>'])
        html.add_free_content(['<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>'])

        html.write(os.path.join(directory, title, "parameters.html"))
コード例 #7
0
ファイル: projection_test.py プロジェクト: rafalcode/reg-gen
    def gen_html(self, directory, title, args, align=50):
        dir_name = os.path.basename(directory)
        statistic_table = []
        # check_dir(directory)
        html_header = "Projection Test: " + dir_name
        link_d = OrderedDict()
        link_d["Projection test"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")
        html.add_figure("projection_test.png", align="center")

        header_list = ["No.",
                       "Reference<br>name",
                       "Query<br>name",
                       "Reference<br>number",
                       "Query<br>number",
                       "Proportion",
                       "Background<br>proportion",
                       "Positive<br>association<br>p-value",
                       "Negative<br>association<br>p-value"]
        statistic_table.append(["Reference_name", "Query_name", "Reference_number",
                                "Query_number", "Proportion", "Background_proportion",
                                "Positive_association_p-value", "Negative_association_p-value"])
        type_list = 'ssssssssssssssss'
        col_size_list = [5, 10, 10, 10, 10, 10, 10, 15, 15]

        nalist = []
        for ind_ty, ty in enumerate(self.plist.keys()):
            html.add_heading(ty, size=4, bold=False)
            data_table = []
            for ind_r, r in enumerate(self.plist[ty].keys()):
                rlen = str(self.lenlist[r])
                for ind_q, q in enumerate(self.plist[ty][r].keys()):
                    qlen = str(self.lenlist[q])
                    backv = value2str(self.qlist[ty][r]['Background'])
                    propor = value2str(self.qlist[ty][r][q])
                    pv = self.plist[ty][r][q]
                    if pv == "na":
                        nalist.append(r)
                        continue
                    elif self.qlist[ty][r][q] < args.cfp:
                        continue
                    else:
                        pvn = 1 - pv

                        if self.plist[ty][r][q] < 0.05:
                            if self.qlist[ty][r]['Background'] < self.qlist[ty][r][q]:
                                data_table.append([str(ind_ty), r, q, rlen, qlen, propor, backv,
                                                   "<font color=\"red\">" + value2str(pv) + "</font>", value2str(pvn)])
                                statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)])
                            else:
                                data_table.append([str(ind_ty), r, q, rlen, qlen, propor, backv,
                                                   value2str(pvn), "<font color=\"red\">" + value2str(pv) + "</font>"])
                                statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pvn), value2str(pv)])
                        else:
                            data_table.append(
                                [str(ind_ty), r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)])
                            statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)])

            html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, sortable=True)
            output_array(statistic_table, directory=directory, folder=title, filename="statistics" + ty + ".txt")

        header_list = ["Assumptions and hypothesis"]
        data_table = [['If the background proportion is too small, it may cause bias in p value.'],
                      [
                          'For projection test, the reference GenomicRegionSet should have non-zero length in order to calculate its background proportion.'],
                      ['P values are corrected by multiple test correction.'],
                      ['Positive association is defined by: Proportion > Background.'],
                      ['Negative association is defined by: Proportion < Background.']]

        nalist = set(nalist)
        if len(nalist) > 0:
            data_table.append([
                                  'The following references contain zero-length region which cause error in proportion calculation, please check it:<br>' +
                                  '     <font color=\"red\">' + ', '.join([s for s in nalist]) + '</font></p>'])
        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left")
        html.add_fixed_rank_sortable()

        html.write(os.path.join(directory, os.path.join(title, "index.html")))

        # Parameters
        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")
        header_list = ["Description", "Argument", "Value"]
        data_table = [["Reference", "-r", args.r],
                      ["Query", "-q", args.q],
                      ["Output directory", "-o", os.path.basename(args.o)],
                      ["Experiment title", "-t", args.t],
                      # ["Grouping tag", "-g", args.g],
                      # ["Coloring tag", "-c", args.c],
                      # ["Background", "-bg", args.bg],
                      ["Organism", "-organism", args.organism],
                      ["Cutoff of proportion", "-cfp", str(args.cfp)]]

        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left")
        html.add_free_content([
                                  '<a href="reference_experimental_matrix.txt" style="margin-left:100">See reference experimental matrix</a>'])
        html.add_free_content(
            ['<a href="query_experimental_matrix.txt" style="margin-left:100">See query experimental matrix</a>'])
        html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See details</a>'])
        html.write(os.path.join(directory, os.path.join(title, "parameters.html")))
コード例 #8
0
ファイル: tdf_regiontest.py プロジェクト: rafalcode/reg-gen
    def gen_html(self,
                 directory,
                 parameters,
                 obed,
                 align=50,
                 alpha=0.05,
                 score=False):
        """Generate the HTML file"""
        dir_name = os.path.basename(directory)
        html_header = "Genomic Region Test: " + dir_name
        link_ds = OrderedDict()
        link_ds["RNA"] = "index.html"
        link_ds["Sig Target Regions"] = "starget_regions.html"
        link_ds["Target Regions"] = "target_regions.html"
        link_ds["Parameters"] = "parameters.html"

        ##################################################
        # index.html

        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")
        # Plots
        html.add_figure("lineplot_region.png",
                        align="left",
                        width="45%",
                        more_images=["boxplot_regions.png"])
        if self.showdbs:
            html.add_figure("lineplot_dbs.png",
                            align="left",
                            width="45%",
                            more_images=["boxplot_dbs.png"])

        if self.showdbs:
            header_list = [[
                "#", "DBD", "Target Regions", None, "Non-target Regions", None,
                "Statistics", "Target Regions", "Non-target Regions", None,
                "Statistics"
            ],
                           [
                               "", "", "with DBS", "without DBS",
                               "with DBS (average)", "s.d.", "<i>p</i>-value",
                               "NO. DBSs", "NO. DBSs (average)", "s.d.",
                               "<i>p</i>-value"
                           ]]
            header_titles = [
                [
                    "Rank", "DNA Binding Domain",
                    "Given target regions on DNA", None,
                    "Regions from randomization", None,
                    "Statistics based on target regions",
                    "Given target regions on DNA",
                    "Regions from randomization", None,
                    "Statistics based on DNA Binding Sites"
                ],
                [
                    "", "", "Number of target regions with DBS binding",
                    "Number of target regions without DBS binding",
                    "Average number of regions from randomization with DBS binding",
                    "Standard deviation", "P value",
                    "Number of related DNA Binding Sites binding to target regions",
                    "Average number of DNA Binding Sites binding to random regions",
                    "Standard deviation", "P-value"
                ]
            ]
            border_list = [
                " style=\"border-right:1pt solid gray\"",
                " style=\"border-right:1pt solid gray\"", "",
                " style=\"border-right:1pt solid gray\"", "",
                " style=\"border-right:1pt solid gray\"",
                " style=\"border-right:2pt solid gray\"",
                " style=\"border-right:1pt solid gray\"", "",
                " style=\"border-right:1pt solid gray\"",
                " style=\"border-right:1pt solid gray\""
            ]
        else:
            header_list = [[
                "#", "DBD", "Target Regions", None, "Non-target Regions", None,
                "Statistics", None
            ],
                           [
                               "", "", "with DBS", "without DBS",
                               "with DBS (average)", "s.d.", "<i>p</i>-value",
                               "z-score"
                           ]]
            header_titles = [
                [
                    "Rank", "DNA Binding Domain",
                    "Given target regions on DNA", None,
                    "Regions from randomization", None,
                    "Statistics based on target regions", None
                ],
                [
                    "", "", "Number of target regions with DBS binding",
                    "Number of target regions without DBS binding",
                    "Average number of regions from randomization with DBS binding",
                    "Standard deviation", "P value", "Z-score"
                ]
            ]
            border_list = [
                " style=\"border-right:1pt solid gray\"",
                " style=\"border-right:1pt solid gray\"", "",
                " style=\"border-right:1pt solid gray\"", "",
                " style=\"border-right:1pt solid gray\"",
                " style=\"border-right:1pt solid gray\"", ""
            ]

        type_list = 'ssssssssssssssss'
        col_size_list = [
            50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50
        ]
        data_table = []

        for i, rbs in enumerate(self.rbss):
            if self.data["region"]["p"][i] < alpha:
                p_region = "<font color=\"red\">" + value2str(
                    self.data["region"]["p"][i]) + "</font>"

            else:
                p_region = value2str(self.data["region"]["p"][i])
            zs = (self.counts_tr[rbs][0] -
                  self.data["region"]["ave"][i]) / self.data["region"]["sd"][i]
            new_line = [
                str(i + 1),
                rbs.str_rna(pa=False), '<a href="dbd_region.html#' +
                rbs.str_rna() + '" style="text-align:left">' +
                str(self.counts_tr[rbs][0]) + '</a>',
                str(self.counts_tr[rbs][1]),
                value2str(self.data["region"]["ave"][i]),
                value2str(self.data["region"]["sd"][i]), p_region,
                value2str(zs)
            ]
            if self.showdbs:
                if self.data["dbs"]["p"][i] < alpha:
                    p_dbs = "<font color=\"red\">" + value2str(
                        self.data["dbs"]["p"][i]) + "</font>"
                else:
                    p_dbs = value2str(self.data["dbs"]["p"][i])

                new_line += [
                    str(self.counts_dbs[rbs]),
                    value2str(self.data["dbs"]["ave"][i]),
                    value2str(self.data["dbs"]["sd"][i]), p_dbs
                ]
            data_table.append(new_line)

        data_table = natsort.natsorted(data_table, key=lambda x: x[6])
        html.add_zebra_table(header_list,
                             col_size_list,
                             type_list,
                             data_table,
                             align=align,
                             cell_align="left",
                             auto_width=True,
                             header_titles=header_titles,
                             border_list=border_list,
                             sortable=True)

        html.add_heading("Notes")
        html.add_list([
            "RNA name: " + self.rna_name,
            "Randomization is performed for " + str(self.repeats) + " times.",
            "DBD stands for DNA Binding Domain on RNA.",
            "DBS stands for DNA Binding Site on DNA."
        ])
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "index.html"))

        #############################################################
        # RNA subpage: Profile of targeted regions for each merged DNA Binding Domain
        #############################################################

        header_list = [
            "#", "Target Region", "Associated Gene", "No. of DBSs",
            "DBS coverage"
        ]
        header_titles = [
            "Rank", "Given target regions from BED files",
            "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
            "Number of DNA Binding Sites locate within the region",
            "The proportion of the region covered by DBS binding"
        ]

        #########################################################
        # dbd_region.html
        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")

        for rbsm in self.rbss:
            html.add_heading("DNA Binding Domain: " + rbsm.str_rna(),
                             idtag=rbsm.str_rna())
            data_table = []
            for i, region in enumerate(self.txp.merged_dict[rbsm]):
                # Add information
                data_table.append([
                    str(i + 1),
                    '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' +
                    self.organism + "&position=" + region.chrom + "%3A" +
                    str(region.initial) + "-" + str(region.final) +
                    '" style="text-align:left">' +
                    region.toString(space=True) + '</a>',
                    split_gene_name(gene_name=region.name, org=self.organism),
                    str(len(self.region_dbs[region.toString()])),
                    value2str(self.region_coverage[region.toString()])
                ])

            html.add_zebra_table(header_list,
                                 col_size_list,
                                 type_list,
                                 data_table,
                                 align=align,
                                 cell_align="left",
                                 auto_width=True,
                                 header_titles=header_titles,
                                 sortable=True)
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "dbd_region.html"))

        #############################################################
        # Targeted regions centered
        #############################################################

        ##############################################################################################
        # target_regions.html
        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")

        if score:
            header_list = [
                "#", "Target region", "Associated Gene", "DBSs Count",
                "DBS coverage", "Score", "Sum of ranks"
            ]
            header_titles = [
                "Rank", "Target regions loaded from the given BED file",
                "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
                "Number of DNA Binding Sites within the region",
                "The proportion of the region covered by DBS binding",
                "Scores from BED file", "Sum of all the left-hand-side ranks"
            ]
        else:
            header_list = [
                "#", "Target region", "Associated Gene", "DBSs Count",
                "DBS coverage", "Sum of ranks"
            ]
            header_titles = [
                "Rank", "Target regions loaded from the given BED file",
                "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
                "Number of DNA Binding Sites within the region",
                "The proportion of the region covered by DBS binding",
                "Sum of all the left-hand-side ranks"
            ]
        html.add_heading("Target Regions")
        data_table = []

        if not self.dna_region.sorted: self.dna_region.sort()

        # Calculate the ranking
        rank_count = len(self.dna_region) - rank_array(
            [len(self.region_dbs[p.toString()]) for p in self.dna_region])
        rank_coverage = len(self.dna_region) - rank_array(
            [self.region_coverage[p.toString()] for p in self.dna_region])

        if score:
            try:
                score_list = [
                    float(p.data.split("\t")[0]) for p in self.dna_region
                ]
                rank_score = len(self.dna_region) - rank_array(
                    [abs(s) for s in score_list])
                rank_sum = [
                    x + y + z
                    for x, y, z in zip(rank_count, rank_coverage, rank_score)
                ]
                # sum_rank = rank_array(rank_sum)  # method='min'
            except ImportError:
                print(
                    "There is no score in BED file, please don't use '-score' argument."
                )
        else:
            rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)]
            sum_rank = rank_array(rank_sum)

        for i, region in enumerate(self.dna_region):
            dbs_counts = str(len(self.region_dbs[region.toString()]))
            dbs_cover = value2str(self.region_coverage[region.toString()])

            newline = [
                str(i + 1),
                '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' +
                self.organism + "&position=" + region.chrom + "%3A" +
                str(region.initial) + "-" + str(region.final) +
                '" style="text-align:left">' + region.toString(space=True) +
                '</a>',
                split_gene_name(gene_name=region.name, org=self.organism),
                '<a href="region_dbs.html#' + region.toString() +
                '" style="text-align:left">' + dbs_counts + '</a>', dbs_cover
            ]

            if score:
                dbs_score = value2str(score_list[i])
                region.data = "\t".join(
                    [dbs_counts, dbs_cover, dbs_score,
                     str(rank_sum[i])])
                newline.append(dbs_score)
                newline.append(str(rank_sum[i]))
            else:
                region.data = "\t".join(
                    [dbs_counts, dbs_cover,
                     str(rank_sum[i])])
                newline.append(str(rank_sum[i]))
            data_table.append(newline)

        data_table = natsort.natsorted(data_table, key=lambda x: x[-1])
        # data_table = sorted(data_table, key=lambda x: x[-1])
        html.add_zebra_table(header_list,
                             col_size_list,
                             type_list,
                             data_table,
                             align=align,
                             cell_align="left",
                             auto_width=True,
                             header_titles=header_titles,
                             sortable=True)
        html.add_heading("Notes")
        html.add_list(["All target regions without any bindings are ignored."])
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "target_regions.html"))

        self.dna_region.sort_score()
        self.dna_region.write_bed(
            os.path.join(directory, obed + "_target_regions.bed"))

        ##############################################################################################
        # starget_regions.html    for significant target regions

        stargets = GenomicRegionSet("sig_targets")
        sig_dbs = {}
        sig_dbs_coverage = {}
        for i, r in enumerate(self.dna_region):
            sig_bindings = self.region_dbs[r.toString()].overlap_rbss(
                rbss=self.data["region"]["sig_region"])
            dbs = sig_bindings.get_dbs()
            if len(dbs) > 0:
                stargets.add(r)
                m_dbs = dbs.merge(w_return=True)
                sig_dbs[r] = len(dbs)
                # self.promoter["de"]["merged_dbs"][promoter.toString()] = len(m_dbs)
                sig_dbs_coverage[r] = float(m_dbs.total_coverage()) / len(r)

        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")

        # Select promoters in sig DBD
        if len(self.data["region"]["sig_region"]) == 0:
            html.add_heading("There is no significant DBD.")
        else:
            html.add_heading("Target regions bound by significant DBD")
            data_table = []
            # Calculate the ranking
            rank_count = len(stargets) - rank_array(
                [sig_dbs[p] for p in stargets])
            rank_coverage = len(stargets) - rank_array(
                [sig_dbs_coverage[p] for p in stargets])
            if score:
                score_list = [float(p.data.split("\t")[0]) for p in stargets]
                rank_score = len(stargets) - rank_array(
                    [abs(s) for s in score_list])
                rank_sum = [
                    x + y + z
                    for x, y, z in zip(rank_count, rank_coverage, rank_score)
                ]
                sum_rank = rank_array(rank_sum)  # method='min'
            else:
                rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)]
                sum_rank = rank_array(rank_sum)

            for i, region in enumerate(stargets):
                dbssount = '<a href="region_dbs.html#' + region.toString() + \
                           '" style="text-align:left">' + str(sig_dbs[region]) + '</a>'

                region_link = region_link_internet(self.organism, region)

                newline = [
                    str(i + 1), region_link,
                    split_gene_name(gene_name=region.name, org=self.organism),
                    dbssount,
                    value2str(sig_dbs_coverage[region])
                ]
                if score:
                    dbs_score = value2str(score_list[i])
                    # region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(sum_rank[i])])
                    newline.append(dbs_score)
                    newline.append(str(rank_sum[i]))
                    # print([dbs_score, str(sum_rank[i])])
                else:
                    # region.data = "\t".join([dbs_counts, dbs_cover, str(sum_rank[i])])
                    newline.append(str(rank_sum[i]))

                # newline += ["<i>" + str(rank_sum[i]) + "</i>"]
                # print(newline)
                data_table.append(newline)

            # print(data_table)
            # data_table = sorted(data_table, key=lambda x: x[-1])
            data_table = natsort.natsorted(data_table, key=lambda x: x[-1])
            html.add_zebra_table(header_list,
                                 col_size_list,
                                 type_list,
                                 data_table,
                                 align=align,
                                 cell_align="left",
                                 header_titles=header_titles,
                                 border_list=None,
                                 sortable=True)
            html.add_heading("Notes")
            html.add_list([
                "DBS stands for DNA Binding Site on DNA.",
                "DBS coverage is the proportion of the region where has potential to form triple helices with the given RNA."
            ])
            html.add_fixed_rank_sortable()
            html.write(os.path.join(directory, "starget_regions.html"))

        ############################
        # Subpages for targeted region centered page
        # region_dbs.html
        header_list = ["RBS", "DBS", "Strand", "Score", "Motif", "Orientation"]

        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")

        for i, region in enumerate(self.dna_region):
            if len(self.region_dbs[region.toString()]) == 0:
                continue
            else:
                html.add_heading(
                    "Associated gene: " +
                    split_gene_name(gene_name=region.name, org=self.organism),
                    idtag=region.toString())
                html.add_free_content([
                    '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' +
                    self.organism + "&position=" + region.chrom + "%3A" +
                    str(region.initial) + "-" + str(region.final) +
                    '" style="margin-left:50">' + region.toString(space=True) +
                    '</a>'
                ])
                data_table = []
                for rd in self.region_dbs[region.toString()]:
                    rbs = rd.rna.str_rna(pa=False)
                    for rbsm in self.data["region"]["sig_region"]:
                        # rbsm = rbsm.partition(":")[2].split("-")
                        if rd.rna.overlap(rbsm):
                            rbs = "<font color=\"red\">" + rbs + "</font>"
                    data_table.append([
                        rbs,
                        '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db='
                        + self.organism + "&position=" + rd.dna.chrom + "%3A" +
                        str(rd.dna.initial) + "-" + str(rd.dna.final) +
                        '" style="text-align:left">' +
                        rd.dna.toString(space=True) + '</a>',
                        rd.dna.orientation, rd.score, rd.motif, rd.orient
                    ])
                html.add_zebra_table(header_list,
                                     col_size_list,
                                     type_list,
                                     data_table,
                                     align=align,
                                     cell_align="left",
                                     auto_width=True)
        html.write(os.path.join(directory, "region_dbs.html"))

        ###############################################################################33
        ################ Parameters.html

        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")
        html.add_heading("Parameters")
        header_list = ["Description", "Arguments", "Value"]

        data_table = [
            ["RNA sequence name", "-rn", parameters.rn],
            ["Input RNA sequence file", "-r",
             os.path.basename(parameters.r)],
            ["Input BED file", "-bed",
             os.path.basename(parameters.bed)],
            ["Output directory", "-o",
             os.path.basename(parameters.o)],
            ["Organism", "-organism", parameters.organism],
            ["Number of repitetion of andomization", "-n",
             str(parameters.n)],
            ["Alpha level for rejection p value", "-a",
             str(parameters.a)],
            [
                "Cut off value for filtering out the low counts of DBSs",
                "-ccf",
                str(parameters.ccf)
            ], ["Remove temporary files", "-rt",
                str(parameters.rt)],
            [
                "Input BED file for masking in randomization", "-f",
                str(parameters.f)
            ], ["Input file for RNA accecibility", "-ac",
                str(parameters.ac)],
            [
                "Cut off value for RNA accecibility", "-accf",
                str(parameters.accf)
            ],
            [
                "Output the BED files for DNA binding sites.", "-obed",
                str(parameters.obed)
            ],
            [
                "Show parallel and antiparallel bindings in the plot separately.",
                "-showpa",
                str(parameters.showpa)
            ], ["Minimum length", "-l",
                str(self.triplexator_p[0])],
            ["Maximum error rate", "-e",
             str(self.triplexator_p[1])],
            [
                "Tolerated number of consecutive errors", "-c",
                str(self.triplexator_p[2])
            ], ["Filtering repeats", "-fr",
                str(self.triplexator_p[3])],
            ["Filtering mode", "-fm",
             str(self.triplexator_p[4])],
            ["Output format", "-of",
             str(self.triplexator_p[5])],
            ["Merge features", "-mf",
             str(self.triplexator_p[6])]
        ]
        html.add_zebra_table(header_list,
                             col_size_list,
                             type_list,
                             data_table,
                             align=align,
                             cell_align="left",
                             auto_width=True)
        html.add_free_content(
            ['<a href="summary.txt" style="margin-left:100">See details</a>'])
        html.write(os.path.join(directory, "parameters.html"))
コード例 #9
0
ファイル: tracker.py プロジェクト: rafalcode/reg-gen
    def make_html(self):
        html_header = "THOR"
        from rgt.THOR.dpc_help import FOLDER_REPORT

        #Links
        links_dict = OrderedDict()
        links_dict['Experimental Configuration'] = 'index.html#extinfo'
        links_dict['Sample Information'] = 'index.html#sampleinfo'
        links_dict['HMM Information'] = 'index.html#hmminfo'
        links_dict['Mean Variance Function Estimate'] = 'index.html#mvfunction'

        p = path.join(FOLDER_REPORT, 'pics/fragment_size_estimate.png')
        if path.isfile(p):
            links_dict['Fragment Size Estimate'] = 'index.html#fsestimate'

        p = path.join(FOLDER_REPORT, 'pics/data/sample.data')
        if path.isfile(p):
            links_dict['Housekeeping Gene Normalization'] = 'index.html#norm'

        links_dict['References'] = 'index.html#ref'
        links_dict['Contact'] = 'index.html#contact'

        # copy basic rgt logo, style etc to local directory inside report
        fig_path = path.join(FOLDER_REPORT, "fig")
        html = Html(name=html_header,
                    links_dict=links_dict,
                    fig_dir=fig_path,
                    fig_rpath="fig")

        try:
            html.add_heading("Experimental Configuration", idtag='extinfo')
            self.make_ext_config(html)
        except:
            pass

        html.add_heading("Pre- and post-processing Features",
                         idtag='prepostinfo')
        self.make_pre_post(html)

        try:
            html.add_heading("Sample Information", idtag='sampleinfo')
            self.make_ext_scaling_table(html)
        except:
            pass

        #Run Info
        try:
            html.add_heading("HMM Information", idtag='hmminfo')
            self.make_hmm(html)
        except:
            pass

        #Mean Variance Function
        try:
            p = path.join(FOLDER_REPORT,
                          'pics/mean_variance_func_cond_0_original.png')
            if path.isfile(p):
                html.add_heading("Mean Variance Function", idtag='mvfunction')
                html.add_figure(
                    path.relpath(p, FOLDER_REPORT),
                    align="left",
                    width="45%",
                    more_images=[
                        'pics/mean_variance_func_cond_1_original.png'
                    ])
                info = "THOR uses a polynomial function to empirically describe the relationship between mean and variance in the data.\
                The data the plot is based on can be found at report/pics/data for further downstream analysis."

                self._write_text(html, info)
        except:
            pass

        #Fragment Size Estimate
        try:
            p = path.join(FOLDER_REPORT, 'pics/fragment_size_estimate.png')
            if path.isfile(p):
                html.add_heading("Fragment Size Estimate", idtag='fsestimate')
                html.add_figure(path.relpath(p, FOLDER_REPORT),
                                align="left",
                                width="45%")
                info = "THOR estimates the fragmentation sizes of each sample's reads. Here, the cross-correlation function [1] is shown. Their maxima give the\
                fragmentation extension sizes.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis."

                self._write_text(html, info)
        except:
            pass

        #HK normalization
        try:
            p = path.join(FOLDER_REPORT, 'pics/data/gene.data')
            if path.isfile(p):
                d = self._read_hk(p)
                html.add_heading("Housekeeping Gene Normalization",
                                 idtag='norm')
                html.add_zebra_table(header_list=['gene', 'quality q'],
                                     col_size_list=[1, 150],
                                     type_list='s' * len(d),
                                     data_table=d)
                info = "For active histone marks, housekeeping genes given by [4] can be used for normalization [1]. Here, the genes for the experiments are\
                evaluated. For each gene i, we estimate the normalization factors with gene i and without gene i and compute the sums of squared deviations q.\
                High values (higher than 2) indicate striking genes which should be considered to be left our for normalization.,<br> One can also \
                use other genes or regions for normalization.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis."

                self._write_text(html, info)

            p = path.join(FOLDER_REPORT, 'pics/data/sample.data')
            if path.isfile(p):
                d = self._read_hk(p)
                html.add_zebra_table(header_list=['sample', 'quality p'],
                                     col_size_list=[1, 150],
                                     type_list='s' * len(d),
                                     data_table=d)
                info = "We evaluate the effect of samples to the normalization factors. For sample j, we estimate the normalization factors with sample j\
                and without sample j and compute the sums of squared deviations p. High values (higher than 2) indicate striking samples which should be\
                considered to be left out for the analysis.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis."

                self._write_text(html, info)
        except:
            pass

        html.add_heading("References", idtag='ref')
        info = "[1] M. Allhoff, J. F. Pires, K. Ser&eacute;, M. Zenke, and I. G. Costa. Differential Peak Calling of ChIP-Seq \
        Signals with Replicates with THOR. <i>submitted.</i> <br>\
        [2] A. Mammana, M. Vingron, and H.-R. Chung. Inferring nucleosome positions with their histone mark annotation from chip data. \
        Bioinformatics, 29(20):2547-2554, 2013. <br>\
        [3] M. D. Robinson and A. Oshlack. A scaling normalization method for differential expression analysis of RNA-seq data. \
        Genome Biology, 11(3):R25, 2010. <br>\
        [4] E. Eisenberg and E. Y. Levanon. Human housekeeping genes, revisited. Trends in genetics: TIG, 29(10):569-574, 2013."

        self._write_text(html, info)

        html.add_heading("Contact", idtag='contact')
        info = "If you have any questions, please don't hesitate to contact us: [email protected]"
        self._write_text(html, info)

        html.write(path.join(FOLDER_REPORT, "index.html"))
コード例 #10
0
ファイル: tracker.py プロジェクト: Marvin84/reg-gen
 def make_html(self):
     html_header = "THOR"
     from rgt.THOR.dpc_help import FOLDER_REPORT
     #Links
     links_dict = OrderedDict()
     links_dict['Experimental Configuration'] = 'index.html#extinfo'
     links_dict['Sample Information'] = 'index.html#sampleinfo'
     links_dict['HMM Information'] = 'index.html#hmminfo'
     links_dict['Mean Variance Function Estimate'] = 'index.html#mvfunction'
     
     p = path.join(FOLDER_REPORT, 'pics/fragment_size_estimate.png')
     if path.isfile(p):
         links_dict['Fragment Size Estimate'] = 'index.html#fsestimate'
     
     p = path.join(FOLDER_REPORT, 'pics/data/sample.data')
     if path.isfile(p):
         links_dict['Housekeeping Gene Normalization'] = 'index.html#norm'
     
     links_dict['References'] = 'index.html#ref'
     links_dict['Contact'] = 'index.html#contact'
     
     config_class = ConfigurationFile()
     html = Html(name=html_header, links_dict=links_dict, fig_rpath= config_class.data_dir + '/fig/')
     
     try:
         html.add_heading("Experimental Configuration", idtag = 'extinfo')
         self.make_ext_config(html)
     except:
         pass
     
     html.add_heading("Pre- and post-processing Features", idtag = 'prepostinfo')
     self.make_pre_post(html)
     
     try:
         html.add_heading("Sample Information", idtag = 'sampleinfo')
         self.make_ext_scaling_table(html)
     except:
         pass
     
     #Run Info
     try:
         html.add_heading("HMM Information", idtag = 'hmminfo')
         self.make_hmm(html)
     except:
         pass
     
     #Mean Variance Function
     try:
         p = path.join(FOLDER_REPORT, "pics/mean_variance_func_cond_0_original.png")
         if path.isfile(p):
             html.add_heading("Mean Variance Function", idtag='mvfunction')
             html.add_figure(p, align="left", width="45%", more_images=[path.join(FOLDER_REPORT, 'pics/mean_variance_func_cond_1_original.png')])
             info = "THOR uses a polynomial function to empirically describe the relationship between mean and variance in the data.\
             The data the plot is based on can be found at report/pics/data for further downstream analysis."
             self._write_text(html, info)
     except:
         pass
     
     #Fragment Size Estimate
     try:
         p = path.join(FOLDER_REPORT, 'pics/fragment_size_estimate.png')
         if path.isfile(p):
             html.add_heading("Fragment Size Estimate", idtag = 'fsestimate')
             html.add_figure(p, align="left", width="45%")
             info = "THOR estimates the fragmentation sizes of each sample's reads. Here, the cross-correlation function [1] is shown. Their maxima give the\
             fragmentation extension sizes.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis."
             self._write_text(html, info)
     except:
         pass
     
     #HK normalization
     try:
         p = path.join(FOLDER_REPORT, 'pics/data/gene.data')
         if path.isfile(p):
             d = self._read_hk(p)
             html.add_heading("Housekeeping Gene Normalization", idtag = 'norm')
             html.add_zebra_table(header_list=['gene', 'quality q'], col_size_list=[1,150], type_list='s'*len(d), data_table=d)
             info = "For active histone marks, housekeeping genes given by [4] can be used for normalization [1]. Here, the genes for the experiments are\
             evaluated. For each gene i, we estimate the normalization factors with gene i and without gene i and compute the sums of squared deviations q.\
             High values (higher than 2) indicate striking genes which should be considered to be left our for normalization.,<br> One can also \
             use other genes or regions for normalization.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis."
             self._write_text(html, info)
             
         p = path.join(FOLDER_REPORT, 'pics/data/sample.data')
         if path.isfile(p):
             d = self._read_hk(p)
             html.add_zebra_table(header_list=['sample', 'quality p'], col_size_list=[1,150], type_list='s'*len(d), data_table=d)
             info = "We evaluate the effect of samples to the normalization factors. For sample j, we estimate the normalization factors with sample j\
             and without sample j and compute the sums of squared deviations p. High values (higher than 2) indicate striking samples which should be\
             considered to be left out for the analysis.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis."
             self._write_text(html, info)
     except:
         pass
     
     html.add_heading("References", idtag = 'ref')
     info = "[1] M. Allhoff, J. F. Pires, K. Ser&eacute;, M. Zenke, and I. G. Costa. Differential Peak Calling of ChIP-Seq \
     Signals with Replicates with THOR. <i>submitted.</i> <br>\
     [2] A. Mammana, M. Vingron, and H.-R. Chung. Inferring nucleosome positions with their histone mark annotation from chip data. \
     Bioinformatics, 29(20):2547-2554, 2013. <br>\
     [3] M. D. Robinson and A. Oshlack. A scaling normalization method for differential expression analysis of RNA-seq data. \
     Genome Biology, 11(3):R25, 2010. <br>\
     [4] E. Eisenberg and E. Y. Levanon. Human housekeeping genes, revisited. Trends in genetics: TIG, 29(10):569-574, 2013."
     self._write_text(html, info)
     
     html.add_heading("Contact", idtag = 'contact')
     info = "If you have any questions, please don't hesitate to contact us: [email protected]"
     self._write_text(html, info)
     
     html.write(path.join(FOLDER_REPORT, "index.html"))
     
コード例 #11
0
ファイル: triplexTools.py プロジェクト: rafalcode/reg-gen
def list_all_index(path, link_d=None):
    """Creat an 'index.html' in the defined directory """

    dirname = os.path.basename(path)

    if link_d:
        pass
    else:
        link_d = {"List": "index.html"}

    html = Html(name="Directory: " + dirname, links_dict=link_d,
                fig_rpath="./style", fig_dir=os.path.join(path, "style"),
                RGT_header=False, other_logo="TDF", homepage="../index.html")

    html.add_heading("All experiments in: " + dirname + "/")

    data_table = []
    type_list = 'sssssssssssssssssss'
    col_size_list = [20] * 20
    c = 0

    header_list = ["No.", "Experiments", "RNA", "Closest genes",
                   "Exon", "Length", "Expression*",
                   "Norm DBS*",
                   "Norm DBD*",  "No sig. DBD",
                   "Organism", "Target region",
                   "Rank*"]

    profile_f = open(os.path.join(path, "profile.txt"), 'r')
    profile = {}
    for line in profile_f:
        line = line.strip()
        line = line.split("\t")
        if line[0] == "Experiment": continue
        elif len(line) > 5: profile[line[0]] = line[1:]
    profile_f.close()

    # sig_list = []

    for i, exp in enumerate(profile.keys()):
        c += 1
        if profile[exp][10] == "-":
            new_line = [str(c), exp, profile[exp][0]]
        else:
            new_line = [str(c),
                        '<a href="' + os.path.join(exp, "index.html") + \
                        '">' + exp + "</a>", profile[exp][0]]
        new_line += [ profile[exp][12],#3 close genes
                      profile[exp][1], #4 exon
                      profile[exp][2], #5 length
                      profile[exp][13] ]#6 exp

        if float(profile[exp][11]) < 0.05:
            new_line += [ profile[exp][6], #7 norm DBS
                          profile[exp][8], #8 norm DBD
                          profile[exp][9]] #9 sig DBD
                          # profile[exp][10], #10 Top DBD
                          # "<font color=\"red\">" + \
                          # profile[exp][11] + "</font>"]
            # sig_list.append(True)
        else:
            new_line += [str(0),  # 7 norm DBS
                         str(0),  # 8 norm DBD
                         profile[exp][9]]  # 9 sig DBD
                         # profile[exp][10],  # 10 Top DBD
                         # profile[exp][11]]
            # sig_list.append(False)

        new_line += [ profile[exp][4], profile[exp][5] ]

        data_table.append(new_line)

    rank_dbd = len(data_table) - rank_array([float(x[8]) for x in data_table])
    rank_dbs = len(data_table) - rank_array([float(x[7]) for x in data_table])

    rank_exp = len(data_table) - rank_array([0 if x[6] == "n.a." else float(x[6]) for x in data_table ])

    rank_sum = [x + y + z for x, y, z  in zip(rank_dbd, rank_dbs, rank_exp)]

    nd = [ d + [str(rank_sum[i])] for i, d in enumerate(data_table) ]

    nd = natsort_ob.natsorted(nd, key=lambda x: x[-1])
    html.add_zebra_table(header_list, col_size_list, type_list, nd,
                         align=10, cell_align="left", sortable=True)

    html.add_fixed_rank_sortable()
    html.write(os.path.join(path, "index.html"))
コード例 #12
0
ファイル: Main.py プロジェクト: jovesus/reg-gen
def list_all_index(path, show_RNA_ass_gene=False):
    """Creat an 'index.html' in the defined directory """

    dirname = os.path.basename(path)
    
    link_d = {"List":"index.html"}
    html = Html(name="Directory: "+dirname, links_dict=link_d, 
                fig_dir=os.path.join(path,"style"), fig_rpath="./style", RGT_header=False, other_logo="TDF")
    
    html.add_heading("All experiments in: "+dirname+"/")
    data_table = []
    type_list = 'sssssssssssss'
    col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
    c = 0
    if show_RNA_ass_gene:
        header_list = ["No.", "Experiments", "RNA", "Closest genes", "Organism", #"Condition", 
                       "Target region", "No significant DBD", "Top DBD", "p-value"]
    else:
        header_list = ["No.", "Experiments", "RNA", "Organism", #"Condition", 
                       "Target region", "No significant DBD", "Top DBD", "p-value"]
    profile_f = open(os.path.join(path, "profile.txt"),'r')
    profile = {}
    for line in profile_f:
        line = line.strip()
        line = line.split("\t")
        profile[line[0]] = line[1:]
    #profile = pickle.load(profile_f)
    for root, dirnames, filenames in os.walk(path):
        #roots = root.split('/')
        #for filename in fnmatch.filter(filenames, '*.html'):
        #    if filename == 'index.html' and root.split('/')[-1] != dirname:
        for i, dirname in enumerate(dirnames):
            
            if dirname in profile.keys():
                c += 1
                #exp = root.split('/')[-1]
                exp = dirname
                if profile[exp][5] == "-":
                    new_line = [ str(c), exp, profile[exp][0] ]
                else:
                    new_line = [ str(c), '<a href="'+os.path.join(exp, "index.html")+'">'+exp+"</a>",
                                 profile[exp][0] ]
                
                if show_RNA_ass_gene: new_line.append( split_gene_name(gene_name=profile[exp][7], org=profile[exp][2]) )

                try:
                    if profile[exp][6] == "-":
                        new_line += [ profile[exp][2], profile[exp][3], profile[exp][4], profile[exp][5], profile[exp][6] ]
                    elif float(profile[exp][6]) < 0.05:
                        new_line += [ profile[exp][2], profile[exp][3], profile[exp][4], profile[exp][5], 
                                      "<font color=\"red\">"+profile[exp][6]+"</font>" ]
                    else:
                        new_line += [ profile[exp][2], profile[exp][3], profile[exp][4], profile[exp][5], profile[exp][6] ]
                    data_table.append(new_line)
                except:
                    
                    print("Error in loading profile: "+exp)
                    continue

    html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=50, cell_align="left", sortable=True)
    html.add_fixed_rank_sortable()
    html.write(os.path.join(path,"index.html"))
コード例 #13
0
ファイル: Main.py プロジェクト: Marvin84/reg-gen
def list_all_index(path, link_d=None, show_RNA_ass_gene=False):
    """Creat an 'index.html' in the defined directory """

    dirname = os.path.basename(path)
    
    if link_d: pass
    else: link_d = {"List":"index.html"}

    html = Html(name="Directory: "+dirname, links_dict=link_d, 
                fig_rpath="./style", fig_dir=os.path.join(path,"style"), 
                RGT_header=False, other_logo="TDF", homepage="../index.html")
    
    html.add_heading("All experiments in: "+dirname+"/")

    
    data_table = []
    type_list = 'sssssssssssss'
    col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
    c = 0
    if show_RNA_ass_gene:
        header_list = ["No.", "Experiments", "RNA", "Closest genes", 
                       "No sig. DBD",
                       "Top DBD", "p-value", "Organism", "Target region"]
    else:
        header_list = ["No.", "Experiments", "RNA", "No sig. DBD",
                       "Top DBD", "p-value", "Organism", #"Condition", 
                       "Target region" ]

    profile_f = open(os.path.join(path, "profile.txt"),'r')
    profile = {}
    for line in profile_f:
        line = line.strip()
        line = line.split("\t")
        profile[line[0]] = line[1:]
   
    for i, exp in enumerate(profile.keys()):
        #print(exp)
        c += 1
        
        try:
            if profile[exp][5] == "-":
                new_line = [ str(c), exp, profile[exp][0] ]
            else:
                new_line = [ str(c), 
                             '<a href="'+os.path.join(exp, "index.html")+\
                             '">'+exp+"</a>", profile[exp][0] ]

            if show_RNA_ass_gene: 
                new_line.append( 
                    split_gene_name(gene_name=profile[exp][7], 
                                    org=profile[exp][2]) 
                    )

            if profile[exp][6] == "-":
                new_line += [ profile[exp][4], 
                              profile[exp][5], profile[exp][6],
                              profile[exp][2], profile[exp][3] ]

            elif float(profile[exp][6]) < 0.05:
                new_line += [ profile[exp][4], 
                              profile[exp][5], 
                              "<font color=\"red\">"+\
                              profile[exp][6]+"</font>",
                              profile[exp][2], profile[exp][3] ]
            else:
                new_line += [ profile[exp][4], 
                              profile[exp][5], profile[exp][6],
                              profile[exp][2], profile[exp][3] ]
            data_table.append(new_line)
        except:
            if exp != "Experiment":
                print("Error in loading profile: "+exp)
            continue

    html.add_zebra_table( header_list, col_size_list, type_list, data_table, 
                          align=10, cell_align="left", sortable=True)
    
    html.add_fixed_rank_sortable()
    html.write(os.path.join(path,"index.html"))
コード例 #14
0
ファイル: boxplot.py プロジェクト: eggduzao/reg-gen
    def gen_html(self, directory, title, align=50):
        dir_name = os.path.basename(directory)
        # check_dir(directory)
        html_header = title
        link_d = OrderedDict()
        link_d["Boxplot"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")
        # fp = os.path.join(dir,outputname,title)

        html.add_figure("boxplot.png", align="center")

        type_list = 'ssssssssssssssssssssssssssssssssssssssssssssss'

        #### Calculate p value ####
        plist = {}
        for g in self.sortDict.keys():
            plist[g] = {}
            for s1 in self.sortDict[g].keys():
                for c1 in self.sortDict[g][s1].keys():
                    data1 = self.sortDict[g][s1][c1]
                    plist[g][s1 + c1] = {}
                    for s2 in self.sortDict[g].keys():
                        for c2 in self.sortDict[g][s2].keys():
                            if s2 == s1 and c2 == c1:
                                pass
                            else:
                                data2 = self.sortDict[g][s2][c2]
                                u, p_value = mannwhitneyu(data1, data2)
                                plist[g][s1 + c1][s2 + c2] = p_value

        print("Multiple test correction.")
        multiple_correction(plist)

        for g in self.sortDict.keys():
            html.add_heading(g, size=4, bold=False)
            data_table = []
            col_size_list = [15]
            header_list = ["p-value"]
            for s in self.sortDict[g].keys():
                for c in self.sortDict[g][s1].keys():
                    header_list.append(s + "\n" + c)
                    col_size_list.append(15)

            for s1 in self.sortDict[g].keys():
                for c1 in self.sortDict[g][s1].keys():
                    row = [s1 + "\n" + c1]
                    for s2 in self.sortDict[g].keys():
                        for c2 in self.sortDict[g][s2].keys():
                            if s2 == s1 and c2 == c1:
                                row.append("-")
                            else:
                                p = plist[g][s1 + c1][s2 + c2]
                                if p > 0.05:
                                    row.append(value2str(p))
                                else:
                                    row.append("<font color=\"red\">" + value2str(p) + "</font>")
                    data_table.append(row)

            html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align + 50)

        # html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, title, "index.html"))

        ## Parameters
        html = Html(name=html_header, links_dict=link_d,
                    fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html")

        header_list = ["Assumptions and hypothesis"]
        col_size_list = [50]
        data_table = [['All the regions among different BED files are normalized by quantile normalization.'],
                      [
                          'If there is any grouping problem, please check all the optional columns in input experimental matrix.']]
        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left")

        html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>'])
        html.add_free_content(['<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>'])
        html.write(os.path.join(directory, title, "parameters.html"))
コード例 #15
0
ファイル: tdf_regiontest.py プロジェクト: eggduzao/reg-gen
    def gen_html(self, directory, parameters, obed, align=50, alpha=0.05, score=False):
        """Generate the HTML file"""
        dir_name = os.path.basename(directory)
        html_header = "Genomic Region Test: " + dir_name
        link_ds = OrderedDict()
        link_ds["RNA"] = "index.html"
        link_ds["Sig Target Regions"] = "starget_regions.html"
        link_ds["Target Regions"] = "target_regions.html"
        link_ds["Parameters"] = "parameters.html"

        ##################################################
        # index.html

        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")
        # Plots
        html.add_figure("lineplot_region.png", align="left", width="45%", more_images=["boxplot_regions.png"])
        if self.showdbs:
            html.add_figure("lineplot_dbs.png", align="left", width="45%", more_images=["boxplot_dbs.png"])

        if self.showdbs:
            header_list = [["#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics",
                            "Target Regions", "Non-target Regions", None, "Statistics"],
                           ["", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value",
                            "NO. DBSs", "NO. DBSs (average)", "s.d.", "<i>p</i>-value"]]
            header_titles = [["Rank", "DNA Binding Domain", "Given target regions on DNA", None,
                              "Regions from randomization", None, "Statistics based on target regions",
                              "Given target regions on DNA", "Regions from randomization", None,
                              "Statistics based on DNA Binding Sites"],
                             ["", "",
                              "Number of target regions with DBS binding",
                              "Number of target regions without DBS binding",
                              "Average number of regions from randomization with DBS binding",
                              "Standard deviation", "P value",
                              "Number of related DNA Binding Sites binding to target regions",
                              "Average number of DNA Binding Sites binding to random regions",
                              "Standard deviation", "P-value"]]
            border_list = [" style=\"border-right:1pt solid gray\"",
                           " style=\"border-right:1pt solid gray\"", "",
                           " style=\"border-right:1pt solid gray\"", "",
                           " style=\"border-right:1pt solid gray\"",
                           " style=\"border-right:2pt solid gray\"",
                           " style=\"border-right:1pt solid gray\"", "",
                           " style=\"border-right:1pt solid gray\"",
                           " style=\"border-right:1pt solid gray\""]
        else:
            header_list = [["#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", None],
                           ["", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value",
                            "z-score"]]
            header_titles = [["Rank", "DNA Binding Domain", "Given target regions on DNA", None,
                              "Regions from randomization", None, "Statistics based on target regions", None],
                             ["", "",
                              "Number of target regions with DBS binding",
                              "Number of target regions without DBS binding",
                              "Average number of regions from randomization with DBS binding",
                              "Standard deviation", "P value", "Z-score"]]
            border_list = [" style=\"border-right:1pt solid gray\"",
                           " style=\"border-right:1pt solid gray\"", "",
                           " style=\"border-right:1pt solid gray\"", "",
                           " style=\"border-right:1pt solid gray\"",
                           " style=\"border-right:1pt solid gray\"", ""]

        type_list = 'ssssssssssssssss'
        col_size_list = [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]
        data_table = []

        for i, rbs in enumerate(self.rbss):
            if self.data["region"]["p"][i] < alpha:
                p_region = "<font color=\"red\">" + value2str(self.data["region"]["p"][i]) + "</font>"

            else:
                p_region = value2str(self.data["region"]["p"][i])
            zs = (self.counts_tr[rbs][0] - self.data["region"]["ave"][i]) / self.data["region"]["sd"][i]
            new_line = [str(i + 1),
                        rbs.str_rna(pa=False),
                        '<a href="dbd_region.html#' + rbs.str_rna() +
                        '" style="text-align:left">' + str(self.counts_tr[rbs][0]) + '</a>',
                        str(self.counts_tr[rbs][1]),
                        value2str(self.data["region"]["ave"][i]),
                        value2str(self.data["region"]["sd"][i]),
                        p_region,
                        value2str(zs)]
            if self.showdbs:
                if self.data["dbs"]["p"][i] < alpha:
                    p_dbs = "<font color=\"red\">" + value2str(self.data["dbs"]["p"][i]) + "</font>"
                else:
                    p_dbs = value2str(self.data["dbs"]["p"][i])

                new_line += [str(self.counts_dbs[rbs]),
                             value2str(self.data["dbs"]["ave"][i]),
                             value2str(self.data["dbs"]["sd"][i]),
                             p_dbs]
            data_table.append(new_line)

        data_table = natsort.natsorted(data_table, key=lambda x: x[6])
        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                             auto_width=True, header_titles=header_titles, border_list=border_list, sortable=True)

        html.add_heading("Notes")
        html.add_list(["RNA name: " + self.rna_name,
                       "Randomization is performed for " + str(self.repeats) + " times.",
                       "DBD stands for DNA Binding Domain on RNA.",
                       "DBS stands for DNA Binding Site on DNA."])
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "index.html"))

        #############################################################
        # RNA subpage: Profile of targeted regions for each merged DNA Binding Domain
        #############################################################

        header_list = ["#", "Target Region",
                       "Associated Gene",
                       "No. of DBSs",
                       "DBS coverage"]
        header_titles = ["Rank", "Given target regions from BED files",
                         "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
                         "Number of DNA Binding Sites locate within the region",
                         "The proportion of the region covered by DBS binding"]

        #########################################################
        # dbd_region.html
        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")

        for rbsm in self.rbss:
            html.add_heading("DNA Binding Domain: " + rbsm.str_rna(),
                             idtag=rbsm.str_rna())
            data_table = []
            for i, region in enumerate(self.txp.merged_dict[rbsm]):
                # Add information
                data_table.append([str(i + 1),
                                   '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism +
                                   "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) +
                                   '" style="text-align:left">' + region.toString(space=True) + '</a>',
                                   split_gene_name(gene_name=region.name, org=self.organism),
                                   str(len(self.region_dbs[region.toString()])),
                                   value2str(self.region_coverage[region.toString()])
                                   ])

            html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                                 auto_width=True, header_titles=header_titles, sortable=True)
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "dbd_region.html"))

        #############################################################
        # Targeted regions centered
        #############################################################

        ##############################################################################################
        # target_regions.html
        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")

        if score:
            header_list = ["#", "Target region", "Associated Gene", "DBSs Count",
                           "DBS coverage", "Score", "Sum of ranks"]
            header_titles = ["Rank",
                             "Target regions loaded from the given BED file",
                             "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
                             "Number of DNA Binding Sites within the region",
                             "The proportion of the region covered by DBS binding",
                             "Scores from BED file",
                             "Sum of all the left-hand-side ranks"]
        else:
            header_list = ["#", "Target region", "Associated Gene", "DBSs Count",
                           "DBS coverage", "Sum of ranks"]
            header_titles = ["Rank",
                             "Target regions loaded from the given BED file",
                             "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
                             "Number of DNA Binding Sites within the region",
                             "The proportion of the region covered by DBS binding",
                             "Sum of all the left-hand-side ranks"]
        html.add_heading("Target Regions")
        data_table = []

        if not self.dna_region.sorted: self.dna_region.sort()

        # Calculate the ranking
        rank_count = len(self.dna_region) - rank_array([len(self.region_dbs[p.toString()]) for p in self.dna_region])
        rank_coverage = len(self.dna_region) - rank_array([self.region_coverage[p.toString()] for p in self.dna_region])

        if score:
            try:
                score_list = [float(p.data.split("\t")[0]) for p in self.dna_region]
                rank_score = len(self.dna_region) - rank_array([abs(s) for s in score_list])
                rank_sum = [x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score)]
                # sum_rank = rank_array(rank_sum)  # method='min'
            except ImportError:
                print("There is no score in BED file, please don't use '-score' argument.")
        else:
            rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)]
            sum_rank = rank_array(rank_sum)

        for i, region in enumerate(self.dna_region):
            dbs_counts = str(len(self.region_dbs[region.toString()]))
            dbs_cover = value2str(self.region_coverage[region.toString()])

            newline = [str(i + 1),
                       '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism +
                       "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) +
                       '" style="text-align:left">' + region.toString(space=True) + '</a>',
                       split_gene_name(gene_name=region.name, org=self.organism),
                       '<a href="region_dbs.html#' + region.toString() +
                       '" style="text-align:left">' + dbs_counts + '</a>',
                       dbs_cover]

            if score:
                dbs_score = value2str(score_list[i])
                region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(rank_sum[i])])
                newline.append(dbs_score)
                newline.append(str(rank_sum[i]))
            else:
                region.data = "\t".join([dbs_counts, dbs_cover, str(rank_sum[i])])
                newline.append(str(rank_sum[i]))
            data_table.append(newline)

        data_table = natsort.natsorted(data_table, key=lambda x: x[-1])
        # data_table = sorted(data_table, key=lambda x: x[-1])
        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                             auto_width=True, header_titles=header_titles, sortable=True)
        html.add_heading("Notes")
        html.add_list(["All target regions without any bindings are ignored."])
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "target_regions.html"))

        self.dna_region.sort_score()
        self.dna_region.write_bed(os.path.join(directory, obed + "_target_regions.bed"))



        ##############################################################################################
        # starget_regions.html    for significant target regions

        stargets = GenomicRegionSet("sig_targets")
        sig_dbs = {}
        sig_dbs_coverage = {}
        for i, r in enumerate(self.dna_region):
            sig_bindings = self.region_dbs[r.toString()].overlap_rbss(rbss=self.data["region"]["sig_region"])
            dbs = sig_bindings.get_dbs()
            if len(dbs) > 0:
                stargets.add(r)
                m_dbs = dbs.merge(w_return=True)
                sig_dbs[r] = len(dbs)
                # self.promoter["de"]["merged_dbs"][promoter.toString()] = len(m_dbs)
                sig_dbs_coverage[r] = float(m_dbs.total_coverage()) / len(r)

        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")

        # Select promoters in sig DBD
        if len(self.data["region"]["sig_region"]) == 0:
            html.add_heading("There is no significant DBD.")
        else:
            html.add_heading("Target regions bound by significant DBD")
            data_table = []
            # Calculate the ranking
            rank_count = len(stargets) - rank_array([sig_dbs[p] for p in stargets])
            rank_coverage = len(stargets) - rank_array([sig_dbs_coverage[p] for p in stargets])
            if score:
                score_list = [float(p.data.split("\t")[0]) for p in stargets]
                rank_score = len(stargets) - rank_array([abs(s) for s in score_list])
                rank_sum = [x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score)]
                sum_rank = rank_array(rank_sum)  # method='min'
            else:
                rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)]
                sum_rank = rank_array(rank_sum)

            for i, region in enumerate(stargets):
                dbssount = '<a href="region_dbs.html#' + region.toString() + \
                           '" style="text-align:left">' + str(sig_dbs[region]) + '</a>'

                region_link = region_link_internet(self.organism, region)

                newline = [str(i + 1), region_link,
                           split_gene_name(gene_name=region.name, org=self.organism),
                           dbssount, value2str(sig_dbs_coverage[region]) ]
                if score:
                    dbs_score = value2str(score_list[i])
                    # region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(sum_rank[i])])
                    newline.append(dbs_score)
                    newline.append(str(rank_sum[i]))
                    # print([dbs_score, str(sum_rank[i])])
                else:
                    # region.data = "\t".join([dbs_counts, dbs_cover, str(sum_rank[i])])
                    newline.append(str(rank_sum[i]))

                # newline += ["<i>" + str(rank_sum[i]) + "</i>"]
                # print(newline)
                data_table.append(newline)

            # print(data_table)
            # data_table = sorted(data_table, key=lambda x: x[-1])
            data_table = natsort.natsorted(data_table, key=lambda x: x[-1])
            html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                                 header_titles=header_titles, border_list=None, sortable=True)
            html.add_heading("Notes")
            html.add_list(["DBS stands for DNA Binding Site on DNA.",
                           "DBS coverage is the proportion of the region where has potential to form triple helices with the given RNA."])
            html.add_fixed_rank_sortable()
            html.write(os.path.join(directory, "starget_regions.html"))

        ############################
        # Subpages for targeted region centered page
        # region_dbs.html
        header_list = ["RBS", "DBS", "Strand", "Score", "Motif", "Orientation"]

        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")

        for i, region in enumerate(self.dna_region):
            if len(self.region_dbs[region.toString()]) == 0:
                continue
            else:
                html.add_heading("Associated gene: " + split_gene_name(gene_name=region.name, org=self.organism),
                                 idtag=region.toString())
                html.add_free_content(['<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism +
                                       "&position=" + region.chrom + "%3A" + str(region.initial) +
                                       "-" + str(region.final) + '" style="margin-left:50">' +
                                       region.toString(space=True) + '</a>'])
                data_table = []
                for rd in self.region_dbs[region.toString()]:
                    rbs = rd.rna.str_rna(pa=False)
                    for rbsm in self.data["region"]["sig_region"]:
                        # rbsm = rbsm.partition(":")[2].split("-")
                        if rd.rna.overlap(rbsm):
                            rbs = "<font color=\"red\">" + rbs + "</font>"
                    data_table.append([rbs,
                                       '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism +
                                       "&position=" + rd.dna.chrom + "%3A" + str(rd.dna.initial) + "-" + str(
                                           rd.dna.final) +
                                       '" style="text-align:left">' + rd.dna.toString(space=True) + '</a>',
                                       rd.dna.orientation, rd.score, rd.motif, rd.orient])
                html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                                     auto_width=True)
        html.write(os.path.join(directory, "region_dbs.html"))

        ###############################################################################33
        ################ Parameters.html

        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")
        html.add_heading("Parameters")
        header_list = ["Description", "Arguments", "Value"]

        data_table = [["RNA sequence name", "-rn", parameters.rn],
                      ["Input RNA sequence file", "-r", os.path.basename(parameters.r)],
                      ["Input BED file", "-bed", os.path.basename(parameters.bed)],
                      ["Output directory", "-o", os.path.basename(parameters.o)],
                      ["Organism", "-organism", parameters.organism],
                      ["Number of repitetion of andomization", "-n", str(parameters.n)],
                      ["Alpha level for rejection p value", "-a", str(parameters.a)],
                      ["Cut off value for filtering out the low counts of DBSs", "-ccf", str(parameters.ccf)],
                      ["Remove temporary files", "-rt", str(parameters.rt)],
                      ["Input BED file for masking in randomization", "-f", str(parameters.f)],
                      ["Input file for RNA accecibility", "-ac", str(parameters.ac)],
                      ["Cut off value for RNA accecibility", "-accf", str(parameters.accf)],
                      ["Output the BED files for DNA binding sites.", "-obed", str(parameters.obed)],
                      ["Show parallel and antiparallel bindings in the plot separately.", "-showpa",
                       str(parameters.showpa)],
                      ["Minimum length", "-l", str(self.triplexator_p[0])],
                      ["Maximum error rate", "-e", str(self.triplexator_p[1])],
                      ["Tolerated number of consecutive errors", "-c", str(self.triplexator_p[2])],
                      ["Filtering repeats", "-fr", str(self.triplexator_p[3])],
                      ["Filtering mode", "-fm", str(self.triplexator_p[4])],
                      ["Output format", "-of", str(self.triplexator_p[5])],
                      ["Merge features", "-mf", str(self.triplexator_p[6])]]
        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                             auto_width=True)
        html.add_free_content(['<a href="summary.txt" style="margin-left:100">See details</a>'])
        html.write(os.path.join(directory, "parameters.html"))
コード例 #16
0
ファイル: Main.py プロジェクト: eggduzao/reg-gen
def main():
    ##########################################################################
    ##### PARAMETERS #########################################################
    ##########################################################################
    
    parser = argparse.ArgumentParser(description='Triplex Domain Finder is a statistical framework \
                                                  for detection of triple helix potential of \
                                                  lncRNAs from genome-wide functional data. \
                                                  Author: Chao-Chung Kuo\
                                                  \nVersion: ' + __version__,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    
    subparsers = parser.add_subparsers(help='sub-command help',dest='mode')
    
    ################### Promoter test ##########################################

    h_promotor = "Promoter test evaluates the association between the given lncRNA to the target promoters."
    parser_promotertest = subparsers.add_parser('promotertest', help=h_promotor)
    parser_promotertest.add_argument('-r', type=str, metavar='  ', help="Input file name for RNA sequence (in fasta format)")
    parser_promotertest.add_argument('-rl', type=str, default=None, metavar='  ', help="Input list for paths to all RNA sequences (in fasta format)")
    parser_promotertest.add_argument('-rn', type=str, default=None, metavar='  ', help="Define the RNA name")
    parser_promotertest.add_argument('-de', default=False, metavar='  ', help="Input file for target gene list (gene symbols or Ensembl ID)")
    parser_promotertest.add_argument('-bed', default=False, metavar='  ', help="Input BED file of the promoter regions of target genes")
    parser_promotertest.add_argument('-bg', default=False, metavar='  ', help="Input BED file of the promoter regions of background genes")
    parser_promotertest.add_argument('-o', metavar='  ', help="Output directory name for all the results")
    parser_promotertest.add_argument('-t', metavar='  ', default=False, help="Define the title name for the results under the Output name. (default: %(default)s)")
    
    parser_promotertest.add_argument('-organism', metavar='  ', help='Define the organism (hg19 or mm9)')
    parser_promotertest.add_argument('-gtf', metavar='  ', default=None, help='Define the GTF file for annotation (optional)')

    parser_promotertest.add_argument('-pl', type=int, default=1000, metavar='  ', help="Define the promotor length (default: %(default)s)")
    
    parser_promotertest.add_argument('-showdbs', action="store_true", help="Show the plots and statistics of DBS (DNA Binding sites)")
    parser_promotertest.add_argument('-score', action="store_true", help="Load score column from input gene list or BED file for analysis.")
    parser_promotertest.add_argument('-scoreh', action="store_true", help="Use the header of scores from the given gene list or BED file.")
    parser_promotertest.add_argument('-a', type=float, default=0.05, metavar='  ', help="Define significance level for rejection null hypothesis (default: %(default)s)")
    parser_promotertest.add_argument('-ccf', type=int, default=100, metavar='  ', help="Define the cut off value for promoter counts (default: %(default)s)")
    parser_promotertest.add_argument('-rt', action="store_true", default=False, help="Remove temporary files (fa, txp...etc)")
    parser_promotertest.add_argument('-log', action="store_true", default=False, help="Set the plots in log scale")
    parser_promotertest.add_argument('-ac', type=str, default=False, metavar='  ', help="Input file for RNA accecibility ")
    parser_promotertest.add_argument('-accf', type=float, default=500, metavar='  ', help="Define the cut off value for RNA accecibility")
    parser_promotertest.add_argument('-obed', action="store_true", default=True, help="Output the BED files for DNA binding sites.")
    parser_promotertest.add_argument('-showpa', action="store_true", default=False, help="Show parallel and antiparallel bindings in the plot separately.")
    # parser_promotertest.add_argument('-motif', action="store_true", default=False, help="Show motif of binding sites.")
    parser_promotertest.add_argument('-filter_havana', type=str, default="F", metavar='  ', help="Apply filtering to remove HAVANA entries.")
    parser_promotertest.add_argument('-protein_coding', type=str, default="F", metavar='  ', help="Apply filtering to get only protein coding genes.")
    parser_promotertest.add_argument('-known_only', type=str, default="F", metavar='  ', help="Apply filtering to get only known genes.")
    parser_promotertest.add_argument('-dump', action="store_true", default=False, help="Only dump the experimental file and leave the program.")
    parser_promotertest.add_argument('-rnaexp', type=str, default=None, metavar='  ',
                                     help="Given a file with RNA name and the expression value")

    parser_promotertest.add_argument('-l', type=int, default=20, metavar='  ', help="[Triplexator] Define the minimum length of triplex (default: %(default)s)")
    parser_promotertest.add_argument('-e', type=int, default=20, metavar='  ', help="[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)")
    parser_promotertest.add_argument('-c', type=int, default=2, metavar='  ', help="[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)")
    parser_promotertest.add_argument('-fr', type=str, default="off", metavar='  ', help="[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)")
    parser_promotertest.add_argument('-fm', type=int, default=0, metavar='  ', help="[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering.")
    parser_promotertest.add_argument('-of', type=int, default=1, metavar='  ', help="[Triplexator] Define output formats of Triplexator (default: %(default)s)")
    parser_promotertest.add_argument('-mf', action="store_true", default=False, help="[Triplexator] Merge overlapping features into a cluster and report the spanning region.")
    parser_promotertest.add_argument('-rm', type=int, default=0, metavar='  ', help="[Triplexator] Set the multiprocessing")
    parser_promotertest.add_argument('-par', type=str, default="", metavar='  ', help="[Triplexator] Define other parameters for Triplexator")
    
    ################### Genomic Region Test ##########################################
    h_region = "Genomic region test evaluates the association between the given lncRNA to the target regions by randomization."
    parser_randomtest = subparsers.add_parser('regiontest', help=h_region)
    parser_randomtest.add_argument('-r', type=str, metavar='  ', help="Input file name for RNA sequence (in fasta format)")
    parser_randomtest.add_argument('-rl', type=str, default=None, metavar='  ', help="Input list for paths to all RNA sequences (in fasta format)")
    parser_randomtest.add_argument('-rn', type=str, default=False, metavar='  ', help="Define the RNA name")
    parser_randomtest.add_argument('-bed', metavar='  ', help="Input BED file for interested regions on DNA")
    parser_randomtest.add_argument('-o', metavar='  ', help="Output directory name for all the results and temporary files")
    parser_randomtest.add_argument('-t', metavar='  ', default=False, help="Define the title name for the results under the Output name. (default: %(default)s)")
    
    parser_randomtest.add_argument('-n', type=int, default=10000, metavar='  ', 
                                   help="Number of times for randomization (default: %(default)s)")

    parser_randomtest.add_argument('-organism', metavar='  ', help='Define the organism (hg19 or mm9)')
 
    parser_randomtest.add_argument('-showdbs', action="store_true", help="Show the plots and statistics of DBS (DNA Binding sites)")
    parser_randomtest.add_argument('-score', action="store_true", help="Load score column from input BED file")
    parser_randomtest.add_argument('-a', type=float, default=0.05, metavar='  ', help="Define significance level for rejection null hypothesis (default: %(default)s)")
    parser_randomtest.add_argument('-ccf', type=int, default=40, metavar='  ', help="Define the cut off value for DBS counts (default: %(default)s)")
    parser_randomtest.add_argument('-rt', action="store_true", default=False, help="Remove temporary files (fa, txp...etc)")
    parser_randomtest.add_argument('-log', action="store_true", default=False, help="Set the plots in log scale")
    parser_randomtest.add_argument('-f', type=str, default=False, metavar='  ', help="Input BED file as mask in randomization")
    parser_randomtest.add_argument('-ac', type=str, default=False, metavar='  ', help="Input file for RNA accecibility ")
    parser_randomtest.add_argument('-accf', type=float, default=500, metavar='  ', help="Define the cut off value for RNA accecibility")
    parser_randomtest.add_argument('-obed', action="store_true", default=True, help="Output the BED files for DNA binding sites.")
    parser_randomtest.add_argument('-showpa', action="store_true", default=False, help="Show parallel and antiparallel bindings in the plot separately.")
    
    parser_randomtest.add_argument('-l', type=int, default=20, metavar='  ', help="[Triplexator] Define the minimum length of triplex (default: %(default)s)")
    parser_randomtest.add_argument('-e', type=int, default=20, metavar='  ', help="[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)")
    parser_randomtest.add_argument('-c', type=int, default=2, metavar='  ', help="[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)")
    parser_randomtest.add_argument('-fr', type=str, default="off", metavar='  ', help="[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)")
    parser_randomtest.add_argument('-fm', type=int, default=0, metavar='  ', help="[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering.")
    parser_randomtest.add_argument('-of', type=int, default=1, metavar='  ', help="[Triplexator] Define output formats of Triplexator (default: %(default)s)")
    parser_randomtest.add_argument('-mf', action="store_true", default=False, help="[Triplexator] Merge overlapping features into a cluster and report the spanning region.")
    parser_randomtest.add_argument('-rm', type=int, default=0, metavar='  ', help="[Triplexator] Set the multiprocessing")
    parser_randomtest.add_argument('-par', type=str, default="", metavar='  ', help="[Triplexator] Define other parameters for Triplexator")

    ##########################################################################
    parser_bed2bed = subparsers.add_parser('get_dbss', help="Get DBSs in BED format from the single BED file")
    parser_bed2bed.add_argument('-i',type=str, metavar='  ', help='Input BED file of the target regions')
    parser_bed2bed.add_argument('-dbs',type=str, metavar='  ', help='Output BED file of the DBSs')
    parser_bed2bed.add_argument('-rbs',type=str, metavar='  ', help='Output BED file of the RBSs')
    parser_bed2bed.add_argument('-r',type=str, metavar='  ', help='Input FASTA file of the RNA')
    parser_bed2bed.add_argument('-organism', metavar='  ', help='Define the organism (hg19 or mm9)')
    parser_bed2bed.add_argument('-l', type=int, default=20, metavar='  ', help="[Triplexator] Define the minimum length of triplex (default: %(default)s)")
    parser_bed2bed.add_argument('-e', type=int, default=20, metavar='  ', help="[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)")
    parser_bed2bed.add_argument('-c', type=int, default=2, metavar='  ', help="[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)")
    parser_bed2bed.add_argument('-fr', type=str, default="off", metavar='  ', help="[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)")
    parser_bed2bed.add_argument('-fm', type=int, default=0, metavar='  ', help="[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering.")
    parser_bed2bed.add_argument('-of', type=int, default=1, metavar='  ', help="[Triplexator] Define output formats of Triplexator (default: %(default)s)")
    parser_bed2bed.add_argument('-mf', action="store_true", default=False, help="[Triplexator] Merge overlapping features into a cluster and report the spanning region.")
    parser_bed2bed.add_argument('-rm', type=int, default=0, metavar='  ', help="[Triplexator] Set the multiprocessing")
    
    ##########################################################################
    # rgt-TDF integrate -path 
    parser_integrate = subparsers.add_parser('integrate', help="Integrate the project's links and generate project-level statistics.")
    parser_integrate.add_argument('-path',type=str, metavar='  ', help='Define the path of the project.')
    ##########################################################################
    parser_updatehtml = subparsers.add_parser('updatehtml', help="Update the project's html.")
    parser_updatehtml.add_argument('-path',type=str, metavar='  ', help='Define the path of the project.')
    parser_updatehtml.add_argument('-exp', type=str, metavar='  ', help='Define file with expression data.')

    ################### Parsing the arguments ################################
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    elif len(sys.argv) == 2:  
        # retrieve subparsers from parser
        subparsers_actions = [action for action in parser._actions if isinstance(action, argparse._SubParsersAction)]
        # there will probably only be one subparser_action,but better save than sorry
        for subparsers_action in subparsers_actions:
            # get all subparsers and print help
            for choice, subparser in subparsers_action.choices.items():
                if choice == sys.argv[1]:
                    print("\nYou need more arguments.")
                    print("\nSubparser '{}'".format(choice))        
                    subparser.print_help()
        sys.exit(1)
    else:   
        args = parser.parse_args()

        ####################################################################################
        ######### Integration
        if args.mode == "integrate":
            condition_list = [] # name, link, no. tests, no. sig.
            for item in os.listdir(args.path):
                if item == "style": continue
                if os.path.isfile(os.path.join(args.path,item)): continue
                elif os.path.isdir(os.path.join(args.path,item)):
                    h = os.path.join(item, "index.html")
                    pro = os.path.join(args.path, item, "profile.txt")
                    if os.path.isfile(pro):
                        integrate_stat(path=os.path.join(args.path, item))
                        nt = 0
                        ns = 0
                        with open(pro) as f:
                            for line in f:
                                line = line.strip().split("\t")
                                if line[0] == "Experiment": continue
                                nt += 1
                                if float(line[7]) < 0.05: ns += 1
                        # print([item, h, str(nt), str(ns)])
                        condition_list.append( [item, h, str(nt), str(ns)] )
            # print(condition_list)
            link_d = {"List":"index.html"}
            fp = condition_list[0][0] + "/style"
            html = Html(name="Directory: "+args.path, links_dict=link_d, 
                        fig_rpath=fp, #fig_dir=fp, 
                        RGT_header=False, other_logo="TDF")
            html.add_heading("All conditions in: "+args.path+"/")
            data_table = []
            type_list = 'sssssssssssss'
            col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
            c = 0
            header_list = ["No.", "Conditions", "No. tests", "No. sig. tests" ]
            for i, exp in enumerate(condition_list):
                c += 1
                data_table.append([str(c), 
                                   '<a href="'+exp[1]+'">'+exp[0]+"</a>",
                                   exp[2], exp[3] ])
            html.add_zebra_table( header_list, col_size_list, type_list, data_table, 
                                  align=10, cell_align="left", sortable=True)
            html.add_fixed_rank_sortable()
            html.write(os.path.join(args.path,"index.html"))
            gen_heatmap(path=args.path)
            generate_rna_exp_pv_table(root=args.path, multi_corr=False)
            merge_DBD_regions(path=args.path)

            sys.exit(0)

        ####################################################################################
        ######### updatehtml
        elif args.mode == "updatehtml":
            for item in os.listdir(args.path):
                pro = os.path.join(args.path, item, "profile.txt")
                if os.path.isfile(pro): update_profile(dirpath=os.path.join(args.path, item),
                                                       expression=args.exp)
            revise_index(root=args.path)
            generate_rna_exp_pv_table(root=args.path, multi_corr=True)
            sys.exit(0)
        
        ####################################################################################
        ######### get_dbss
        elif args.mode == "get_dbss":

            get_dbss(input_BED=args.i,output_BED=args.dbs,rna_fasta=args.r,output_rbss=args.rbs,
                     organism=args.organism,l=args.l,e=args.e,c=args.c,
                     fr=args.fr,fm=args.fm,of=args.of,mf=args.mf,rm=args.rm,temp=dir)
            os.remove("dna_targeted_region.fa")
            os.remove("dna_targeted_region.txp")
            os.remove("rna_temp.fa")
            sys.exit(0)


        #######################################################################
        #### Checking arguments
        if not args.o: 
            print("Please define the output directory name. \n")
            sys.exit(1)
        if not args.organism: 
            print("Please define the organism. (hg19 or mm9)")
            sys.exit(1)
        if not args.rn and not args.rl: 
            print("Please define RNA sequence name.")
            sys.exit(1)
        if args.r and args.rl:
            print("Both -r and -rl are given. TDF will skip -r and process -rl ")
        if args.rl:
            with open(args.rl) as f:
                for line in f:
                    line = line.strip()
                    rn = os.path.basename(line).rpartition(".")[0]
                    print("\tProcessing: "+rn)
                    command = ["rgt-TDF", args.mode, 
                               "-r", line, "-rn", rn,
                               "-o", os.path.join(args.o, rn),
                               "-organism", args.organism ]
                    if args.de and not args.bed: command += ["-de", args.de]
                    if args.bed and args.bg: command += ["-bed", args.bed, "-bg", args.bg]

                    if args.score: command += ["-score"]
                    if args.rt: command += ["-rt" ]
                    if args.pl != 1000: command += ["-pl", args.pl]
                    if args.ccf != 40: command += ["-ccf", args.ccf]
                    if args.obed: command += ["-obed"]
                    if args.a != 0.05: command += ["-a", args.a]
                    if args.filter_havana == 'F': command += ["-filter_havana", 'F']
                    if args.protein_coding == 'T': command += ["-protein_coding", 'T']
                    if args.known_only == 'F': command += ["-known_only", 'F']
                    
                    if args.rm > 0: command += ["-rm", args.rm ]
                    if args.fr != 'off': command += ["-fr", args.fr ]
                    if args.c != 2: command += ["-c", args.c ]
                    if args.e != 20: command += ["-e", args.e ]
                    if args.of != 1: command += ["-of", args.of ]
                    if args.l != 15: command += ["-l", args.l ]
                    if args.fr != 'off': command += ["-fr", args.fr ]
                    if args.fr != 'off': command += ["-fr", args.fr ]
                    if args.fr != 'off': command += ["-fr", args.fr ]
                    subprocess.call(command)
            sys.exit(0)

        t0 = time.time()
        # Normalised output path
        if not args.t: title = args.rn
        else: title = args.t
        
        args.o = os.path.normpath(os.path.join(dir,args.o,title))
        check_dir(os.path.dirname(os.path.dirname(args.o)))
        check_dir(os.path.dirname(args.o))
        check_dir(args.o)
        # Input parameters dictionary
        summary = []
        summary.append("Time: " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        summary.append("User: "******"\nCommand:\n\t$ " + " ".join(sys.argv))
           
    ################################################################################
    ##### Promoter Test ############################################################
    ################################################################################
    if args.mode == 'promotertest':


################################################################################################3

        if args.bed and not args.bg:
            print("Please add background promoters in BED format. (-bg)")
            sys.exit(1)
        if args.scoreh and not args.score:
            print("Score header (-scoreh) can only be used when scores (-score) are loaded.")
            print("Please add '-score'.")
            sys.exit(1)

        print2(summary, "\n"+"*************** Promoter Test ****************")
        print2(summary, "*** Input RNA sequence: "+args.r)
        
        if args.o.count("/") < 3:
            print2(summary, "*** Output directory: "+ args.o)
        else:
            n = args.o.count("/") - 3 + 1
            print2(summary, "*** Output directory: "+ args.o.split("/",n)[-1] )

        args.r = os.path.normpath(os.path.join(dir,args.r))
        
        if args.de: args.de = os.path.normpath(os.path.join(dir,args.de))
        if args.bed: args.bed = os.path.normpath(os.path.join(dir,args.bed))
        if args.bg: args.bg = os.path.normpath(os.path.join(dir,args.bg))

        # Get GenomicRegionSet from the given genes
        print2(summary, "Step 1: Calculate the triplex forming sites on RNA and DNA.")
        promoter = PromoterTest(gene_list_file=args.de, gtf=args.gtf, rna_name=args.rn, bed=args.bed, bg=args.bg, 
                                organism=args.organism, promoterLength=args.pl, summary=summary, 
                                temp=dir, output=args.o, showdbs=args.showdbs, score=args.score, 
                                scoreh=args.scoreh, filter_havana=args.filter_havana, 
                                protein_coding=args.protein_coding, known_only=args.known_only)
        if args.dump: sys.exit(0)
        promoter.get_rna_region_str(rna=args.r, expfile=args.rnaexp)
        promoter.connect_rna(rna=args.r, temp=args.o)
        promoter.search_triplex(temp=args.o, l=args.l, e=args.e, remove_temp=args.rt, 
                                c=args.c, fr=args.fr, fm=args.fm, of=args.of, mf=args.mf, par=args.par)
        
        t1 = time.time()
        print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t1-t0))))

        print2(summary, "Step 2: Calculate the frequency of DNA binding sites within the promotors.")
        if args.obed: obedp = os.path.basename(args.o)
        else: obedp = None
        promoter.count_frequency(temp=args.o, remove_temp=args.rt, obedp=obedp, cutoff=args.ccf, l=args.l)
        promoter.fisher_exact(alpha=args.a)
        t2 = time.time()
        print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t2-t1))))
        
        if len(promoter.rbss) == 0:
            no_binding_response(args=args, rna_regions=promoter.rna_regions,
                                rna_name=promoter.rna_name, organism=promoter.organism,
                                stat=promoter.stat, expression=promoter.rna_expression)
        promoter.dbd_regions(output=args.o)
        os.remove(os.path.join(args.o,"rna_temp.fa"))
        try: os.remove(os.path.join(args.o,"rna_temp.fa.fai"))
        except: pass
        print2(summary, "Step 3: Establishing promoter profile.")
        t3 = time.time()
        print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t3-t2))))

        print2(summary, "Step 4: Generate plot and output html files.")
        promoter.plot_lines(txp=promoter.txp_def, rna=args.r, dirp=args.o, ac=args.ac, 
                            cut_off=args.accf, log=args.log, showpa=args.showpa,
                            sig_region=promoter.sig_DBD,
                            ylabel="Number of DBSs", 
                            linelabel="No. DBSs", filename="plot_promoter.png")

        promoter.barplot(dirp=args.o, filename="bar_promoter.png", sig_region=promoter.sig_DBD
                        )
        #if args.showdbs:
        #    promoter.plot_lines(txp=promoter.txp_def, rna=args.r, dirp=args.o, ac=args.ac, 
        #                        cut_off=args.accf, log=args.log, showpa=args.showpa,
        #                        sig_region=promoter.sig_region_dbs,
        #                        ylabel="Number of DBSs on target promoters", 
        #                        linelabel="No. DBSs", filename="plot_dbss.png")
        #    promoter.barplot(dirp=args.o, filename="bar_dbss.png", sig_region=promoter.sig_region_dbs, dbs=True)
        # if args.motif: promoter.gen_motifs(temp=args.o)

        promoter.gen_html(directory=args.o, parameters=args, ccf=args.ccf, align=50, alpha=args.a)
        promoter.gen_html_genes(directory=args.o, align=50, alpha=args.a, nonDE=False)
        # promoter.save_table(path=os.path.dirname(args.o), table=promoter.ranktable,
        #                         filename="lncRNA_target_ranktable.txt")
        # promoter.save_table(path=os.path.dirname(args.o), table=promoter.dbstable,
        #                         filename="lncRNA_target_dbstable.txt")

        #promoter.heatmap(table="ranktable.txt", temp=os.path.dirname(args.o))

        t4 = time.time()
        print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t4-t3))))
        print2(summary, "\nTotal running time is: " + str(datetime.timedelta(seconds=round(t4-t0))))
    
        output_summary(summary, args.o, "summary.txt")
        save_profile(rna_regions=promoter.rna_regions, rna_name=promoter.rna_name,
                     organism=promoter.organism, output=args.o, bed=args.bed,
                     geneset=args.de, stat=promoter.stat, topDBD=promoter.topDBD,
                     sig_DBD=promoter.sig_DBD, expression=promoter.rna_expression)
        revise_index(root=os.path.dirname(os.path.dirname(args.o)))
        try: os.remove(os.path.join(args.o, "de.fa"))
        except OSError: pass
        try: os.remove(os.path.join(args.o, "nde.fa"))
        except OSError: pass
        write_stat(stat=promoter.stat, filename=os.path.join(args.o, "stat.txt"))


    ################################################################################
    ##### Genomic Region Test ######################################################
    ################################################################################
    if args.mode == 'regiontest':
        def no_binding_code():
            print("*** Find no triple helices binding on the given RNA")

            pro_path = os.path.join(os.path.dirname(args.o), "profile.txt")
            exp = os.path.basename(args.o)
            tar_reg = os.path.basename(args.bed)
            r_genes = rna_associated_gene(rna_regions=randomtest.rna_regions, name=randomtest.rna_name, organism=randomtest.organism)
            newlines = []
            if os.path.isfile(pro_path):
                with open(pro_path,'r') as f:
                    new_exp = True
                    for line in f:
                        line = line.strip()
                        line = line.split("\t")
                        if line[0] == exp:
                            newlines.append([exp, args.rn, args.o.split("_")[-1],
                                             args.organism, tar_reg, "0", 
                                             "-", "1.0", r_genes, "No triplex found" ])
                            new_exp = False
                        else:
                            newlines.append(line)
                    if new_exp:
                        newlines.append([exp, args.rn, args.o.split("_")[-1],
                                             args.organism, tar_reg,"0", 
                                             "-", "1.0", r_genes, "No triplex found" ])
            else:
                newlines.append(["Experiment","RNA_names","Tag","Organism","Target_region","No_sig_DBDs", 
                                 "Top_DBD", "p-value","closest_genes"])
                newlines.append([exp, args.rn, args.o.split("_")[-1],
                                             args.organism, tar_reg, "0", 
                                             "-", "1.0", r_genes, "No triplex found" ])
            with open(pro_path,'w') as f:
                for lines in newlines:
                    print("\t".join(lines), file=f)

            #shutil.rmtree(args.o)
            list_all_index(path=os.path.dirname(args.o), show_RNA_ass_gene=randomtest.rna_regions)
            shutil.rmtree(args.o)
            sys.exit(1)

            #########################################################
        print2(summary, "\n"+"*************** Genomic Region Test ***************")
        print2(summary, "*** Input RNA sequence: "+args.r)
        print2(summary, "*** Input regions in BED: "+os.path.basename(args.bed))
        print2(summary, "*** Number of randomization: "+str(args.n))
        print2(summary, "*** Output directoey: "+os.path.basename(args.o))

        args.r = os.path.normpath(os.path.join(dir,args.r))
        
        print2(summary, "\nStep 1: Calculate the triplex forming sites on RNA and the given regions")
        randomtest = RandomTest(rna_fasta=args.r, rna_name=args.rn, dna_region=args.bed, 
                                organism=args.organism, showdbs=args.showdbs)
        randomtest.get_rna_region_str(rna=args.r)
        obed = os.path.basename(args.o)
        randomtest.connect_rna(rna=args.r, temp=args.o)

        randomtest.target_dna(temp=args.o, remove_temp=args.rt, l=args.l, e=args.e, obed=obed,
                              c=args.c, fr=args.fr, fm=args.fm, of=args.of, mf=args.mf, par=args.par, cutoff=args.ccf )
        t1 = time.time()
        print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t1-t0))))
        # print(args.par)
        if len(randomtest.rbss) == 0:
            # no_binding_code()
            no_binding_response(args=args, rna_regions=randomtest.rna_regions,
                                rna_name=randomtest.rna_name, organism=randomtest.organism,
                                stat=randomtest.stat, expression=randomtest.rna_expression)

        print2(summary, "Step 2: Randomization and counting number of binding sites")

        randomtest.random_test(repeats=args.n, temp=args.o, remove_temp=args.rt, l=args.l, e=args.e,
                               c=args.c, fr=args.fr, fm=args.fm, of=args.of, mf=args.mf, par=args.par, rm=args.rm,
                               filter_bed=args.f, alpha=args.a)
        


        t2 = time.time()
        print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t2-t1))))
        
        print2(summary, "Step 3: Generating plot and output HTML")
        randomtest.dbd_regions(sig_region=randomtest.data["region"]["sig_region"], output=args.o)

        os.remove(os.path.join(args.o, "rna_temp.fa"))
        try:
            os.remove(os.path.join(args.o, "rna_temp.fa.fai"))
        except:
            pass
        

        randomtest.lineplot(txp=randomtest.txpf, dirp=args.o, ac=args.ac, cut_off=args.accf, showpa=args.showpa,
                            log=args.log, ylabel="Number of DBS",
                            sig_region=randomtest.data["region"]["sig_region"], 
                            linelabel="No. DBS", filename="lineplot_region.png")

        #randomtest.lineplot(txp=randomtest.txp, dirp=args.o, ac=args.ac, cut_off=args.accf, showpa=args.showpa,
        #                    log=args.log, ylabel="Number of target regions with DBS", 
        #                    sig_region=randomtest.data["region"]["sig_region"],
        #                    linelabel="No. target regions", filename="lineplot_region.png")
        
        randomtest.boxplot(dir=args.o, matrix=randomtest.region_matrix, 
                           sig_region=randomtest.data["region"]["sig_region"], 
                           truecounts=[r[0] for r in randomtest.counts_tr.values()],
                           sig_boolean=randomtest.data["region"]["sig_boolean"], 
                           ylabel="Number of target regions",
                           filename="boxplot_regions" )
        #if args.showdbs:
        #    randomtest.lineplot(txp=randomtest.txpf, dirp=args.o, ac=args.ac, cut_off=args.accf, showpa=args.showpa,
        #                        log=args.log, ylabel="Number of DBS on target regions",
        #                        sig_region=randomtest.data["dbs"]["sig_region"], 
        #                        linelabel="No. DBS", filename="lineplot_dbs.png")
            
        #    randomtest.boxplot(dir=args.o, matrix=randomtest.dbss_matrix, 
        #                       sig_region=randomtest.data["dbs"]["sig_region"], 
        #                       truecounts=randomtest.counts_dbs.values(),
        #                       sig_boolean=randomtest.data["dbs"]["sig_boolean"], 
        #                       ylabel="Number of DBS on target regions",
        #                       filename="boxplot_dbs" )

        randomtest.gen_html(directory=args.o, parameters=args, align=50, alpha=args.a, 
                            score=args.score, obed=obed)

        t3 = time.time()
        print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t3-t2))))

        print2(summary, "\nTotal running time is: " + str(datetime.timedelta(seconds=round(t3-t0))))

        output_summary(summary, args.o, "summary.txt")
        # save_profile(output=args.o, bed=args.bed)
        save_profile(rna_regions=randomtest.rna_regions, rna_name=randomtest.rna_name,
                     organism=randomtest.organism, output=args.o, bed=args.bed,
                     stat=randomtest.stat, topDBD=randomtest.topDBD,
                     sig_DBD=randomtest.data["region"]["sig_region"],
                     expression=randomtest.rna_expression)
        list_all_index(path=os.path.dirname(args.o))
        for f in os.listdir(args.o):
            if re.search("dna*.fa", f) or re.search("dna*.txp", f):
                os.remove(os.path.join(args.o, f))
        write_stat(stat=randomtest.stat, filename=os.path.join(args.o, "stat.txt"))
コード例 #17
0
ファイル: Main.py プロジェクト: rafalcode/reg-gen
def main():
    ##########################################################################
    ##### PARAMETERS #########################################################
    ##########################################################################

    parser = argparse.ArgumentParser(
        description='Triplex Domain Finder is a statistical framework \
                                                  for detection of triple helix potential of \
                                                  lncRNAs from genome-wide functional data. \
                                                  Author: Chao-Chung Kuo\
                                                  \nVersion: ' + __version__,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    subparsers = parser.add_subparsers(help='sub-command help', dest='mode')

    ################### Promoter test ##########################################

    h_promotor = "Promoter test evaluates the association between the given lncRNA to the target promoters."
    parser_promotertest = subparsers.add_parser('promotertest',
                                                help=h_promotor)
    parser_promotertest.add_argument(
        '-r',
        type=str,
        metavar='  ',
        help="Input file name for RNA sequence (in fasta format)")
    parser_promotertest.add_argument(
        '-rl',
        type=str,
        default=None,
        metavar='  ',
        help="Input list for paths to all RNA sequences (in fasta format)")
    parser_promotertest.add_argument('-rn',
                                     type=str,
                                     default=None,
                                     metavar='  ',
                                     help="Define the RNA name")
    parser_promotertest.add_argument(
        '-de',
        default=False,
        metavar='  ',
        help="Input file for target gene list (gene symbols or Ensembl ID)")
    parser_promotertest.add_argument(
        '-bed',
        default=False,
        metavar='  ',
        help="Input BED file of the promoter regions of target genes")
    parser_promotertest.add_argument(
        '-bg',
        default=False,
        metavar='  ',
        help="Input BED file of the promoter regions of background genes")
    parser_promotertest.add_argument(
        '-o', metavar='  ', help="Output directory name for all the results")
    parser_promotertest.add_argument(
        '-t',
        metavar='  ',
        default=False,
        help=
        "Define the title name for the results under the Output name. (default: %(default)s)"
    )

    parser_promotertest.add_argument('-organism',
                                     metavar='  ',
                                     help='Define the organism (hg19 or mm9)')
    parser_promotertest.add_argument(
        '-gtf',
        metavar='  ',
        default=None,
        help='Define the GTF file for annotation (optional)')

    parser_promotertest.add_argument(
        '-pl',
        type=int,
        default=1000,
        metavar='  ',
        help="Define the promotor length (default: %(default)s)")

    parser_promotertest.add_argument(
        '-showdbs',
        action="store_true",
        help="Show the plots and statistics of DBS (DNA Binding sites)")
    parser_promotertest.add_argument(
        '-score',
        action="store_true",
        help="Load score column from input gene list or BED file for analysis."
    )
    parser_promotertest.add_argument(
        '-scoreh',
        action="store_true",
        help="Use the header of scores from the given gene list or BED file.")
    parser_promotertest.add_argument(
        '-a',
        type=float,
        default=0.05,
        metavar='  ',
        help=
        "Define significance level for rejection null hypothesis (default: %(default)s)"
    )
    parser_promotertest.add_argument(
        '-ccf',
        type=int,
        default=100,
        metavar='  ',
        help=
        "Define the cut off value for promoter counts (default: %(default)s)")
    parser_promotertest.add_argument(
        '-rt',
        action="store_true",
        default=False,
        help="Remove temporary files (fa, txp...etc)")
    parser_promotertest.add_argument('-log',
                                     action="store_true",
                                     default=False,
                                     help="Set the plots in log scale")
    parser_promotertest.add_argument('-ac',
                                     type=str,
                                     default=False,
                                     metavar='  ',
                                     help="Input file for RNA accecibility ")
    parser_promotertest.add_argument(
        '-accf',
        type=float,
        default=500,
        metavar='  ',
        help="Define the cut off value for RNA accecibility")
    parser_promotertest.add_argument(
        '-obed',
        action="store_true",
        default=True,
        help="Output the BED files for DNA binding sites.")
    parser_promotertest.add_argument(
        '-showpa',
        action="store_true",
        default=False,
        help="Show parallel and antiparallel bindings in the plot separately.")
    # parser_promotertest.add_argument('-motif', action="store_true", default=False, help="Show motif of binding sites.")
    parser_promotertest.add_argument(
        '-filter_havana',
        type=str,
        default="F",
        metavar='  ',
        help="Apply filtering to remove HAVANA entries.")
    parser_promotertest.add_argument(
        '-protein_coding',
        type=str,
        default="F",
        metavar='  ',
        help="Apply filtering to get only protein coding genes.")
    parser_promotertest.add_argument(
        '-known_only',
        type=str,
        default="F",
        metavar='  ',
        help="Apply filtering to get only known genes.")
    parser_promotertest.add_argument(
        '-dump',
        action="store_true",
        default=False,
        help="Only dump the experimental file and leave the program.")
    parser_promotertest.add_argument(
        '-rnaexp',
        type=str,
        default=None,
        metavar='  ',
        help="Given a file with RNA name and the expression value")

    parser_promotertest.add_argument(
        '-l',
        type=int,
        default=20,
        metavar='  ',
        help=
        "[Triplexator] Define the minimum length of triplex (default: %(default)s)"
    )
    parser_promotertest.add_argument(
        '-e',
        type=int,
        default=20,
        metavar='  ',
        help=
        "[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)"
    )
    parser_promotertest.add_argument(
        '-c',
        type=int,
        default=2,
        metavar='  ',
        help=
        "[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)"
    )
    parser_promotertest.add_argument(
        '-fr',
        type=str,
        default="off",
        metavar='  ',
        help=
        "[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)"
    )
    parser_promotertest.add_argument(
        '-fm',
        type=int,
        default=0,
        metavar='  ',
        help=
        "[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering."
    )
    parser_promotertest.add_argument(
        '-of',
        type=int,
        default=1,
        metavar='  ',
        help=
        "[Triplexator] Define output formats of Triplexator (default: %(default)s)"
    )
    parser_promotertest.add_argument(
        '-mf',
        action="store_true",
        default=False,
        help=
        "[Triplexator] Merge overlapping features into a cluster and report the spanning region."
    )
    parser_promotertest.add_argument(
        '-rm',
        type=int,
        default=0,
        metavar='  ',
        help="[Triplexator] Set the multiprocessing")
    parser_promotertest.add_argument(
        '-par',
        type=str,
        default="",
        metavar='  ',
        help="[Triplexator] Define other parameters for Triplexator")

    ################### Genomic Region Test ##########################################
    h_region = "Genomic region test evaluates the association between the given lncRNA to the target regions by randomization."
    parser_randomtest = subparsers.add_parser('regiontest', help=h_region)
    parser_randomtest.add_argument(
        '-r',
        type=str,
        metavar='  ',
        help="Input file name for RNA sequence (in fasta format)")
    parser_randomtest.add_argument(
        '-rl',
        type=str,
        default=None,
        metavar='  ',
        help="Input list for paths to all RNA sequences (in fasta format)")
    parser_randomtest.add_argument('-rn',
                                   type=str,
                                   default=False,
                                   metavar='  ',
                                   help="Define the RNA name")
    parser_randomtest.add_argument(
        '-bed',
        metavar='  ',
        help="Input BED file for interested regions on DNA")
    parser_randomtest.add_argument(
        '-o',
        metavar='  ',
        help="Output directory name for all the results and temporary files")
    parser_randomtest.add_argument(
        '-t',
        metavar='  ',
        default=False,
        help=
        "Define the title name for the results under the Output name. (default: %(default)s)"
    )

    parser_randomtest.add_argument(
        '-n',
        type=int,
        default=10000,
        metavar='  ',
        help="Number of times for randomization (default: %(default)s)")

    parser_randomtest.add_argument('-organism',
                                   metavar='  ',
                                   help='Define the organism (hg19 or mm9)')

    parser_randomtest.add_argument(
        '-showdbs',
        action="store_true",
        help="Show the plots and statistics of DBS (DNA Binding sites)")
    parser_randomtest.add_argument(
        '-score',
        action="store_true",
        help="Load score column from input BED file")
    parser_randomtest.add_argument(
        '-a',
        type=float,
        default=0.05,
        metavar='  ',
        help=
        "Define significance level for rejection null hypothesis (default: %(default)s)"
    )
    parser_randomtest.add_argument(
        '-ccf',
        type=int,
        default=40,
        metavar='  ',
        help="Define the cut off value for DBS counts (default: %(default)s)")
    parser_randomtest.add_argument(
        '-rt',
        action="store_true",
        default=False,
        help="Remove temporary files (fa, txp...etc)")
    parser_randomtest.add_argument('-log',
                                   action="store_true",
                                   default=False,
                                   help="Set the plots in log scale")
    parser_randomtest.add_argument(
        '-f',
        type=str,
        default=False,
        metavar='  ',
        help="Input BED file as mask in randomization")
    parser_randomtest.add_argument('-ac',
                                   type=str,
                                   default=False,
                                   metavar='  ',
                                   help="Input file for RNA accecibility ")
    parser_randomtest.add_argument(
        '-accf',
        type=float,
        default=500,
        metavar='  ',
        help="Define the cut off value for RNA accecibility")
    parser_randomtest.add_argument(
        '-obed',
        action="store_true",
        default=True,
        help="Output the BED files for DNA binding sites.")
    parser_randomtest.add_argument(
        '-showpa',
        action="store_true",
        default=False,
        help="Show parallel and antiparallel bindings in the plot separately.")

    parser_randomtest.add_argument(
        '-l',
        type=int,
        default=20,
        metavar='  ',
        help=
        "[Triplexator] Define the minimum length of triplex (default: %(default)s)"
    )
    parser_randomtest.add_argument(
        '-e',
        type=int,
        default=20,
        metavar='  ',
        help=
        "[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)"
    )
    parser_randomtest.add_argument(
        '-c',
        type=int,
        default=2,
        metavar='  ',
        help=
        "[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)"
    )
    parser_randomtest.add_argument(
        '-fr',
        type=str,
        default="off",
        metavar='  ',
        help=
        "[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)"
    )
    parser_randomtest.add_argument(
        '-fm',
        type=int,
        default=0,
        metavar='  ',
        help=
        "[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering."
    )
    parser_randomtest.add_argument(
        '-of',
        type=int,
        default=1,
        metavar='  ',
        help=
        "[Triplexator] Define output formats of Triplexator (default: %(default)s)"
    )
    parser_randomtest.add_argument(
        '-mf',
        action="store_true",
        default=False,
        help=
        "[Triplexator] Merge overlapping features into a cluster and report the spanning region."
    )
    parser_randomtest.add_argument(
        '-rm',
        type=int,
        default=0,
        metavar='  ',
        help="[Triplexator] Set the multiprocessing")
    parser_randomtest.add_argument(
        '-par',
        type=str,
        default="",
        metavar='  ',
        help="[Triplexator] Define other parameters for Triplexator")

    ##########################################################################
    parser_bed2bed = subparsers.add_parser(
        'get_dbss', help="Get DBSs in BED format from the single BED file")
    parser_bed2bed.add_argument('-i',
                                type=str,
                                metavar='  ',
                                help='Input BED file of the target regions')
    parser_bed2bed.add_argument('-dbs',
                                type=str,
                                metavar='  ',
                                help='Output BED file of the DBSs')
    parser_bed2bed.add_argument('-rbs',
                                type=str,
                                metavar='  ',
                                help='Output BED file of the RBSs')
    parser_bed2bed.add_argument('-r',
                                type=str,
                                metavar='  ',
                                help='Input FASTA file of the RNA')
    parser_bed2bed.add_argument('-organism',
                                metavar='  ',
                                help='Define the organism (hg19 or mm9)')
    parser_bed2bed.add_argument(
        '-l',
        type=int,
        default=20,
        metavar='  ',
        help=
        "[Triplexator] Define the minimum length of triplex (default: %(default)s)"
    )
    parser_bed2bed.add_argument(
        '-e',
        type=int,
        default=20,
        metavar='  ',
        help=
        "[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)"
    )
    parser_bed2bed.add_argument(
        '-c',
        type=int,
        default=2,
        metavar='  ',
        help=
        "[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)"
    )
    parser_bed2bed.add_argument(
        '-fr',
        type=str,
        default="off",
        metavar='  ',
        help=
        "[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)"
    )
    parser_bed2bed.add_argument(
        '-fm',
        type=int,
        default=0,
        metavar='  ',
        help=
        "[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering."
    )
    parser_bed2bed.add_argument(
        '-of',
        type=int,
        default=1,
        metavar='  ',
        help=
        "[Triplexator] Define output formats of Triplexator (default: %(default)s)"
    )
    parser_bed2bed.add_argument(
        '-mf',
        action="store_true",
        default=False,
        help=
        "[Triplexator] Merge overlapping features into a cluster and report the spanning region."
    )
    parser_bed2bed.add_argument('-rm',
                                type=int,
                                default=0,
                                metavar='  ',
                                help="[Triplexator] Set the multiprocessing")

    ##########################################################################
    # rgt-TDF integrate -path
    parser_integrate = subparsers.add_parser(
        'integrate',
        help=
        "Integrate the project's links and generate project-level statistics.")
    parser_integrate.add_argument('-path',
                                  type=str,
                                  metavar='  ',
                                  help='Define the path of the project.')
    ##########################################################################
    parser_updatehtml = subparsers.add_parser(
        'updatehtml', help="Update the project's html.")
    parser_updatehtml.add_argument('-path',
                                   type=str,
                                   metavar='  ',
                                   help='Define the path of the project.')
    parser_updatehtml.add_argument('-exp',
                                   type=str,
                                   metavar='  ',
                                   help='Define file with expression data.')

    ################### Parsing the arguments ################################
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    elif len(sys.argv) == 2:
        # retrieve subparsers from parser
        subparsers_actions = [
            action for action in parser._actions
            if isinstance(action, argparse._SubParsersAction)
        ]
        # there will probably only be one subparser_action,but better save than sorry
        for subparsers_action in subparsers_actions:
            # get all subparsers and print help
            for choice, subparser in subparsers_action.choices.items():
                if choice == sys.argv[1]:
                    print("\nYou need more arguments.")
                    print("\nSubparser '{}'".format(choice))
                    subparser.print_help()
        sys.exit(1)
    else:
        args = parser.parse_args()

        ####################################################################################
        ######### Integration
        if args.mode == "integrate":
            condition_list = []  # name, link, no. tests, no. sig.
            for item in os.listdir(args.path):
                if item == "style": continue
                if os.path.isfile(os.path.join(args.path, item)): continue
                elif os.path.isdir(os.path.join(args.path, item)):
                    h = os.path.join(item, "index.html")
                    pro = os.path.join(args.path, item, "profile.txt")
                    if os.path.isfile(pro):
                        integrate_stat(path=os.path.join(args.path, item))
                        nt = 0
                        ns = 0
                        with open(pro) as f:
                            for line in f:
                                line = line.strip().split("\t")
                                if line[0] == "Experiment": continue
                                nt += 1
                                if float(line[7]) < 0.05: ns += 1
                        # print([item, h, str(nt), str(ns)])
                        condition_list.append([item, h, str(nt), str(ns)])
            # print(condition_list)
            link_d = {"List": "index.html"}
            fp = condition_list[0][0] + "/style"
            html = Html(
                name="Directory: " + args.path,
                links_dict=link_d,
                fig_rpath=fp,  #fig_dir=fp, 
                RGT_header=False,
                other_logo="TDF")
            html.add_heading("All conditions in: " + args.path + "/")
            data_table = []
            type_list = 'sssssssssssss'
            col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
            c = 0
            header_list = ["No.", "Conditions", "No. tests", "No. sig. tests"]
            for i, exp in enumerate(condition_list):
                c += 1
                data_table.append([
                    str(c), '<a href="' + exp[1] + '">' + exp[0] + "</a>",
                    exp[2], exp[3]
                ])
            html.add_zebra_table(header_list,
                                 col_size_list,
                                 type_list,
                                 data_table,
                                 align=10,
                                 cell_align="left",
                                 sortable=True)
            html.add_fixed_rank_sortable()
            html.write(os.path.join(args.path, "index.html"))
            gen_heatmap(path=args.path)
            generate_rna_exp_pv_table(root=args.path, multi_corr=False)
            merge_DBD_regions(path=args.path)

            sys.exit(0)

        ####################################################################################
        ######### updatehtml
        elif args.mode == "updatehtml":
            for item in os.listdir(args.path):
                pro = os.path.join(args.path, item, "profile.txt")
                if os.path.isfile(pro):
                    update_profile(dirpath=os.path.join(args.path, item),
                                   expression=args.exp)
            revise_index(root=args.path)
            generate_rna_exp_pv_table(root=args.path, multi_corr=True)
            sys.exit(0)

        ####################################################################################
        ######### get_dbss
        elif args.mode == "get_dbss":

            get_dbss(input_BED=args.i,
                     output_BED=args.dbs,
                     rna_fasta=args.r,
                     output_rbss=args.rbs,
                     organism=args.organism,
                     l=args.l,
                     e=args.e,
                     c=args.c,
                     fr=args.fr,
                     fm=args.fm,
                     of=args.of,
                     mf=args.mf,
                     rm=args.rm,
                     temp=dir)
            os.remove("dna_targeted_region.fa")
            os.remove("dna_targeted_region.txp")
            os.remove("rna_temp.fa")
            sys.exit(0)

        #######################################################################
        #### Checking arguments
        if not args.o:
            print("Please define the output directory name. \n")
            sys.exit(1)
        if not args.organism:
            print("Please define the organism. (hg19 or mm9)")
            sys.exit(1)
        if not args.rn and not args.rl:
            print("Please define RNA sequence name.")
            sys.exit(1)
        if args.r and args.rl:
            print(
                "Both -r and -rl are given. TDF will skip -r and process -rl ")
        if args.rl:
            with open(args.rl) as f:
                for line in f:
                    line = line.strip()
                    rn = os.path.basename(line).rpartition(".")[0]
                    print("\tProcessing: " + rn)
                    command = [
                        "rgt-TDF", args.mode, "-r", line, "-rn", rn, "-o",
                        os.path.join(args.o, rn), "-organism", args.organism
                    ]
                    if args.de and not args.bed: command += ["-de", args.de]
                    if args.bed and args.bg:
                        command += ["-bed", args.bed, "-bg", args.bg]

                    if args.score: command += ["-score"]
                    if args.rt: command += ["-rt"]
                    if args.pl != 1000: command += ["-pl", args.pl]
                    if args.ccf != 40: command += ["-ccf", args.ccf]
                    if args.obed: command += ["-obed"]
                    if args.a != 0.05: command += ["-a", args.a]
                    if args.filter_havana == 'F':
                        command += ["-filter_havana", 'F']
                    if args.protein_coding == 'T':
                        command += ["-protein_coding", 'T']
                    if args.known_only == 'F': command += ["-known_only", 'F']

                    if args.rm > 0: command += ["-rm", args.rm]
                    if args.fr != 'off': command += ["-fr", args.fr]
                    if args.c != 2: command += ["-c", args.c]
                    if args.e != 20: command += ["-e", args.e]
                    if args.of != 1: command += ["-of", args.of]
                    if args.l != 15: command += ["-l", args.l]
                    if args.fr != 'off': command += ["-fr", args.fr]
                    if args.fr != 'off': command += ["-fr", args.fr]
                    if args.fr != 'off': command += ["-fr", args.fr]
                    subprocess.call(command)
            sys.exit(0)

        t0 = time.time()
        # Normalised output path
        if not args.t: title = args.rn
        else: title = args.t

        args.o = os.path.normpath(os.path.join(dir, args.o, title))
        check_dir(os.path.dirname(os.path.dirname(args.o)))
        check_dir(os.path.dirname(args.o))
        check_dir(args.o)
        # Input parameters dictionary
        summary = []
        summary.append("Time: " +
                       datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        summary.append("User: "******"\nCommand:\n\t$ " + " ".join(sys.argv))

    ################################################################################
    ##### Promoter Test ############################################################
    ################################################################################
    if args.mode == 'promotertest':

        ################################################################################################3

        if args.bed and not args.bg:
            print("Please add background promoters in BED format. (-bg)")
            sys.exit(1)
        if args.scoreh and not args.score:
            print(
                "Score header (-scoreh) can only be used when scores (-score) are loaded."
            )
            print("Please add '-score'.")
            sys.exit(1)

        print2(summary,
               "\n" + "*************** Promoter Test ****************")
        print2(summary, "*** Input RNA sequence: " + args.r)

        if args.o.count("/") < 3:
            print2(summary, "*** Output directory: " + args.o)
        else:
            n = args.o.count("/") - 3 + 1
            print2(summary,
                   "*** Output directory: " + args.o.split("/", n)[-1])

        args.r = os.path.normpath(os.path.join(dir, args.r))

        if args.de: args.de = os.path.normpath(os.path.join(dir, args.de))
        if args.bed: args.bed = os.path.normpath(os.path.join(dir, args.bed))
        if args.bg: args.bg = os.path.normpath(os.path.join(dir, args.bg))

        # Get GenomicRegionSet from the given genes
        print2(summary,
               "Step 1: Calculate the triplex forming sites on RNA and DNA.")
        promoter = PromoterTest(gene_list_file=args.de,
                                gtf=args.gtf,
                                rna_name=args.rn,
                                bed=args.bed,
                                bg=args.bg,
                                organism=args.organism,
                                promoterLength=args.pl,
                                summary=summary,
                                temp=dir,
                                output=args.o,
                                showdbs=args.showdbs,
                                score=args.score,
                                scoreh=args.scoreh,
                                filter_havana=args.filter_havana,
                                protein_coding=args.protein_coding,
                                known_only=args.known_only)
        if args.dump: sys.exit(0)
        promoter.get_rna_region_str(rna=args.r, expfile=args.rnaexp)
        promoter.connect_rna(rna=args.r, temp=args.o)
        promoter.search_triplex(temp=args.o,
                                l=args.l,
                                e=args.e,
                                remove_temp=args.rt,
                                c=args.c,
                                fr=args.fr,
                                fm=args.fm,
                                of=args.of,
                                mf=args.mf,
                                par=args.par)

        t1 = time.time()
        print2(
            summary, "\tRunning time is: " +
            str(datetime.timedelta(seconds=round(t1 - t0))))

        print2(
            summary,
            "Step 2: Calculate the frequency of DNA binding sites within the promotors."
        )
        if args.obed: obedp = os.path.basename(args.o)
        else: obedp = None
        promoter.count_frequency(temp=args.o,
                                 remove_temp=args.rt,
                                 obedp=obedp,
                                 cutoff=args.ccf,
                                 l=args.l)
        promoter.fisher_exact(alpha=args.a)
        t2 = time.time()
        print2(
            summary, "\tRunning time is: " +
            str(datetime.timedelta(seconds=round(t2 - t1))))

        if len(promoter.rbss) == 0:
            no_binding_response(args=args,
                                rna_regions=promoter.rna_regions,
                                rna_name=promoter.rna_name,
                                organism=promoter.organism,
                                stat=promoter.stat,
                                expression=promoter.rna_expression)
        promoter.dbd_regions(output=args.o)
        os.remove(os.path.join(args.o, "rna_temp.fa"))
        try:
            os.remove(os.path.join(args.o, "rna_temp.fa.fai"))
        except:
            pass
        print2(summary, "Step 3: Establishing promoter profile.")
        t3 = time.time()
        print2(
            summary, "\tRunning time is: " +
            str(datetime.timedelta(seconds=round(t3 - t2))))

        print2(summary, "Step 4: Generate plot and output html files.")
        promoter.plot_lines(txp=promoter.txp_def,
                            rna=args.r,
                            dirp=args.o,
                            ac=args.ac,
                            cut_off=args.accf,
                            log=args.log,
                            showpa=args.showpa,
                            sig_region=promoter.sig_DBD,
                            ylabel="Number of DBSs",
                            linelabel="No. DBSs",
                            filename="plot_promoter.png")

        promoter.barplot(dirp=args.o,
                         filename="bar_promoter.png",
                         sig_region=promoter.sig_DBD)
        #if args.showdbs:
        #    promoter.plot_lines(txp=promoter.txp_def, rna=args.r, dirp=args.o, ac=args.ac,
        #                        cut_off=args.accf, log=args.log, showpa=args.showpa,
        #                        sig_region=promoter.sig_region_dbs,
        #                        ylabel="Number of DBSs on target promoters",
        #                        linelabel="No. DBSs", filename="plot_dbss.png")
        #    promoter.barplot(dirp=args.o, filename="bar_dbss.png", sig_region=promoter.sig_region_dbs, dbs=True)
        # if args.motif: promoter.gen_motifs(temp=args.o)

        promoter.gen_html(directory=args.o,
                          parameters=args,
                          ccf=args.ccf,
                          align=50,
                          alpha=args.a)
        promoter.gen_html_genes(directory=args.o,
                                align=50,
                                alpha=args.a,
                                nonDE=False)
        # promoter.save_table(path=os.path.dirname(args.o), table=promoter.ranktable,
        #                         filename="lncRNA_target_ranktable.txt")
        # promoter.save_table(path=os.path.dirname(args.o), table=promoter.dbstable,
        #                         filename="lncRNA_target_dbstable.txt")

        #promoter.heatmap(table="ranktable.txt", temp=os.path.dirname(args.o))

        t4 = time.time()
        print2(
            summary, "\tRunning time is: " +
            str(datetime.timedelta(seconds=round(t4 - t3))))
        print2(
            summary, "\nTotal running time is: " +
            str(datetime.timedelta(seconds=round(t4 - t0))))

        output_summary(summary, args.o, "summary.txt")
        save_profile(rna_regions=promoter.rna_regions,
                     rna_name=promoter.rna_name,
                     organism=promoter.organism,
                     output=args.o,
                     bed=args.bed,
                     geneset=args.de,
                     stat=promoter.stat,
                     topDBD=promoter.topDBD,
                     sig_DBD=promoter.sig_DBD,
                     expression=promoter.rna_expression)
        revise_index(root=os.path.dirname(os.path.dirname(args.o)))
        try:
            os.remove(os.path.join(args.o, "de.fa"))
        except OSError:
            pass
        try:
            os.remove(os.path.join(args.o, "nde.fa"))
        except OSError:
            pass
        write_stat(stat=promoter.stat,
                   filename=os.path.join(args.o, "stat.txt"))

    ################################################################################
    ##### Genomic Region Test ######################################################
    ################################################################################
    if args.mode == 'regiontest':

        def no_binding_code():
            print("*** Find no triple helices binding on the given RNA")

            pro_path = os.path.join(os.path.dirname(args.o), "profile.txt")
            exp = os.path.basename(args.o)
            tar_reg = os.path.basename(args.bed)
            r_genes = rna_associated_gene(rna_regions=randomtest.rna_regions,
                                          name=randomtest.rna_name,
                                          organism=randomtest.organism)
            newlines = []
            if os.path.isfile(pro_path):
                with open(pro_path, 'r') as f:
                    new_exp = True
                    for line in f:
                        line = line.strip()
                        line = line.split("\t")
                        if line[0] == exp:
                            newlines.append([
                                exp, args.rn,
                                args.o.split("_")[-1], args.organism, tar_reg,
                                "0", "-", "1.0", r_genes, "No triplex found"
                            ])
                            new_exp = False
                        else:
                            newlines.append(line)
                    if new_exp:
                        newlines.append([
                            exp, args.rn,
                            args.o.split("_")[-1], args.organism, tar_reg, "0",
                            "-", "1.0", r_genes, "No triplex found"
                        ])
            else:
                newlines.append([
                    "Experiment", "RNA_names", "Tag", "Organism",
                    "Target_region", "No_sig_DBDs", "Top_DBD", "p-value",
                    "closest_genes"
                ])
                newlines.append([
                    exp, args.rn,
                    args.o.split("_")[-1], args.organism, tar_reg, "0", "-",
                    "1.0", r_genes, "No triplex found"
                ])
            with open(pro_path, 'w') as f:
                for lines in newlines:
                    print("\t".join(lines), file=f)

            #shutil.rmtree(args.o)
            list_all_index(path=os.path.dirname(args.o),
                           show_RNA_ass_gene=randomtest.rna_regions)
            shutil.rmtree(args.o)
            sys.exit(1)

            #########################################################

        print2(summary,
               "\n" + "*************** Genomic Region Test ***************")
        print2(summary, "*** Input RNA sequence: " + args.r)
        print2(summary,
               "*** Input regions in BED: " + os.path.basename(args.bed))
        print2(summary, "*** Number of randomization: " + str(args.n))
        print2(summary, "*** Output directoey: " + os.path.basename(args.o))

        args.r = os.path.normpath(os.path.join(dir, args.r))

        print2(
            summary,
            "\nStep 1: Calculate the triplex forming sites on RNA and the given regions"
        )
        randomtest = RandomTest(rna_fasta=args.r,
                                rna_name=args.rn,
                                dna_region=args.bed,
                                organism=args.organism,
                                showdbs=args.showdbs)
        randomtest.get_rna_region_str(rna=args.r)
        obed = os.path.basename(args.o)
        randomtest.connect_rna(rna=args.r, temp=args.o)

        randomtest.target_dna(temp=args.o,
                              remove_temp=args.rt,
                              l=args.l,
                              e=args.e,
                              obed=obed,
                              c=args.c,
                              fr=args.fr,
                              fm=args.fm,
                              of=args.of,
                              mf=args.mf,
                              par=args.par,
                              cutoff=args.ccf)
        t1 = time.time()
        print2(
            summary, "\tRunning time is: " +
            str(datetime.timedelta(seconds=round(t1 - t0))))
        # print(args.par)
        if len(randomtest.rbss) == 0:
            # no_binding_code()
            no_binding_response(args=args,
                                rna_regions=randomtest.rna_regions,
                                rna_name=randomtest.rna_name,
                                organism=randomtest.organism,
                                stat=randomtest.stat,
                                expression=randomtest.rna_expression)

        print2(summary,
               "Step 2: Randomization and counting number of binding sites")

        randomtest.random_test(repeats=args.n,
                               temp=args.o,
                               remove_temp=args.rt,
                               l=args.l,
                               e=args.e,
                               c=args.c,
                               fr=args.fr,
                               fm=args.fm,
                               of=args.of,
                               mf=args.mf,
                               par=args.par,
                               rm=args.rm,
                               filter_bed=args.f,
                               alpha=args.a)

        t2 = time.time()
        print2(
            summary, "\tRunning time is: " +
            str(datetime.timedelta(seconds=round(t2 - t1))))

        print2(summary, "Step 3: Generating plot and output HTML")
        randomtest.dbd_regions(
            sig_region=randomtest.data["region"]["sig_region"], output=args.o)

        os.remove(os.path.join(args.o, "rna_temp.fa"))
        try:
            os.remove(os.path.join(args.o, "rna_temp.fa.fai"))
        except:
            pass

        randomtest.lineplot(txp=randomtest.txpf,
                            dirp=args.o,
                            ac=args.ac,
                            cut_off=args.accf,
                            showpa=args.showpa,
                            log=args.log,
                            ylabel="Number of DBS",
                            sig_region=randomtest.data["region"]["sig_region"],
                            linelabel="No. DBS",
                            filename="lineplot_region.png")

        #randomtest.lineplot(txp=randomtest.txp, dirp=args.o, ac=args.ac, cut_off=args.accf, showpa=args.showpa,
        #                    log=args.log, ylabel="Number of target regions with DBS",
        #                    sig_region=randomtest.data["region"]["sig_region"],
        #                    linelabel="No. target regions", filename="lineplot_region.png")

        randomtest.boxplot(
            dir=args.o,
            matrix=randomtest.region_matrix,
            sig_region=randomtest.data["region"]["sig_region"],
            truecounts=[r[0] for r in randomtest.counts_tr.values()],
            sig_boolean=randomtest.data["region"]["sig_boolean"],
            ylabel="Number of target regions",
            filename="boxplot_regions")
        #if args.showdbs:
        #    randomtest.lineplot(txp=randomtest.txpf, dirp=args.o, ac=args.ac, cut_off=args.accf, showpa=args.showpa,
        #                        log=args.log, ylabel="Number of DBS on target regions",
        #                        sig_region=randomtest.data["dbs"]["sig_region"],
        #                        linelabel="No. DBS", filename="lineplot_dbs.png")

        #    randomtest.boxplot(dir=args.o, matrix=randomtest.dbss_matrix,
        #                       sig_region=randomtest.data["dbs"]["sig_region"],
        #                       truecounts=randomtest.counts_dbs.values(),
        #                       sig_boolean=randomtest.data["dbs"]["sig_boolean"],
        #                       ylabel="Number of DBS on target regions",
        #                       filename="boxplot_dbs" )

        randomtest.gen_html(directory=args.o,
                            parameters=args,
                            align=50,
                            alpha=args.a,
                            score=args.score,
                            obed=obed)

        t3 = time.time()
        print2(
            summary, "\tRunning time is: " +
            str(datetime.timedelta(seconds=round(t3 - t2))))

        print2(
            summary, "\nTotal running time is: " +
            str(datetime.timedelta(seconds=round(t3 - t0))))

        output_summary(summary, args.o, "summary.txt")
        # save_profile(output=args.o, bed=args.bed)
        save_profile(rna_regions=randomtest.rna_regions,
                     rna_name=randomtest.rna_name,
                     organism=randomtest.organism,
                     output=args.o,
                     bed=args.bed,
                     stat=randomtest.stat,
                     topDBD=randomtest.topDBD,
                     sig_DBD=randomtest.data["region"]["sig_region"],
                     expression=randomtest.rna_expression)
        list_all_index(path=os.path.dirname(args.o))
        for f in os.listdir(args.o):
            if re.search("dna*.fa", f) or re.search("dna*.txp", f):
                os.remove(os.path.join(args.o, f))
        write_stat(stat=randomtest.stat,
                   filename=os.path.join(args.o, "stat.txt"))
コード例 #18
0
ファイル: boxplot.py プロジェクト: rafalcode/reg-gen
    def gen_html(self, directory, title, align=50):
        dir_name = os.path.basename(directory)
        # check_dir(directory)
        html_header = title
        link_d = OrderedDict()
        link_d["Boxplot"] = "index.html"
        link_d["Parameters"] = "parameters.html"

        html = Html(name=html_header,
                    links_dict=link_d,
                    fig_rpath="../style",
                    RGT_header=False,
                    other_logo="viz",
                    homepage="../index.html")
        # fp = os.path.join(dir,outputname,title)

        html.add_figure("boxplot.png", align="center")

        type_list = 'ssssssssssssssssssssssssssssssssssssssssssssss'

        #### Calculate p value ####
        plist = {}
        for g in self.sortDict.keys():
            plist[g] = {}
            for s1 in self.sortDict[g].keys():
                for c1 in self.sortDict[g][s1].keys():
                    data1 = self.sortDict[g][s1][c1]
                    plist[g][s1 + c1] = {}
                    for s2 in self.sortDict[g].keys():
                        for c2 in self.sortDict[g][s2].keys():
                            if s2 == s1 and c2 == c1:
                                pass
                            else:
                                data2 = self.sortDict[g][s2][c2]
                                u, p_value = mannwhitneyu(data1, data2)
                                plist[g][s1 + c1][s2 + c2] = p_value

        print("Multiple test correction.")
        multiple_correction(plist)

        for g in self.sortDict.keys():
            html.add_heading(g, size=4, bold=False)
            data_table = []
            col_size_list = [15]
            header_list = ["p-value"]
            for s in self.sortDict[g].keys():
                for c in self.sortDict[g][s1].keys():
                    header_list.append(s + "\n" + c)
                    col_size_list.append(15)

            for s1 in self.sortDict[g].keys():
                for c1 in self.sortDict[g][s1].keys():
                    row = [s1 + "\n" + c1]
                    for s2 in self.sortDict[g].keys():
                        for c2 in self.sortDict[g][s2].keys():
                            if s2 == s1 and c2 == c1:
                                row.append("-")
                            else:
                                p = plist[g][s1 + c1][s2 + c2]
                                if p > 0.05:
                                    row.append(value2str(p))
                                else:
                                    row.append("<font color=\"red\">" +
                                               value2str(p) + "</font>")
                    data_table.append(row)

            html.add_zebra_table(header_list,
                                 col_size_list,
                                 type_list,
                                 data_table,
                                 align=align + 50)

        # html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, title, "index.html"))

        ## Parameters
        html = Html(name=html_header,
                    links_dict=link_d,
                    fig_rpath="../style",
                    RGT_header=False,
                    other_logo="viz",
                    homepage="../index.html")

        header_list = ["Assumptions and hypothesis"]
        col_size_list = [50]
        data_table = [
            [
                'All the regions among different BED files are normalized by quantile normalization.'
            ],
            [
                'If there is any grouping problem, please check all the optional columns in input experimental matrix.'
            ]
        ]
        html.add_zebra_table(header_list,
                             col_size_list,
                             type_list,
                             data_table,
                             align=align,
                             cell_align="left")

        html.add_free_content([
            '<a href="parameters.txt" style="margin-left:100">See parameters</a>'
        ])
        html.add_free_content([
            '<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>'
        ])
        html.write(os.path.join(directory, title, "parameters.html"))