Esempio n. 1
0
 def find_threshold(self, percentile, universal=True):
     " find the threshold so only top #percentile are kept; Universal indicates whether the threshold is found for all result sets or individually for each "
     sys_write("Preparing data for plotting... ")
     self.allscore = np.hstack([r["score"] for r in self.res])
     self.allscore.sort()
     self.threshold = np.percentile(self.allscore, 100 - percentile)
     self.max_score = self.allscore[-1]
     self.min_score = self.allscore[0]
     self.plot_padding = 0.05 * (self.max_score - self.min_score)
     sys_write("Done.\n")
Esempio n. 2
0
 def transform(self):
     sys_write("Tranforming score values by ")
     if self.transform_type == "nlog":
         sys_write("Negative log... ")
         for r in self.res:
             r["score"] = -np.log10(r["score"])
     sys_write("Done.\n")
Esempio n. 3
0
    def debug(self, outfilename="", distance_allow=200000, percentile=0.02, restricted=False):
        """
        debug function
        """
        sys_write("Ranking result entries... ")
        self.res.sort(key=lambda d: -d.score)
        sys_write("Done.\n")

        l_top = []
        for res in self.res[: int(len(self.res) * percentile / 100)]:
            l_top.append(res)
        print "Selected top %s SNPs, with a minimum score of %s" % (len(l_top), l_top[-1].score)

        # Sort the top SNPs by position
        sys_write("Sorting top SNPs by position... ")
        l_top.sort()
        sys_write("Done.\n")

        return l_top
Esempio n. 4
0
    def get_prob_distribution(self, chr, posbeg, posend, savefigname):
        " Plot the distribution of SNP scores within a certain range (Sort of non-optimal implementation) "
        l_target_snp = []
        sys_write("Getting list of SNP within range... ")
        for snp in self.res:
            if snp.chr == chr and snp.pos > posbeg and snp.pos < posend:
                l_target_snp.append(snp)
        sys_write("Done.\n")

        sys_write("Plotting histogram... ")
        import matplotlib

        matplotlib.use("Agg")
        import matplotlib.pyplot as plt

        l_scores = [snp.score for snp in l_target_snp]
        plt.hist(l_scores, bins=25)
        plt.savefig(savefigname, format="png", dpi=300, bbox_inches="tight")
        plt.cla()
        sys_write("Done.\n")
Esempio n. 5
0
    def find_peaks(self, outfilename="", distance_allow=200000, percentile=1, restricted=False, replace_with_top=False):
        """
        A different approach to the problem of peaking finding, here I just find everything that we can
        - Bad repetitive codes! Should be merged somewhere into a shared private function
        """
        sys_write("Ranking result entries... ")
        self.res.sort(key=lambda d: -d.score)
        sys_write("Done.\n")

        l_top = []
        for res in self.res[: int(len(self.res) * percentile / 100)]:
            l_top.append(res)
        print "Selected top %s SNPs, with a minimum score of %s" % (len(l_top), l_top[-1].score)

        # Sort the top SNPs by position
        sys_write("Sorting top SNPs by position... ")
        l_top.sort()
        sys_write("Done.\n")

        sys_write("Calculating number of peaks... ")
        if replace_with_top == False:
            peaks = []
            current_chr = 0
            new_peak_left_bound = None
            for snp_i, snp in enumerate(l_top):
                if restricted == True:
                    if snp.indicator == 1:
                        continue
                if snp.chr > current_chr:
                    current_chr = snp.chr
                    if new_peak_left_bound != None:
                        peaks.append(l_top[snp_i - 1])
                    new_peak_left_bound = None
                    last_peak_right_limit = 0
                if new_peak_left_bound != None:
                    if snp.pos > new_peak_left_bound + distance_allow:
                        new_peak_left_bound = None
                        peaks.append(l_top[snp_i - 1])
                        last_peak_right_limit = l_top[snp_i - 1].pos + distance_allow
                if new_peak_left_bound == None and snp.pos > last_peak_right_limit:
                    new_peak_left_bound = snp.pos
            # process far right: if reach end, simply add the last snp
            if new_peak_left_bound != None:
                peaks.append(snp)

        # Additional step: replace the peaks found with top snps in the peak range. A larger window is allowed in this case
        if replace_with_top == True:
            peaks = []
            current_chr = l_top[0].chr
            current_peak = l_top[0]
            left_limit = l_top[0].pos
            for snp_i, snp in enumerate(l_top[1:]):
                if restricted == True:
                    if snp.indicator == 1:
                        continue
                if snp.chr > current_chr:
                    peaks.append(current_peak)
                    current_peak = snp
                    current_chr = snp.chr
                    left_limit = snp.pos
                else:
                    if snp.pos > current_peak.pos + distance_allow:
                        peaks.append(current_peak)
                        current_peak = snp
                        left_limit = snp.pos
                    elif snp.score > current_peak.score:
                        if snp.pos > left_limit + distance_allow:
                            # a higher peak, but does not cover the leftmost snp
                            peaks.append(current_peak)
                            current_peak = snp
                            left_limit = snp.pos
                        else:
                            current_peak = snp
            peaks.append(current_peak)
        sys_write("Done.\n")
        return peaks
Esempio n. 6
0
    def region_plot(
        self,
        chr,
        posbeg,
        posend,
        savefigname="trial.png",
        percentile=2,
        plot_snp_list=None,
        gene_file_name="/Volumes/samaras/Data/TAIR9/TAIR9_GFF3_genes.gff",
    ):
        """
        OBSELETE 
        gwa plot, regional
        percentile : the top #% to be included in the final plot
        full_plot : toggles whether the entire span of 5 chromosomes are plotted regardless of what is inside the data
        """
        import matplotlib

        matplotlib.use("Agg")
        import matplotlib.pyplot as plt

        self.find_threshold(percentile)

        # initializing plot
        plt.figure(figsize=(12, 7.5))
        plt.axes([0, 0.25, 1, 0.75])

        # plotting data points
        sys_write("Plotting data points... ")
        for data_indicator in xrange(1, self.num_dif_results + 1):
            scores = []
            positions = []
            for res in self.res:
                if res.chr != chr:
                    continue
                if res.pos > posend or res.pos < posbeg:
                    continue
                if res.indicator == data_indicator:  # This is so not optimal!
                    if res.score >= self.thresholds[data_indicator - 1]:
                        scores.append(res.score)
                        positions.append(res.plotpos)
            plt.plot(positions, scores, ".", markersize=3, alpha=0.7)
        sys_write("Done.\n")

        # plotting axis
        sys_write("Plotting axes and extras... ")
        # Creating a ticklist for the region; there should be separate scale lists
        ticklist = []
        ticklabels = []
        # dynamic labeling of intervals
        pos_range = posend - posbeg
        log10scale = int(math.log10(pos_range))
        log10residue = float(pos_range) / math.pow(10, log10scale)
        if log10residue < 1.2:
            log10multiplier = 0.2
        elif log10residue >= 1.2 and log10residue < 2.6:
            log10multiplier = 0.5
        elif log10residue >= 2.6 and log10residue < 5.8:
            log10multiplier = 1
        else:
            log10multiplier = 2
        scale_interval = int(log10multiplier * math.pow(10, log10scale))
        if scale_interval >= 1000000:
            label_scale_interval = scale_interval / 1000000
            label_scale_alphabet = "Mb"
        elif scale_interval >= 1000:
            label_scale_interval = scale_interval / 1000
            label_scale_alphabet = "kb"
        else:
            label_scale_interval = scale_interval
            label_scale_alphabet = "bp"
        for i in xrange(int((posbeg - 1) / scale_interval) + 1, int((posend) / scale_interval) + 1):
            ticklabels.append(i * label_scale_interval)
            ticklist.append(self.chr_offsets[chr - 1] + i * scale_interval)
        plt.axis(
            [
                self.chr_offsets[chr] + posbeg,
                self.chr_offsets[chr] + posend,
                self.min_score - self.plot_padding,
                self.max_score + self.plot_padding,
            ]
        )
        plt.xticks(ticklist, ticklabels)
        plt.ylabel("$-log(p-$value$)$", size="large")
        plt.xlabel(label_scale_alphabet, size="large")
        # Plots custom snp list
        if plot_snp_list != None:
            for custom_snp in plot_snp_list:
                if custom_snp.chr == chr:
                    if custom_snp.pos >= posbeg and custom_snp.pos <= posend:
                        plt.plot(
                            [custom_snp.plotpos, custom_snp.plotpos],
                            [self.min_score - self.plot_padding, self.max_score + self.plot_padding],
                            "b-",
                            linewidth=0.5,
                        )
        sys_write("Done\n")

        # plotting genomes
        if not self.G:
            import gene_info

            sys_write("Loading gene model information... ")
            self.G = gene_info.genes(gene_file_name)
            sys_write("Done.\n")

        l_genes = self.G.get_genes_in_range(chr, posbeg, posend)
        if len(l_genes) > 50:
            print "Skipping gene plots: too many genes in range: %s. " % len(l_genes)
        else:
            sys_write("Plotting gene models... ")
            plt.axes([0, 0, 1, 0.18])
            plt.axis([self.chr_offsets[chr - 1] + posbeg, self.chr_offsets[chr - 1] + posend, -3, 0])
            plt.axis("off")
            broken_barh_xranges = []
            broken_barh_yranges = (-0.5, 0.5)
            annotate_y = -1
            for gene in l_genes:
                broken_barh_xranges.append((self.chr_offsets[chr - 1] + gene.posbeg, gene.posend - gene.posbeg))
                plt.annotate(gene.id, (self.chr_offsets[chr - 1] + gene.posbeg, annotate_y), rotation=270, size="small")
            plt.broken_barh(broken_barh_xranges, broken_barh_yranges, facecolors="yellow", alpha=0.5)
            sys_write("Done.\n")

        # saving figure
        sys_write("Outputting figure to file... ")
        plt.savefig(savefigname, format="png", dpi=300, bbox_inches="tight")
        sys_write("Done.\n")
Esempio n. 7
0
    def manhattan_plot(self, savefigname="trial.png", percentile=2, full_plot=True, plot_snp_list=None, tick_gap=10):
        """ gwa plot, manhattan style 
        percentile : the top #% to be included in the final plot
        full_plot : toggles whether the entire span of 5 chromosomes are plotted regardless of what is inside the data
        """
        import matplotlib

        matplotlib.use("Agg")
        import matplotlib.pyplot as plt

        self.find_threshold(percentile)

        # initializing plot
        plt.figure(figsize=(12, 2.8))
        plt.axes([0.045, 0.15, 0.95, 0.71])

        # plotting data points
        sys_write("Plotting data points... ")
        for r in self.res:
            indices_to_plot = r["score"] >= self.threshold
            plt.plot(r["plotpos"][indices_to_plot], r["score"][indices_to_plot], ".", markersize=3, alpha=0.7)
        sys_write("Done.\n")

        # plotting axis
        sys_write("Plotting axes and extras... ")
        for chr_ind in self.chr_range[1:]:
            plt.plot(
                [self.chr_offsets[chr_ind], self.chr_offsets[chr_ind]],
                [self.min_score - self.plot_padding, self.max_score + self.plot_padding],
                "k-",
                linewidth=0.5,
            )
        # bonferroni
        bonf = -np.log10(0.05 / sum([len(r) for r in self.res]))
        if bonf < self.max_score + self.plot_padding:  # only plot if relevant
            plt.plot([0, self.chr_offsets[-1]], [bonf, bonf], "r--")
        # ticks
        if full_plot == True:
            # This part is independent of data
            # Mostly copied from Bjarni's
            ticklist = []
            ticklabels = []
            for chr_ind in self.chr_range:
                for chr_tickpos in xrange(self.chr_offsets[chr_ind], self.chr_offsets[chr_ind + 1], 5000000):
                    ticklist.append(chr_tickpos)
                for chr_ticklabel in xrange(0, self.chr_sizes[chr_ind], 5000000):  # no tick for 0
                    if (
                        chr_ticklabel % (tick_gap * 1000000) == 0
                        and chr_ticklabel < self.chr_offsets[chr_ind + 1] - (tick_gap * 300000)
                        and chr_ticklabel > 0
                    ):
                        ticklabels.append(chr_ticklabel / 1000000)
                    else:
                        ticklabels.append("")
        plt.axis([0, self.chr_offsets[-1], self.min_score - self.plot_padding, self.max_score + self.plot_padding])
        plt.xticks(ticklist, ticklabels)
        plt.ylabel("$-log(p-$value$)$", size="large")
        # Plots custom snp list
        if plot_snp_list != None:
            for custom_snp in plot_snp_list:
                plt.plot(
                    [custom_snp.plotpos, custom_snp.plotpos],
                    [self.min_score - self.plot_padding, self.max_score + self.plot_padding],
                    "b-",
                    linewidth=0.5,
                )
        sys_write("Done\n")

        # saving figure
        sys_write("Outputting figure to file... ")
        plt.savefig(savefigname, format="png", dpi=300, bbox_inches="tight")
        sys_write("Done.\n")
Esempio n. 8
0
 def sort(self, whichcolumn=2):
     " sort all the result entries once all results are loaded, not working "
     sys_write("Sorting through result entries... ")
     self.res = self.res[self.res[:, 2].argsort()]
     sys_write("Done.\n")