def find_threshold(self, percentile, universal=True): " find the threshold so only top #percentile are kept; Universal indicates whether the threshold is found for all result sets or individually for each " sys_write("Preparing data for plotting... ") self.allscore = np.hstack([r["score"] for r in self.res]) self.allscore.sort() self.threshold = np.percentile(self.allscore, 100 - percentile) self.max_score = self.allscore[-1] self.min_score = self.allscore[0] self.plot_padding = 0.05 * (self.max_score - self.min_score) sys_write("Done.\n")
def transform(self): sys_write("Tranforming score values by ") if self.transform_type == "nlog": sys_write("Negative log... ") for r in self.res: r["score"] = -np.log10(r["score"]) sys_write("Done.\n")
def debug(self, outfilename="", distance_allow=200000, percentile=0.02, restricted=False): """ debug function """ sys_write("Ranking result entries... ") self.res.sort(key=lambda d: -d.score) sys_write("Done.\n") l_top = [] for res in self.res[: int(len(self.res) * percentile / 100)]: l_top.append(res) print "Selected top %s SNPs, with a minimum score of %s" % (len(l_top), l_top[-1].score) # Sort the top SNPs by position sys_write("Sorting top SNPs by position... ") l_top.sort() sys_write("Done.\n") return l_top
def get_prob_distribution(self, chr, posbeg, posend, savefigname): " Plot the distribution of SNP scores within a certain range (Sort of non-optimal implementation) " l_target_snp = [] sys_write("Getting list of SNP within range... ") for snp in self.res: if snp.chr == chr and snp.pos > posbeg and snp.pos < posend: l_target_snp.append(snp) sys_write("Done.\n") sys_write("Plotting histogram... ") import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt l_scores = [snp.score for snp in l_target_snp] plt.hist(l_scores, bins=25) plt.savefig(savefigname, format="png", dpi=300, bbox_inches="tight") plt.cla() sys_write("Done.\n")
def find_peaks(self, outfilename="", distance_allow=200000, percentile=1, restricted=False, replace_with_top=False): """ A different approach to the problem of peaking finding, here I just find everything that we can - Bad repetitive codes! Should be merged somewhere into a shared private function """ sys_write("Ranking result entries... ") self.res.sort(key=lambda d: -d.score) sys_write("Done.\n") l_top = [] for res in self.res[: int(len(self.res) * percentile / 100)]: l_top.append(res) print "Selected top %s SNPs, with a minimum score of %s" % (len(l_top), l_top[-1].score) # Sort the top SNPs by position sys_write("Sorting top SNPs by position... ") l_top.sort() sys_write("Done.\n") sys_write("Calculating number of peaks... ") if replace_with_top == False: peaks = [] current_chr = 0 new_peak_left_bound = None for snp_i, snp in enumerate(l_top): if restricted == True: if snp.indicator == 1: continue if snp.chr > current_chr: current_chr = snp.chr if new_peak_left_bound != None: peaks.append(l_top[snp_i - 1]) new_peak_left_bound = None last_peak_right_limit = 0 if new_peak_left_bound != None: if snp.pos > new_peak_left_bound + distance_allow: new_peak_left_bound = None peaks.append(l_top[snp_i - 1]) last_peak_right_limit = l_top[snp_i - 1].pos + distance_allow if new_peak_left_bound == None and snp.pos > last_peak_right_limit: new_peak_left_bound = snp.pos # process far right: if reach end, simply add the last snp if new_peak_left_bound != None: peaks.append(snp) # Additional step: replace the peaks found with top snps in the peak range. A larger window is allowed in this case if replace_with_top == True: peaks = [] current_chr = l_top[0].chr current_peak = l_top[0] left_limit = l_top[0].pos for snp_i, snp in enumerate(l_top[1:]): if restricted == True: if snp.indicator == 1: continue if snp.chr > current_chr: peaks.append(current_peak) current_peak = snp current_chr = snp.chr left_limit = snp.pos else: if snp.pos > current_peak.pos + distance_allow: peaks.append(current_peak) current_peak = snp left_limit = snp.pos elif snp.score > current_peak.score: if snp.pos > left_limit + distance_allow: # a higher peak, but does not cover the leftmost snp peaks.append(current_peak) current_peak = snp left_limit = snp.pos else: current_peak = snp peaks.append(current_peak) sys_write("Done.\n") return peaks
def region_plot( self, chr, posbeg, posend, savefigname="trial.png", percentile=2, plot_snp_list=None, gene_file_name="/Volumes/samaras/Data/TAIR9/TAIR9_GFF3_genes.gff", ): """ OBSELETE gwa plot, regional percentile : the top #% to be included in the final plot full_plot : toggles whether the entire span of 5 chromosomes are plotted regardless of what is inside the data """ import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt self.find_threshold(percentile) # initializing plot plt.figure(figsize=(12, 7.5)) plt.axes([0, 0.25, 1, 0.75]) # plotting data points sys_write("Plotting data points... ") for data_indicator in xrange(1, self.num_dif_results + 1): scores = [] positions = [] for res in self.res: if res.chr != chr: continue if res.pos > posend or res.pos < posbeg: continue if res.indicator == data_indicator: # This is so not optimal! if res.score >= self.thresholds[data_indicator - 1]: scores.append(res.score) positions.append(res.plotpos) plt.plot(positions, scores, ".", markersize=3, alpha=0.7) sys_write("Done.\n") # plotting axis sys_write("Plotting axes and extras... ") # Creating a ticklist for the region; there should be separate scale lists ticklist = [] ticklabels = [] # dynamic labeling of intervals pos_range = posend - posbeg log10scale = int(math.log10(pos_range)) log10residue = float(pos_range) / math.pow(10, log10scale) if log10residue < 1.2: log10multiplier = 0.2 elif log10residue >= 1.2 and log10residue < 2.6: log10multiplier = 0.5 elif log10residue >= 2.6 and log10residue < 5.8: log10multiplier = 1 else: log10multiplier = 2 scale_interval = int(log10multiplier * math.pow(10, log10scale)) if scale_interval >= 1000000: label_scale_interval = scale_interval / 1000000 label_scale_alphabet = "Mb" elif scale_interval >= 1000: label_scale_interval = scale_interval / 1000 label_scale_alphabet = "kb" else: label_scale_interval = scale_interval label_scale_alphabet = "bp" for i in xrange(int((posbeg - 1) / scale_interval) + 1, int((posend) / scale_interval) + 1): ticklabels.append(i * label_scale_interval) ticklist.append(self.chr_offsets[chr - 1] + i * scale_interval) plt.axis( [ self.chr_offsets[chr] + posbeg, self.chr_offsets[chr] + posend, self.min_score - self.plot_padding, self.max_score + self.plot_padding, ] ) plt.xticks(ticklist, ticklabels) plt.ylabel("$-log(p-$value$)$", size="large") plt.xlabel(label_scale_alphabet, size="large") # Plots custom snp list if plot_snp_list != None: for custom_snp in plot_snp_list: if custom_snp.chr == chr: if custom_snp.pos >= posbeg and custom_snp.pos <= posend: plt.plot( [custom_snp.plotpos, custom_snp.plotpos], [self.min_score - self.plot_padding, self.max_score + self.plot_padding], "b-", linewidth=0.5, ) sys_write("Done\n") # plotting genomes if not self.G: import gene_info sys_write("Loading gene model information... ") self.G = gene_info.genes(gene_file_name) sys_write("Done.\n") l_genes = self.G.get_genes_in_range(chr, posbeg, posend) if len(l_genes) > 50: print "Skipping gene plots: too many genes in range: %s. " % len(l_genes) else: sys_write("Plotting gene models... ") plt.axes([0, 0, 1, 0.18]) plt.axis([self.chr_offsets[chr - 1] + posbeg, self.chr_offsets[chr - 1] + posend, -3, 0]) plt.axis("off") broken_barh_xranges = [] broken_barh_yranges = (-0.5, 0.5) annotate_y = -1 for gene in l_genes: broken_barh_xranges.append((self.chr_offsets[chr - 1] + gene.posbeg, gene.posend - gene.posbeg)) plt.annotate(gene.id, (self.chr_offsets[chr - 1] + gene.posbeg, annotate_y), rotation=270, size="small") plt.broken_barh(broken_barh_xranges, broken_barh_yranges, facecolors="yellow", alpha=0.5) sys_write("Done.\n") # saving figure sys_write("Outputting figure to file... ") plt.savefig(savefigname, format="png", dpi=300, bbox_inches="tight") sys_write("Done.\n")
def manhattan_plot(self, savefigname="trial.png", percentile=2, full_plot=True, plot_snp_list=None, tick_gap=10): """ gwa plot, manhattan style percentile : the top #% to be included in the final plot full_plot : toggles whether the entire span of 5 chromosomes are plotted regardless of what is inside the data """ import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt self.find_threshold(percentile) # initializing plot plt.figure(figsize=(12, 2.8)) plt.axes([0.045, 0.15, 0.95, 0.71]) # plotting data points sys_write("Plotting data points... ") for r in self.res: indices_to_plot = r["score"] >= self.threshold plt.plot(r["plotpos"][indices_to_plot], r["score"][indices_to_plot], ".", markersize=3, alpha=0.7) sys_write("Done.\n") # plotting axis sys_write("Plotting axes and extras... ") for chr_ind in self.chr_range[1:]: plt.plot( [self.chr_offsets[chr_ind], self.chr_offsets[chr_ind]], [self.min_score - self.plot_padding, self.max_score + self.plot_padding], "k-", linewidth=0.5, ) # bonferroni bonf = -np.log10(0.05 / sum([len(r) for r in self.res])) if bonf < self.max_score + self.plot_padding: # only plot if relevant plt.plot([0, self.chr_offsets[-1]], [bonf, bonf], "r--") # ticks if full_plot == True: # This part is independent of data # Mostly copied from Bjarni's ticklist = [] ticklabels = [] for chr_ind in self.chr_range: for chr_tickpos in xrange(self.chr_offsets[chr_ind], self.chr_offsets[chr_ind + 1], 5000000): ticklist.append(chr_tickpos) for chr_ticklabel in xrange(0, self.chr_sizes[chr_ind], 5000000): # no tick for 0 if ( chr_ticklabel % (tick_gap * 1000000) == 0 and chr_ticklabel < self.chr_offsets[chr_ind + 1] - (tick_gap * 300000) and chr_ticklabel > 0 ): ticklabels.append(chr_ticklabel / 1000000) else: ticklabels.append("") plt.axis([0, self.chr_offsets[-1], self.min_score - self.plot_padding, self.max_score + self.plot_padding]) plt.xticks(ticklist, ticklabels) plt.ylabel("$-log(p-$value$)$", size="large") # Plots custom snp list if plot_snp_list != None: for custom_snp in plot_snp_list: plt.plot( [custom_snp.plotpos, custom_snp.plotpos], [self.min_score - self.plot_padding, self.max_score + self.plot_padding], "b-", linewidth=0.5, ) sys_write("Done\n") # saving figure sys_write("Outputting figure to file... ") plt.savefig(savefigname, format="png", dpi=300, bbox_inches="tight") sys_write("Done.\n")
def sort(self, whichcolumn=2): " sort all the result entries once all results are loaded, not working " sys_write("Sorting through result entries... ") self.res = self.res[self.res[:, 2].argsort()] sys_write("Done.\n")