def __init__(self, EMpath, title, annotation, organism, center, extend, rs, bs, ss, df, dft, fields, test, sense): # Read the Experimental Matrix self.title = title self.exps = ExperimentalMatrix() self.exps.read(EMpath, test=test) for f in self.exps.fields: if f not in [ 'name', 'type', 'file', "reads", "regions", "factors" ]: self.exps.match_ms_tags(f, test=test) self.exps.remove_name() # if annotation: # self.beds, self.bednames, self.annotation = annotation_dump(organism) # else: self.beds = self.exps.get_regionsets() # A list of GenomicRegionSets self.bednames = self.exps.get_regionsnames() self.annotation = None self.reads = self.exps.get_readsfiles() self.readsnames = self.exps.get_readsnames() self.fieldsDict = self.exps.fieldsDict self.parameter = [] self.center = center self.extend = extend self.rs = rs self.bs = bs self.ss = ss self.df = df self.dft = dft self.sense = sense
def __init__(self, reference_path, query_path): # Reference self.rEM = ExperimentalMatrix() self.rEM.read(reference_path) self.rEM.remove_empty_regionset() self.references = self.rEM.get_regionsets() self.referencenames = self.rEM.get_regionsnames() # Query self.qEM = ExperimentalMatrix() self.qEM.read(query_path) self.qEM.remove_empty_regionset() self.query = self.qEM.get_regionsets() self.querynames = self.qEM.get_regionsnames() self.parameter = [] self.background = None
def __init__(self, EMpath, title, annotation, organism, center, extend, rs, bs, ss, df, dft, fields, test, sense): # Read the Experimental Matrix self.title = title self.exps = ExperimentalMatrix() self.exps.read(EMpath, test=test) for f in self.exps.fields: if f not in ['name', 'type', 'file', "reads", "regions", "factors"]: self.exps.match_ms_tags(f, test=test) self.exps.remove_name() # if annotation: # self.beds, self.bednames, self.annotation = annotation_dump(organism) # else: self.beds = self.exps.get_regionsets() # A list of GenomicRegionSets self.bednames = self.exps.get_regionsnames() self.annotation = None self.reads = self.exps.get_readsfiles() self.readsnames = self.exps.get_readsnames() self.fieldsDict = self.exps.fieldsDict self.parameter = [] self.center = center self.extend = extend self.rs = rs self.bs = bs self.ss = ss self.df = df self.dft = dft self.sense = sense
import sys import os.path from rgt.GenomicRegionSet import * from rgt.ExperimentalMatrix import * from fisher import pvalue back=False designFile = sys.argv[1] anotationPath = sys.argv[2] genomeFile=anotationPath+"chrom.sizes" geneFile=anotationPath+"association_file.bed" exps=ExperimentalMatrix() exps.read(designFile) beds=[] geneLists=[] #this should be improved bedGenes = GenomicRegionSet(geneFile) bedGenes.read_bed(geneFile) allgenes=[] for r in bedGenes: allgenes.append(r.name) allgenes=list(set(allgenes)) genesets=exps.get_genesets() if len(sys.argv) > 3: back=True backGroundPeaks = sys.argv[3]
def test_projection_test(self): matrix1 = ExperimentalMatrix() matrix2 = ExperimentalMatrix() matrix1.read(path_input2) matrix2.read(path_input2) AssociationAnalysis.projection_test(matrix1, matrix2)
def test_jaccard_test(self): matrix1 = ExperimentalMatrix() matrix2 = ExperimentalMatrix() matrix1.read(path_input2) matrix2.read(path_input2)
if len(args) != 4: parser.error("Exactly three parameters are needed: experimental matrix, gene expression, annotation path and prefix for output") #map arguments experimental_matrix_file = args[0] gene_exp = args[1] annotation_path = args[2] outputdir = args[3] # experimental_matrix_file = "/home/manuel/workspace/cluster_p/THOR/exp/exp23_macs2_payton/1" # gene_exp = "/home/manuel/workspace/cluster_p/allhoff/project_THOR/data/payton/gene_expression/CCmean.data" # annotation_path = "/home/manuel/workspace/cluster_h/rgtdata/hg19/" # outputdir = "/home/manuel/test/" exps = ExperimentalMatrix() exps.read(experimental_matrix_file) regionsets = exps.get_regionsets() genome_file = annotation_path + "/chrom.sizes" gene_file = annotation_path + "/association_file.bed" genes = GeneSet("Expression") genes.read_expression(gene_exp) for region in regionsets: bedNew = GenomicRegionSet("") [degenes, de_peak_genes, mappedGenes, totalPeaks, regionsToGenes] \ = bedNew.filter_by_gene_association_old(region.fileName, genes.genes, gene_file, genome_file) [ct, labels] = averageExpression(region, genes, regionsToGenes)
#cov.normRPM() c.append(cov.coverage) return numpy.transpose(c) def printTable(namesCol, namesLines, table, fileName): f = open(fileName, "w") f.write("\t" + ("\t".join(namesCol)) + "\n") for i, line in enumerate(table): f.write(namesLines[i] + "\t" + ("\t".join([str(j) for j in line])) + "\n") out = "" experimentalFile = sys.argv[1] exps = ExperimentalMatrix() exps.read(experimentalFile) beds = exps.get_regionsets() reads = exps.get_readsfiles() readsnames = exps.get_readsnames() outputDir = sys.argv[2] if len(sys.argv) > 3: experimentalFile2 = sys.argv[3] exps2 = ExperimentalMatrix() exps2.read(experimentalFile2) reads = exps2.get_readsfiles() readsnames = exps2.get_readsnames() out = outputDir for bed in beds: bednames = [
class Lineplot: def __init__(self, EMpath, title, annotation, organism, center, extend, rs, bs, ss, df, dft, fields, test, sense): # Read the Experimental Matrix self.title = title self.exps = ExperimentalMatrix() self.exps.read(EMpath, test=test) for f in self.exps.fields: if f not in ['name', 'type', 'file', "reads", "regions", "factors"]: self.exps.match_ms_tags(f, test=test) self.exps.remove_name() # if annotation: # self.beds, self.bednames, self.annotation = annotation_dump(organism) # else: self.beds = self.exps.get_regionsets() # A list of GenomicRegionSets self.bednames = self.exps.get_regionsnames() self.annotation = None self.reads = self.exps.get_readsfiles() self.readsnames = self.exps.get_readsnames() self.fieldsDict = self.exps.fieldsDict self.parameter = [] self.center = center self.extend = extend self.rs = rs self.bs = bs self.ss = ss self.df = df self.dft = dft self.sense = sense def relocate_bed(self): self.processed_beds = [] self.processed_bedsF = [] # Processed beds to be flapped for bed in self.beds: if self.center == 'bothends': newbed = bed.relocate_regions(center='leftend', left_length=self.extend + self.bs, right_length=self.extend + self.bs) self.processed_beds.append(newbed) newbedF = bed.relocate_regions(center='rightend', left_length=self.extend + self.bs, right_length=self.extend + self.bs) self.processed_bedsF.append(newbedF) elif self.center == 'upstream' or self.center == 'downstream': allbed = bed.relocate_regions(center=self.center, left_length=self.extend + self.bs, right_length=self.extend + self.bs) newbed = allbed.filter_strand(strand="+") self.processed_beds.append(newbed) newbedF = allbed.filter_strand(strand="-") self.processed_bedsF.append(newbedF) else: newbed = bed.relocate_regions(center=self.center, left_length=self.extend + int(0.5 * self.bs) + 2 * self.ss, right_length=self.extend + int(0.5 * self.bs) + 2 * self.ss) self.processed_beds.append(newbed) def group_tags(self, groupby, sortby, colorby): """Generate the tags for the grouping of plot Parameters: groupby = 'reads','regions','cell',or 'factor' colorby = 'reads','regions','cell',or 'factor' sortby = 'reads','regions','cell',or 'factor' """ self.tag_type = [sortby, groupby, colorby, self.dft] if "None" in self.tag_type: self.tag_type.remove("None") if groupby == "None": self.group_tags = [""] elif groupby == "regions" and self.annotation: self.group_tags = self.bednames else: self.group_tags = gen_tags(self.exps, groupby) if sortby == "None": self.sort_tags = [""] elif sortby == "regions" and self.annotation: self.sort_tags = self.bednames else: self.sort_tags = gen_tags(self.exps, sortby) if colorby == "None": self.color_tags = [""] elif colorby == "regions" and self.annotation: self.color_tags = self.bednames else: self.color_tags = gen_tags(self.exps, colorby) print("\tColumn labels:\t" + ",".join(self.group_tags)) print("\tRow labels:\t" + ",".join(self.sort_tags)) print("\tColor labels:\t" + ",".join(self.color_tags)) def gen_cues(self): self.cuebed = OrderedDict() self.cuebam = OrderedDict() # if self.annotation: # #all_tags = [] # #for dictt in self.exps.fieldsDict.values(): # # for tag in dictt.keys(): # # all_tags.append(tag) # for bed in self.bednames: # # self.cuebed[bed] = set([bed]+all_tags) # self.cuebed[bed] = set([bed]) # else: for bed in self.bednames: self.cuebed[bed] = set(tag_from_r(self.exps, self.tag_type, bed)) try: self.cuebed[bed].remove("None") except: pass for bam in self.readsnames: self.cuebam[bam] = set(tag_from_r(self.exps, self.tag_type, bam)) def coverage(self, sortby, heatmap=False, logt=False, mp=0, log=False): def annot_ind(bednames, tags): """Find the index for annotation tag""" for ind, a in enumerate(bednames): if a in tags: return ind if mp>0: ts = time.time() normRPM = False # Calculate for coverage mp_input = [] data = OrderedDict() bi = 0 for s in self.sort_tags: data[s] = OrderedDict() for g in self.group_tags: data[s][g] = OrderedDict() for c in self.color_tags: # if self.df: data[s][g][c] = [] data[s][g][c] = OrderedDict() if not self.dft: dfs = [c] else: dfs = self.exps.fieldsDict[self.dft].keys() for d in dfs: data[s][g][c][d] = defaultdict(list) for bed in self.cuebed.keys(): # print(self.cuebed[bed]) # print(set([s,g,c,d])) # print(self.cuebed[bed].issubset(set([s,g,c,d]))) if len(self.cuebed[bed].intersection(set([s, g, c, d]))) > 2 or self.cuebed[bed].issubset( set([s, g, c, d])): # if self.cuebed[bed] <= set([s,g,c]): for bam in self.cuebam.keys(): # print(self.cuebam[bam]) # print(set([s,g,c])) if self.cuebam[bam] <= set([s, g, c, d]): i = self.bednames.index(bed) j = self.readsnames.index(bam) # print(bed + "." + bam) # if len(self.processed_beds[i]) == 0: # try: # data[s][g][c][d].append(numpy.empty(1, dtype=object)) # except: # data[s][g][c][d] = [numpy.empty(1, dtype=object)] # continue ######################################################################### if mp > 0: # Multiple processing mp_input.append([self.processed_beds[i], self.reads[j], self.rs, self.bs, self.ss, self.center, heatmap, logt, s, g, c, d]) data[s][g][c][d] = None ######################################################################### else: # Single thread ts = time.time() cov = CoverageSet(bed + "." + bam, self.processed_beds[i]) # print(len(self.processed_beds[i])) if "Conservation" in [s,g,c,d]: cov.phastCons46way_score(stepsize=self.ss) elif ".bigwig" in self.reads[j].lower() or ".bw" in self.reads[j].lower(): cov.coverage_from_bigwig(bigwig_file=self.reads[j], stepsize=self.ss) else: if not self.sense: cov.coverage_from_bam(bam_file=self.reads[j], extension_size=self.rs, binsize=self.bs, stepsize=self.ss) if normRPM: cov.normRPM() else: # Sense specific cov.coverage_from_bam(bam_file=self.reads[j], extension_size=self.rs, binsize=self.bs, stepsize=self.ss, get_sense_info=True, paired_reads=True) cov.array_transpose() if normRPM: cov.normRPM() # When bothends, consider the fliping end if self.center == 'bothends' or self.center == 'upstream' or self.center == 'downstream': if "Conservation" in [s,g,c,d]: flap = CoverageSet("for flap", self.processed_bedsF[i]) flap.phastCons46way_score(stepsize=self.ss) ffcoverage = numpy.fliplr(flap.coverage) cov.coverage = numpy.concatenate((cov.coverage, ffcoverage), axis=0) elif ".bigwig" in self.reads[j].lower() or ".bw" in self.reads[j].lower(): flap = CoverageSet("for flap", self.processed_bedsF[i]) flap.coverage_from_bigwig(bigwig_file=self.reads[j], stepsize=self.ss) ffcoverage = numpy.fliplr(flap.coverage) cov.coverage = numpy.concatenate((cov.coverage, ffcoverage), axis=0) else: flap = CoverageSet("for flap", self.processed_bedsF[i]) if not self.sense: flap.coverage_from_bam(self.reads[j], extension_size=self.rs, binsize=self.bs, stepsize=self.ss) if normRPM: flap.normRPM() else: # Sense specific flap.coverage_from_bam(bam_file=self.reads[j], extension_size=self.rs, binsize=self.bs, stepsize=self.ss, get_sense_info=True, paired_reads=True) flap.array_transpose(flip=True) if normRPM: flap.normRPM() ffcoverage = numpy.fliplr(flap.coverage) try: cov.coverage = numpy.concatenate((cov.coverage, ffcoverage), axis=0) except: pass if self.sense: cov.transpose_cov1 = numpy.concatenate((cov.transpose_cov1, flap.transpose_cov1),axis=0) cov.transpose_cov2 = numpy.concatenate((cov.transpose_cov2, flap.transpose_cov2), axis=0) # Averaging the coverage of all regions of each bed file if heatmap: if logt: data[s][g][c][d] = numpy.log10(numpy.vstack( cov.coverage) + 1) # Store the array into data list else: data[s][g][c][d] = numpy.vstack( cov.coverage) # Store the array into data list else: if len(cov.coverage) == 0: data[s][g][c][d] = None print("** Warning: Cannot open " + self.reads[j]) continue else: for i, car in enumerate(cov.coverage): if i == 0: avearr = np.array(car, ndmin=2) else: # avearr = numpy.vstack((avearr, np.array(car, ndmin=2))) try: avearr = numpy.vstack((avearr, np.array(car, ndmin=2))) except: print(bed+"."+bam+"."+str(i)) if log: avearr = numpy.log2(avearr+1) avearr = numpy.average(avearr, axis=0) if self.sense: if log: sense_1 = numpy.average(numpy.log2(cov.transpose_cov1+1), axis=0) sense_2 = numpy.average(numpy.log2(cov.transpose_cov2+1), axis=0) else: sense_1 = numpy.average(cov.transpose_cov1,axis=0) sense_2 = numpy.average(cov.transpose_cov2,axis=0) cut_end = int(self.bs/self.ss) avearr = avearr[cut_end:-cut_end] data[s][g][c][d]["all"].append(avearr) if self.sense: sense_1 = sense_1[cut_end:-cut_end] sense_2 = sense_2[cut_end:-cut_end] data[s][g][c][d]["sense_1"].append(sense_1) data[s][g][c][d]["sense_2"].append(sense_2) bi += 1 te = time.time() print2(self.parameter, "\t" + str(bi) + "\t" + "{0:30}\t--{1:<5.1f}s".format( bed + "." + bam, ts - te)) if mp > 0: pool = MyPool(mp) mp_output = pool.map(compute_coverage, mp_input) pool.close() pool.join() for s in data.keys(): for g in data[s].keys(): for c in data[s][g].keys(): for d in data[s][g][c].keys(): for out in mp_output: if out[0] == s and out[1] == g and out[2] == c and out[3] == d: if self.df: try: data[s][g][c][d][-1].append(out[4]) except: data[s][g][c][d] = [[out[4]]] else: try: data[s][g][c][d].append(out[4]) except: data[s][g][c][d] = [out[4]] if self.df: for s in data.keys(): for g in data[s].keys(): for c in data[s][g].keys(): for d in data[s][g][c].keys(): if isinstance(data[s][g][c][d]["all"], list) and len(data[s][g][c][d]["all"]) > 1: diff = numpy.subtract(data[s][g][c][d]["all"][0], data[s][g][c][d]["all"][1]) data[s][g][c][d]["df"].append(diff.tolist()) else: print("Warning: There is no repetitive reads for calculating difference.\n" " Please add one more entry in experimental matrix.") self.data = data def colormap(self, colorby, definedinEM): colors = colormap(self.exps, colorby, definedinEM, annotation=self.annotation) self.colors = {} for i, c in enumerate(self.color_tags): self.colors[c] = colors[i] def plot(self, groupby, colorby, output, printtable=False, scol=False, srow=False, w=2, h=2): rot = 50 if len(self.data.values()[0].keys()) < 2: ticklabelsize = w * 1.5 else: ticklabelsize = w * 3 tw = len(self.data.values()[0].keys()) * w th = len(self.data.keys()) * (h * 0.8) f, axs = plt.subplots(len(self.data.keys()), len(self.data.values()[0].keys()), dpi=300, figsize=(tw, th)) yaxmax = [0] * len(self.data.values()[0]) sx_ymax = [0] * len(self.data.keys()) if self.df: yaxmin = [0] * len(self.data.values()[0]) sx_ymin = [0] * len(self.data.keys()) if printtable: bott = self.extend - int(0.5 * self.ss) pArr = [["Group_tag", "Sort_tag", "Color_tag"] + [str(x) for x in range(-bott, bott, self.ss)]] # Header nit = len(self.data.keys()) for it, s in enumerate(self.data.keys()): for i, g in enumerate(self.data[s].keys()): try: ax = axs[it, i] except: if len(self.data.keys()) == 1 and len(self.data[s].keys()) == 1: ax = axs elif len(self.data.keys()) == 1 and len(self.data[s].keys()) > 1: ax = axs[i] else: ax = axs[it] if it == 0: if self.df: ax.set_title(g + "_df", fontsize=ticklabelsize + 2) else: ax.set_title(g, fontsize=ticklabelsize + 2) # Processing for future output for j, c in enumerate(self.data[s][g].keys()): for k, d in enumerate(self.data[s][g][c].keys()): if not self.data[s][g][c][d]: continue else: if not self.sense: if self.df: pt = self.data[s][g][c][d]["df"] else: pt = self.data[s][g][c][d]["all"] for l, y in enumerate(pt): # print(y) yaxmax[i] = max(numpy.amax(y), yaxmax[i]) sx_ymax[it] = max(numpy.amax(y), sx_ymax[it]) if self.df: yaxmin[i] = min(numpy.amin(y), yaxmin[i]) sx_ymin[it] = min(numpy.amin(y), sx_ymin[it]) x = numpy.linspace(-self.extend, self.extend, len(y)) ax.plot(x, y, color=self.colors[c], lw=1, label=c) if it < nit - 1: ax.set_xticklabels([]) # Processing for future output if printtable: pArr.append([g, s, c, d] + list(y)) else: plt.text(0.5, 0.51, 'sense',transform=ax.transAxes,fontsize=ticklabelsize, horizontalalignment='center', verticalalignment='bottom') plt.text(0.5, 0.49, 'anti-sense', transform=ax.transAxes,fontsize=ticklabelsize, horizontalalignment='center', verticalalignment='top') plt.plot((-self.extend, self.extend), (0, 0), '0.1', linewidth=0.2) print(self.data[s][g][c][d]) for l, y in enumerate(self.data[s][g][c][d]["sense_1"]): # print(y) ymax1 = numpy.amax(y) yaxmax[i] = max(ymax1, yaxmax[i]) sx_ymax[it] = max(ymax1, sx_ymax[it]) x = numpy.linspace(-self.extend, self.extend, y.shape[0]) ax.plot(x, y, color=self.colors[c], lw=1, label=c) if it < nit - 1: ax.set_xticklabels([]) # Processing for future output if printtable: pArr.append([g, s, c, d, "+"] + list(y)) for l, y in enumerate(self.data[s][g][c][d]["sense_2"]): # print(y) ymax2 = numpy.amax(y) yaxmax[i] = max(ymax2, yaxmax[i]) sx_ymax[it] = max(ymax2, sx_ymax[it]) x = numpy.linspace(-self.extend, self.extend, y.shape[0]) ax.plot(x, -y, color=self.colors[c], lw=1, label=c) if it < nit - 1: ax.set_xticklabels([]) # Processing for future output if printtable: pArr.append([g, s, c, d, "-"] + list(y)) ym = 1.2 * max(max(yaxmax), max(sx_ymax)) ax.set_ylim([-ym, ym]) ax.get_yaxis().set_label_coords(-0.1, 0.5) ax.set_xlim([-self.extend, self.extend]) plt.setp(ax.get_xticklabels(), fontsize=ticklabelsize, rotation=rot) plt.setp(ax.get_yticklabels(), fontsize=ticklabelsize) ax.locator_params(axis='x', nbins=4) ax.locator_params(axis='y', nbins=2) # try: # # except: # ax.locator_params(axis='y', nbins=2) # pass if printtable: output_array(pArr, directory=output, folder=self.title, filename="plot_table.txt") for it, ty in enumerate(self.data.keys()): try: axs[it, 0].set_ylabel("{}".format(ty), fontsize=ticklabelsize + 1) except: try: axs[it].set_ylabel("{}".format(ty), fontsize=ticklabelsize + 1) except: axs.set_ylabel("{}".format(ty), fontsize=ticklabelsize + 1) for i, g in enumerate(self.data[ty].keys()): try: axx = axs[it, i] except: try: if len(self.data.keys()) == 1: axx = axs[i] else: axx = axs[it] except: axx = axs if self.df: if scol: ymin = yaxmin[i] - abs(yaxmin[i] * 0.2) ymax = yaxmax[i] + abs(yaxmax[i] * 0.2) elif srow: ymin = sx_ymin[it] - abs(sx_ymin[it] * 0.2) ymax = sx_ymax[it] + abs(sx_ymax[it] * 0.2) else: if scol: ymax = yaxmax[i] * 1.2 elif srow: ymax = sx_ymax[it] * 1.2 else: ymax = axx.get_ylim()[1] if self.sense: ymin = -ymax else: ymin = 0 try: axx.set_ylim([ymin, ymax]) except: pass handles, labels = ax.get_legend_handles_labels() uniq_labels = unique(labels) plt.legend([handles[labels.index(l)] for l in uniq_labels], uniq_labels, loc='center left', handlelength=1, handletextpad=1, columnspacing=2, borderaxespad=0., prop={'size': ticklabelsize}, bbox_to_anchor=(1.05, 0.5)) f.tight_layout() self.fig = f def gen_html(self, directory, title, align=50): dir_name = os.path.basename(directory) # check_dir(directory) html_header = dir_name + " / " + title link_d = OrderedDict() link_d["Lineplot"] = "index.html" link_d["Parameters"] = "parameters.html" html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") html.add_figure("lineplot.png", align="center", width="80%") html.write(os.path.join(directory, title, "index.html")) ## Parameters html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") type_list = 'ssssssssss' col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20] header_list = ["Assumptions and hypothesis"] data_table = [] if self.annotation: data_table.append( ["Genomic annotation: TSS - Transcription Start Site; TTS - Transcription Termination Site."]) data_table.append(["Directory: " + directory.rpartition("/")[2]]) data_table.append(["Title: " + title]) data_table.append(["Extend length: " + str(self.extend)]) data_table.append(["Read size: " + str(self.rs)]) data_table.append(["Bin size: " + str(self.bs)]) data_table.append(["Step size: " + str(self.ss)]) data_table.append(["Center mode: " + self.center]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left") html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>']) html.add_free_content(['<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>']) html.write(os.path.join(directory, title, "parameters.html")) def hmsort(self, sort): if sort == None: pass elif sort == 0: for t in self.data.keys(): for i, g in enumerate(self.data[t].keys()): # print(numpy.sum(data[t][bed].values()[0], axis=1)) # print(len(numpy.sum(data[t][bed].values()[0], axis=1))) sumarr = numpy.sum([numpy.sum(d, axis=1) for d in self.data[t][g].values()], axis=0) # print(sumarr) # sumarr = numpy.sum(sumarr, axis=1) ind = stats.rankdata(sumarr, method='ordinal') # The index for further sorting # numpy.fliplr(ind) for j, c in enumerate(self.data[t][g].keys()): d = numpy.empty(shape=(self.data[t][g][c].shape)) for k, ranki in enumerate(ind): d[-ranki, :] = self.data[t][g][c][k, :] self.data[t][g][c] = d else: for t in self.data.keys(): for i, g in enumerate(self.data[t].keys()): sumarr = numpy.sum(self.data[t][g].values()[sort - 1], axis=1) # print(sumarr) # sumarr = numpy.sum(sumarr, axis=1) ind = stats.rankdata(sumarr, method='ordinal') # The index for further sorting # list(ind) # print(ind) for j, c in enumerate(self.data[t][g].keys()): d = numpy.empty(shape=(self.data[t][g][c].shape)) for k, ranki in enumerate(ind): d[-ranki, :] = self.data[t][g][c][k, :] self.data[t][g][c] = d # print(data[t][bed].values()[0]) def hmcmlist(self, colorby, definedinEM): # self.colors = colormaps(self.exps, colorby, definedinEM) self.colors = ["Reds", "Blues", "Oranges", "Greens", "Purples"] def heatmap(self, logt): tickfontsize = 6 ratio = 10 self.hmfiles = [] self.figs = [] for ti, t in enumerate(self.data.keys()): # fig.append(plt.figure()) # rows = len(data[t].keys()) columns = len(self.data[t].values()[0].keys()) # fig, axs = plt.subplots(rows,columns, sharey=True, dpi=300) # matplotlib.pyplot.subplots_adjust(left=1, right=2, top=2, bottom=1) fig = plt.figure(t) plt.suptitle("Heatmap: " + t, y=1.05) rows = len(self.data[t].keys()) # gs = gridspec.GridSpec(rows*ratio,columns) axs = numpy.empty(shape=(rows + 1, columns), dtype=object) for bi, g in enumerate(self.data[t].keys()): for bj, c in enumerate(self.data[t][g].keys()): max_value = numpy.amax(self.data[t][g][c]) max_value = int(max_value) axs[bi, bj] = plt.subplot2grid(shape=(rows * ratio + 1, columns), loc=(bi * ratio, bj), rowspan=ratio) if bi == 0: axs[bi, bj].set_title(c, fontsize=7) # print(self.data[t][g][c]) # print(self.colors) # print(bj) # im = axs[bi, bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto', # vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj]) im = axs[bi, bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0, 1], aspect='auto', vmin=0, vmax=max_value, interpolation='nearest', cmap=plt.get_cmap("Blues")) # for bi, g in enumerate(self.data[t].keys()): # for bj, c in enumerate(self.data[t][g].keys()): # im = axs[bi, bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto', # vmin=0, vmax=max_value, interpolation='nearest', cmap=cm.coolwarm) axs[bi, bj].set_xlim([-self.extend, self.extend]) axs[bi, bj].set_xticks([-self.extend, 0, self.extend]) # axs[bi, bj].set_xticklabels([-args.e, 0, args.e] plt.setp(axs[bi, bj].get_xticklabels(), fontsize=tickfontsize, rotation=0) # plt.setp(axs[bi, bj].get_yticklabels(), fontsize=10) # axs[bi, bj].locator_params(axis = 'x', nbins = 2) # axs[bi, bj].locator_params(axis = 'y', nbins = 4) for spine in ['top', 'right', 'left', 'bottom']: axs[bi, bj].spines[spine].set_visible(False) axs[bi, bj].tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='on') axs[bi, bj].tick_params(axis='y', which='both', left='off', right='off', labelleft='off') # if bj > 0: # plt.setp(axs[bi, bj].get_yticklabels(),visible=False) # plt.setp(axarr[i].get_yticks(),visible=False) axs[bi, bj].minorticks_off() if bj == 0: # nregion = len(self.exps.objectsDict[g]) # axs[bi, bj].set_ylabel(self.exps.get_type(g,'factor')+" ("+str(nregion) + ")", fontsize=7) axs[bi, bj].set_ylabel(g, fontsize=7) if bi == rows - 1: # divider = make_axes_locatable(axs[bi,bj]) # cax = divider.append_axes("bottom", size="5%", pad=0.5) cbar_ax = plt.subplot2grid((rows * ratio + 4, columns), (rows * ratio + 3, bj)) # axs[rows,bj].tick_params(axis='y', which='both', left='off', right='off', labelleft='off') # cbar = grid.cbar_axes[i//2].colorbar(im) # cbar = plt.colorbar(im, cax = axs[rows,bj], ticks=[0, max_value], orientation='horizontal') # cbar = axs[rows,bj].imshow(range(int(max_value)), extent=[0, int(max_value),0,0], aspect=10, extent=[-self.extend, self.extend,0,0] # vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj]) # cbar = axs[rows,bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto', # vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj]) # cbar = axs[rows,bj].imshow([range(2*self.extend),range(2*self.extend),range(2*self.extend)], # aspect='auto', vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj] ) # cbar.outline.set_linewidth(0.5) # axs[rows,bj].set_ticks_position('none') # axs[rows,bj].tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') # axs[rows,bj].tick_params(axis='y', which='both', left='off', right='off', labelleft='off') # cbar.set_label('Amplitute of signal') max_value = int(max_value) # width = 0.4/rows # cbar_ax = fig.add_axes([0.01 + bj/columns, 0, width, 0.01]) cbar = plt.colorbar(im, cax=cbar_ax, ticks=[0, max_value], orientation='horizontal') cbar.ax.set_xticklabels([0, int(max_value)]) if logt: cbar.ax.set_xticklabels(['0', '{:1.1f}'.format(max_value)], fontsize=tickfontsize) # horizontal colorbar cbar.set_label('log10', fontsize=tickfontsize) # else: # cbar.ax.set_xticklabels(['0', int(max_value)], fontsize=tickfontsize)# horizontal colorbar # pass # cbar.outline.set_linewidth(0.1) # fig.tight_layout() # fig.tight_layout(pad=1.08, h_pad=None, w_pad=None) # fig.tight_layout(pad=1, h_pad=1, w_pad=1) self.figs.append(fig) self.hmfiles.append("heatmap" + "_" + t) def gen_htmlhm(self, outputname, title, align=50): dir_name = os.path.basename(outputname) # check_dir(directory) html_header = title link_d = OrderedDict() link_d["Lineplot"] = "index.html" link_d["Parameters"] = "parameters.html" html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") # Each row is a plot with its data for name in self.hmfiles: html.add_figure(name + ".png", align="center") html.write(os.path.join(outputname, title, "index.html")) ## Parameters html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>']) html.add_free_content(['<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>']) html.write(os.path.join(outputname, title, "parameters.html"))
parser.add_option("--distance", "-d", dest="distance", default=50000, help="distance from peak to gene", type="int") parser.add_option("--type", "-t", dest="type", default="bed", help="type of bed file (<bed>, <THOR>)", type="str") parser.add_option("--metric", dest="metric", default="max", help="metric to merge peaks' scores (mean, max)", type="str") (options, args) = parser.parse_args() i = 3 if len(args) > i: parser.error("Exactly %s parameters are needed" %i) path_exp_matrix = args[0] path_annotation = args[1] genome_file = os.path.join(path_annotation, "chrom.sizes") gene_file = os.path.join(path_annotation, "association_file.bed") exp_matrix = ExperimentalMatrix() exp_matrix.read(path_exp_matrix, is_bedgraph=False) print("Use metric %s to merge peaks' score." %options.metric, file=sys.stderr) if options.mode is 1: mode_1(exp_matrix,options.distance) elif options.mode is 2: mode_2(exp_matrix,options.distance) elif options.mode is 3: mode_3(exp_matrix,options.distance,options.type) elif options.mode is 4: geneexp_file = args[2] mode_4(exp_matrix,options.distance,options.type,geneexp_file)
class Lineplot: def __init__(self, EMpath, title, annotation, organism, center, extend, rs, bs, ss, df, dft, fields, test, sense): # Read the Experimental Matrix self.title = title self.exps = ExperimentalMatrix() self.exps.read(EMpath, test=test) for f in self.exps.fields: if f not in [ 'name', 'type', 'file', "reads", "regions", "factors" ]: self.exps.match_ms_tags(f, test=test) self.exps.remove_name() # if annotation: # self.beds, self.bednames, self.annotation = annotation_dump(organism) # else: self.beds = self.exps.get_regionsets() # A list of GenomicRegionSets self.bednames = self.exps.get_regionsnames() self.annotation = None self.reads = self.exps.get_readsfiles() self.readsnames = self.exps.get_readsnames() self.fieldsDict = self.exps.fieldsDict self.parameter = [] self.center = center self.extend = extend self.rs = rs self.bs = bs self.ss = ss self.df = df self.dft = dft self.sense = sense def relocate_bed(self): self.processed_beds = [] self.processed_bedsF = [] # Processed beds to be flapped for bed in self.beds: if self.center == 'bothends': newbed = bed.relocate_regions( center='leftend', left_length=self.extend + self.bs, right_length=self.extend + self.bs) self.processed_beds.append(newbed) newbedF = bed.relocate_regions( center='rightend', left_length=self.extend + self.bs, right_length=self.extend + self.bs) self.processed_bedsF.append(newbedF) elif self.center == 'upstream' or self.center == 'downstream': allbed = bed.relocate_regions( center=self.center, left_length=self.extend + self.bs, right_length=self.extend + self.bs) newbed = allbed.filter_strand(strand="+") self.processed_beds.append(newbed) newbedF = allbed.filter_strand(strand="-") self.processed_bedsF.append(newbedF) else: newbed = bed.relocate_regions( center=self.center, left_length=self.extend + int(0.5 * self.bs) + 2 * self.ss, right_length=self.extend + int(0.5 * self.bs) + 2 * self.ss) self.processed_beds.append(newbed) def group_tags(self, groupby, sortby, colorby): """Generate the tags for the grouping of plot Parameters: groupby = 'reads','regions','cell',or 'factor' colorby = 'reads','regions','cell',or 'factor' sortby = 'reads','regions','cell',or 'factor' """ self.tag_type = [sortby, groupby, colorby, self.dft] if "None" in self.tag_type: self.tag_type.remove("None") if groupby == "None": self.group_tags = [""] elif groupby == "regions" and self.annotation: self.group_tags = self.bednames else: self.group_tags = gen_tags(self.exps, groupby) if sortby == "None": self.sort_tags = [""] elif sortby == "regions" and self.annotation: self.sort_tags = self.bednames else: self.sort_tags = gen_tags(self.exps, sortby) if colorby == "None": self.color_tags = [""] elif colorby == "regions" and self.annotation: self.color_tags = self.bednames else: self.color_tags = gen_tags(self.exps, colorby) print("\tColumn labels:\t" + ",".join(self.group_tags)) print("\tRow labels:\t" + ",".join(self.sort_tags)) print("\tColor labels:\t" + ",".join(self.color_tags)) def gen_cues(self): self.cuebed = OrderedDict() self.cuebam = OrderedDict() # if self.annotation: # #all_tags = [] # #for dictt in self.exps.fieldsDict.values(): # # for tag in dictt.keys(): # # all_tags.append(tag) # for bed in self.bednames: # # self.cuebed[bed] = set([bed]+all_tags) # self.cuebed[bed] = set([bed]) # else: for bed in self.bednames: self.cuebed[bed] = set(tag_from_r(self.exps, self.tag_type, bed)) try: self.cuebed[bed].remove("None") except: pass for bam in self.readsnames: self.cuebam[bam] = set(tag_from_r(self.exps, self.tag_type, bam)) def coverage(self, sortby, heatmap=False, logt=False, mp=0, log=False): def annot_ind(bednames, tags): """Find the index for annotation tag""" for ind, a in enumerate(bednames): if a in tags: return ind if mp > 0: ts = time.time() normRPM = False # Calculate for coverage mp_input = [] data = OrderedDict() bi = 0 for s in self.sort_tags: data[s] = OrderedDict() for g in self.group_tags: data[s][g] = OrderedDict() for c in self.color_tags: # if self.df: data[s][g][c] = [] data[s][g][c] = OrderedDict() if not self.dft: dfs = [c] else: dfs = self.exps.fieldsDict[self.dft].keys() for d in dfs: data[s][g][c][d] = defaultdict(list) for bed in self.cuebed.keys(): # print(self.cuebed[bed]) # print(set([s,g,c,d])) # print(self.cuebed[bed].issubset(set([s,g,c,d]))) if len(self.cuebed[bed].intersection( set([s, g, c, d ]))) > 2 or self.cuebed[bed].issubset( set([s, g, c, d])): # if self.cuebed[bed] <= set([s,g,c]): for bam in self.cuebam.keys(): # print(self.cuebam[bam]) # print(set([s,g,c])) if self.cuebam[bam] <= set([s, g, c, d]): i = self.bednames.index(bed) j = self.readsnames.index(bam) # print(bed + "." + bam) # if len(self.processed_beds[i]) == 0: # try: # data[s][g][c][d].append(numpy.empty(1, dtype=object)) # except: # data[s][g][c][d] = [numpy.empty(1, dtype=object)] # continue ######################################################################### if mp > 0: # Multiple processing mp_input.append([ self.processed_beds[i], self.reads[j], self.rs, self.bs, self.ss, self.center, heatmap, logt, s, g, c, d ]) data[s][g][c][d] = None ######################################################################### else: # Single thread ts = time.time() cov = CoverageSet( bed + "." + bam, self.processed_beds[i]) # print(len(self.processed_beds[i])) if "Conservation" in [s, g, c, d]: cov.phastCons46way_score( stepsize=self.ss) elif ".bigwig" in self.reads[ j].lower( ) or ".bw" in self.reads[ j].lower(): cov.coverage_from_bigwig( bigwig_file=self.reads[j], stepsize=self.ss) else: if not self.sense: cov.coverage_from_bam( bam_file=self.reads[j], extension_size=self.rs, binsize=self.bs, stepsize=self.ss) if normRPM: cov.normRPM() else: # Sense specific cov.coverage_from_bam( bam_file=self.reads[j], extension_size=self.rs, binsize=self.bs, stepsize=self.ss, get_sense_info=True, paired_reads=True) cov.array_transpose() if normRPM: cov.normRPM() # When bothends, consider the fliping end if self.center == 'bothends' or self.center == 'upstream' or self.center == 'downstream': if "Conservation" in [ s, g, c, d ]: flap = CoverageSet( "for flap", self. processed_bedsF[i]) flap.phastCons46way_score( stepsize=self.ss) ffcoverage = numpy.fliplr( flap.coverage) cov.coverage = numpy.concatenate( (cov.coverage, ffcoverage), axis=0) elif ".bigwig" in self.reads[ j].lower( ) or ".bw" in self.reads[ j].lower(): flap = CoverageSet( "for flap", self. processed_bedsF[i]) flap.coverage_from_bigwig( bigwig_file=self. reads[j], stepsize=self.ss) ffcoverage = numpy.fliplr( flap.coverage) cov.coverage = numpy.concatenate( (cov.coverage, ffcoverage), axis=0) else: flap = CoverageSet( "for flap", self. processed_bedsF[i]) if not self.sense: flap.coverage_from_bam( self.reads[j], extension_size=self .rs, binsize=self.bs, stepsize=self.ss) if normRPM: flap.normRPM() else: # Sense specific flap.coverage_from_bam( bam_file=self. reads[j], extension_size=self .rs, binsize=self.bs, stepsize=self.ss, get_sense_info=True, paired_reads=True) flap.array_transpose( flip=True) if normRPM: flap.normRPM() ffcoverage = numpy.fliplr( flap.coverage) try: cov.coverage = numpy.concatenate( (cov.coverage, ffcoverage), axis=0) except: pass if self.sense: cov.transpose_cov1 = numpy.concatenate( (cov. transpose_cov1, flap. transpose_cov1), axis=0) cov.transpose_cov2 = numpy.concatenate( (cov. transpose_cov2, flap. transpose_cov2), axis=0) # Averaging the coverage of all regions of each bed file if heatmap: if logt: data[s][g][c][ d] = numpy.log10( numpy.vstack( cov.coverage) + 1 ) # Store the array into data list else: data[s][g][c][ d] = numpy.vstack( cov.coverage ) # Store the array into data list else: if len(cov.coverage) == 0: data[s][g][c][d] = None print( "** Warning: Cannot open " + self.reads[j]) continue else: for i, car in enumerate( cov.coverage): if i == 0: avearr = np.array( car, ndmin=2) else: # avearr = numpy.vstack((avearr, np.array(car, ndmin=2))) try: avearr = numpy.vstack( (avearr, np.array( car, ndmin=2 ))) except: print(bed + "." + bam + "." + str(i)) if log: avearr = numpy.log2( avearr + 1) avearr = numpy.average( avearr, axis=0) if self.sense: if log: sense_1 = numpy.average( numpy.log2( cov. transpose_cov1 + 1), axis=0) sense_2 = numpy.average( numpy.log2( cov. transpose_cov2 + 1), axis=0) else: sense_1 = numpy.average( cov. transpose_cov1, axis=0) sense_2 = numpy.average( cov. transpose_cov2, axis=0) cut_end = int(self.bs / self.ss) avearr = avearr[ cut_end:-cut_end] data[s][g][c][d][ "all"].append(avearr) if self.sense: sense_1 = sense_1[ cut_end:-cut_end] sense_2 = sense_2[ cut_end:-cut_end] data[s][g][c][d][ "sense_1"].append( sense_1) data[s][g][c][d][ "sense_2"].append( sense_2) bi += 1 te = time.time() print2( self.parameter, "\t" + str(bi) + "\t" + "{0:30}\t--{1:<5.1f}s".format( bed + "." + bam, ts - te)) if mp > 0: pool = MyPool(mp) mp_output = pool.map(compute_coverage, mp_input) pool.close() pool.join() for s in data.keys(): for g in data[s].keys(): for c in data[s][g].keys(): for d in data[s][g][c].keys(): for out in mp_output: if out[0] == s and out[1] == g and out[ 2] == c and out[3] == d: if self.df: try: data[s][g][c][d][-1].append(out[4]) except: data[s][g][c][d] = [[out[4]]] else: try: data[s][g][c][d].append(out[4]) except: data[s][g][c][d] = [out[4]] if self.df: for s in data.keys(): for g in data[s].keys(): for c in data[s][g].keys(): for d in data[s][g][c].keys(): if isinstance( data[s][g][c][d]["all"], list) and len(data[s][g][c][d]["all"]) > 1: diff = numpy.subtract( data[s][g][c][d]["all"][0], data[s][g][c][d]["all"][1]) data[s][g][c][d]["df"].append(diff.tolist()) else: print( "Warning: There is no repetitive reads for calculating difference.\n" " Please add one more entry in experimental matrix." ) self.data = data def colormap(self, colorby, definedinEM): colors = colormap(self.exps, colorby, definedinEM, annotation=self.annotation) self.colors = {} for i, c in enumerate(self.color_tags): self.colors[c] = colors[i] def plot(self, groupby, colorby, output, printtable=False, scol=False, srow=False, w=2, h=2): rot = 50 if len(self.data.values()[0].keys()) < 2: ticklabelsize = w * 1.5 else: ticklabelsize = w * 3 tw = len(self.data.values()[0].keys()) * w th = len(self.data.keys()) * (h * 0.8) f, axs = plt.subplots(len(self.data.keys()), len(self.data.values()[0].keys()), dpi=300, figsize=(tw, th)) yaxmax = [0] * len(self.data.values()[0]) sx_ymax = [0] * len(self.data.keys()) if self.df: yaxmin = [0] * len(self.data.values()[0]) sx_ymin = [0] * len(self.data.keys()) if printtable: bott = self.extend - int(0.5 * self.ss) pArr = [["Group_tag", "Sort_tag", "Color_tag"] + [str(x) for x in range(-bott, bott, self.ss)]] # Header nit = len(self.data.keys()) for it, s in enumerate(self.data.keys()): for i, g in enumerate(self.data[s].keys()): try: ax = axs[it, i] except: if len(self.data.keys()) == 1 and len( self.data[s].keys()) == 1: ax = axs elif len(self.data.keys()) == 1 and len( self.data[s].keys()) > 1: ax = axs[i] else: ax = axs[it] if it == 0: if self.df: ax.set_title(g + "_df", fontsize=ticklabelsize + 2) else: ax.set_title(g, fontsize=ticklabelsize + 2) # Processing for future output for j, c in enumerate(self.data[s][g].keys()): for k, d in enumerate(self.data[s][g][c].keys()): if not self.data[s][g][c][d]: continue else: if not self.sense: if self.df: pt = self.data[s][g][c][d]["df"] else: pt = self.data[s][g][c][d]["all"] for l, y in enumerate(pt): # print(y) yaxmax[i] = max(numpy.amax(y), yaxmax[i]) sx_ymax[it] = max(numpy.amax(y), sx_ymax[it]) if self.df: yaxmin[i] = min( numpy.amin(y), yaxmin[i]) sx_ymin[it] = min( numpy.amin(y), sx_ymin[it]) x = numpy.linspace(-self.extend, self.extend, len(y)) ax.plot(x, y, color=self.colors[c], lw=1, label=c) if it < nit - 1: ax.set_xticklabels([]) # Processing for future output if printtable: pArr.append([g, s, c, d] + list(y)) else: plt.text(0.5, 0.51, 'sense', transform=ax.transAxes, fontsize=ticklabelsize, horizontalalignment='center', verticalalignment='bottom') plt.text(0.5, 0.49, 'anti-sense', transform=ax.transAxes, fontsize=ticklabelsize, horizontalalignment='center', verticalalignment='top') plt.plot((-self.extend, self.extend), (0, 0), '0.1', linewidth=0.2) print(self.data[s][g][c][d]) for l, y in enumerate( self.data[s][g][c][d]["sense_1"]): # print(y) ymax1 = numpy.amax(y) yaxmax[i] = max(ymax1, yaxmax[i]) sx_ymax[it] = max(ymax1, sx_ymax[it]) x = numpy.linspace(-self.extend, self.extend, y.shape[0]) ax.plot(x, y, color=self.colors[c], lw=1, label=c) if it < nit - 1: ax.set_xticklabels([]) # Processing for future output if printtable: pArr.append([g, s, c, d, "+"] + list(y)) for l, y in enumerate( self.data[s][g][c][d]["sense_2"]): # print(y) ymax2 = numpy.amax(y) yaxmax[i] = max(ymax2, yaxmax[i]) sx_ymax[it] = max(ymax2, sx_ymax[it]) x = numpy.linspace(-self.extend, self.extend, y.shape[0]) ax.plot(x, -y, color=self.colors[c], lw=1, label=c) if it < nit - 1: ax.set_xticklabels([]) # Processing for future output if printtable: pArr.append([g, s, c, d, "-"] + list(y)) ym = 1.2 * max(max(yaxmax), max(sx_ymax)) ax.set_ylim([-ym, ym]) ax.get_yaxis().set_label_coords(-0.1, 0.5) ax.set_xlim([-self.extend, self.extend]) plt.setp(ax.get_xticklabels(), fontsize=ticklabelsize, rotation=rot) plt.setp(ax.get_yticklabels(), fontsize=ticklabelsize) ax.locator_params(axis='x', nbins=4) ax.locator_params(axis='y', nbins=2) # try: # # except: # ax.locator_params(axis='y', nbins=2) # pass if printtable: output_array(pArr, directory=output, folder=self.title, filename="plot_table.txt") for it, ty in enumerate(self.data.keys()): try: axs[it, 0].set_ylabel("{}".format(ty), fontsize=ticklabelsize + 1) except: try: axs[it].set_ylabel("{}".format(ty), fontsize=ticklabelsize + 1) except: axs.set_ylabel("{}".format(ty), fontsize=ticklabelsize + 1) for i, g in enumerate(self.data[ty].keys()): try: axx = axs[it, i] except: try: if len(self.data.keys()) == 1: axx = axs[i] else: axx = axs[it] except: axx = axs if self.df: if scol: ymin = yaxmin[i] - abs(yaxmin[i] * 0.2) ymax = yaxmax[i] + abs(yaxmax[i] * 0.2) elif srow: ymin = sx_ymin[it] - abs(sx_ymin[it] * 0.2) ymax = sx_ymax[it] + abs(sx_ymax[it] * 0.2) else: if scol: ymax = yaxmax[i] * 1.2 elif srow: ymax = sx_ymax[it] * 1.2 else: ymax = axx.get_ylim()[1] if self.sense: ymin = -ymax else: ymin = 0 try: axx.set_ylim([ymin, ymax]) except: pass handles, labels = ax.get_legend_handles_labels() uniq_labels = unique(labels) plt.legend([handles[labels.index(l)] for l in uniq_labels], uniq_labels, loc='center left', handlelength=1, handletextpad=1, columnspacing=2, borderaxespad=0., prop={'size': ticklabelsize}, bbox_to_anchor=(1.05, 0.5)) f.tight_layout() self.fig = f def gen_html(self, directory, title, align=50): dir_name = os.path.basename(directory) # check_dir(directory) html_header = dir_name + " / " + title link_d = OrderedDict() link_d["Lineplot"] = "index.html" link_d["Parameters"] = "parameters.html" html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") html.add_figure("lineplot.png", align="center", width="80%") html.write(os.path.join(directory, title, "index.html")) ## Parameters html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") type_list = 'ssssssssss' col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20] header_list = ["Assumptions and hypothesis"] data_table = [] if self.annotation: data_table.append([ "Genomic annotation: TSS - Transcription Start Site; TTS - Transcription Termination Site." ]) data_table.append(["Directory: " + directory.rpartition("/")[2]]) data_table.append(["Title: " + title]) data_table.append(["Extend length: " + str(self.extend)]) data_table.append(["Read size: " + str(self.rs)]) data_table.append(["Bin size: " + str(self.bs)]) data_table.append(["Step size: " + str(self.ss)]) data_table.append(["Center mode: " + self.center]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left") html.add_free_content([ '<a href="parameters.txt" style="margin-left:100">See parameters</a>' ]) html.add_free_content([ '<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>' ]) html.write(os.path.join(directory, title, "parameters.html")) def hmsort(self, sort): if sort == None: pass elif sort == 0: for t in self.data.keys(): for i, g in enumerate(self.data[t].keys()): # print(numpy.sum(data[t][bed].values()[0], axis=1)) # print(len(numpy.sum(data[t][bed].values()[0], axis=1))) sumarr = numpy.sum([ numpy.sum(d, axis=1) for d in self.data[t][g].values() ], axis=0) # print(sumarr) # sumarr = numpy.sum(sumarr, axis=1) ind = stats.rankdata( sumarr, method='ordinal') # The index for further sorting # numpy.fliplr(ind) for j, c in enumerate(self.data[t][g].keys()): d = numpy.empty(shape=(self.data[t][g][c].shape)) for k, ranki in enumerate(ind): d[-ranki, :] = self.data[t][g][c][k, :] self.data[t][g][c] = d else: for t in self.data.keys(): for i, g in enumerate(self.data[t].keys()): sumarr = numpy.sum(self.data[t][g].values()[sort - 1], axis=1) # print(sumarr) # sumarr = numpy.sum(sumarr, axis=1) ind = stats.rankdata( sumarr, method='ordinal') # The index for further sorting # list(ind) # print(ind) for j, c in enumerate(self.data[t][g].keys()): d = numpy.empty(shape=(self.data[t][g][c].shape)) for k, ranki in enumerate(ind): d[-ranki, :] = self.data[t][g][c][k, :] self.data[t][g][c] = d # print(data[t][bed].values()[0]) def hmcmlist(self, colorby, definedinEM): # self.colors = colormaps(self.exps, colorby, definedinEM) self.colors = ["Reds", "Blues", "Oranges", "Greens", "Purples"] def heatmap(self, logt): tickfontsize = 6 ratio = 10 self.hmfiles = [] self.figs = [] for ti, t in enumerate(self.data.keys()): # fig.append(plt.figure()) # rows = len(data[t].keys()) columns = len(self.data[t].values()[0].keys()) # fig, axs = plt.subplots(rows,columns, sharey=True, dpi=300) # matplotlib.pyplot.subplots_adjust(left=1, right=2, top=2, bottom=1) fig = plt.figure(t) plt.suptitle("Heatmap: " + t, y=1.05) rows = len(self.data[t].keys()) # gs = gridspec.GridSpec(rows*ratio,columns) axs = numpy.empty(shape=(rows + 1, columns), dtype=object) for bi, g in enumerate(self.data[t].keys()): for bj, c in enumerate(self.data[t][g].keys()): max_value = numpy.amax(self.data[t][g][c]) max_value = int(max_value) axs[bi, bj] = plt.subplot2grid(shape=(rows * ratio + 1, columns), loc=(bi * ratio, bj), rowspan=ratio) if bi == 0: axs[bi, bj].set_title(c, fontsize=7) # print(self.data[t][g][c]) # print(self.colors) # print(bj) # im = axs[bi, bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto', # vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj]) im = axs[bi, bj].imshow( self.data[t][g][c], extent=[-self.extend, self.extend, 0, 1], aspect='auto', vmin=0, vmax=max_value, interpolation='nearest', cmap=plt.get_cmap("Blues")) # for bi, g in enumerate(self.data[t].keys()): # for bj, c in enumerate(self.data[t][g].keys()): # im = axs[bi, bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto', # vmin=0, vmax=max_value, interpolation='nearest', cmap=cm.coolwarm) axs[bi, bj].set_xlim([-self.extend, self.extend]) axs[bi, bj].set_xticks([-self.extend, 0, self.extend]) # axs[bi, bj].set_xticklabels([-args.e, 0, args.e] plt.setp(axs[bi, bj].get_xticklabels(), fontsize=tickfontsize, rotation=0) # plt.setp(axs[bi, bj].get_yticklabels(), fontsize=10) # axs[bi, bj].locator_params(axis = 'x', nbins = 2) # axs[bi, bj].locator_params(axis = 'y', nbins = 4) for spine in ['top', 'right', 'left', 'bottom']: axs[bi, bj].spines[spine].set_visible(False) axs[bi, bj].tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='on') axs[bi, bj].tick_params(axis='y', which='both', left='off', right='off', labelleft='off') # if bj > 0: # plt.setp(axs[bi, bj].get_yticklabels(),visible=False) # plt.setp(axarr[i].get_yticks(),visible=False) axs[bi, bj].minorticks_off() if bj == 0: # nregion = len(self.exps.objectsDict[g]) # axs[bi, bj].set_ylabel(self.exps.get_type(g,'factor')+" ("+str(nregion) + ")", fontsize=7) axs[bi, bj].set_ylabel(g, fontsize=7) if bi == rows - 1: # divider = make_axes_locatable(axs[bi,bj]) # cax = divider.append_axes("bottom", size="5%", pad=0.5) cbar_ax = plt.subplot2grid((rows * ratio + 4, columns), (rows * ratio + 3, bj)) # axs[rows,bj].tick_params(axis='y', which='both', left='off', right='off', labelleft='off') # cbar = grid.cbar_axes[i//2].colorbar(im) # cbar = plt.colorbar(im, cax = axs[rows,bj], ticks=[0, max_value], orientation='horizontal') # cbar = axs[rows,bj].imshow(range(int(max_value)), extent=[0, int(max_value),0,0], aspect=10, extent=[-self.extend, self.extend,0,0] # vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj]) # cbar = axs[rows,bj].imshow(self.data[t][g][c], extent=[-self.extend, self.extend, 0,1], aspect='auto', # vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj]) # cbar = axs[rows,bj].imshow([range(2*self.extend),range(2*self.extend),range(2*self.extend)], # aspect='auto', vmin=0, vmax=max_value, interpolation='nearest', cmap=self.colors[bj] ) # cbar.outline.set_linewidth(0.5) # axs[rows,bj].set_ticks_position('none') # axs[rows,bj].tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') # axs[rows,bj].tick_params(axis='y', which='both', left='off', right='off', labelleft='off') # cbar.set_label('Amplitute of signal') max_value = int(max_value) # width = 0.4/rows # cbar_ax = fig.add_axes([0.01 + bj/columns, 0, width, 0.01]) cbar = plt.colorbar(im, cax=cbar_ax, ticks=[0, max_value], orientation='horizontal') cbar.ax.set_xticklabels([0, int(max_value)]) if logt: cbar.ax.set_xticklabels( ['0', '{:1.1f}'.format(max_value)], fontsize=tickfontsize) # horizontal colorbar cbar.set_label('log10', fontsize=tickfontsize) # else: # cbar.ax.set_xticklabels(['0', int(max_value)], fontsize=tickfontsize)# horizontal colorbar # pass # cbar.outline.set_linewidth(0.1) # fig.tight_layout() # fig.tight_layout(pad=1.08, h_pad=None, w_pad=None) # fig.tight_layout(pad=1, h_pad=1, w_pad=1) self.figs.append(fig) self.hmfiles.append("heatmap" + "_" + t) def gen_htmlhm(self, outputname, title, align=50): dir_name = os.path.basename(outputname) # check_dir(directory) html_header = title link_d = OrderedDict() link_d["Lineplot"] = "index.html" link_d["Parameters"] = "parameters.html" html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") # Each row is a plot with its data for name in self.hmfiles: html.add_figure(name + ".png", align="center") html.write(os.path.join(outputname, title, "index.html")) ## Parameters html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") html.add_free_content([ '<a href="parameters.txt" style="margin-left:100">See parameters</a>' ]) html.add_free_content([ '<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>' ]) html.write(os.path.join(outputname, title, "parameters.html"))
if __name__ == '__main__': parser = HelpfulOptionParser(usage=__doc__) parser.add_option("--mode", "-m", dest="mode", default=1, help="choose mode", type="int") (options, args) = parser.parse_args() i = 2 if len(args) != i: parser.error("Exactly %s parameters are needed" %i) path_exp_matrix = args[0] path_annotation = args[1] #options.mode = 3 #path_exp_matrix = '/workspace/cluster_p/hematology/exp/exp03_rerun_chipseq/assign_peak/exp_matrix_peak_assign_chipseq' #path_annotation = '/home/manuel/data/rgt-data/mm9/' genome_file = os.path.join(path_annotation, "chrom.sizes") gene_file = os.path.join(path_annotation, "association_file.bed") exp_matrix = ExperimentalMatrix() exp_matrix.read(path_exp_matrix, is_bedgraph=True) if options.mode is 1: mode_1(exp_matrix) elif options.mode is 2: mode_2(exp_matrix) elif options.mode is 3: mode_3(exp_matrix)
if len(args) != 4: parser.error("Exactly three parameters are needed: experimental matrix, gene expression, annotation path and prefix for output") #map arguments experimental_matrix_file = args[0] gene_exp = args[1] annotation_path = args[2] outputdir = args[3] # experimental_matrix_file = "/home/manuel/workspace/cluster_p/THOR/exp/exp23_macs2_payton/1" # gene_exp = "/home/manuel/workspace/cluster_p/allhoff/project_THOR/data/payton/gene_expression/CCmean.data" # annotation_path = "/home/manuel/workspace/cluster_h/rgtdata/hg19/" # outputdir = "/home/manuel/test/" exps = ExperimentalMatrix() exps.read(experimental_matrix_file) regionsets = exps.get_regionsets() genome_file = annotation_path + "/chrom.sizes" gene_file = annotation_path + "/association_file.bed" genes = GeneSet("Expression") genes.read_expression(gene_exp) for region in regionsets: bedNew = GenomicRegionSet("") [degenes, de_peak_genes, mappedGenes, totalPeaks, regionsToGenes] \ = bedNew.filter_by_gene_association_old(region.fileName, genes.genes, gene_file, genome_file, threshDist=options.dist) [ct, labels] = averageExpression(region, genes, regionsToGenes)
backGroundPeaksName = sys.argv[6] backBed = GenomicRegionSet("BACK") backBed.read_bed(backGroundPeaksName) backGroundPeaks = True distance = 50000 if len(sys.argv) > 6: distance = len(sys.argv[6]) if len(sys.argv) > 7: outdir = sys.argv[7] #genomeFile=anotationPath+"chrom.sizes" #geneFile=anotationPath+"association_file.bed" exps = ExperimentalMatrix() exps.read(designFile) beds = [] geneLists = [] #this should be improved bedGenes = GenomicRegionSet(geneFile) bedGenes.read_bed(geneFile) allgenes = [] for r in bedGenes: allgenes.append(r.name) allgenes = list(set(allgenes)) genesets = exps.get_genesets()
resprop2 = res.transpose() / numpy.array(totalLine) resprop2 = resprop2.transpose() return namesCol, namesLine, res, resprop1, resprop2 def printTable(namesCol, namesLines, table, fileName): f = open(fileName, "w") f.write("\t" + ("\t".join(namesCol)) + "\n") for i, line in enumerate(table): f.write(namesLines[i] + "\t" + ("\t".join([str(j) for j in line])) + "\n") out = "" experimentalFile = sys.argv[1] exps = ExperimentalMatrix() exps.read(experimentalFile) beds = exps.get_regionsets() beds2 = beds outputDir = sys.argv[2] if len(sys.argv) > 3: experimentalFile2 = sys.argv[3] exps2 = ExperimentalMatrix() exps2.read(experimentalFile2) beds2 = exps2.get_regionsets() out = outputDir [namesCol, namesLine, res, resprop1, resprop2] = bedOverllap(beds, beds2, outPath=out) printTable(namesCol, namesLine, res, outputDir + "/count.table")
class Projection: def __init__(self, reference_path, query_path): # Reference self.rEM = ExperimentalMatrix() self.rEM.read(reference_path) self.rEM.remove_empty_regionset() self.references = self.rEM.get_regionsets() self.referencenames = self.rEM.get_regionsnames() # Query self.qEM = ExperimentalMatrix() self.qEM.read(query_path) self.qEM.remove_empty_regionset() self.query = self.qEM.get_regionsets() self.querynames = self.qEM.get_regionsnames() self.parameter = [] self.background = None def group_refque(self, groupby=False): self.groupedreference, self.groupedquery = group_refque(self.rEM, self.qEM, groupby) def colors(self, colorby, definedinEM): ############# Color ##################################### # self.color_list = colormap(self.qEM, colorby, definedinEM) self.color_list = color_groupded_region(self.qEM, self.groupedquery, colorby, definedinEM) # self.color_tags = gen_tags(self.qEM, colorby) # self.color_tags.append('Background') self.color_list['Background'] = '0.70' def ref_union(self): self.background = OrderedDict() for ty in self.groupedreference.keys(): self.background[ty] = GenomicRegionSet("union of references") for r in self.groupedreference[ty]: self.background[ty].combine(r) self.background[ty].merge() for ty in self.groupedreference.keys(): rlist = [ r.trim_by(background=self.background[ty]) for r in self.groupedreference[ty]] self.groupedreference[ty] = rlist qlist = [ q.trim_by(background=self.background[ty]) for q in self.groupedquery[ty]] self.groupedquery[ty] = qlist def set_background(self, bed_path): bg = GenomicRegionSet("background") bg.read_bed(bed_path) self.background = OrderedDict() for ty in self.groupedreference.keys(): self.background[ty] = bg rlist = [ r.trim_by(background=bg) for r in self.groupedreference[ty]] self.groupedreference[ty] = rlist qlist = [ q.trim_by(background=bg) for q in self.groupedquery[ty]] self.groupedquery[ty] = qlist def projection_test(self, organism): self.bglist = OrderedDict() self.qlist = OrderedDict() self.plist = OrderedDict() self.interq_list = OrderedDict() self.lenlist = {} # print2(self.parameter, "\nProjection test") # print2(self.parameter, "{0:s}\t{1:s}\t{2:s}\t{3:s}\t{4:s}".format("Reference","Background", "Query", "Proportion", "p value")) all_p = {} for ty in self.groupedquery.keys(): # print(ty) self.bglist[ty] = OrderedDict() self.qlist[ty] = OrderedDict() self.plist[ty] = OrderedDict() self.interq_list[ty] = OrderedDict() if self.background: bgset = self.background[ty] else: bgset = None for i, r in enumerate(self.groupedreference[ty]): # print(r.name) self.bglist[ty][r.name] = OrderedDict() self.qlist[ty][r.name] = OrderedDict() self.plist[ty][r.name] = OrderedDict() self.interq_list[ty][r.name] = OrderedDict() self.lenlist[r.name] = len(r) for j, q in enumerate(self.groupedquery[ty]): # print(r.name, q.name, sep="\t") if r.name == q.name: continue else: bg, ratio, p, interq = r.projection_test(q, organism, extra=True, background=bgset) self.bglist[ty][r.name][q.name] = bg self.qlist[ty][r.name][q.name] = ratio self.plist[ty][r.name][q.name] = p self.interq_list[ty][r.name][q.name] = interq self.lenlist[q.name] = len(q) # if r in self.backgrounds.keys(): pass # else: self.backgrounds[r] = bg # multiple test correction multiple_correction(self.plist) for ty in self.groupedquery.keys(): for i, r in enumerate(self.groupedreference[ty]): for j, q in enumerate(self.groupedquery[ty]): # if r.name == q.name: continue # else: # bg = self.bglist[ty][r.name][q.name] # ratio = self.qlist[ty][r.name][q.name] # p = self.plist[ty][r.name][q.name] self.qlist[ty][r.name]['Background'] = self.bglist[ty][r.name].values()[0] def output_interq(self, directory): """Output the intersected query to the reference in BED format""" try: os.stat(os.path.dirname(directory)) except: os.mkdir(os.path.dirname(directory)) try: os.stat(directory) except: os.mkdir(directory) for ty in self.interq_list.keys(): if ty: g = ty+ "_" else: g = "" for r in self.interq_list[ty].keys(): for q in self.interq_list[ty][r].keys(): self.interq_list[ty][r][q].write_bed(os.path.join(directory, g + q + "_intersected_" + r + ".bed")) def plot(self, logt=None, pw=3, ph=3): tw = pw th = len(self.qlist.keys()) * ph f, ax = plt.subplots(len(self.qlist.keys()), 1, dpi=300,figsize=(tw, th)) # f, ax = plt.subplots(len(self.qlist.keys()),1) try: ax = ax.reshape(-1) except: ax = [ax] # nm = len(self.groupedreference.keys()) * len(self.groupedreference.values()[0]) * len(self.groupedquery.values()[0]) # if nm > 40: # f.set_size_inches(nm * 0.2 +1 ,7) g_label = [] for ind_ty, ty in enumerate(self.qlist.keys()): g_label.append(ty) r_label = [] for ind_r, r in enumerate(self.qlist[ty].keys()): r_label.append(r) width = 0.8 / (len(self.qlist[ty][r].keys()) + 1) # Plus one background for ind_q, q in enumerate(self.qlist[ty][r].keys()): x = ind_r + ind_q * width + 0.1 y = self.qlist[ty][r][q] if y == 0 and logt: y = 0.000001 # print(" "+r+" "+q+" "+str(x)+" "+str(y)) ax[ind_ty].bar(x, y, width=width, color=self.color_list[q], edgecolor="none", align='edge', log=logt, label=q) if logt: ax[ind_ty].set_yscale('log') else: ax[ind_ty].locator_params(axis='y', nbins=2) # ax[ind_ty].set_ylabel("Percentage of intersected regions",fontsize=12) ax[ind_ty].set_title(ty) ax[ind_ty].yaxis.tick_left() ax[ind_ty].set_ylabel('Percentage of intersected regions', fontsize=8) ax[ind_ty].set_xticks([i + 0.5 - 0.5 * width for i in range(len(r_label))]) ax[ind_ty].set_xticklabels(r_label, rotation=30, ha="right", fontsize=8) ax[ind_ty].tick_params(axis='x', which='both', top='off', bottom='off', labelbottom='on') handles, labels = ax[ind_ty].get_legend_handles_labels() # uniq_labels = unique(labels) uniq_labels = [q.name for q in self.groupedquery[ty]] + ["Background"] ax[ind_ty].legend([handles[labels.index(l)] for l in uniq_labels], uniq_labels, loc='center left', handlelength=1, handletextpad=1, columnspacing=2, borderaxespad=0., prop={'size': 10}, bbox_to_anchor=(1.05, 0.5)) for spine in ['top', 'right']: # 'left', 'bottom' ax[ind_ty].spines[spine].set_visible(False) # f.text(-0.025, 0.5, "Percentage of intersected regions",fontsize=12, rotation="vertical", va="center") # f.tight_layout(pad=1.08, h_pad=None, w_pad=None) f.tight_layout() self.fig = f def heatmap(self): f, ax = plt.subplots(1, len(self.plist.keys())) try: ax = ax.reshape(-1) except: ax = [ax] g_label = [] for ind_ty, ty in enumerate(self.plist.keys()): g_label.append(ty) r_label = [] data = [] for ind_r, r in enumerate(self.plist[ty].keys()): r_label.append(r) # data.append(self.plist[ty][r].values()) for ind_q, q in enumerate(self.plist[ty][r].keys()): pass da = numpy.array(data) da = da.transpose() # im = plt.imshow(da, cmap=ax[ind_r], vmin=, vmax, origin, extent, shape, filternorm, filterrad, imlim, resample, url, hold) def gen_html(self, directory, title, args, align=50): dir_name = os.path.basename(directory) statistic_table = [] # check_dir(directory) html_header = "Projection Test: " + dir_name link_d = OrderedDict() link_d["Projection test"] = "index.html" link_d["Parameters"] = "parameters.html" html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") html.add_figure("projection_test.png", align="center") header_list = ["No.", "Reference<br>name", "Query<br>name", "Reference<br>number", "Query<br>number", "Proportion", "Background<br>proportion", "Positive<br>association<br>p-value", "Negative<br>association<br>p-value"] statistic_table.append(["Reference_name", "Query_name", "Reference_number", "Query_number", "Proportion", "Background_proportion", "Positive_association_p-value", "Negative_association_p-value"]) type_list = 'ssssssssssssssss' col_size_list = [5, 10, 10, 10, 10, 10, 10, 15, 15] nalist = [] for ind_ty, ty in enumerate(self.plist.keys()): html.add_heading(ty, size=4, bold=False) data_table = [] for ind_r, r in enumerate(self.plist[ty].keys()): rlen = str(self.lenlist[r]) for ind_q, q in enumerate(self.plist[ty][r].keys()): qlen = str(self.lenlist[q]) backv = value2str(self.qlist[ty][r]['Background']) propor = value2str(self.qlist[ty][r][q]) pv = self.plist[ty][r][q] if pv == "na": nalist.append(r) continue elif self.qlist[ty][r][q] < args.cfp: continue else: pvn = 1 - pv if self.plist[ty][r][q] < 0.05: if self.qlist[ty][r]['Background'] < self.qlist[ty][r][q]: data_table.append([str(ind_ty), r, q, rlen, qlen, propor, backv, "<font color=\"red\">" + value2str(pv) + "</font>", value2str(pvn)]) statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)]) else: data_table.append([str(ind_ty), r, q, rlen, qlen, propor, backv, value2str(pvn), "<font color=\"red\">" + value2str(pv) + "</font>"]) statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pvn), value2str(pv)]) else: data_table.append( [str(ind_ty), r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)]) statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, sortable=True) output_array(statistic_table, directory=directory, folder=title, filename="statistics" + ty + ".txt") header_list = ["Assumptions and hypothesis"] data_table = [['If the background proportion is too small, it may cause bias in p value.'], [ 'For projection test, the reference GenomicRegionSet should have non-zero length in order to calculate its background proportion.'], ['P values are corrected by multiple test correction.'], ['Positive association is defined by: Proportion > Background.'], ['Negative association is defined by: Proportion < Background.']] nalist = set(nalist) if len(nalist) > 0: data_table.append([ 'The following references contain zero-length region which cause error in proportion calculation, please check it:<br>' + ' <font color=\"red\">' + ', '.join([s for s in nalist]) + '</font></p>']) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left") html.add_fixed_rank_sortable() html.write(os.path.join(directory, os.path.join(title, "index.html"))) # Parameters html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") header_list = ["Description", "Argument", "Value"] data_table = [["Reference", "-r", args.r], ["Query", "-q", args.q], ["Output directory", "-o", os.path.basename(args.o)], ["Experiment title", "-t", args.t], # ["Grouping tag", "-g", args.g], # ["Coloring tag", "-c", args.c], # ["Background", "-bg", args.bg], ["Organism", "-organism", args.organism], ["Cutoff of proportion", "-cfp", str(args.cfp)]] html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left") html.add_free_content([ '<a href="reference_experimental_matrix.txt" style="margin-left:100">See reference experimental matrix</a>']) html.add_free_content( ['<a href="query_experimental_matrix.txt" style="margin-left:100">See query experimental matrix</a>']) html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See details</a>']) html.write(os.path.join(directory, os.path.join(title, "parameters.html"))) def table(self, directory, folder): arr = numpy.array([["#reference", "query", "background", "proportion", "p-value"]]) for ty in self.plist.keys(): for r in self.plist[ty].keys(): for q in self.plist[ty][r].keys(): ar = numpy.array( [[r, q, self.qlist[ty][r]['Background'], self.qlist[ty][r][q], self.plist[ty][r][q]]]) arr = numpy.vstack((arr, ar)) output_array(arr, directory, folder, filename="output_table.txt") def distribution(self, organism): genome = GenomicRegionSet("genome") genome.get_genome_data(organism) all_cov = genome.total_coverage() self.chrom_list = [] for ss in genome: self.chrom_list.append(ss.chrom) self.chrom_list.sort() # self.distriDict = OrderedDict() self.disperDict = OrderedDict() for ty in self.groupedreference.keys(): # self.distriDict[ty] = OrderedDict() self.disperDict[ty] = OrderedDict() # Reference for r in self.groupedreference[ty]: r.merge() len_r = r.total_coverage() # self.distriDict[ty][r.name] = [] self.disperDict[ty][r.name] = [] for ch in self.chrom_list: rc = r.any_chrom(chrom=ch) nr = sum([len(s) for s in rc]) # self.distriDict[ty][r.name].append(nr) self.disperDict[ty][r.name].append(nr / len_r) # Query for q in self.groupedquery[ty]: q.merge() len_q = q.total_coverage() # self.distriDict[ty][q.name] = [] self.disperDict[ty][q.name] = [] for ch in self.chrom_list: qc = q.any_chrom(chrom=chr) nq = sum([len(s) for s in qc]) # self.distriDict[ty][q.name].append(nq) self.disperDict[ty][q.name].append(nq / len_q) # Genome # self.distriDict[ty]["Genome"] = [len(genome.any_chrom(chrom=chr)) for chr in self.chrom_list] self.disperDict[ty]["Genome"] = [len(genome.any_chrom(chrom=chr)[0]) / all_cov for chr in self.chrom_list] def plot_distribution(self): def to_percentage(x, pos=0): return '{:.2f} %'.format(100 * x) self.fig = [] for ty in self.disperDict.keys(): colors = plt.cm.Set1(numpy.linspace(0.1, 0.9, len(self.disperDict[ty].keys()))).tolist() f, ax = plt.subplots() f.set_size_inches(10.5, 30) width = 0.9 / len(self.disperDict[ty].keys()) ind = np.arange(len(self.chrom_list)) coverage = self.disperDict[ty] for ind_r, r in enumerate(self.disperDict[ty].keys()): ax.barh(ind + width * ind_r, self.disperDict[ty][r], width, color=colors[ind_r]) plt.xlabel('Percentage') ax.xaxis.set_major_formatter(mtick.FuncFormatter(to_percentage)) ax.minorticks_off() ax.set_yticks([x + 0.5 for x in range(len(self.chrom_list))]) ax.set_yticklabels(self.chrom_list, rotation=0, ha="right") ax.tick_params(axis='y', which='both', top='off', bottom='off', labelbottom='on') ax.legend(self.disperDict[ty].keys(), loc='center left', handlelength=1, handletextpad=1, columnspacing=2, borderaxespad=0., prop={'size': 10}, bbox_to_anchor=(1.05, 0.5)) for spine in ['top', 'right', 'left', 'bottom']: ax.spines[spine].set_visible(False) f.tight_layout(pad=1.08, h_pad=None, w_pad=None) self.fig.append(f) def gen_html_distribution(self, outputname, title, align=50): fp = os.path.join(dir, outputname, title) link_d = {title: "distribution.html"} html = Html(name="Viz", links_dict=link_d, fig_dir=os.path.join(dir, outputname, "fig"), other_logo="viz", homepage="../index.html") for i, f in enumerate(self.fig): html.add_figure("distribution_test_" + str(i) + ".png", align="center") html.add_free_content(['<p style=\"margin-left: ' + str(align + 150) + '">' + '** </p>']) type_list = 'ssssssssssssssssssssssssssssssssssssssssssssss' col_size_list = [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10] data_table = [] for ind_ty, ty in enumerate(self.disperDict.keys()): header_list = ["Chromosome"] + self.disperDict[ty].keys() html.add_heading(ty, size=4, bold=False) for i, ch in enumerate(self.chrom_list): # for ind_r,r in enumerate(self.disperDict[ty].keys()): data_table.append( [ch] + ["{:.3f} %".format(100 * self.disperDict[ty][r][i]) for r in self.disperDict[ty].keys()]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align) html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>']) html.add_free_content([ '<a href="reference_experimental_matrix.txt" style="margin-left:100">See reference experimental matrix</a>']) html.add_free_content( ['<a href="query_experimental_matrix.txt" style="margin-left:100">See query experimental matrix</a>']) html.write(os.path.join(fp, "distribution.html"))
cov.coverage_from_genomicset(r) #cov.normRPM() c.append(cov.coverage) return numpy.transpose(c) def printTable(namesCol,namesLines,table,fileName): f=open(fileName,"w") f.write("\t"+("\t".join(namesCol))+"\n") for i,line in enumerate(table): f.write(namesLines[i]+"\t"+("\t".join([str(j) for j in line]))+"\n") out="" experimentalFile = sys.argv[1] exps=ExperimentalMatrix() exps.read(experimentalFile) beds = exps.get_regionsets() reads = exps.get_readsfiles() readsnames = exps.get_readsnames() outputDir = sys.argv[2] if len(sys.argv) > 3: experimentalFile2 = sys.argv[3] exps2=ExperimentalMatrix() exps2.read(experimentalFile2) reads = exps2.get_readsfiles() readsnames = exps2.get_readsnames() out=outputDir for bed in beds: bednames=[r.chrom+":"+str(r.initial)+"-"+str(r.final) for r in bed]
#resprop1=numpy.divide(res,numpy.array([totalLine]).transpose()) resprop2=res.transpose()/numpy.array(totalLine) resprop2=resprop2.transpose() return namesCol, namesLine,res,resprop1,resprop2 def printTable(namesCol,namesLines,table,fileName): f=open(fileName,"w") f.write("\t"+("\t".join(namesCol))+"\n") for i,line in enumerate(table): f.write(namesLines[i]+"\t"+("\t".join([str(j) for j in line]))+"\n") out="" experimentalFile = sys.argv[1] exps=ExperimentalMatrix() exps.read(experimentalFile) beds = exps.get_regionsets() beds2=beds outputDir = sys.argv[2] if len(sys.argv) > 3: experimentalFile2 = sys.argv[3] exps2=ExperimentalMatrix() exps2.read(experimentalFile2) beds2 = exps2.get_regionsets() out=outputDir [namesCol,namesLine,res,resprop1,resprop2]=bedOverllap(beds,beds2,outPath=out) printTable(namesCol,namesLine,res,outputDir+"/count.table") printTable(namesCol,namesLine,resprop1,outputDir+"/propline.table")