def histogram_runner(data): np.random.shuffle(data) if data.shape[0] % 2 !=0: #print(data.shape, "init") data = data[:-1] #print(data.shape, "jsjsjsj") data1, data2 = np.split(data,2) fake = GAN_sampler(data.shape[0]).squeeze(axis=2) np.random.shuffle(fake) l = [] for i in range(5): np.random.shuffle(fake) np.random.shuffle(data) box_list_fake = compare(data[:data1.shape[0]], fake[:data1.shape[0]]) print(sum(box_list_fake)/ len(box_list_fake), "fake") l += [sum(box_list_fake)/ len(box_list_fake)] print(sum(l)/len(l), "Average Fake\n") l = [] for i in range(5): np.random.shuffle(data) data1, data2 = np.split(data,2) box_list = compare(data2, data1) print(sum(box_list)/ len(box_list), "Real") l += [sum(box_list)/ len(box_list)] print(sum(l)/len(l), "Average Real\n")
def findWindowSize(peakbed): """ given a chromosome and a range of window sizes, find the average number of peaks per window, and plot. """ chrm='chr1' windowSizes = [1E3, 1E4, 1E5, 1E6, 1E7] vecs = ['']*len(windowSizes) for i, windowSize in enumerate(windowSizes): vecs[i] = findPeaksPerWindow(peakbed, chrm, windowSize)[0] # plot average number of peaks versus window size fig = plt.figure() ax = fig.add_subplot(111) ax.plot(windowSizes, [np.mean(vecs[i]) for i in range(len(windowSizes))], 'o') ax.set_xscale('log') ax.set_xlabel('window size') ax.set_ylabel('average number of peaks') # plot distributions of peaks for different window sizes histogram.compare(vecs, labels=windowSizes.astype(str)) ax = plt.gca() ax.set_xlabel('number of locations per window') ax.set_ylabel('normalized number of windows') ax.set_title(chrm) return
ax.set_ylabel('number of NFI motifs') plt.colorbar(im) plt.tight_layout() ## plot names = np.array(['NF1-fullsite', 'FOXA1:NF1-halfsite' , 'NF1-halfsite','de novo NFI-halfsite', 'Tlx']) # plot histograms vecs = [np.log2(fold_change[num_nfi[:, col]>0]) for col in range(len(names))] vecs.append(np.log2(fold_change)) binwidth = 0.3 xbins=np.arange(-5-binwidth*0.5, 6+binwidth*0.5, binwidth) fig = plt.figure(figsize=(7.2,4.5)) histogram.compare(vecs, xbins=xbins, labels=np.append(names, 'all peaks')) ax = plt.gca() ax.grid() ax.set_ylabel('fraction of peaks') ax.set_xlabel('log2(fold change)') plt.legend(loc='upper left') plt.tight_layout() plt.savefig('peakScores.per_NFI_motif.histogram.pdf') # plot CDF xvalues = ['']*len(vecs) yvalues = ['']*len(vecs) for i, vec in enumerate(vecs): xvalues[i], yvalues[i] = seqfun.getCDF(vec) fig = plt.figure(figsize=(7.2,4.5)) plotfun.plot_manylines(yvalues, x=xvalues, labels=np.append(names, 'all peaks'))
cutoff_distance = 5E3 peakBed.distal = peakBed.distancetoTss > cutoff_distance # get Intergenic indicator from homer annotate peaks peakIndxName = os.path.join(wd, 'scoring/140815_peaks.coverageCorr.all.ann.noheader.intergenic.peakIndx') peakIndx = np.loadtxt(peakIndxName, dtype=bool) # find the number of chip peaks that fall into differentially accessible peaks numIterations = 5000 numExpected = np.empty(numIterations) for i in range(numIterations): significant_up_random = np.random.permutation(peakBed.significant_up[peakIndx]) numExpected[i] = np.sum(np.all((peakBed.hasChipPeak[peakIndx], significant_up_random), axis=0)) numActual = np.sum(np.all((peakBed.hasChipPeak[peakIndx], peakBed.significant_up[peakIndx]), axis=0)) binwidth = 1 minbin = 0 maxbin = 100 xbins = np.arange(minbin-binwidth*0.5, maxbin+binwidth*0.5, binwidth) plt.figure(figsize=(4,4)) histogram.compare([numExpected], xbins=xbins, labels=['simulated']) plt.legend(loc='upper left') ax = plt.gca() ax.set_xlabel('number of chip seq peaks in our peaks') ax.plot([numActual], [0.001], 'ro', label='actual') handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels) plt.tight_layout() plt.savefig('numberOfChipSeqPeaks.differentiallyAccessible.intergenic.histogram.pdf')
else: outfile = options.o print "saving to: %s"%outfile # load genome size if options.g is None: options.g = '/raid/gSizes/mm9.genomsize' genomeSize = filefun.getGenomeSize(options.g) # find signal around each motif sites signals = np.zeros((numSamples, numSites)) for i, bw in enumerate(bws): print "Extracting signal for %s"%(os.path.basename(bw)) pool = Pool(processes=options.p) sigArray = pool.map(functools.partial(findInsertions, bw, bedFile), range(0, numSites)) pool.close() signals[i] = np.array(sigArray) np.save(outfile+'.npy', signals) labels = np.array([os.path.splitext(os.path.basename(bw))[0].replace('wgEncodeFsuRepliChip', '').replace('WaveSignal', '_') for bw in bws]) indx = np.array([label.find('Diff') == -1 for label in labels]) plt.figure(figsize=(10,6)) binsize = 0.2 binmin = -2 binmax = 2 xbins = np.arange(binmin-binsize*0.5, binmax+binsize,binsize) histogram.compare(signals[indx], labels=labels[indx], xbins=xbins, cmap='set1') ax = plt.gca() ax.set_xlabel('replication timing') plt.savefig('%s.replication_timing.pdf'%outfile)
np.save(outfile+'.npy', signals) """ for signal tracks, make plot """ span = 1E4 indx = np.arange(0, options.l+options.r, span, dtype=int) xvalues = np.arange(-options.l, options.r, span) fig = plt.figure(figsize=(5, 4)) ax = fig.add_subplot(111) for signal in signals[0]: ax.plot(xvalues, signal[indx], 'b', alpha=0.1) ax.plot(xvalues, np.nanmean(signals[0, :, indx], 1), 'k') plt.savefig('%s.%s.pdf'%(outfile, options.interval)) """ Now, for conservation, etc, plot the conservation in distal sites. Dista, """ locBed = filefun.loadBedwScores(options.a) locBed.distanceToTss = np.array(subprocess.check_output("bedtools closest -d -t first -a %s -b %s | awk '{print $NF}'"%(bedFileName, tssBedFileName), shell=True).split(), dtype=int) locBed.distal = locBed.distanceToTss > 5E3 signals[0, np.all((locBed.distal, locBed.significant_up), axis=0)] signals[0, locBed.distal] signals[0, np.logical_not(locBed.distal)] histogram.compare([signals[0, np.all((locBed.distal, locBed.significant_up), axis=0)], signals[0, np.all((locBed.distal, locBed.no_change), axis=0)], signals[0, np.logical_not(locBed.distal)]], labels=['up, distal', 'no change, distal', 'all promoter']) plotfun.plot_barplot()
# cluster motifs nummotifs_to_look_at = nummotifs # by correlation of distances to start distanceCorr = np.array([[getDistanceSpearmanr(matnew[:, i], matnew[:, j]) for i in range(nummotifs_to_look_at)] for j in range(nummotifs_to_look_at)]) plotHeatMap(distanceCorr, rowlabels=motifNames, columnlabels=motifNames, fontSize=6, cmap='RdGy_r', vmin=0, vmax=1) plt.savefig('%s.heatmap.correlation_of_distance.pdf'%outfile) # by whether motif is present or not distanceCorr = np.array([[1-scipy.spatial.distance.jaccard(np.isfinite(matnew)[:,i], np.isfinite(matnew)[:,j]) for i in range(nummotifs_to_look_at)] for j in range(nummotifs_to_look_at)]) plotHeatMap(distanceCorr, rowlabels=motifNames, columnlabels=motifNames, fontSize=6, cmap='RdGy_r', vmin=0, vmax=1) plt.savefig('%s.heatmap.jaccard.pdf'%outfile) # histogram distances to call plt.figure(figsize=(6,4)) xbins, hists = histogram.compare(np.transpose(matnew), labels=motifNames, xbins=np.arange(-100, 100, 5)-2.5, normalize=True) plt.savefig('%s.distance.histogram.pdf'%outfile) plt.figure(figsize=(8,8)) heatmapfun.plotCoverageHeatMap(np.transpose(hists[:nummotifs_to_look_at]), rowlabels=motifNames[:nummotifs_to_look_at], colorbar=False) plt.savefig('%s.distance.heatmap.pdf'%outfile) # percentage of sites percent = np.sum(np.isfinite(matnew), axis=0)/float(numsites) plt.figure(figsize=(10,6)) plotfun.plot_barplot([percent[:nummotifs_to_look_at]], samples = motifNames[:nummotifs_to_look_at]) ax = plt.gca() ax.legend_ = None plt.xticks(rotation='90')