Example #1
0
def histogram_runner(data):
    np.random.shuffle(data)
    if data.shape[0] % 2 !=0:
          #print(data.shape, "init")
          data = data[:-1]
          #print(data.shape, "jsjsjsj")
    data1, data2 = np.split(data,2)
    fake = GAN_sampler(data.shape[0]).squeeze(axis=2)
    np.random.shuffle(fake)    


    l = []
    for i in range(5):
        np.random.shuffle(fake)
        np.random.shuffle(data)
        box_list_fake = compare(data[:data1.shape[0]], fake[:data1.shape[0]])
        print(sum(box_list_fake)/ len(box_list_fake), "fake")
        l += [sum(box_list_fake)/ len(box_list_fake)]
    print(sum(l)/len(l), "Average Fake\n")

    l = []
    for i in range(5):
        np.random.shuffle(data)
        data1, data2 = np.split(data,2)
        box_list = compare(data2, data1)
        print(sum(box_list)/ len(box_list), "Real")
        l += [sum(box_list)/ len(box_list)]
    print(sum(l)/len(l), "Average Real\n")
def findWindowSize(peakbed):
    """
    given a chromosome and a range of window sizes,
    find the average number of peaks per window, and plot.
    """
    chrm='chr1'
    windowSizes = [1E3, 1E4, 1E5, 1E6, 1E7]
    vecs = ['']*len(windowSizes)
    for i, windowSize in enumerate(windowSizes):
        vecs[i] = findPeaksPerWindow(peakbed, chrm, windowSize)[0]
        
    # plot average number of peaks versus window size
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(windowSizes, [np.mean(vecs[i]) for i in range(len(windowSizes))], 'o')
    ax.set_xscale('log')
    ax.set_xlabel('window size')
    ax.set_ylabel('average number of peaks')
    
    # plot distributions of peaks for different window sizes
    histogram.compare(vecs, labels=windowSizes.astype(str))
    ax = plt.gca()
    ax.set_xlabel('number of locations per window')
    ax.set_ylabel('normalized number of windows')
    ax.set_title(chrm)
    
    return
ax.set_ylabel('number of NFI motifs')
plt.colorbar(im)
plt.tight_layout()


## plot
names = np.array(['NF1-fullsite', 'FOXA1:NF1-halfsite' , 'NF1-halfsite','de novo NFI-halfsite', 'Tlx'])

# plot histograms  
vecs = [np.log2(fold_change[num_nfi[:, col]>0]) for col in range(len(names))]
vecs.append(np.log2(fold_change))

binwidth = 0.3
xbins=np.arange(-5-binwidth*0.5, 6+binwidth*0.5, binwidth)
fig = plt.figure(figsize=(7.2,4.5))
histogram.compare(vecs, xbins=xbins, labels=np.append(names, 'all peaks'))
ax = plt.gca()
ax.grid()
ax.set_ylabel('fraction of peaks')
ax.set_xlabel('log2(fold change)')
plt.legend(loc='upper left')
plt.tight_layout()
plt.savefig('peakScores.per_NFI_motif.histogram.pdf')

# plot CDF
xvalues = ['']*len(vecs)
yvalues = ['']*len(vecs)
for i, vec in enumerate(vecs):
    xvalues[i], yvalues[i] = seqfun.getCDF(vec)
fig = plt.figure(figsize=(7.2,4.5))
plotfun.plot_manylines(yvalues, x=xvalues, labels=np.append(names, 'all peaks'))
cutoff_distance = 5E3
peakBed.distal = peakBed.distancetoTss > cutoff_distance

# get Intergenic indicator from homer annotate peaks
peakIndxName = os.path.join(wd, 'scoring/140815_peaks.coverageCorr.all.ann.noheader.intergenic.peakIndx')
peakIndx = np.loadtxt(peakIndxName, dtype=bool)

# find the number of chip peaks that fall into differentially accessible peaks
numIterations = 5000
numExpected = np.empty(numIterations)

for i in range(numIterations):
    significant_up_random = np.random.permutation(peakBed.significant_up[peakIndx])
    numExpected[i] = np.sum(np.all((peakBed.hasChipPeak[peakIndx], significant_up_random), axis=0))

numActual = np.sum(np.all((peakBed.hasChipPeak[peakIndx], peakBed.significant_up[peakIndx]), axis=0))

binwidth = 1
minbin = 0
maxbin = 100
xbins = np.arange(minbin-binwidth*0.5, maxbin+binwidth*0.5, binwidth)
plt.figure(figsize=(4,4))
histogram.compare([numExpected], xbins=xbins, labels=['simulated'])
plt.legend(loc='upper left')
ax = plt.gca()
ax.set_xlabel('number of chip seq peaks in our peaks')
ax.plot([numActual], [0.001], 'ro', label='actual')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
plt.tight_layout()
plt.savefig('numberOfChipSeqPeaks.differentiallyAccessible.intergenic.histogram.pdf')
Example #5
0
    else: outfile = options.o
    print "saving to: %s"%outfile
        
    # load genome size
    if options.g is None:
        options.g = '/raid/gSizes/mm9.genomsize'
    genomeSize = filefun.getGenomeSize(options.g)
        
    # find signal around each motif sites
    signals = np.zeros((numSamples, numSites))
    for i, bw in enumerate(bws):
        print "Extracting signal for %s"%(os.path.basename(bw))
        pool = Pool(processes=options.p)
        sigArray = pool.map(functools.partial(findInsertions, bw, bedFile), range(0, numSites))
        pool.close()
        signals[i] = np.array(sigArray)

    np.save(outfile+'.npy', signals)
    
    labels = np.array([os.path.splitext(os.path.basename(bw))[0].replace('wgEncodeFsuRepliChip', '').replace('WaveSignal', '_') for bw in bws])
    indx = np.array([label.find('Diff') == -1 for label in labels])
    
    plt.figure(figsize=(10,6))
    binsize = 0.2
    binmin = -2
    binmax = 2
    xbins = np.arange(binmin-binsize*0.5, binmax+binsize,binsize)
    histogram.compare(signals[indx], labels=labels[indx], xbins=xbins, cmap='set1')
    ax = plt.gca()
    ax.set_xlabel('replication timing')
    plt.savefig('%s.replication_timing.pdf'%outfile)
    np.save(outfile+'.npy', signals)
    
    """
    for signal tracks, make plot
    """
    span = 1E4
    indx = np.arange(0, options.l+options.r, span, dtype=int)
    xvalues = np.arange(-options.l, options.r, span)
    fig = plt.figure(figsize=(5, 4))
    ax = fig.add_subplot(111)
    for signal in signals[0]:
        ax.plot(xvalues, signal[indx], 'b', alpha=0.1)
    ax.plot(xvalues, np.nanmean(signals[0, :, indx], 1), 'k')
    plt.savefig('%s.%s.pdf'%(outfile, options.interval))

    """
    Now, for conservation, etc, plot the conservation in distal sites. Dista,
    """

    locBed = filefun.loadBedwScores(options.a)
    locBed.distanceToTss = np.array(subprocess.check_output("bedtools closest -d -t first -a %s -b %s | awk '{print $NF}'"%(bedFileName, tssBedFileName), shell=True).split(), dtype=int)
    locBed.distal = locBed.distanceToTss > 5E3
    
    signals[0, np.all((locBed.distal, locBed.significant_up), axis=0)]
    signals[0, locBed.distal]
    signals[0, np.logical_not(locBed.distal)]
    histogram.compare([signals[0, np.all((locBed.distal, locBed.significant_up), axis=0)],
                       signals[0, np.all((locBed.distal, locBed.no_change), axis=0)],
                       signals[0, np.logical_not(locBed.distal)]], labels=['up, distal', 'no change, distal', 'all promoter'])
    plotfun.plot_barplot()
# cluster motifs
nummotifs_to_look_at = nummotifs

# by correlation of distances to start
distanceCorr = np.array([[getDistanceSpearmanr(matnew[:, i], matnew[:, j]) for i in range(nummotifs_to_look_at)] for j in range(nummotifs_to_look_at)])
plotHeatMap(distanceCorr, rowlabels=motifNames, columnlabels=motifNames, fontSize=6, cmap='RdGy_r', vmin=0, vmax=1)
plt.savefig('%s.heatmap.correlation_of_distance.pdf'%outfile)

# by whether motif is present or not
distanceCorr = np.array([[1-scipy.spatial.distance.jaccard(np.isfinite(matnew)[:,i], np.isfinite(matnew)[:,j]) for i in range(nummotifs_to_look_at)] for j in range(nummotifs_to_look_at)])
plotHeatMap(distanceCorr, rowlabels=motifNames, columnlabels=motifNames, fontSize=6, cmap='RdGy_r', vmin=0, vmax=1)
plt.savefig('%s.heatmap.jaccard.pdf'%outfile)

# histogram distances to call
plt.figure(figsize=(6,4))
xbins, hists = histogram.compare(np.transpose(matnew), labels=motifNames, xbins=np.arange(-100, 100, 5)-2.5, normalize=True)
plt.savefig('%s.distance.histogram.pdf'%outfile)
plt.figure(figsize=(8,8))
heatmapfun.plotCoverageHeatMap(np.transpose(hists[:nummotifs_to_look_at]), rowlabels=motifNames[:nummotifs_to_look_at], colorbar=False)
plt.savefig('%s.distance.heatmap.pdf'%outfile)



# percentage of sites
percent = np.sum(np.isfinite(matnew), axis=0)/float(numsites)

plt.figure(figsize=(10,6))
plotfun.plot_barplot([percent[:nummotifs_to_look_at]], samples = motifNames[:nummotifs_to_look_at])
ax = plt.gca()
ax.legend_ = None
plt.xticks(rotation='90')