from heatmapfun import plotHeatMap import filefun import globalvars parameters = globalvars.Parameters() wd = '/home/sarah/GreenDragonRaid1/lab/sarah/SCLC_ATAC/140908_bams/mm9/peakProcessing' # load peak beds with scores bedFileName = os.path.join(wd, 'scoring/140815_peaks.coverageCorr.all.bed') peakBed = filefun.loadBedwScores(bedFileName) # load nfib peak summits chipBedFileName = '/home/sarah/GreenDragonRaid1/shr/Downloaded_data/mm9/nfib_chip/nfib_peak_summits.bed' chipBed = filefun.loadBed(chipBedFileName) # load tss tssBedFileName ='/raid/Downloaded_data/mm9_data/TSS/refSeqmm9.TSS.fix.bed' # find the number of chip peaks that fall into accessible peaks peakBed.hasChipPeak = np.array(subprocess.check_output("bedtools intersect -c -b %s -a %s | awk '{print $NF}'"%(chipBedFileName, bedFileName), shell=True).split(), dtype=int).astype(bool) # get peak Distance to Tss peakBed.distancetoTss = np.array(subprocess.check_output("bedtools closest -d -t first -a %s -b %s | awk '{print $NF}'"%(bedFileName, tssBedFileName), shell=True).split(), dtype=int) cutoff_distance = 5E3 peakBed.distal = peakBed.distancetoTss > cutoff_distance # get Intergenic indicator from homer annotate peaks peakIndxName = os.path.join(wd, 'scoring/140815_peaks.coverageCorr.all.ann.noheader.intergenic.peakIndx') peakIndx = np.loadtxt(peakIndxName, dtype=bool)
ax.set_xlabel('distance across chromosome %s (Mb)'%chrm) ax.yaxis.tick_left() return #### SCRIPT ##### print '%s\n%s\n%s'%(options.b, options.g, options.o) # lets look at this subset of chromosomes chrms = np.array(['chr%d'%i for i in np.linspace(1, 19, 19)]) # import bedfile stepSize = 5E5 # distance between points windowSize = 1E6 # distance over which to average genomeSize = filefun.getGenomeSize(options.g) locBed = filefun.loadBed(options.b) values = filefun.Peaks(np.loadtxt(options.c)) smoothedValues, windowedLocs = smooth_windowed_genome.main(locBed, values,genomeSize, windowSize, stepSize ) # normalize coverage by total number of reads on chromosome 1. Should you do that? # Can't normalize with chromosome 4 because variable amplifications lead to craziness smoothedValuesNorm = {} for chrm in chrms: #smoothedValuesNorm[chrm] = smoothedValues[chrm]/np.mean(smoothedValues['chr1'], 0) * np.mean(smoothedValues['chr1']) # NOTE: not actually normalizing at all smoothedValuesNorm[chrm] = smoothedValues[chrm] # what should max plotted value be? allValues = smoothedValuesNorm[chrms[0]] for chrm in chrms[1:]:
# given enrichment scores of how spatiall distirbuted the peaks # are, what are the intervals of 'boundaries' i.e. places where enrichment crosses zero? ##### IMPORT MODULES ##### # import necessary for python import os import sys import numpy as np import subprocess import matplotlib.pyplot as plt import filefun import histogram ##### enrichment = np.loadtxt('counts_in_peaks.distal.hypervsnot.enrichment_values.iterations_500.mat') locBed = filefun.loadBed('counts_in_peaks.distal.windowed_locs.bed') """ define boundary regions as those close to zero, where regions to either side change signs range of 'zero' points is within 0.1 std deviations of overall distribution from zero """ stdev = np.nanstd(enrichment) mx = 0.1 # possible boundaries are those that are within 0.1 standard deviations from 0 possible_boundaries = np.all((enrichment < mx*stdev, enrichment > -mx*stdev), axis=0) # cycle through and ask if the region before that boundary region actual_boundaries = np.zeros(possible_boundaries.shape, dtype=bool) for indx in np.ravel(np.where(possible_boundaries)):
y[np.argsort(y)] = meannorm.astype(int) return x, y wd = "/home/sarah/GreenDragonRaid1/lab/sarah/SCLC_ATAC/140908_bams/mm9/peakProcessing" readcountNormFile = os.path.join(wd, "140815_peaks.coverageCorr.normalize.norepicates.peakCount") bedFileName = os.path.join(wd, "scoring/140815_peaks.coverageCorr.all.bed") tssBedFileName = "/raid/Downloaded_data/mm9_data/TSS/refSeqmm9.TSS.fix.bed" noExprChangeFile = os.path.join(wd, "expression/diff_TvsM.filtered.noChange.tss.noExprChange") enrichedBedFile = os.path.join(wd, "spatialCorrelation/significant_up_peaks.enriched.merged.bed") depletedBedFile = os.path.join(wd, "spatialCorrelation/significant_up_peaks.depleted.merged.bed") # load files readCountNorm = np.loadtxt(readcountNormFile) peakBed = filefun.loadBedwScores(bedFileName) tssBed = filefun.loadBed(tssBedFileName) enrichedBed = filefun.loadBed(enrichedBedFile) depletedBed = filefun.loadBed(depletedBedFile) noExprChange = np.loadtxt(noExprChangeFile, dtype=bool) # get non promoter proximal peaks peakBed.distancetoTss = np.array( subprocess.check_output( "bedtools closest -d -t first -a %s -b %s | cut -f14" % (bedFileName, tssBedFileName), shell=True ).split(), dtype=int, ) cutoff_distance = 5e3 peakBed.distal = peakBed.distancetoTss > cutoff_distance # get region of peaks
for chrm in chrms: foldChange = signalDensityDict[chrm] foldChangeRandom = np.mean(signalDensityRandomDict[chrm], 0) enrichment = np.log2(foldChange/foldChangeRandom) if chrm=='chr8': plt.figure(figsize=(20,5)) heatmapfun.plotCoverageHeatMap(enrichment[:, reorder], cluster=False, rowlabels = parameters.headers_noreplicates[reorder]) # 11/7/14 import filefun import subprocess motifBed = '../NF1_CTF.bed' nucBedFile = 'hyperaccessible.chr1.nucpos.bed' nucBed = filefun.loadBed(nucBedFile) nucBed.zscore = np.loadtxt(nucBedFile, usecols=(3,)) nucBed.is_nfi_full = np.array(subprocess.check_output("bedtools intersect -c -b %s -a %s | awk '{print $NF}'"%(motifBed, nucBedFile), shell=True).split()).astype(bool) nucBed.is_ctcf = np.array(subprocess.check_output("bedtools intersect -c -b %s -a %s | awk '{print $NF}'"%('../../CTCF_Zf/CTCF_Zf.noheader.bed', nucBedFile), shell=True).split()).astype(bool) xbins = np.arange(-0.5, 21, 1) histogram.compare([nucBed.zscore, nucBed.zscore[nucBed.is_nfi_full], nucBed.zscore[nucBed.is_ctcf]], labels=['all', 'NFI full', 'CTCF'], xbins=xbins) ax = plt.gca() ax.set_xlabel('zscore') ax.set_ylabel('fraction') plt.savefig('hyperaccessible.chr1.nucpos.bed.zscore.histogram.pdf') footprints = np.load('hyperaccessible.insertions.npy')[0] fig = plt.figure() ax = fig.add_subplot(111) xvalues = np.arange(-100, 101) ax.plot(xvalues, np.mean(footprints[nucBed.is_ctcf], axis=0), label='CTCF sites')
#### SCRIPT ##### print '%s\n%s\n%s'%(options.b, options.g, options.o) bedFileName = options.b countFile = options.c genomeSizeFile = options.g print 'loading files...' #genomeSizeFile = '/raid/gSizes/mm9.genomsize' #bedFileName = '../scoring/140815_peaks.coverageCorr.all.bed' #countFile = '../140815_peaks.coverageCorr.normalize.norepicates.peakCount' tssBedFileName ='/raid/Downloaded_data/mm9_data/TSS/refSeqmm9.TSS.fix.bed' genomeSize = filefun.getGenomeSize(genomeSizeFile) locBed = filefun.loadBed(bedFileName) values = filefun.Peaks(np.loadtxt(countFile)) numPeaks = values.numPeaks print 'initializing...' chrms = np.array(['chr%d'%i for i in np.linspace(1, 19, 19)]) #chrms = np.array([chrms[0]]) # go through each chromosome and find signal density windowSize = 1E6 stepSize = 1E5 signalDensityDict = {} signalDensityRandomDict = {} windowedLocsDict = {} qvaluesDict = {}