from heatmapfun import plotHeatMap
import filefun
import globalvars
parameters = globalvars.Parameters()



wd = '/home/sarah/GreenDragonRaid1/lab/sarah/SCLC_ATAC/140908_bams/mm9/peakProcessing'

# load peak beds with scores
bedFileName =  os.path.join(wd, 'scoring/140815_peaks.coverageCorr.all.bed')
peakBed = filefun.loadBedwScores(bedFileName)

# load nfib peak summits
chipBedFileName = '/home/sarah/GreenDragonRaid1/shr/Downloaded_data/mm9/nfib_chip/nfib_peak_summits.bed'
chipBed = filefun.loadBed(chipBedFileName)

# load tss
tssBedFileName ='/raid/Downloaded_data/mm9_data/TSS/refSeqmm9.TSS.fix.bed'

# find the number of chip peaks that fall into accessible peaks
peakBed.hasChipPeak = np.array(subprocess.check_output("bedtools intersect -c -b %s -a %s | awk '{print $NF}'"%(chipBedFileName, bedFileName), shell=True).split(), dtype=int).astype(bool)

# get peak Distance to Tss
peakBed.distancetoTss = np.array(subprocess.check_output("bedtools closest -d -t first -a %s -b %s | awk '{print $NF}'"%(bedFileName, tssBedFileName), shell=True).split(), dtype=int)
cutoff_distance = 5E3
peakBed.distal = peakBed.distancetoTss > cutoff_distance

# get Intergenic indicator from homer annotate peaks
peakIndxName = os.path.join(wd, 'scoring/140815_peaks.coverageCorr.all.ann.noheader.intergenic.peakIndx')
peakIndx = np.loadtxt(peakIndxName, dtype=bool)
    ax.set_xlabel('distance across chromosome %s (Mb)'%chrm)
    ax.yaxis.tick_left()

    return

#### SCRIPT #####
print '%s\n%s\n%s'%(options.b, options.g, options.o)
# lets look at this subset of chromosomes
chrms = np.array(['chr%d'%i for i in np.linspace(1, 19, 19)])

# import bedfile
stepSize =   5E5      # distance between points
windowSize = 1E6    # distance over which to average
genomeSize = filefun.getGenomeSize(options.g)

locBed = filefun.loadBed(options.b)
values = filefun.Peaks(np.loadtxt(options.c))
smoothedValues, windowedLocs = smooth_windowed_genome.main(locBed, values,genomeSize, windowSize, stepSize )

# normalize coverage by total number of reads on chromosome 1. Should you do that?
# Can't normalize with chromosome 4 because variable amplifications lead to craziness
smoothedValuesNorm = {}
for chrm in chrms:
    #smoothedValuesNorm[chrm] = smoothedValues[chrm]/np.mean(smoothedValues['chr1'], 0) * np.mean(smoothedValues['chr1'])
    
    # NOTE: not actually normalizing at all
    smoothedValuesNorm[chrm] = smoothedValues[chrm]
    
# what should max plotted value be?
allValues = smoothedValuesNorm[chrms[0]]
for chrm in chrms[1:]:
# given enrichment scores of how spatiall distirbuted the peaks
# are, what are the intervals of 'boundaries' i.e. places where enrichment crosses zero?

##### IMPORT MODULES #####
# import necessary for python
import os
import sys
import numpy as np
import subprocess
import matplotlib.pyplot as plt
import filefun
import histogram

#####
enrichment = np.loadtxt('counts_in_peaks.distal.hypervsnot.enrichment_values.iterations_500.mat')
locBed = filefun.loadBed('counts_in_peaks.distal.windowed_locs.bed')

"""
define boundary regions as those close to zero, where regions to either side change signs

range of 'zero' points is within 0.1 std deviations of overall distribution from zero
"""
stdev = np.nanstd(enrichment)
mx = 0.1

# possible boundaries are those that are within 0.1 standard deviations from 0
possible_boundaries = np.all((enrichment < mx*stdev, enrichment > -mx*stdev), axis=0)

# cycle through and ask if the region before that boundary region
actual_boundaries = np.zeros(possible_boundaries.shape, dtype=bool)
for indx in np.ravel(np.where(possible_boundaries)):
    y[np.argsort(y)] = meannorm.astype(int)
    return x, y


wd = "/home/sarah/GreenDragonRaid1/lab/sarah/SCLC_ATAC/140908_bams/mm9/peakProcessing"
readcountNormFile = os.path.join(wd, "140815_peaks.coverageCorr.normalize.norepicates.peakCount")
bedFileName = os.path.join(wd, "scoring/140815_peaks.coverageCorr.all.bed")
tssBedFileName = "/raid/Downloaded_data/mm9_data/TSS/refSeqmm9.TSS.fix.bed"
noExprChangeFile = os.path.join(wd, "expression/diff_TvsM.filtered.noChange.tss.noExprChange")
enrichedBedFile = os.path.join(wd, "spatialCorrelation/significant_up_peaks.enriched.merged.bed")
depletedBedFile = os.path.join(wd, "spatialCorrelation/significant_up_peaks.depleted.merged.bed")

# load files
readCountNorm = np.loadtxt(readcountNormFile)
peakBed = filefun.loadBedwScores(bedFileName)
tssBed = filefun.loadBed(tssBedFileName)
enrichedBed = filefun.loadBed(enrichedBedFile)
depletedBed = filefun.loadBed(depletedBedFile)
noExprChange = np.loadtxt(noExprChangeFile, dtype=bool)

# get non promoter proximal peaks
peakBed.distancetoTss = np.array(
    subprocess.check_output(
        "bedtools closest -d -t first -a %s -b %s | cut -f14" % (bedFileName, tssBedFileName), shell=True
    ).split(),
    dtype=int,
)
cutoff_distance = 5e3
peakBed.distal = peakBed.distancetoTss > cutoff_distance

# get region of peaks
Exemple #5
0
for chrm in chrms:
    foldChange = signalDensityDict[chrm]
    foldChangeRandom = np.mean(signalDensityRandomDict[chrm], 0)
    enrichment = np.log2(foldChange/foldChangeRandom)   

    if chrm=='chr8':
        plt.figure(figsize=(20,5))
        heatmapfun.plotCoverageHeatMap(enrichment[:, reorder], cluster=False, rowlabels = parameters.headers_noreplicates[reorder])
        
        
# 11/7/14
import filefun
import subprocess
motifBed = '../NF1_CTF.bed'
nucBedFile = 'hyperaccessible.chr1.nucpos.bed'
nucBed = filefun.loadBed(nucBedFile)
nucBed.zscore = np.loadtxt(nucBedFile, usecols=(3,))
nucBed.is_nfi_full = np.array(subprocess.check_output("bedtools intersect -c -b %s -a %s | awk '{print $NF}'"%(motifBed, nucBedFile), shell=True).split()).astype(bool)
nucBed.is_ctcf = np.array(subprocess.check_output("bedtools intersect -c -b %s -a %s | awk '{print $NF}'"%('../../CTCF_Zf/CTCF_Zf.noheader.bed', nucBedFile), shell=True).split()).astype(bool)
xbins = np.arange(-0.5, 21, 1)
histogram.compare([nucBed.zscore, nucBed.zscore[nucBed.is_nfi_full], nucBed.zscore[nucBed.is_ctcf]], labels=['all', 'NFI full', 'CTCF'], xbins=xbins)
ax = plt.gca()
ax.set_xlabel('zscore')
ax.set_ylabel('fraction')
plt.savefig('hyperaccessible.chr1.nucpos.bed.zscore.histogram.pdf')

footprints = np.load('hyperaccessible.insertions.npy')[0]
fig = plt.figure()
ax = fig.add_subplot(111)
xvalues = np.arange(-100, 101)
ax.plot(xvalues, np.mean(footprints[nucBed.is_ctcf], axis=0), label='CTCF sites')
#### SCRIPT #####
print '%s\n%s\n%s'%(options.b, options.g, options.o)
bedFileName = options.b
countFile = options.c
genomeSizeFile = options.g

print 'loading files...'
#genomeSizeFile = '/raid/gSizes/mm9.genomsize'
#bedFileName = '../scoring/140815_peaks.coverageCorr.all.bed'
#countFile = '../140815_peaks.coverageCorr.normalize.norepicates.peakCount'

tssBedFileName ='/raid/Downloaded_data/mm9_data/TSS/refSeqmm9.TSS.fix.bed'

genomeSize = filefun.getGenomeSize(genomeSizeFile)
locBed = filefun.loadBed(bedFileName)
values = filefun.Peaks(np.loadtxt(countFile))
numPeaks = values.numPeaks

print 'initializing...'
chrms = np.array(['chr%d'%i for i in np.linspace(1, 19, 19)])
#chrms = np.array([chrms[0]])

# go through each chromosome and find signal density
windowSize = 1E6
stepSize = 1E5
signalDensityDict = {}
signalDensityRandomDict = {}
windowedLocsDict = {}
qvaluesDict = {}