-
Notifications
You must be signed in to change notification settings - Fork 2
/
phastconsmeta_aroundkmer.py
208 lines (177 loc) · 9.55 KB
/
phastconsmeta_aroundkmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#Given a gff of regions you are interested in, a fasta file of the genome sequence, and a .bed file
#of phastcons values (sorted.phastcons.mm9.bed.gz), look for kmers within those regions and
#get phastcons values from kmerstart - 100 to kmerend + 100
from Bio import SeqIO
import sys
import tabix
import pysam
from numpy import mean, std, sqrt
import argparse
def indexgenome(genomefasta):
sys.stderr.write('Indexing genome sequence...\n')
seq_dict = SeqIO.to_dict(SeqIO.parse(genomefasta, 'fasta'))
sys.stderr.write('{0} chromosomes indexed.\n'.format(len(seq_dict)))
return seq_dict
def getSequence(seq_dict, region):
#region = chrm;start;stop;strand
chrm = region.split(';')[0]
start = int(region.split(';')[1])
stop = int(region.split(';')[2])
strand = region.split(';')[3]
if strand == '+':
seq = seq_dict[chrm].seq[start:stop].upper().transcribe()
elif strand == '-':
seq = seq_dict[chrm].seq[start:stop].upper().reverse_complement().transcribe()
seq = str(seq)
return seq
def getkmerpos(gff, seq_dict, kmers):
kmerpos = {} # {age : {location : [[chrm, kmerstart, kmerstop, strand]]}}
k = len(kmers[0])
gfffh = open(gff, 'r')
for line in gfffh:
line = line.strip().split('\t')
chrm = line[0]
age = line[1]
location = line[2]
start = int(line[3])
stop = int(line[4])
strand = line[6]
ID = line[8][3:]
region = (';').join([chrm, str(start), str(stop), strand])
seq = getSequence(seq_dict, region)
for i in range(len(seq) - k + 1):
kmer = seq[i:i+k]
if kmer in kmers:
if strand == '+':
kmerstart = start + i + 1
kmerstop = start + i + k
elif strand == '-':
kmerstart = stop - i - k + 1
kmerstop = stop - i
if kmerpos.has_key(age) == False:
kmerpos[age] = {}
kmerpos[age][location] = [[chrm, kmerstart, kmerstop, strand]]
elif kmerpos.has_key(age):
if kmerpos[age].has_key(location) == False:
kmerpos[age][location] = [[chrm, kmerstart, kmerstop, strand]]
elif kmerpos[age].has_key(location):
kmerpos[age][location].append([chrm, kmerstart, kmerstop, strand])
gfffh.close()
return kmerpos
def getphastcons(kmerpos, phastconsbed):
#kmerpos = {} # {age : {location : [[chrm, kmerstart, kmerstop, strand]]}}
phastconsdict = {} # {age : {location : {binnumber : [phastconsvalue1, phastconsvalue2, ...]}}}
#Bin1 is 100 bp upstream of kmerstart. For a 4mer, the kmer would be bins 101-104, and bins 105-205 would be 100 bp downstream of kmerstop.
phastconstabix = pysam.Tabixfile(phastconsbed)
for age in kmerpos:
phastconsdict[age] = {}
for location in kmerpos[age]:
phastconsdict[age][location] = {}
for kmer in kmerpos[age][location]:
chrm = kmer[0]
kmerstart = int(kmer[1])
kmerstop = int(kmer[2])
strand = kmer[3]
phastconsscores = {} # {windowbin : score}
if strand == '+':
windowstart = kmerstart - 100
windowend = kmerstop + 100
try:
for bed in phastconstabix.fetch(chrm, windowstart, windowend, parser = pysam.asBed()):
windowbin = str(int(bed.start) - windowstart)
phastconsscore = float(bed.name)
phastconsscores[windowbin] = phastconsscore
except ValueError:
print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format(str(chrm), str(kmerstart), str(kmerstop), strand)
elif strand == '-':
windowstart = kmerstart - 100
windowend = kmerstop + 100
try:
for bed in phastconstabix.fetch(chrm, windowstart, windowend, parser = pysam.asBed()):
windowbin = str(windowend - int(bed.start))
phastconsscore = float(bed.name)
phastconsscores[windowbin] = phastconsscore
except ValueError:
print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format(str(chrm), str(kmerstart), str(kmerstop), strand)
if len(phastconsscores) > 0: #if there were any bases in the UTR that had phastcons scores
for windowbin in phastconsscores:
if phastconsdict[age][location].has_key(windowbin) == False:
phastconsdict[age][location][windowbin] = [phastconsscores[windowbin]]
elif phastconsdict[age][location].has_key(windowbin):
phastconsdict[age][location][windowbin].append(phastconsscores[windowbin])
return phastconsdict
def summarizephastconsdict(phastconsdict):
#phastconsdict = {age : {location : {binnumber : [phastconsvalue1, phastconsvalue2, ...]}}}
meanphastconsdict = {} # {age : {location : {binnumber : [meanphastcons, stdev]}}}
for age in phastconsdict:
meanphastconsdict[age] = {}
for location in phastconsdict[age]:
meanphastconsdict[age][location] = {}
for binnumber in phastconsdict[age][location]:
meanphastconsscore = round(mean(phastconsdict[age][location][binnumber]), 4)
stdev = round(std(phastconsdict[age][location][binnumber]), 4)
stderr = round(stdev / sqrt(len(phastconsdict[age][location][binnumber])), 4)
meanphastconsdict[age][location][binnumber] = [meanphastconsscore, stdev, stderr]
return meanphastconsdict
def smoothscores(meanphastconsdict):
#meanphastconsdict = {age : {location : {binnumber : [meanphastcons, stdev]}}}
smoothdict = {} # {age : {location : {binnumber : smoothedscore}}}
smoothwindow = 2 #width of window to use when smoothing...this is the bin and +/- 2 nt
#Bins range from 1 (100 bp upstream of kmer start) to 200 + k. Kmer starts at 100
for age in meanphastconsdict:
if smoothdict.has_key(age) == False:
smoothdict[age] = {}
for location in meanphastconsdict[age]:
if smoothdict[age].has_key(location) == False:
smoothdict[age][location] = {}
for binnumber in meanphastconsdict[age][location]:
binnumber = int(binnumber)
nearbyvalues = []
if binnumber >=3 and binnumber <= 200: #cant look at the edges because there is no window
for i in range(binnumber - smoothwindow, binnumber + smoothwindow + 1):
if str(i) in meanphastconsdict[age][location]:
nearbyvalues.append(meanphastconsdict[age][location][str(i)][0])
else: #sometimes there is no value for a bin in a particular age/location
continue
smoothedvalue = mean(nearbyvalues)
smoothdict[age][location][binnumber] = smoothedvalue
return smoothdict
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--genomefasta', type = str, help = 'Genome sequence in fasta format.')
parser.add_argument('--gff', type = str, help = 'Gff file of regions to consider.')
parser.add_argument('--kmers', type = str, help = 'Comma separated list of kmers to look for.')
parser.add_argument('--phastcons', type = str, help = 'Gzipped and tabix indexed bed of phastcons scores. sorted.phastcons.mm9.bed.gz works well. Requires sorted.phastcons.mm9.bed.gz.tbi in the same directory.')
parser.add_argument('--outfile', type = str, help = 'Output file.')
parser.add_argument('--smoothedoutfile', type = str, help = 'Optional. If you want the output smoothed, provide a filename here.')
args = parser.parse_args()
seq_dict = indexgenome(args.genomefasta)
kmers = args.kmers.split(',')
kmerpos = getkmerpos(args.gff, seq_dict, kmers)
phastconsdict = getphastcons(kmerpos, args.phastcons)
meanphastconsdict = summarizephastconsdict(phastconsdict)
smoothdict = smoothscores(meanphastconsdict)
def asint(s):
try:
return int(s), ''
except ValueError:
return sys.maxint, s
outfh = open(args.outfile, 'w')
outfh.write('Age' + '\t' + 'location' + '\t' + 'BNS' + '\t' + 'bin' + '\t' + 'meanphastcons' + '\t' + 'stdev' + '\t' + 'stderr' + '\t' + 'Protein' + '\t' + 'BNS' + '\n')
for age in meanphastconsdict:
for location in meanphastconsdict[age]:
for binnumber in sorted(meanphastconsdict[age][location], key = asint):
phastconsscore = meanphastconsdict[age][location][binnumber][0]
stdev = meanphastconsdict[age][location][binnumber][1]
stderr = meanphastconsdict[age][location][binnumber][2]
outfh.write(age + '\t' + location + '\t' + 'foxunbound' + '\t' + str(binnumber) + '\t' + str(phastconsscore) + '\t' + str(stdev) + '\t' + str(stderr) + '\t' + 'Fox' + '\t' + 'unbound' + '\n')
outfh.close()
if args.smoothedoutfile:
outfh = open(args.smoothedoutfile, 'w')
outfh.write(('\t').join(['Age', 'location','Class', 'bin', 'smoothedphastcons','Protein','BNS']) + '\n')
for age in smoothdict:
for location in smoothdict[age]:
for binnumber in sorted(smoothdict[age][location], key = asint):
smoothedscore = smoothdict[age][location][binnumber]
outfh.write(('\t').join([age, location, 'msibound', str(binnumber), str(smoothedscore), 'Msi', 'bound']) + '\n')
outfh.close()