/
trfStats.py
executable file
·319 lines (279 loc) · 15.3 KB
/
trfStats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/usr/bin/python
#Author: Jacob Zieve
import sys,argparse,textwrap,os,re,pdb
from pickler import sendToPickleJar, getFromPickleJar
from classes import trfHit
from collections import defaultdict
from operator import itemgetter, attrgetter
def main():
args = parseArgs()
if args.filter_monomers:
outFileBase = args.trf_output+".multimers"
elif args.filter_multimers:
outFileBase = args.trf_output+".monomers"
else:
outFileBase = args.trf_output
trfDict = getTRFDict(args)
getStats(args,trfDict,outFileBase)
def parseArgs():
myDescription = '''
Processes TRF output. It does nothing if no parameters are supplied.
Example usage (if pathed):
trfStats.py -s trf_output.dat
Sample line from TRF .dat file:
start end period_length copy_num consensus_length %matches %indels score %a %c %g %t entropy period sequence
33493 33544 22 2.3 22 83 3 59 34 3 1 59 1.27 ATTTTTAAAACATTTTTAATTC ATTTTAAAAATATTTTTAATTTATTTTTAAAACATTTTTTATTGCATTTTTA
'''
argParser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent(myDescription))
argParser.add_argument('trf_output')
argParser.add_argument('-ngs', '--next_gen_seq', help='Input file was generated with TRF(v4.07+) option -ngs, a .ngs file', action='store_true')
argParser.add_argument('-csv', '--csv', help='Prints trf_output as a csv', action='store_true')
argParser.add_argument('-sql', '--sql', help='Prints trf_output as a MySQL dump for further querying', action='store_true')
###Deprecated###
#argParser.add_argument('-pis', '--send_to_pickle', nargs =1, help='Send the main dictionary to a pickle file')
#argParser.add_argument('-pil', '--load_from_pickle', nargs=1, help='Load the main dictionary to a pickle file')
#argParser.add_argument('-g', '--greedy_filter', help='Filter selects monomers over multimers if overlapping (i.e. http://en.wikipedia.org/wiki/Activity_selection_problem)', action='store_true')
argParser.add_argument('-fmo', '--filter_monomers', help='Filter selects multimers over monomers if overlapping (i.e. selects the highest scoring loci), WARNING: still some overlap' , action='store_true')
argParser.add_argument('-fmu', '--filter_multimers', help='Should be default to remover redundancy, it filters selects monomers over multimers if overlapping (i.e. selects the lowest scoring loci)' , action='store_true')
argParser.add_argument('-pf', '--period_frequencies', help='Outputs a tab-delimited .period_freqs file with number of occurences of a period as a tab-delimeted file with the first field being the period and the second being the frequecy', action='store_true')
argParser.add_argument('-pl', '--period_length', help='Outputs a tab-delimited .period_lengths file with the first field being the period and the second being the cumulative bp', action='store_true')
argParser.add_argument('-pt', '--period_table', help='Outputs a tab-delimited .period_tbl file with the period and total number of loci,copies,variants and cumulative bp', action='store_true')
argParser.add_argument('-mf', '--motif_frequencies', help='Outputs tab-delimited .motif_freqs file with the first field being the motif and the second being the frequency', action='store_true')
argParser.add_argument('-ml', '--motif_length', help='Outputs tab-delimited .motif_lengths file with the first field being the motif and the second being the cumulative bp', action='store_true')
argParser.add_argument('-fa', '--fasta', help='Outputs a fasta file with all the tandem repeats for further processing', action='store_true')
argParser.add_argument('-tel', '--telomeres', help='Asseses potential telomeres and interstitial telomeric sequences', action='store_true')
argParser.add_argument('-s', '--stats', help = 'Performs all the above output operations', action='store_true')
return argParser.parse_args()
def getTRFDict(args):
if args.next_gen_seq:
return parseNGS(args)
else:
return parseDat(args)
def getStats(args,trfDict,outFileBase):
trfList = flattenTRFDict(trfDict)
if args.csv:
writeCSV(trfList,outFileBase)
if args.sql:
writeSQL(trfList,outFileBase)
if args.period_frequencies:
writePeriodFreqs(trfList,outFileBase)
if args.period_length:
writePeriodLengths(trfList,outFileBase)
if args.period_table:
writePeriodTable(trfList,outFileBase)
if args.motif_frequencies:
writeMotifFreqs(trfList,outFileBase)
if args.motif_length:
writeMotifLengths(trfList,outFileBase)
if args.fasta:
writeFasta(trfList,outFileBase)
if args.telomeres:
writeTelomeres(trfList,outFileBase)
if args.stats:
writeCSV(trfList,outFileBase)
writeSQL(trfList,outFileBase)
writePeriodFreqs(trfList,outFileBase)
writePeriodLengths(trfList,outFileBase)
writePeriodTable(trfList,outFileBase)
writeMotifFreqs(trfList,outFileBase)
writeMotifLengths(trfList,outFileBase)
writeFasta(trfList,outFileBase)
writeTelomeres(trfList,outFileBase)
def parseNGS(args):
trfDict = defaultdict(list)
seq_name = ''
count = 0
for line in open(args.trf_output).read().split('\n'):
if line.startswith('@'):
seq_name = line[1:]
elif line != '':
fields = line.split()
fields.insert(0,seq_name)
newTandem = trfHit(fields)
if args.filter_monomers:
filterTRFDict(newTandem,trfDict,"monomers")
elif args.filter_multimers:
filterTRFDict(newTandem,trfDict,"multimers")
else:
trfDict[seq_name].append(newTandem)
return trfDict
def overlap(t1,t2):
if ( (t1.start >= t2.end) or (t2.start >= t1.end) ):
return False
else:
return True
def filterTRFDict(newTandem,trfDict,discard):
key = newTandem.seq_name
overlapFlag = False
if trfDict.has_key(key):
for i,tandem in enumerate(trfDict[key][:]):
if overlap(newTandem,tandem):
overlapFlag = True
if (discard == "monomers") and (newTandem.score >= tandem.score):
trfDict[key].pop(i)
trfDict[key].append(newTandem)
if (discard == "multimers") and (newTandem.score <= tandem.score):
trfDict[key].pop(i)
trfDict[key].append(newTandem)
if(not overlapFlag):
trfDict[key].append(newTandem)
def parseDat(args):
trfDict = defaultdict(list)
seq_name = ''
for line in open(args.trf_output).read().split('\n'):
if line.startswith('Seq'):
seq_name = line[len('Sequence: '):]
if line != '' and line.split()[0].isdigit():
fields = line.split()
fields.insert(0,seq_name)
newTandem = trfHit(fields)
if args.filter_monomers:
filterTRFDict(newTandem,trfDict,"monomers")
elif args.filter_multimers:
filterTRFDict(newTandem,trfDict,"multimers")
else:
trfDict[seq_name].append(newTandem)
return trfDict
def flattenTRFDict(trfDict):
trfList = []
for v in trfDict.values():
trfList.extend(v)
return trfList
#greedy algorithm, can be faster
#def selectMonomers(trfDict):#greedy
# trf = []
# for seq_name in trfDict.keys():
# seqs=sorted(trfDict[seq_name],key=attrgetter('end'))
# trf.append(seqs[0])
# f = seqs[0].end
# for i in range(1,len(seqs)):
# if seqs[i].start >= f:
# trf.append(seqs[i])
# f = seqs[i].end
# return trf
#
def writeCSV(trfList,outFileBase):#mostly for visualization in R script
out = open(outFileBase+'.csv','w')
motifFreqs = getMotifFreqs(trfList)
periodFreqs = getPeriodFreqs(trfList)
header = "seq_name,start,end,period,copies,consensus,%matches,%indels,score,%a,%c,%g,%t,entropy,motif,length,motif_freq,period_freq\n"
out.write(header)
for t in trfList:
motifFreq = motifFreqs[t.motif]
periodFreq = periodFreqs[t.period]
out.write(str(t)+','+str(motifFreq)+','+str(periodFreq)+'\n');
def writeSQL(trfList,outFileBase):
table = outFileBase[:outFileBase.find('.')]+'_trf'#tables can't have . in them, quick hack
out = open(table+'.dmp','w')
motifFreqs = getMotifFreqs(trfList)
periodFreqs = getPeriodFreqs(trfList)
out.write('BEGIN;\n')
out.write('DROP TABLE IF EXISTS `'+table+'`;\n')
out.write('CREATE TABLE '+table+'(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, seq_name VARCHAR(200), start INT, end INT, period INT, copies FLOAT, consensus INT, matches FLOAT,\
indels FLOAT, score INT, A FLOAT, C FLOAT, G FLOAT, T FLOAT, entropy FLOAT, motif VARCHAR(2001),len INT, motif_freq INT, period_freq INT);\n')
for t in trfList:
motifFreq = motifFreqs[t.motif]
periodFreq = periodFreqs[t.period]
out.write('INSERT INTO '+table+'(seq_name,start,end,period,copies,consensus,matches,indels,score,A,C,G,T,entropy,motif,len,motif_freq,period_freq) VALUES('+str(t)+','+str(motifFreq)+','+str(periodFreq)+');\n')
out.write('COMMIT;')
def getPeriodFreqs(trfList):
periodDict = defaultdict(int)
for t in trfList:
periodDict[t.period]+=1
return periodDict
def writePeriodFreqs(trfList,outFileBase):
out = open(outFileBase+'.period_freqs','w')
for k,v in sorted(getPeriodFreqs(trfList).items()):
out.write('{:d}\t{:d}\n'.format(k,v))
out.close()
def getCumulativeBpByPeriodRange(trfList,prange):
return sum([t.length for t in filter(lambda x: x.period in range(prange[0],prange[1]),trfList)])
def getCumulativePeriodLengths(trfList):
periods = defaultdict(int)
for t in trfList:
periods[t.period] += t.length
return periods
def writePeriodLengths(trfList,outFileBase):
out = open(outFileBase+'.period_lengths','w')
for k,v in sorted(getCumulativePeriodLengths(trfList).items()):
out.write('{:d}\t{:d}\n'.format(k,v))
out.close()
def getMaxPeriod(trfList):
return max([t.period for t in trfList])
'''def writePeriodRange(trfList,outFileBase):
with open(outFileBase+'.period_range.txt','w') as out:
out = writeCumulativeBpByPeriodRange(trfList,out,(2,3))
out = writeCumulativeBpByPeriodRange(trfList,out,(3,4))
out = writeCumulativeBpByPeriodRange(trfList,out,(4,5))
out = writeCumulativeBpByPeriodRange(trfList,out,(5,6))
out = writeCumulativeBpByPeriodRange(trfList,out,(6,7))
out = writeCumulativeBpByPeriodRange(trfList,out,(7,8))
out = writeCumulativeBpByPeriodRange(trfList,out,(8,9))#micros
out = writeCumulativeBpByPeriodRange(trfList,out,(9,101))#minis
out = writeCumulativeBpByPeriodRange(trfList,out,(101,getMaxPeriod(trfList)))#sats
'''
def getLociByPeriodRange(trfList,prange):
return sum([1 for t in filter(lambda x:x.period in range(prange[0],prange[1]),trfList)])
def getCopiesByPeriodRange(trfList,prange):
return sum([t.copy_num for t in filter(lambda x:x.period in range(prange[0],prange[1]),trfList)])
def getVariantsByPeriodRange(trfList,prange):
return len(set([t.motif for t in filter(lambda x:x.period in range(prange[0],prange[1]),trfList)]))
def getCumulativeBpByPeriodRange(trfList,prange):
return sum([t.length for t in filter(lambda x:x.period in range(prange[0],prange[1]),trfList)])
def writePeriodTable(trfList,outFileBase):
with open(outFileBase+'.period_tbl','w') as out:
maxPeriod = getMaxPeriod(trfList)
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('Dinucleotide',getLociByPeriodRange(trfList,(2,3)),getCopiesByPeriodRange(trfList,(2,3)),getVariantsByPeriodRange(trfList,(2,3)),getCumulativeBpByPeriodRange(trfList,(2,3))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('Trinucleotide',getLociByPeriodRange(trfList,(3,4)),getCopiesByPeriodRange(trfList,(3,4)),getVariantsByPeriodRange(trfList,(3,4)),getCumulativeBpByPeriodRange(trfList,(3,4))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('Tetranucleotide',getLociByPeriodRange(trfList,(4,5)),getCopiesByPeriodRange(trfList,(4,5)),getVariantsByPeriodRange(trfList,(4,5)),getCumulativeBpByPeriodRange(trfList,(4,5))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('Pentanucleotide',getLociByPeriodRange(trfList,(5,6)),getCopiesByPeriodRange(trfList,(5,6)),getVariantsByPeriodRange(trfList,(5,6)),getCumulativeBpByPeriodRange(trfList,(5,6))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('Hexanucleotide',getLociByPeriodRange(trfList,(6,7)),getCopiesByPeriodRange(trfList,(6,7)),getVariantsByPeriodRange(trfList,(6,7)),getCumulativeBpByPeriodRange(trfList,(6,7))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('Heptanucleotide',getLociByPeriodRange(trfList,(7,8)),getCopiesByPeriodRange(trfList,(7,8)),getVariantsByPeriodRange(trfList,(7,8)),getCumulativeBpByPeriodRange(trfList,(7,8))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('Octanucleotide',getLociByPeriodRange(trfList,(8,9)),getCopiesByPeriodRange(trfList,(8,9)),getVariantsByPeriodRange(trfList,(8,9)),getCumulativeBpByPeriodRange(trfList,(8,9))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('9-30 bp',getLociByPeriodRange(trfList,(9,31)),getCopiesByPeriodRange(trfList,(9,31)),getVariantsByPeriodRange(trfList,(9,31)),getCumulativeBpByPeriodRange(trfList,(9,31))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('31-50 bp',getLociByPeriodRange(trfList,(31,51)),getCopiesByPeriodRange(trfList,(31,51)),getVariantsByPeriodRange(trfList,(31,51)),getCumulativeBpByPeriodRange(trfList,(31,51))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('51-70 bp',getLociByPeriodRange(trfList,(51,71)),getCopiesByPeriodRange(trfList,(51,71)),getVariantsByPeriodRange(trfList,(51,71)),getCumulativeBpByPeriodRange(trfList,(51,71))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('71-100 bp',getLociByPeriodRange(trfList,(71,101)),getCopiesByPeriodRange(trfList,(71,101)),getVariantsByPeriodRange(trfList,(71,101)),getCumulativeBpByPeriodRange(trfList,(71,101))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('101-200 bp',getLociByPeriodRange(trfList,(101,201)),getCopiesByPeriodRange(trfList,(101,201)),getVariantsByPeriodRange(trfList,(101,201)),getCumulativeBpByPeriodRange(trfList,(101,201))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('201-300 bp',getLociByPeriodRange(trfList,(201,301)),getCopiesByPeriodRange(trfList,(201,301)),getVariantsByPeriodRange(trfList,(201,301)),getCumulativeBpByPeriodRange(trfList,(201,301))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('301-400 bp',getLociByPeriodRange(trfList,(301,401)),getCopiesByPeriodRange(trfList,(301,401)),getVariantsByPeriodRange(trfList,(301,401)),getCumulativeBpByPeriodRange(trfList,(301,401))))
out.write('{:s}\t{:d}\t{:.1f}\t{:d}\t{:d}\n'.format('>400 bp',getLociByPeriodRange(trfList,(401,maxPeriod)),getCopiesByPeriodRange(trfList,(401,maxPeriod)),getVariantsByPeriodRange(trfList,(401,maxPeriod)),getCumulativeBpByPeriodRange(trfList,(401,maxPeriod))))
def getMotifFreqs(trfList):
motifDict = defaultdict(int)
for t in trfList:
motifDict[t.motif]+=1
return motifDict
def writeMotifFreqs(trfList,outFileBase):
out = open(outFileBase+'.motif_freqs','w')
for k,v in sorted(getMotifFreqs(trfList).items(),key=itemgetter(1),reverse=True):
out.write('{:s}\t{:d}\n'.format(k,v))
out.close()
def getCumulativeMotifLengths(trfList):
motifs = defaultdict(int)
for t in trfList:
motifs[t.motif] += t.length
return motifs
def writeMotifLengths(trfList,outFileBase):
out = open(outFileBase+'.motif_lengths','w')
for k,v in sorted(getCumulativeMotifLengths(trfList).items(),key=itemgetter(1),reverse=True):
out.write('{:s}\t{:d}\n'.format(k,v))
out.close()
def writeFasta(trfList,outFileBase):
out = open(outFileBase+'.fa','w')
for t in trfList:
header = '>'+t.seq_name+'|tandem repeat|'+str(t.start)+':'+str(t.end)+'|length='+str(t.length)+'|period='+str(t.period)
out.write(header+'\n'+t.sequence+'\n')
out.close()
def isTelomere(tandem):
telomere = re.search('(TTTAGGG){3,}',tandem.sequence)
if telomere is not None and tandem.length > 1000:
return True
else:
return False
def writeTelomeres(trfList,outFileBase):
out = open(outFileBase+'.telomeres','w')
for t in trfList:
if isTelomere(t):
out.write(str(t)+"\n")
if __name__=='__main__':
main()