/
extract_hairpins.py
executable file
·111 lines (99 loc) · 3.75 KB
/
extract_hairpins.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from subprocess import call
import threading
import sys
import getopt
import os.path
import os
import classes.FastaOperations as FastaOps
import classes.FoldOperations as FoldOps
import classes.FileConversion as FileConversion
from classes.SequenceList import *
import random
# Parameters:
# -i: Input fasta file
# -n: Number of threads for multi-threaded use
#
# Output is a non-redundant hairpins file, <inpath>.nr.hairpins
opts, extraparams = getopt.getopt(sys.argv[1:], 'i:n:l:m:p:h:c:t:')
hairpinLength = 100
basePairs = 18
minMFE = -15.00
numHairpins = 0
clusterSim = 0.0
foldTemp = 37.0
for o,p in opts:
if o == '-i':
inPath = p
if o == '-n':
numThreads = int(p)
if o == '-l':
hairpinLength = int(p)
if o == '-m':
minMFE = float(p)
if o == '-p':
basePairs = float(p)
if o == '-h':
numHairpins = int(p)
if o == '-c':
clusterSim = float(p)
if o == '-t':
foldTemp = float(p)
class myThread(threading.Thread):
def __init__(self, inPath):
threading.Thread.__init__(self)
self.inPath = inPath
def run(self):
# Use newer local version of RNAfold
call('RNALfold -T '+str(foldTemp)+' -d2 --noLP -L '+str(hairpinLength)+' < data/tmp/'+self.inPath+' > data/tmp/'+self.inPath+'.folds', shell=True)
# Use older version of RNAfold
# call('progs/ViennaRNA-1.8.5/Progs/RNALfold -T '+str(foldTemp)+' -d2 -noLP -L '+str(hairpinLength)+' < data/tmp/'+self.inPath+' > data/tmp/'+self.inPath+'.folds', shell=True)
FoldOps.filter_hairpins('data/tmp/'+self.inPath+'.folds', 'data/tmp/'+self.inPath+'.hairpins', minMFE, basePairs)
FileConversion.RNAL_to_fasta('data/tmp/'+self.inPath+'.hairpins', 'data/tmp/folds_from_'+self.inPath)
sl = SequenceList()
sl.load_fasta('data/tmp/folds_from_'+self.inPath)
# sl.remove_all_redundant()
sl.export_fasta('data/tmp/'+self.inPath+'nrhairpins')
# Step one: turn the fasta into something that RNALfold will work with
FastaOps.remove_newlines('data/'+inPath, 'data/tmp/'+inPath+'.fixed')
FastaOps.convert_DNA_to_RNA('data/tmp/'+inPath+'.fixed', 'data/tmp/'+inPath+'.rna')
# Step two: split the fasta for mutli-threaded processing
FastaOps.split_fasta('data/tmp/'+inPath+'.rna', numThreads)
# Step three: Launch threads
threadPath = inPath
threadExt = 'rna'
threads = []
for i in range(numThreads):
threads.append(myThread(threadPath+'.'+str(i)+'.'+threadExt))
for thread in threads:
thread.start()
for thread in threads:
thread.join()
FastaOps.merge_fasta('data/tmp/'+inPath+'.rnanrhairpins', numThreads)
FastaOps.remove_AU('data/tmp/'+inPath+'.rnanrhairpins', 'data/tmp/'+inPath+'.hairpins.noAU', 5)
if clusterSim > 0.0:
call('cdhit-est -i data/tmp/'+inPath+'.hairpins.noAU -o data/'+inPath+'.nr.hairpins')
else:
call('cp data/tmp/'+inPath+'.hairpins.noAU data/'+inPath+'.nr.hairpins', shell=True)
if numHairpins != 0:
outFile = open('data/'+inPath+'.nr.hairpins.'+str(numHairpins), 'w')
inLines = open('data/'+inPath+'.nr.hairpins', 'r').readlines()
inData = []
for i in range(0,len(inLines)-2,2):
inData.append(inLines[i]+inLines[i+1])
if numHairpins < len(inData):
outData = random.sample(inData, numHairpins)
else:
outData = inData
for d in outData:
outFile.write(d)
# print "Finding all folds in "+self.inPath+" with RNALfold."
# call('progs/ViennaRNA-1.8.5/Progs/RNALfold -d2 -noLP -L 120 < data/tmp/'+inPath+'.fixed > data/tmp/'+inPath+'.folds', shell=True)
# print "Filtering folds in "+self.inPath+" down to hairpins."
# FoldOps.filter_hairpins('data/tmp/'+inPath+'.folds', 'data/tmp/'+inPath+'.hairpins')
# FileConversion.RNAL_to_fasta('data/tmp/'+inPath+'.hairpins', 'data/tmp/folds_from_'+inPath)
# print "Removing redundant hairpins from "+self.inPath+"."
# sl = SequenceList()
# sl.load_fasta('data/tmp/folds_from_'+inPath)
# sl.remove_all_redundant()
# sl.export_fasta('data/'+inPath+'.nr.hairpins')
# print "Done!"