/
Genewiz.py
64 lines (49 loc) · 1.58 KB
/
Genewiz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import src.DNASeq as libDNA
import src.AASeq as libAA
import string, sys, random, zipfile, os
from string import lower
from optparse import OptionParser
optionparser = OptionParser()
optionparser.add_option( '-f', '--file_path', help='path to sequencing zip file' )
optionparser.add_option( '--sub', default=None )
(opt,args) = optionparser.parse_args()
def fixBadZipfile(zipFile):
f = open(zipFile, 'r+b')
data = f.read()
pos = data.find('\x50\x4b\x05\x06') # End of central directory signature
if (pos > 0):
f.seek(pos + 22) # size of 'ZIP end of central directory record'
f.truncate()
f.close()
def read_assembly(lines):
name=""
seq=""
for line in lines:
if ">" in line:
name=line[1:]
continue
seq+=line.rstrip()
return (name,seq.lower())
fixBadZipfile(opt.file_path)
zf = zipfile.ZipFile(opt.file_path, 'r')
files= zf.namelist()
out_name=opt.file_path.split("/")[-1].split('.')[0]
dna=open("./"+out_name+'_dna.fasta','w')
out=open("./"+out_name+'_aa.fasta','w')
print "./"+out_name+'_aa.fasta'
for f in files:
if ".seq" not in f: continue
name,seq=read_assembly(zf.open(f,'r').readlines())
if 'Term' in f: seq= libDNA.reverse_complement(seq)
if len(seq) < 20: continue
DnaSeqObj = libDNA.DNASeq(seq)
rf, found = DnaSeqObj.auto_phase(opt.sub)
if found:
aaobj = libAA.AASeq( DnaSeqObj.translate().replace(' ','') )
out.write(">"+name+ aaobj.seq +"\n\n")
else:
out.write(">"+name+ DnaSeqObj.translate().replace(' ','') +"\n\n")
dna.write(">"+name+DnaSeqObj.seq+"\n\n")
continue
out.close()
dna.close()