/
subsetFasta.py
executable file
·86 lines (79 loc) · 3.33 KB
/
subsetFasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
## Imports
import argparse
import random
import re
parser = argparse.ArgumentParser('python subsetFasta.py')
parser.add_argument('fasta_file', type=str, help="Input FASTA File")
parser.add_argument('output', type=str, help="Output FASTA File")
parser.add_argument('-s', '--subset_file', type=str, help="Optional file of headers for subset")
parser.add_argument('-d', '--disjunction', action="store_true", default=None, help="Take the opposite of the subset")
parser.add_argument('-a', '--ambiguous', type=str, default='', help="Replace ambiguous nucleotides with random choice or N [Y/N]")
parser.add_argument('-n', '--screen', action="store_true", default=None, help="Screen the file for protein sequences and skip if true")
parser.add_argument('-u', '--dedup', action="store_true", default=None, help="Remove duplicated entries")
ambig = {'R':('A', 'G'),
'Y':('C', 'T'),
'S':('G', 'C'),
'W':('A', 'T'),
'K':('G', 'T'),
'M':('A', 'C'),
'B':('C', 'G', 'T'),
'D':('A', 'G', 'T'),
'H':('A', 'C', 'T'),
'V':('A', 'C', 'G'),
'N':('A', 'C', 'G', 'T')}
aa = re.compile(r'[EFILPQ]')
args = parser.parse_args()
def fastaParse(infile):
with open(infile, 'r') as fastaFile:
# Skip whitespace
while True:
line = fastaFile.readline()
if line is "":
return # Empty file or premature end of file?
if line[0] is ">":
break
while True:
if line[0] is not ">":
raise ValueError("Records in FASTA should begin with '>'")
header = line[1:].rstrip()
allLines = []
line = fastaFile.readline()
while True:
if not line:
break
if line[0] is ">":
break
allLines.append(line.rstrip())
line = fastaFile.readline()
yield header, "".join(allLines).replace(" ", "").replace("\r", "")
if not line:
return # Stop Iteration
assert False, "Should not reach this line"
data = [x for x in fastaParse(args.fasta_file)]
if args.dedup:
data = list(set(data))
if args.subset_file:
with open(args.subset_file, 'r') as f, open(args.output, 'w') as out:
subset_headers = f.read().split("\n")
for header,seq in data:
if args.ambiguous == 'N':
seq = ''.join(['N' if x in ambig else x for x in seq])
elif args.ambiguous:
seq = ''.join([ambig[x][random.randit(0, len(ambig[x]) - 1)] if x in ambig else x for x in seq])
if not args.disjunction:
if header in subset_headers:
out.write(">"+header+"\n"+seq+"\n")
else:
if header not in subset_headers:
out.write(">"+header+"\n"+seq+"\n")
else:
with open(args.output, 'w') as out:
for header,seq in data:
if aa.search(seq):
continue
if args.ambiguous == 'N':
seq = ''.join(['N' if x in ambig.keys() else x for x in seq])
elif args.ambiguous:
seq = ''.join([ambig[x][random.randint(0, len(ambig[x]) - 1)] if x in ambig.keys() else x for x in seq])
out.write(">"+header+"\n"+seq+"\n")