forked from jmeppley/py-metagenomics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter_blast_m8.py
executable file
·120 lines (101 loc) · 3.83 KB
/
filter_blast_m8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
"""
"""
import sys
import logging
import argparse
from edl.blastm8 import filterM8, add_hit_table_arguments, FilterParams
from edl.util import add_universal_arguments, setup_logging
def main():
description = """
Take a blast result table and output a subset of hits based on the
chosen filtering options. If more than one blast file given, use -O
to get multiple output files, otherwise all output data will be
concatenated into one output.
"""
# command line arguments
parser = argparse.ArgumentParser(
description=description,
conflict_handler='resolve')
add_hit_table_arguments(parser, flags='all')
parser.add_argument(
"-o",
"--outfilenome",
dest="outfilename",
default=None,
metavar="OUTFILENAME",
help="Write masked fasta output to OUTFILENAME.")
parser.add_argument(
'-O',
'--autoOutName',
default=False,
action='store_true',
help="Automatically generate output file name from input name "
"and options. Overridden by -o, cannot be used with data "
"from STDIN.")
parser.add_argument('-G', '--gff', default=False, action='store_true',
help="output GFF format instead of input format")
parser.add_argument('hit_table', nargs='*',
type=argparse.FileType('rU'), default=[sys.stdin, ],
help="Table of search results to be filtered. "
"If absent, data will be read from STDIN")
add_universal_arguments(parser)
arguments = parser.parse_args()
setup_logging(arguments)
# check that we have blast file as argument
# if we're not doing auto file names, wriate all outputs to same file
if not arguments.autoOutName:
if arguments.outfilename is not None:
logging.info("Writing data to %s" % (arguments.outfilename))
outfile_handle = open(arguments.outfilename, 'w')
else:
logging.info("writing data to STDOUT")
outfile_handle = sys.stdout
if arguments.gff:
logging.info("Converting to GFF")
# loop over inputs
for infile_handle in arguments.hit_table:
logging.info("reading data from %s" % (infile_handle.name))
if arguments.autoOutName:
outfile_handle = open(
getOutputFile(
infile_handle.name,
arguments),
'w')
# filter
params = FilterParams.create_from_arguments(arguments)
filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff)
if arguments.autoOutName:
outfile_handle.close()
infile_handle.close()
#############
# Functions #
#############
def getOutputFile(infile, arguments):
"""
Use the requested arguments to name the output file
"""
outfile = infile
if arguments.filterPctid > 0:
outfile += ".i%g" % arguments.filterPctid
if arguments.filterLength > 0:
outfile += ".l%d" % arguments.filterLength
if arguments.filterBits > 0:
outfile += ".b%g" % arguments.filterBits
if arguments.filterEvalue is not None:
outfile += ".e%g" % arguments.filterEvalue
if arguments.filterAln is not None and arguments.filterAln > 0:
outfile += ".a%g" % arguments.filterAln
if arguments.filterHspsPerHit != 1:
outfile += ".h%d" % arguments.filterHspsPerHit
if arguments.filterTopPct >= 0:
outfile += ".p%g" % arguments.filterTopPct
if arguments.filterNonoverlapping:
outfile += ".u"
if arguments.filterHitsPerRead > 0:
outfile += ".n%d" % arguments.filterHitsPerRead
if outfile == infile:
sys.exit("outfile and infile are the same!!\n%s" % infile)
return outfile
if __name__ == '__main__':
main()