-
Notifications
You must be signed in to change notification settings - Fork 1
/
metpipe.py
250 lines (225 loc) · 11.9 KB
/
metpipe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env python
#@author: Philipp Sehnert
#@contact: philipp.sehnert[a]gmail.com
# import of standard modules
import os, sys, time, multiprocessing
import argparse
import traceback
# Import of pipeline modules
from src.preprocess import Preprocess
from src.assembly import Assembly
from src.annotation import Annotation
from src.analysis import Analysis
from src.settings import *
from src.utils import file_exists, to_string
from src.log_functions import skip_msg, print_verbose, print_running_time
from src.file_functions import create_outputdir, parse_parameter, absolute_path
from src.exceptions import InputNotFound, ParamFileNotFound
def main(argv = None):
# hardcode defaults
RESULT_DIR = '%s%sresult' % (sys.path[0], os.sep)
PARAM_FILE = '%s%sparameter.conf' % (sys.path[0], os.sep)
STEPS = ['preprocessing', 'annotate', 'assembly', 'analysis']
# Get the starting time
starting_time = time.time()
# setup Argument Parser for stdin arguments
parser = argparse.ArgumentParser(add_help = True)
# define arguments
parser.add_argument('input', nargs = '+', action = 'store',
help = 'single or paired input files in <fastq> format')
parser.add_argument('--version', action = 'version', version = '%(prog)s 0.5')
parser.add_argument('-v', dest = 'verbose', action = 'store_true', default = False,
help = 'more detailed output (default = False)')
parser.add_argument('-t', dest = 'threads', type = int, action = 'store',
default = multiprocessing.cpu_count() - 1,
help = 'number of threads to use (default = %d)'
% (multiprocessing.cpu_count() - 1))
parser.add_argument('-p', dest = 'param', action = 'store', default = PARAM_FILE,
help = 'use alternative config file (default = parameter.conf)')
parser.add_argument('-s', dest = 'skip', action = 'store', default = '',
choices = ['preprocessing', 'assembly', 'annotation','analysis'],
help = 'skip steps in the pipeline (default = None)')
parser.add_argument('-o', dest = 'output', action = 'store', default = RESULT_DIR,
help = 'use alternative output folder')
parser.add_argument('-a', dest = 'assembler', default = 'MetaVelvet',
choices = ['metavelvet', 'flash','both'],
help = 'assembling program to use (default = MetaVelvet)')
parser.add_argument('-c', dest = 'annotation', default = 'both',
choices = ['metacv', 'blastn', 'both'],
help = 'classifier to use for annotation (default = both)')
parser.add_argument('--use_contigs', dest = 'use_contigs', action = 'store_true',
default = 'False',
help = 'should MetaCV use assembled Reads or RAW Reads (default = RAW')
parser.add_argument('--notrimming', dest = 'trim', action = 'store_false', default = True,
help = 'trim and filter input reads? (default = True)')
parser.add_argument('--noquality', dest = 'quality', action = 'store_false', default = True,
help = 'create no quality report (default = True)')
parser.add_argument('--noreport', dest = 'krona', action = 'store_false', default = True,
help = 'create no pie chart with the annotated taxonomical data (default = True)')
parser.add_argument('--merge', dest = 'merge_uncombined', action = 'store_true', default = False,
help = 'merge concatinated reads with not concatinated (default = False)')
args = parser.parse_args()
# init the Pipeline
RESULT_DIR = args.output if args.output else RESULT_DIR
# check if param File exists
if os.path.isfile(args.param):
PARAM_FILE = args.param
else:
if os.path.isfile(PARAM_FILE):
sys.stderr.write('ERROR 3: Parameter File could not be found!\n')
sys.stderr.write('Use standard Parameter File:\n%s\n\n' % (PARAM_FILE))
else:
raise ParamFileNotFound(args.param)
# check if input exists
if not all(os.path.isfile(file) for file in args.input):
raise InputNotFound(to_string(args.input))
if __name__ == '__main__':
# create outputdir and log folder
create_outputdir(RESULT_DIR)
create_outputdir(RESULT_DIR + os.sep +'log')
# create the global settings object
settings = General(args.threads, args.verbose, args.skip, starting_time, args.trim,
args.quality, args.krona, args.use_contigs, args.merge_uncombined, args.assembler,
args.annotation, 1)
# setup the input, outputs and important files
files = FileSettings(absolute_path(args.input), os.path.normpath(RESULT_DIR), PARAM_FILE)
exe = Executables(PARAM_FILE)
# get the all skipped steps
skip = to_string(settings.get_skip())
try:
print "hello"
# START the modules of Pipeline and wait until completion
if skip in 'preprocessing' and skip:
skip_msg(skip)
else:
# init the preprocessing module
pre = Preprocess(settings.get_threads(),
settings.get_step_number(),
settings.get_verbose(),
settings.get_actual_time(),
files.get_input(),
files.get_logdir(),
exe.get_FastQC(),
settings.get_quality(),
files.get_quality_dir(),
parse_parameter(FastQC_Parameter(PARAM_FILE)),
exe.get_TrimGalore(),
settings.get_trim(),
files.get_trim_dir(),
parse_parameter(TrimGalore_Parameter(PARAM_FILE)))
# run preprocessing functions
results = pre.manage_preprocessing()
# update pipeline variables with results
settings.set_step_number(results[0])
if len(results) > 1:
files.set_input(absolute_path(results[1]))
files.set_preprocessed_output(absolute_path(results[1]))
if skip in 'assembly' and skip:
skip_msg(skip)
else:
# init the assembly module
assembly = Assembly(settings.get_threads(),
settings.get_step_number(),
settings.get_verbose(),
settings.get_actual_time(),
files.get_logdir(),
files.get_input(),
settings.get_assembler(),
exe.get_Flash(),
files.get_concat_dir(),
parse_parameter(FLASH_Parameter(PARAM_FILE)),
settings.get_merge_uncombined(),
exe.get_Velveth(),
exe.get_Velvetg(),
exe.get_MetaVelvet(),
files.get_assembly_dir(),
Velveth_Parameter(PARAM_FILE).get_kmer(PARAM_FILE),
parse_parameter(Velveth_Parameter(PARAM_FILE)),
parse_parameter(Velvetg_Parameter(PARAM_FILE)),
parse_parameter(MetaVelvet_Parameter(PARAM_FILE)))
# run assembly functions
results = assembly.manage_assembly()
# update pipeline variables with results
settings.set_step_number(results[0])
files.set_input(absolute_path(results[1]))
files.set_concatinated_output(absolute_path(results[2]))
files.set_assembled_output(absolute_path(results[3]))
if skip in 'annotation'and skip:
skip_msg(skip)
else:
# init the annotation module
anno = Annotation(settings.get_threads(),
settings.get_step_number(),
settings.get_verbose(),
settings.get_actual_time(),
files.get_logdir(),
files.get_input(),
files.get_raw(),
settings.get_annotation(),
settings.get_use_contigs(),
exe.get_Blastn(),
exe.get_Blastn_DB(),
exe.get_Converter(),
files.get_blastn_dir(),
Blastn_Parameter(PARAM_FILE).outfmt,
parse_parameter(Blastn_Parameter(PARAM_FILE)),
exe.get_MetaCV(),
exe.get_MetaCV_DB(),
files.get_metacv_dir(),
MetaCV_Parameter(PARAM_FILE).get_seq(),
MetaCV_Parameter(PARAM_FILE).get_mode(),
MetaCV_Parameter(PARAM_FILE).get_orf(),
MetaCV_Parameter(PARAM_FILE).get_total_reads(),
MetaCV_Parameter(PARAM_FILE).get_min_qual(),
MetaCV_Parameter(PARAM_FILE).get_taxon(),
MetaCV_Parameter(PARAM_FILE).get_name())
# run the annotation functions
results = anno.manage_annotation()
settings.set_step_number(results[0])
files.set_blastn_output(absolute_path(results[1]))
files.set_metacv_output(absolute_path(results[2]))
if skip in 'analysis' and skip:
skip_msg(skip)
else:
# init the analysis module
analysis = Analysis(settings.get_threads(),
settings.get_step_number(),
settings.get_verbose(),
settings.get_actual_time(),
files.get_logdir(),
settings.get_annotation(),
files.get_output(),
files.get_parsed_db_dir(),
files.get_annotated_db_dir(),
files.get_subseted_db_dir(),
files.get_krona_report_dir(),
files.get_blastn_output(),
files.get_metacv_output(),
exe.get_Parser(),
parse_parameter(blastParser_Parameter(PARAM_FILE)),
blastParser_Parameter(PARAM_FILE).get_name(),
exe.get_Annotate(),
parse_parameter(Rannotate_Parameter(PARAM_FILE)),
Rannotate_Parameter(PARAM_FILE).get_name(),
Rannotate_Parameter(PARAM_FILE).get_taxon_db(),
exe.get_Subset(),
subsetDB_Parameter(PARAM_FILE).get_bitscore(),
subsetDB_Parameter(PARAM_FILE).get_classifier(),
subsetDB_Parameter(PARAM_FILE).get_rank(),
subsetDB_Parameter(PARAM_FILE).get_taxon_db(),
exe.get_Krona_Blast(),
parse_parameter(Krona_Parameter(PARAM_FILE)),
Krona_Parameter(PARAM_FILE).get_name(),
settings.get_krona(),
exe.get_Perl_lib())
# run the analysis function
results = analysis.manage_analysis()
files.set_parser_output(absolute_path(results[0]))
files.set_annotated_output(absolute_path(results[1]))
except KeyboardInterrupt:
sys.stdout.write('\nERROR 1 : Operation cancelled by User!\n')
sys.exit(1)
# print ending message
print_verbose('\nPIPELINE COMPLETE!\n\n')
print_running_time(settings.get_actual_time())
sys.exit(main())