def get_sample_args_fastq(fastq_files_list, outdir, pairEnd_filesSeparation_list): new_indir = os.path.join(outdir, 'reads', '') utils.removeDirectory(new_indir) os.mkdir(new_indir) samples = [] for fastq in fastq_files_list: fastq_link = os.path.join(new_indir, os.path.basename(fastq)) os.symlink(fastq, fastq_link) samples, removeCreatedSamplesDirectories, indir_same_outdir = utils.checkSetInputDirectory(new_indir, outdir, pairEnd_filesSeparation_list) return new_indir, samples, removeCreatedSamplesDirectories, indir_same_outdir
def get_sample_args_fastq(fastq_files_list, outdir, pairEnd_filesSeparation_list): new_indir = os.path.join(outdir, 'reads', '') utils.removeDirectory(new_indir) os.mkdir(new_indir) samples = [] for fastq in fastq_files_list: fastq_link = os.path.join(new_indir, os.path.basename(fastq)) os.symlink(fastq, fastq_link) samples, removeCreatedSamplesDirectories, indir_same_outdir = utils.checkSetInputDirectory( new_indir, outdir, pairEnd_filesSeparation_list) return new_indir, samples, removeCreatedSamplesDirectories, indir_same_outdir
def main(): program_name = 'seq_typing.py' if sys.version_info[0] < 3: sys.exit('Must be using Python 3. Try calling "python3 {}"'.format( program_name)) parser, _, _, _, _ = python_arguments(program_name, __version__) args = parser.parse_args() start_time = time.time() args.outdir = os.path.abspath(args.outdir) if not os.path.isdir(args.outdir): os.makedirs(args.outdir) # Start logger logfile, time_str = utils.start_logger(args.outdir) script_path = utils.general_information(script_name=program_name, logfile=logfile, version=__version__, outdir=args.outdir, time_str=time_str) del script_path print('\n') folders_2_remove = [] # Create modules pickles folder pickles_folder = os.path.join(args.outdir, 'pickles', '') if not os.path.isdir(pickles_folder): os.makedirs(pickles_folder) folders_2_remove.append(pickles_folder) # Run functions folders_2_remove_func, references_results, reference, references_headers = args.func( args) folders_2_remove.extend(folders_2_remove_func) # Parse results _, _, _, _, _ = parse_results.parse_results( references_results, reference, references_headers, args.outdir, args.minGeneCoverage, args.minDepthCoverage, args.typeSeparator) if not args.debug: for folder in folders_2_remove: utils.removeDirectory(folder) _ = utils.runTime(start_time)
def sequence_data(sample, reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, rematch): sequence_data_outdir = os.path.join(outdir, 'sequence_data', '') utils.removeDirectory(sequence_data_outdir) os.mkdir(sequence_data_outdir) sequences, headers = utils.get_sequence_information(reference_file, length_extra_seq) pool = multiprocessing.Pool(processes=threads) for sequence_counter in sequences: sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '') utils.removeDirectory(sequence_dir) os.makedirs(sequence_dir) pool.apply_async(rematch.analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,)) pool.close() pool.join() run_successfully, sample_data, consensus_files, consensus_sequences = rematch.gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, False) return run_successfully, sample_data, consensus_files, consensus_sequences
def rematch_for_different_references(fastq, references_files, threads, outdir, extraSeq, minCovPresence, minCovCall, minFrequencyDominantAllele, minGeneCoverage, debug, minGeneIdentity, rematch_module, doNotRemoveConsensus, bowtie_algorithm, clean_run_rematch=False): references_results = {} for x, reference in enumerate(references_files): reference_name = os.path.basename(reference) + '_' + str(x) ref_dir = os.path.join(outdir, reference_name, '') os.makedirs(ref_dir) header_gene_list, seq_reference_dict = utils.extractVariableFromPickle( reference + '.pkl') time_taken, run_successfully, data_by_gene, sample_data_general, consensus_files, consensus_sequences = \ rematch_module.run_rematch_module('sample', fastq, reference, threads, ref_dir, extraSeq, minCovPresence, minCovCall, minFrequencyDominantAllele, minGeneCoverage, debug, 1, minGeneIdentity, 'first', 7, 'none', seq_reference_dict, 'X', bowtie_algorithm, None, header_gene_list, not doNotRemoveConsensus, clean_run=clean_run_rematch) if run_successfully: pickleFile = os.path.join(outdir, str(reference_name + '.pkl')) utils.saveVariableToPickle(data_by_gene, pickleFile) references_results[reference] = pickleFile else: sys.exit( 'Something went wrong while running ReMatCh for reference {reference}' .format(reference=reference)) clean_rematch_folder(consensus_files, reference, ref_dir, doNotRemoveConsensus, debug) if not debug and not doNotRemoveConsensus: utils.removeDirectory(ref_dir) return references_results
def run_rematch(rematch_script, outdir, references_files, fastq, threads, extraSeq, minCovPresence, minCovCall, minFrequencyDominantAllele, minGeneCoverage, minGeneIdentity, debug, doNotRemoveConsensus, bowtie_algorithm, clean_run_rematch=False): module_dir = os.path.join(outdir, 'rematch', '') utils.removeDirectory(module_dir) os.makedirs(module_dir) sys.path.append(os.path.join(os.path.dirname(rematch_script), 'modules')) import rematch_module references_results = rematch_for_different_references( fastq, references_files, threads, module_dir, extraSeq, minCovPresence, minCovCall, minFrequencyDominantAllele, minGeneCoverage, debug, minGeneIdentity, rematch_module, doNotRemoveConsensus, bowtie_algorithm, clean_run_rematch=clean_run_rematch) return references_results, module_dir
def run_rematch(rematch, outdir, reference_file, bam_file, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, minimum_gene_identity, debug_mode_true, doNotRemoveConsensus): module_dir = os.path.join(outdir, 'rematch', '') utils.removeDirectory(module_dir) os.makedirs(module_dir) sys.path.append(os.path.join(os.path.dirname(rematch), 'modules')) import rematch_module as rematch print('Analysing alignment data') run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data('sample', reference_file, bam_file, module_dir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, rematch) if run_successfully: number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage = \ determine_general_statistics(outdir, sample_data=sample_data, minimum_gene_coverage=minimum_gene_coverage, minimum_gene_identity=minimum_gene_identity) if not debug_mode_true: utils.removeDirectory(module_dir) clean_rematch_folder(consensus_files, bam_file, reference_file, outdir, doNotRemoveConsensus, debug_mode_true) return run_successfully, {'number_absent_genes': number_absent_genes if 'number_absent_genes' in locals() else None, 'number_genes_multiple_alleles': number_genes_multiple_alleles if 'number_genes_multiple_alleles' in locals() else None, 'mean_sample_coverage': round(mean_sample_coverage, 2) if 'mean_sample_coverage' in locals() else None}, sample_data if 'sample_data' in locals() else None
def main(): version = '3.1' args = utils.parseArguments(version) general_start_time = time.time() time_str = time.strftime("%Y%m%d-%H%M%S") # Check if output directory exists outdir = os.path.abspath(os.path.join(args.outdir, '')) if not os.path.isdir(outdir): os.makedirs(outdir) # Start logger if not args.noLog: sys.stdout = utils.Logger(outdir, time_str) print '\n' + '==========> INNUca.py <==========' print '\n' + 'Program start: ' + time.ctime() # Tells where the logfile will be stored if not args.noLog: print '\n' + 'LOGFILE:' print sys.stdout.getLogFile() # Print command print '\n' + 'COMMAND:' script_path = os.path.abspath(sys.argv[0]) print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:]) # Print directory where programme was lunch print '\n' + 'PRESENT DIRECTORY:' print os.getcwd() # Print program version print '\n' + 'VERSION INNUca.py:' utils.scriptVersionGit(version, os.getcwd(), script_path, args.noGitInfo) # Get CPU information utils.get_cpu_information(outdir, time_str) # Get trueCoverage_ReMatCh settings trueCoverage_config = get_trueCoverage_config(args.skipTrueCoverage, args.trueConfigFile.name if args.trueConfigFile is not None else None, args.speciesExpected, script_path) # Check programms programs_version_dictionary = {} programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6'] # Java check first for java dependents check next if not (args.skipFastQC and args.skipTrimmomatic and (args.skipPilon or args.skipSPAdes)): # programs_version_dictionary['java'] = ['-version', '>=', '1.8'] programs_version_dictionary['java'] = [None, '>=', '1.8'] # For OpenJDK compatibility missingPrograms, programs_version_dictionary = utils.checkPrograms(programs_version_dictionary) if len(missingPrograms) > 0: sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) if not args.skipTrueCoverage or trueCoverage_config is not None: include_rematch_dependencies_path(args.doNotUseProvidedSoftware) programs_version_dictionary['rematch.py'] = ['--version', '>=', '3.2'] programs_version_dictionary['bcftools'] = ['--version', '==', '1.3.1'] if not (args.skipTrueCoverage and ((args.skipAssemblyMapping and args.skipPilon) or args.skipSPAdes)): programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9'] programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1'] if not args.skipFastQC: programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5'] if not args.skipTrimmomatic: programs_version_dictionary['trimmomatic-0.36.jar'] = ['-version', '==', '0.36'] if args.runPear: programs_version_dictionary['pear'] = ['--version', '>=', '0.9.10'] if not args.skipSPAdes: programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0'] if not (args.skipPilon or args.skipSPAdes): programs_version_dictionary['pilon-1.18.jar'] = ['--version', '==', '1.18'] if not (args.skipMLST or args.skipSPAdes): programs_version_dictionary['mlst'] = ['--version', '>=', '2.4'] # Set and print PATH variable utils.setPATHvariable(args, script_path) missingPrograms, programs_version_dictionary = utils.checkPrograms(programs_version_dictionary) if len(missingPrograms) > 0: sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) # .jar paths jar_path_trimmomatic = None if not args.skipTrimmomatic: jar_path_trimmomatic = programs_version_dictionary['trimmomatic-0.36.jar'][3] jar_path_pilon = None if not args.skipPilon and not args.skipSPAdes: jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3] rematch_script = None # ReMatCh path if not args.skipTrueCoverage: rematch_script = programs_version_dictionary['rematch.py'][3] # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation pairEnd_filesSeparation_list = None samples, inputDirectory, removeCreatedSamplesDirectories, indir_same_outdir = get_samples(args.inputDirectory, args.fastq, outdir, pairEnd_filesSeparation_list) # Start running the analysis print '\n' + 'RUNNING INNUca.py' # Prepare run report file samples_report_path = os.path.join(outdir, 'samples_report.' + time_str + '.tab') utils.start_sample_report_file(samples_report_path) number_samples_successfully = 0 number_samples_pass = 0 number_samples_warning = 0 # Get MLST scheme to use scheme = 'unknown' species_genus, mlst_scheme_genus = None, None if not args.skipMLST and not args.skipSPAdes: scheme, species_genus, mlst_scheme_genus = mlst.getScheme(args.speciesExpected) # Print path to blastn mlst.getBlastPath() # Memory available_memory_GB = utils.get_free_memory() / (1024.0 ** 2) # Determine SPAdes maximum memory spadesMaxMemory = None if not args.skipSPAdes: print '' spadesMaxMemory = spades.define_memory(args.spadesMaxMemory, args.threads, available_memory_GB) # Determine .jar maximum memory jarMaxMemory = 'off' if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)): print '' jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory, args.threads, available_memory_GB) # Run INNUca for each sample sample_report_json = {} for sample in samples: sample_start_time = time.time() print '\n' + 'Sample: ' + sample + '\n' # Create sample outdir sample_outdir = os.path.abspath(os.path.join(outdir, sample, '')) if not os.path.isdir(sample_outdir): os.makedirs(sample_outdir) # Get fastq files fastq_files = utils.searchFastqFiles(os.path.join(inputDirectory, sample, ''), pairEnd_filesSeparation_list, False) if len(fastq_files) == 1: print 'Only one fastq file was found: ' + str(fastq_files) print 'Pair-End sequencing is required. Moving to the next sample' continue elif len(fastq_files) == 0: print 'No compressed fastq files were found. Continue to the next sample' continue print 'The following files will be used:' print str(fastq_files) + '\n' # Run INNUca.py analysis run_successfully, pass_qc, run_report = run_INNUca(sample, sample_outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config, rematch_script, species_genus, mlst_scheme_genus) # Save sample fail report utils.write_fail_report(os.path.join(sample_outdir, 'fail_report.txt'), run_report) # Save warning report write_warning_report(os.path.join(sample_outdir, 'warning_report.txt'), run_report) # Get raw reads files size fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files) # Remove sample directory if it was created during the process if removeCreatedSamplesDirectories and not indir_same_outdir: utils.removeDirectory(os.path.join(inputDirectory, sample, '')) print 'END ' + sample + ' analysis' time_taken = utils.runTime(sample_start_time) # Save run report warning, json_pass_qc = utils.write_sample_report(samples_report_path, sample, run_successfully, pass_qc, time_taken, fileSize, run_report) # Save runs statistics if run_successfully: number_samples_successfully += 1 if pass_qc: if warning: number_samples_warning += 1 else: number_samples_pass += 1 sample_report_json[sample] = {'run_successfully': run_successfully, 'pass_qc': json_pass_qc, 'modules_run_report': run_report} # Save combine_samples_reports combine_reports.combine_reports(outdir, outdir, args.json, time_str, len(samples)) # Save sample_report in json if args.json: import json with open(os.path.join(outdir, 'samples_report.' + time_str + '.json'), 'wt') as writer: json.dump(sample_report_json, writer) # Remove temporary folder with symlink to fastq files in case of --fastq use if args.inputDirectory is None and args.fastq is not None: utils.removeDirectory(os.path.join(inputDirectory, '')) # Run report print '\n' + 'END INNUca.py' print '\n' + 'Pipeline problems: {not_run_successfully} samples'.format(not_run_successfully=(len(samples) - number_samples_successfully)) print '\n' + 'FAIL: {number_samples_fail} samples'.format(number_samples_fail=(len(samples) - number_samples_pass - number_samples_warning)) print '\n' + 'WARNING: {number_samples_warning} samples'.format(number_samples_warning=number_samples_warning) print '\n' + 'PASS: {number_samples_pass} samples'.format(number_samples_pass=number_samples_pass) time_taken = utils.runTime(general_start_time) del time_taken # Check whether INNUca.py run at least one sample successfully if number_samples_successfully == 0: sys.exit('No samples run successfully!')
def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config, rematch_script, species_genus, mlst_scheme_genus): threads = args.threads adaptersFasta = args.adapters if adaptersFasta is not None: adaptersFasta = os.path.abspath(adaptersFasta.name) genomeSize = args.genomeSizeExpectedMb skipped = [None, None, 0, {'sample': 'Skipped'}] not_run = [None, None, 0, {'sample': 'Not run'}] runs = {} # Run FastQ integrity check not_corruption_found, pass_qc, time_taken, failing, fastq_encoding, min_reads_length, max_reads_length = fastQintegrity.runFastQintegrity(fastq_files, threads, outdir) runs['FastQ_Integrity'] = [not_corruption_found, pass_qc, time_taken, failing] if not_corruption_found: # Run first Estimated Coverage run_successfully_estimatedCoverage = False estimatedCoverage = None run_successfully_trueCoverage = False pass_qc_trueCoverage = False if not args.skipEstimatedCoverage: # Check whether the Estimated Coverage output is already present report_file = os.path.join(outdir, 'coverage_report.txt') if os.path.isfile(report_file): os.remove(report_file) # Run getEstimatedCoverage run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage(fastq_files, genomeSize, outdir, threads, args.estimatedMinimumCoverage) runs['first_Coverage'] = [run_successfully_estimatedCoverage, pass_qc, time_taken, failing] else: print '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis' runs['first_Coverage'] = skipped trimmomatic_run_successfully = False if args.skipEstimatedCoverage or (run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if not args.skipTrueCoverage and trueCoverage_config is not None: # Run True Coverage run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing = trueCoverage.runTrueCoverage(sampleName, fastq_files, trueCoverage_config['reference_file'], threads, outdir, trueCoverage_config['length_extra_seq'], trueCoverage_config['minimum_depth_presence'], trueCoverage_config['minimum_depth_call'], trueCoverage_config['minimum_depth_frequency_dominant_allele'], trueCoverage_config['minimum_gene_coverage'], False, False, 1, trueCoverage_config['minimum_gene_identity'], trueCoverage_config, rematch_script) runs['trueCoverage_ReMatCh'] = [run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing] else: print '\n' + '--skipTrueCoverage set. Skipping True coverage analysis' runs['trueCoverage_ReMatCh'] = skipped if args.skipTrueCoverage or trueCoverage_config is None or (run_successfully_trueCoverage and pass_qc_trueCoverage): # Run first FastQC nts2clip_based_ntsContent = None if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files, args.fastQCkeepFiles, 'first_run') runs['first_FastQC'] = [run_successfully, pass_qc, time_taken, failing, warning] else: print '--skipFastQC set. Skipping First FastQC analysis' runs['first_FastQC'] = skipped + ['NA'] # Run Trimmomatic if not args.skipTrimmomatic: run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, fileSize = trimmomatic.runTrimmomatic(jar_path_trimmomatic, sampleName, outdir, threads, adaptersFasta, script_path, args.doNotSearchAdapters, fastq_files, max_reads_length, args.doNotTrimCrops, args.trimCrop, args.trimHeadCrop, args.trimLeading, args.trimTrailing, args.trimSlidingWindow, args.trimMinLength, nts2clip_based_ntsContent, jarMaxMemory, fastq_encoding) runs['Trimmomatic'] = [run_successfully, None, time_taken, failing, fileSize] trimmomatic_run_successfully = run_successfully if run_successfully and not_empty_fastq: fastq_files = paired_reads min_reads_length = args.trimMinLength # Run second Estimated Coverage if not args.skipEstimatedCoverage: run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage(fastq_files, genomeSize, outdir, threads, args.estimatedMinimumCoverage) runs['second_Coverage'] = [run_successfully_estimatedCoverage, pass_qc, time_taken, failing] else: print '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis' runs['second_Coverage'] = skipped if args.skipEstimatedCoverage or (run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): # Run second FastQC if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files, args.fastQCkeepFiles, 'second_run') runs['second_FastQC'] = [run_successfully, pass_qc, time_taken, failing, warning] if run_successfully: max_reads_length = maximum_reads_length else: print '--skipFastQC set. Skipping Second FastQC analysis' runs['second_FastQC'] = skipped + ['NA'] else: print '\n' + 'Estimated coverage is too lower (< ' + str(args.estimatedMinimumCoverage) + 'x). This sample will not proceed with INNUca pipeline' runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print 'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated Coverage analysis and FastQC analysis' runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped + ['NA'] else: print '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second Estimated Coverage analysis' runs['Trimmomatic'] = skipped + ['NA'] runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped + ['NA'] if not args.skipFastQC and (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is False and not args.fastQCproceed: print '\n' + 'This sample does not pass FastQC module QA/QC. It will not proceed with INNUca pipeline' runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print '\n' + 'This sample does not pass True Coverage module QA/QC. This sample will not proceed with INNUca pipeline' runs['first_FastQC'] = not_run + ['NA'] runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print '\n' + 'Estimated coverage is too lower (< ' + str(args.estimatedMinimumCoverage) + 'x). This sample will not proceed with INNUca pipeline' runs['trueCoverage_ReMatCh'] = not_run runs['first_FastQC'] = not_run + ['NA'] runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] if args.skipEstimatedCoverage or (run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if args.skipTrueCoverage or trueCoverage_config is None or (run_successfully_trueCoverage and pass_qc_trueCoverage): if args.skipFastQC or (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False or args.fastQCproceed: unassembled_pe_reads = None assembled_se_reads = None # Run Pear if args.runPear: print '--runPear set. Running Pear' pearMinOverlap = pear.determine_minimum_overlap(args.pearMinOverlap, min_reads_length, max_reads_length) run_successfully, pass_qc, time_taken, failing, unassembled_pe_reads, assembled_se_reads, pear_folder, warning = pear.runPear(fastq_files, threads, outdir, sampleName, fastq_encoding, trimmomatic_run_successfully, pearMinOverlap) runs['Pear'] = [run_successfully, pass_qc, time_taken, failing, warning] else: runs['Pear'] = not_run + ['NA'] # Run SPAdes if not args.skipSPAdes: run_successfully, pass_qc, time_taken, failing, contigs_spades, warning = spades.runSpades(sampleName, outdir, threads, unassembled_pe_reads if unassembled_pe_reads is not None else fastq_files, args.spadesNotUseCareful, spadesMaxMemory, args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genomeSize, args.spadesKmers, max_reads_length, args.spadesDefaultKmers, args.spadesMinKmerCovContigs, assembled_se_reads, args.saveExcludedContigs, args.maxNumberContigs) runs['SPAdes'] = [run_successfully, pass_qc, time_taken, failing, warning] if run_successfully: contigs = contigs_spades # Run Assembly Mapping check bam_file = None if not args.skipAssemblyMapping: run_successfully, pass_qc, time_taken, failing, assembly_filtered, bam_file, assemblyMapping_folder, warning = assembly_mapping.runAssemblyMapping(fastq_files, contigs, threads, outdir, args.assemblyMinCoverageContigs, genomeSize, args.saveExcludedContigs, args.maxNumberContigs) runs['Assembly_Mapping'] = [run_successfully, pass_qc, time_taken, failing, warning] if run_successfully: contigs = assembly_filtered if not args.keepIntermediateAssemblies and os.path.isfile(contigs_spades) and contigs != contigs_spades: os.remove(contigs_spades) else: print '--skipAssemblyMapping set. Skipping Assembly Mapping check' runs['Assembly_Mapping'] = skipped + ['NA'] # Run Pilon if not args.skipPilon: run_successfully, _, time_taken, failing, assembly_polished, pilon_folder = pilon.runPilon(jar_path_pilon, contigs, fastq_files, threads, outdir, jarMaxMemory, bam_file) runs['Pilon'] = [run_successfully, None, time_taken, failing] if run_successfully: contigs = assembly_polished if not args.keepIntermediateAssemblies and 'assembly_filtered' in locals() and os.path.isfile(assembly_filtered): os.remove(assembly_filtered) if not args.pilonKeepFiles: utils.removeDirectory(pilon_folder) else: print '--skipPilon set. Skipping Pilon correction' runs['Pilon'] = skipped if 'assemblyMapping_folder' in locals(): utils.removeDirectory(assemblyMapping_folder) print '\n' + 'Final assembly: ' + contigs with open(os.path.join(outdir, 'final_assembly.txt'), 'wt') as writer: writer.write(contigs + '\n') # Run MLST if not args.skipMLST: run_successfully, pass_qc, time_taken, failing, warning = mlst.runMlst(contigs, scheme, outdir, species_genus, mlst_scheme_genus) runs['MLST'] = [run_successfully, pass_qc, time_taken, failing, warning] else: print '--skipMLST set. Skipping MLST analysis' runs['MLST'] = skipped + ['NA'] else: print 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check and MLST analysis' runs['Assembly_Mapping'] = skipped + ['NA'] runs['Pilon'] = skipped runs['MLST'] = skipped + ['NA'] else: print '--skipSPAdes set. Skipping SPAdes, Pilon correction, Assembly Mapping check and MLST analysis' runs['SPAdes'] = skipped + ['NA'] runs['Assembly_Mapping'] = skipped + ['NA'] runs['Pilon'] = skipped runs['MLST'] = skipped + ['NA'] else: print 'Moving to the next sample' for step in ('first_Coverage', 'trueCoverage_ReMatCh', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'Pilon', 'MLST'): if step in ('Trimmomatic', 'first_FastQC', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'MLST'): runs[step] = not_run + ['NA'] else: runs[step] = not_run # Remove Pear directory if not args.pearKeepFiles and 'pear_folder' in locals(): utils.removeDirectory(pear_folder) # Remove Trimmomatic directory with cleaned reads if not args.trimKeepFiles and 'trimmomatic_folder' in locals(): utils.removeDirectory(trimmomatic_folder) # Check run run_successfully = all(runs[step][0] or runs[step][0] is None for step in runs) pass_fastqIntegrity = runs['FastQ_Integrity'][0] pass_cov = (runs['second_Coverage'][1] or (runs['second_Coverage'][1] is None and runs['first_Coverage'][1])) is not False pass_trueCov = runs['trueCoverage_ReMatCh'][1] is not False pass_fastqc = (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False # pass_trimmomatic = runs['Trimmomatic'][1] is not False # pass_pear = runs['Pear'][1] is not False # pass_spades = runs['SPAdes'][1] is not False or runs['Assembly_Mapping'][1] is True pass_spades = runs['SPAdes'][1] is not False pass_assemblyMapping = runs['Assembly_Mapping'][1] is not False pass_pilon = runs['Pilon'][0] is not False pass_mlst = runs['MLST'][1] is not False pass_qc = all([pass_fastqIntegrity, pass_cov, pass_trueCov, pass_fastqc, pass_spades, pass_assemblyMapping, pass_pilon, pass_mlst]) return run_successfully, pass_qc, runs
def main(): version = '3.1' args = utils.parseArguments(version) general_start_time = time.time() time_str = time.strftime("%Y%m%d-%H%M%S") # Check if output directory exists outdir = os.path.abspath(os.path.join(args.outdir, '')) if not os.path.isdir(outdir): os.makedirs(outdir) # Start logger if not args.noLog: sys.stdout = utils.Logger(outdir, time_str) print '\n' + '==========> INNUca.py <==========' print '\n' + 'Program start: ' + time.ctime() # Tells where the logfile will be stored if not args.noLog: print '\n' + 'LOGFILE:' print sys.stdout.getLogFile() # Print command print '\n' + 'COMMAND:' script_path = os.path.abspath(sys.argv[0]) print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:]) # Print directory where programme was lunch print '\n' + 'PRESENT DIRECTORY:' print os.getcwd() # Print program version print '\n' + 'VERSION INNUca.py:' utils.scriptVersionGit(version, os.getcwd(), script_path, args.noGitInfo) # Get CPU information utils.get_cpu_information(outdir, time_str) # Get trueCoverage_ReMatCh settings trueCoverage_config = get_trueCoverage_config( args.skipTrueCoverage, args.trueConfigFile.name if args.trueConfigFile is not None else None, args.speciesExpected, script_path) # Check programms programs_version_dictionary = {} programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6'] # Java check first for java dependents check next if not (args.skipFastQC and args.skipTrimmomatic and (args.skipPilon or args.skipSPAdes)): # programs_version_dictionary['java'] = ['-version', '>=', '1.8'] programs_version_dictionary['java'] = [None, '>=', '1.8' ] # For OpenJDK compatibility missingPrograms, programs_version_dictionary = utils.checkPrograms( programs_version_dictionary) if len(missingPrograms) > 0: sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) if not args.skipTrueCoverage or trueCoverage_config is not None: include_rematch_dependencies_path(args.doNotUseProvidedSoftware) programs_version_dictionary['rematch.py'] = ['--version', '>=', '3.2'] programs_version_dictionary['bcftools'] = ['--version', '==', '1.3.1'] if not (args.skipTrueCoverage and ( (args.skipAssemblyMapping and args.skipPilon) or args.skipSPAdes)): programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9'] programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1'] if not args.skipFastQC: programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5'] if not args.skipTrimmomatic: programs_version_dictionary['trimmomatic-0.36.jar'] = [ '-version', '==', '0.36' ] if args.runPear: programs_version_dictionary['pear'] = ['--version', '>=', '0.9.10'] if not args.skipSPAdes: programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0'] if not (args.skipPilon or args.skipSPAdes): programs_version_dictionary['pilon-1.18.jar'] = [ '--version', '==', '1.18' ] if not (args.skipMLST or args.skipSPAdes): programs_version_dictionary['mlst'] = ['--version', '>=', '2.4'] # Set and print PATH variable utils.setPATHvariable(args, script_path) missingPrograms, programs_version_dictionary = utils.checkPrograms( programs_version_dictionary) if len(missingPrograms) > 0: sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) # .jar paths jar_path_trimmomatic = None if not args.skipTrimmomatic: jar_path_trimmomatic = programs_version_dictionary[ 'trimmomatic-0.36.jar'][3] jar_path_pilon = None if not args.skipPilon and not args.skipSPAdes: jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3] rematch_script = None # ReMatCh path if not args.skipTrueCoverage: rematch_script = programs_version_dictionary['rematch.py'][3] # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation pairEnd_filesSeparation_list = None samples, inputDirectory, removeCreatedSamplesDirectories, indir_same_outdir = get_samples( args.inputDirectory, args.fastq, outdir, pairEnd_filesSeparation_list) # Start running the analysis print '\n' + 'RUNNING INNUca.py' # Prepare run report file samples_report_path = os.path.join(outdir, 'samples_report.' + time_str + '.tab') utils.start_sample_report_file(samples_report_path) number_samples_successfully = 0 number_samples_pass = 0 number_samples_warning = 0 # Get MLST scheme to use scheme = 'unknown' species_genus, mlst_scheme_genus = None, None if not args.skipMLST and not args.skipSPAdes: scheme, species_genus, mlst_scheme_genus = mlst.getScheme( args.speciesExpected) # Print path to blastn mlst.getBlastPath() # Memory available_memory_GB = utils.get_free_memory() / (1024.0**2) # Determine SPAdes maximum memory spadesMaxMemory = None if not args.skipSPAdes: print '' spadesMaxMemory = spades.define_memory(args.spadesMaxMemory, args.threads, available_memory_GB) # Determine .jar maximum memory jarMaxMemory = 'off' if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)): print '' jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory, args.threads, available_memory_GB) # Run INNUca for each sample sample_report_json = {} for sample in samples: sample_start_time = time.time() print '\n' + 'Sample: ' + sample + '\n' # Create sample outdir sample_outdir = os.path.abspath(os.path.join(outdir, sample, '')) if not os.path.isdir(sample_outdir): os.makedirs(sample_outdir) # Get fastq files fastq_files = utils.searchFastqFiles( os.path.join(inputDirectory, sample, ''), pairEnd_filesSeparation_list, False) if len(fastq_files) == 1: print 'Only one fastq file was found: ' + str(fastq_files) print 'Pair-End sequencing is required. Moving to the next sample' continue elif len(fastq_files) == 0: print 'No compressed fastq files were found. Continue to the next sample' continue print 'The following files will be used:' print str(fastq_files) + '\n' # Run INNUca.py analysis run_successfully, pass_qc, run_report = run_INNUca( sample, sample_outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config, rematch_script, species_genus, mlst_scheme_genus) # Save sample fail report utils.write_fail_report(os.path.join(sample_outdir, 'fail_report.txt'), run_report) # Save warning report write_warning_report(os.path.join(sample_outdir, 'warning_report.txt'), run_report) # Get raw reads files size fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files) # Remove sample directory if it was created during the process if removeCreatedSamplesDirectories and not indir_same_outdir: utils.removeDirectory(os.path.join(inputDirectory, sample, '')) print 'END ' + sample + ' analysis' time_taken = utils.runTime(sample_start_time) # Save run report warning, json_pass_qc = utils.write_sample_report( samples_report_path, sample, run_successfully, pass_qc, time_taken, fileSize, run_report) # Save runs statistics if run_successfully: number_samples_successfully += 1 if pass_qc: if warning: number_samples_warning += 1 else: number_samples_pass += 1 sample_report_json[sample] = { 'run_successfully': run_successfully, 'pass_qc': json_pass_qc, 'modules_run_report': run_report } # Save combine_samples_reports combine_reports.combine_reports(outdir, outdir, args.json, time_str, len(samples)) # Save sample_report in json if args.json: import json with open(os.path.join(outdir, 'samples_report.' + time_str + '.json'), 'wt') as writer: json.dump(sample_report_json, writer) # Remove temporary folder with symlink to fastq files in case of --fastq use if args.inputDirectory is None and args.fastq is not None: utils.removeDirectory(os.path.join(inputDirectory, '')) # Run report print '\n' + 'END INNUca.py' print '\n' + 'Pipeline problems: {not_run_successfully} samples'.format( not_run_successfully=(len(samples) - number_samples_successfully)) print '\n' + 'FAIL: {number_samples_fail} samples'.format( number_samples_fail=(len(samples) - number_samples_pass - number_samples_warning)) print '\n' + 'WARNING: {number_samples_warning} samples'.format( number_samples_warning=number_samples_warning) print '\n' + 'PASS: {number_samples_pass} samples'.format( number_samples_pass=number_samples_pass) time_taken = utils.runTime(general_start_time) del time_taken # Check whether INNUca.py run at least one sample successfully if number_samples_successfully == 0: sys.exit('No samples run successfully!')
def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config, rematch_script, species_genus, mlst_scheme_genus): threads = args.threads adaptersFasta = args.adapters if adaptersFasta is not None: adaptersFasta = os.path.abspath(adaptersFasta.name) genomeSize = args.genomeSizeExpectedMb skipped = [None, None, 0, {'sample': 'Skipped'}] not_run = [None, None, 0, {'sample': 'Not run'}] runs = {} # Run FastQ integrity check not_corruption_found, pass_qc, time_taken, failing, fastq_encoding, min_reads_length, max_reads_length = fastQintegrity.runFastQintegrity( fastq_files, threads, outdir) runs['FastQ_Integrity'] = [ not_corruption_found, pass_qc, time_taken, failing ] if not_corruption_found: # Run first Estimated Coverage run_successfully_estimatedCoverage = False estimatedCoverage = None run_successfully_trueCoverage = False pass_qc_trueCoverage = False if not args.skipEstimatedCoverage: # Check whether the Estimated Coverage output is already present report_file = os.path.join(outdir, 'coverage_report.txt') if os.path.isfile(report_file): os.remove(report_file) # Run getEstimatedCoverage run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage( fastq_files, genomeSize, outdir, threads, args.estimatedMinimumCoverage) runs['first_Coverage'] = [ run_successfully_estimatedCoverage, pass_qc, time_taken, failing ] else: print '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis' runs['first_Coverage'] = skipped trimmomatic_run_successfully = False if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if not args.skipTrueCoverage and trueCoverage_config is not None: # Run True Coverage run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing = trueCoverage.runTrueCoverage( sampleName, fastq_files, trueCoverage_config['reference_file'], threads, outdir, trueCoverage_config['length_extra_seq'], trueCoverage_config['minimum_depth_presence'], trueCoverage_config['minimum_depth_call'], trueCoverage_config[ 'minimum_depth_frequency_dominant_allele'], trueCoverage_config['minimum_gene_coverage'], False, False, 1, trueCoverage_config['minimum_gene_identity'], trueCoverage_config, rematch_script) runs['trueCoverage_ReMatCh'] = [ run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing ] else: print '\n' + '--skipTrueCoverage set. Skipping True coverage analysis' runs['trueCoverage_ReMatCh'] = skipped if args.skipTrueCoverage or trueCoverage_config is None or ( run_successfully_trueCoverage and pass_qc_trueCoverage): # Run first FastQC nts2clip_based_ntsContent = None if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis( outdir, threads, adaptersFasta, fastq_files, args.fastQCkeepFiles, 'first_run') runs['first_FastQC'] = [ run_successfully, pass_qc, time_taken, failing, warning ] else: print '--skipFastQC set. Skipping First FastQC analysis' runs['first_FastQC'] = skipped + ['NA'] # Run Trimmomatic if not args.skipTrimmomatic: run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, fileSize = trimmomatic.runTrimmomatic( jar_path_trimmomatic, sampleName, outdir, threads, adaptersFasta, script_path, args.doNotSearchAdapters, fastq_files, max_reads_length, args.doNotTrimCrops, args.trimCrop, args.trimHeadCrop, args.trimLeading, args.trimTrailing, args.trimSlidingWindow, args.trimMinLength, nts2clip_based_ntsContent, jarMaxMemory, fastq_encoding) runs['Trimmomatic'] = [ run_successfully, None, time_taken, failing, fileSize ] trimmomatic_run_successfully = run_successfully if run_successfully and not_empty_fastq: fastq_files = paired_reads min_reads_length = args.trimMinLength # Run second Estimated Coverage if not args.skipEstimatedCoverage: run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage( fastq_files, genomeSize, outdir, threads, args.estimatedMinimumCoverage) runs['second_Coverage'] = [ run_successfully_estimatedCoverage, pass_qc, time_taken, failing ] else: print '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis' runs['second_Coverage'] = skipped if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): # Run second FastQC if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis( outdir, threads, adaptersFasta, fastq_files, args.fastQCkeepFiles, 'second_run') runs['second_FastQC'] = [ run_successfully, pass_qc, time_taken, failing, warning ] if run_successfully: max_reads_length = maximum_reads_length else: print '--skipFastQC set. Skipping Second FastQC analysis' runs['second_FastQC'] = skipped + ['NA'] else: print '\n' + 'Estimated coverage is too lower (< ' + str( args.estimatedMinimumCoverage ) + 'x). This sample will not proceed with INNUca pipeline' runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print 'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated Coverage analysis and FastQC analysis' runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped + ['NA'] else: print '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second Estimated Coverage analysis' runs['Trimmomatic'] = skipped + ['NA'] runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped + ['NA'] if not args.skipFastQC and ( runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'] [1])) is False and not args.fastQCproceed: print '\n' + 'This sample does not pass FastQC module QA/QC. It will not proceed with INNUca pipeline' runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print '\n' + 'This sample does not pass True Coverage module QA/QC. This sample will not proceed with INNUca pipeline' runs['first_FastQC'] = not_run + ['NA'] runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print '\n' + 'Estimated coverage is too lower (< ' + str( args.estimatedMinimumCoverage ) + 'x). This sample will not proceed with INNUca pipeline' runs['trueCoverage_ReMatCh'] = not_run runs['first_FastQC'] = not_run + ['NA'] runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if args.skipTrueCoverage or trueCoverage_config is None or ( run_successfully_trueCoverage and pass_qc_trueCoverage): if args.skipFastQC or (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1]) ) is not False or args.fastQCproceed: unassembled_pe_reads = None assembled_se_reads = None # Run Pear if args.runPear: print '--runPear set. Running Pear' pearMinOverlap = pear.determine_minimum_overlap( args.pearMinOverlap, min_reads_length, max_reads_length) run_successfully, pass_qc, time_taken, failing, unassembled_pe_reads, assembled_se_reads, pear_folder, warning = pear.runPear( fastq_files, threads, outdir, sampleName, fastq_encoding, trimmomatic_run_successfully, pearMinOverlap) runs['Pear'] = [ run_successfully, pass_qc, time_taken, failing, warning ] else: runs['Pear'] = not_run + ['NA'] # Run SPAdes if not args.skipSPAdes: run_successfully, pass_qc, time_taken, failing, contigs_spades, warning = spades.runSpades( sampleName, outdir, threads, unassembled_pe_reads if unassembled_pe_reads is not None else fastq_files, args.spadesNotUseCareful, spadesMaxMemory, args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genomeSize, args.spadesKmers, max_reads_length, args.spadesDefaultKmers, args.spadesMinKmerCovContigs, assembled_se_reads, args.saveExcludedContigs, args.maxNumberContigs) runs['SPAdes'] = [ run_successfully, pass_qc, time_taken, failing, warning ] if run_successfully: contigs = contigs_spades # Run Assembly Mapping check bam_file = None if not args.skipAssemblyMapping: run_successfully, pass_qc, time_taken, failing, assembly_filtered, bam_file, assemblyMapping_folder, warning = assembly_mapping.runAssemblyMapping( fastq_files, contigs, threads, outdir, args.assemblyMinCoverageContigs, genomeSize, args.saveExcludedContigs, args.maxNumberContigs) runs['Assembly_Mapping'] = [ run_successfully, pass_qc, time_taken, failing, warning ] if run_successfully: contigs = assembly_filtered if not args.keepIntermediateAssemblies and os.path.isfile( contigs_spades ) and contigs != contigs_spades: os.remove(contigs_spades) else: print '--skipAssemblyMapping set. Skipping Assembly Mapping check' runs['Assembly_Mapping'] = skipped + ['NA'] # Run Pilon if not args.skipPilon: run_successfully, _, time_taken, failing, assembly_polished, pilon_folder = pilon.runPilon( jar_path_pilon, contigs, fastq_files, threads, outdir, jarMaxMemory, bam_file) runs['Pilon'] = [ run_successfully, None, time_taken, failing ] if run_successfully: contigs = assembly_polished if not args.keepIntermediateAssemblies and 'assembly_filtered' in locals( ) and os.path.isfile(assembly_filtered): os.remove(assembly_filtered) if not args.pilonKeepFiles: utils.removeDirectory(pilon_folder) else: print '--skipPilon set. Skipping Pilon correction' runs['Pilon'] = skipped if 'assemblyMapping_folder' in locals(): utils.removeDirectory(assemblyMapping_folder) print '\n' + 'Final assembly: ' + contigs with open( os.path.join(outdir, 'final_assembly.txt'), 'wt') as writer: writer.write(contigs + '\n') # Run MLST if not args.skipMLST: run_successfully, pass_qc, time_taken, failing, warning = mlst.runMlst( contigs, scheme, outdir, species_genus, mlst_scheme_genus) runs['MLST'] = [ run_successfully, pass_qc, time_taken, failing, warning ] else: print '--skipMLST set. Skipping MLST analysis' runs['MLST'] = skipped + ['NA'] else: print 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check and MLST analysis' runs['Assembly_Mapping'] = skipped + ['NA'] runs['Pilon'] = skipped runs['MLST'] = skipped + ['NA'] else: print '--skipSPAdes set. Skipping SPAdes, Pilon correction, Assembly Mapping check and MLST analysis' runs['SPAdes'] = skipped + ['NA'] runs['Assembly_Mapping'] = skipped + ['NA'] runs['Pilon'] = skipped runs['MLST'] = skipped + ['NA'] else: print 'Moving to the next sample' for step in ('first_Coverage', 'trueCoverage_ReMatCh', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'Pilon', 'MLST'): if step in ('Trimmomatic', 'first_FastQC', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'MLST'): runs[step] = not_run + ['NA'] else: runs[step] = not_run # Remove Pear directory if not args.pearKeepFiles and 'pear_folder' in locals(): utils.removeDirectory(pear_folder) # Remove Trimmomatic directory with cleaned reads if not args.trimKeepFiles and 'trimmomatic_folder' in locals(): utils.removeDirectory(trimmomatic_folder) # Check run run_successfully = all(runs[step][0] or runs[step][0] is None for step in runs) pass_fastqIntegrity = runs['FastQ_Integrity'][0] pass_cov = (runs['second_Coverage'][1] or (runs['second_Coverage'][1] is None and runs['first_Coverage'][1])) is not False pass_trueCov = runs['trueCoverage_ReMatCh'][1] is not False pass_fastqc = (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False # pass_trimmomatic = runs['Trimmomatic'][1] is not False # pass_pear = runs['Pear'][1] is not False # pass_spades = runs['SPAdes'][1] is not False or runs['Assembly_Mapping'][1] is True pass_spades = runs['SPAdes'][1] is not False pass_assemblyMapping = runs['Assembly_Mapping'][1] is not False pass_pilon = runs['Pilon'][0] is not False pass_mlst = runs['MLST'][1] is not False pass_qc = all([ pass_fastqIntegrity, pass_cov, pass_trueCov, pass_fastqc, pass_spades, pass_assemblyMapping, pass_pilon, pass_mlst ]) return run_successfully, pass_qc, runs
def main(): version = '2.0' args = utils.parseArguments(version) general_start_time = time.time() time_str = time.strftime("%Y%m%d-%H%M%S") # Check if output directory exists outdir = os.path.abspath(os.path.join(args.outdir, '')) if not os.path.isdir(outdir): os.makedirs(outdir) # Start logger sys.stdout = utils.Logger(outdir, time_str) print '\n' + '==========> INNUca.py <==========' print '\n' + 'Program start: ' + time.ctime() # Tells where the logfile will be stored print '\n' + 'LOGFILE:' print sys.stdout.getLogFile() # Print command print '\n' + 'COMMAND:' script_path = os.path.abspath(sys.argv[0]) print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:]) # Print directory where programme was lunch print '\n' + 'PRESENT DIRECTORY :' print os.getcwd() # Print program version print '\n' + 'VERSION INNUca.py:' utils.scriptVersionGit(version, os.getcwd(), script_path) # Get CPU information utils.get_cpu_information(outdir, time_str) # Set and print PATH variable utils.setPATHvariable(args.doNotUseProvidedSoftware, script_path) # Check programms programs_version_dictionary = {} programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6'] if (not args.skipTrueCoverage or (not args.skipPilon and not args.skipSPAdes)): programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9'] programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1'] if not (args.skipFastQC and args.skipTrimmomatic and (args.skipPilon or args.skipSPAdes)): programs_version_dictionary['java'] = ['-version', '>=', '1.8'] if not args.skipFastQC: programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5'] if not args.skipTrimmomatic: programs_version_dictionary['trimmomatic-0.36.jar'] = [ '-version', '==', '0.36' ] if not args.skipSPAdes: programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0'] if not args.skipPilon and not args.skipSPAdes: programs_version_dictionary['pilon-1.18.jar'] = [ '--version', '==', '1.18' ] if not args.skipMLST and not args.skipSPAdes: programs_version_dictionary['mlst'] = ['--version', '>=', '2.4'] missingPrograms, programs_version_dictionary = utils.checkPrograms( programs_version_dictionary) if len(missingPrograms) > 0: sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) # .jar paths jar_path_trimmomatic = None if not args.skipTrimmomatic: jar_path_trimmomatic = programs_version_dictionary[ 'trimmomatic-0.36.jar'][3] jar_path_pilon = None if not args.skipPilon and not args.skipSPAdes: jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3] # Check if input directory exists with fastq files and store samples name that have fastq files inputDirectory = os.path.abspath(os.path.join(args.inputDirectory, '')) # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation pairEnd_filesSeparation_list = None print '' samples, removeCreatedSamplesDirectories, indir_same_outdir = utils.checkSetInputDirectory( inputDirectory, outdir, pairEnd_filesSeparation_list) # Start running the analysis print '\n' + 'RUNNING INNUca.py' # Prepare run report file samples_report_path = os.path.join(outdir, 'samples_report.' + time_str + '.tab') utils.start_sample_report_file(samples_report_path) number_samples_successfully = 0 number_samples_pass = 0 # Get MLST scheme to use scheme = 'unknown' if not args.skipMLST and not args.skipSPAdes: scheme = mlst.getScheme(args.speciesExpected) # Get path to blastn mlst.getBlastPath() # Get trueCoverage_ReMatCh settings trueCoverage_config = None if not args.skipTrueCoverage: trueCoverage_reference = None trueCoverage_config_file = None trueCoverage_config = None if args.trueConfigFile is None: print 'No trueCoverage_ReMatCh config file was provided. Search for default files' trueCoverage_config_file, trueCoverage_reference = trueCoverage.check_existing_default_config( args.speciesExpected, script_path) else: trueCoverage_config_file = args.trueConfigFile.name if trueCoverage_config_file is not None: trueCoverage_config = trueCoverage.parse_config( trueCoverage_config_file) if args.trueConfigFile is None and trueCoverage_config is not None: trueCoverage_config['reference_file'] = trueCoverage_reference if trueCoverage_config is not None: print 'The following trueCoverage_ReMatCh config file will be used: ' + trueCoverage_config_file print 'The following trueCoverage_ReMatCh reference file will be used: ' + trueCoverage_config[ 'reference_file'] + '\n' else: print 'No trueCoverage_ReMatCh config file was found' # Memory available_memory_GB = utils.get_free_memory() / (1024.0**2) # Determine SPAdes maximum memory spadesMaxMemory = None if not args.skipSPAdes: print '' spadesMaxMemory = spades.define_memory(args.spadesMaxMemory, args.threads, available_memory_GB) # Determine .jar maximum memory jarMaxMemory = 'off' if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)): print '' jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory, args.threads, available_memory_GB) # Run INNUca for each sample for sample in samples: sample_start_time = time.time() print '\n' + 'Sample: ' + sample + '\n' # Create sample outdir sample_outdir = os.path.abspath(os.path.join(outdir, sample, '')) if not os.path.isdir(sample_outdir): os.makedirs(sample_outdir) # Get fastq files fastq_files = utils.searchFastqFiles( os.path.join(inputDirectory, sample, ''), pairEnd_filesSeparation_list, False) if len(fastq_files) == 1: print 'Only one fastq file was found: ' + str(fastq_files) print 'Pair-End sequencing is required. Moving to the next sample' continue print 'The following files will be used:' print str(fastq_files) + '\n' # Run INNUca.py analysis run_successfully, pass_qc, run_report = run_INNUca( sample, sample_outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config) # Save sample fail report fail_report_path = os.path.join(sample_outdir, 'fail_report.txt') utils.write_fail_report(fail_report_path, run_report) # Save runs statistics if run_successfully: number_samples_successfully += 1 if pass_qc: number_samples_pass += 1 # Get raw reads files size fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files) # Remove sample directory if it was created during the process if removeCreatedSamplesDirectories and not indir_same_outdir: utils.removeDirectory(os.path.join(inputDirectory, sample, '')) print 'END ' + sample + ' analysis' time_taken = utils.runTime(sample_start_time) # Save run report utils.write_sample_report(samples_report_path, sample, run_successfully, pass_qc, time_taken, fileSize, run_report) # Run report print '\n' + 'END INNUca.py' print '\n' + str(number_samples_successfully) + ' samples out of ' + str( len(samples)) + ' run successfully' print '\n' + str(number_samples_pass) + ' samples out of ' + str( number_samples_successfully ) + ' (run successfully) PASS INNUca.py analysis' time_taken = utils.runTime(general_start_time) del time_taken # Check whether INNUca.py run at least one sample successfully if number_samples_successfully == 0: sys.exit('No samples run successfully!')
def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config): threads = args.threads adaptersFasta = args.adapters if adaptersFasta is not None: adaptersFasta = os.path.abspath(adaptersFasta.name) genomeSize = args.genomeSizeExpectedMb maximumReadsLength = None skipped = [None, None, 0, {'sample': 'Skipped'}] not_run = [None, None, 0, {'sample': 'Not run'}] runs = {} # Run FastQ integrity check not_corruption_found, _, time_taken, failing = fastQintegrity.runFastQintegrity( fastq_files, threads, outdir) runs['FastQ_Integrity'] = [not_corruption_found, None, time_taken, failing] if not_corruption_found: # Run first Estimated Coverage run_successfully_estimatedCoverage = False estimatedCoverage = None run_successfully_trueCoverage = False pass_qc_trueCoverage = False if not args.skipEstimatedCoverage: # Check whether the Estimated Coverage output is already present report_file = os.path.join(outdir, 'coverage_report.txt') if os.path.isfile(report_file): os.remove(report_file) # Run getEstimatedCoverage run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage( fastq_files, genomeSize, outdir, threads) runs['first_Coverage'] = [ run_successfully_estimatedCoverage, pass_qc, time_taken, failing ] else: print '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis' runs['first_Coverage'] = skipped if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if not args.skipTrueCoverage and trueCoverage_config is not None: # Run True Coverage run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing = trueCoverage.runTrueCoverage( fastq_files, trueCoverage_config['reference_file'], threads, outdir, trueCoverage_config['length_extra_seq'], trueCoverage_config['minimum_depth_presence'], trueCoverage_config['minimum_depth_call'], trueCoverage_config[ 'minimum_depth_frequency_dominant_allele'], trueCoverage_config['minimum_gene_coverage'], trueCoverage_config['maximum_number_absent_genes'], trueCoverage_config[ 'maximum_number_genes_multiple_alleles'], trueCoverage_config['minimum_read_coverage']) runs['trueCoverage_ReMatCh'] = [ run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing ] else: print '\n' + '--skipTrueCoverage set. Skipping True coverage analysis' runs['trueCoverage_ReMatCh'] = skipped if args.skipTrueCoverage or trueCoverage_config is None or ( run_successfully_trueCoverage and pass_qc_trueCoverage): # Run first FastQC nts2clip_based_ntsContent = None if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, maximumReadsLength, nts2clip_based_ntsContent = fastqc.runFastQCanalysis( outdir, threads, adaptersFasta, fastq_files) runs['first_FastQC'] = [ run_successfully, pass_qc, time_taken, failing ] else: print '--skipFastQC set. Skipping First FastQC analysis' runs['first_FastQC'] = skipped # Run Trimmomatic if not args.skipTrimmomatic: run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, fileSize = trimmomatic.runTrimmomatic( jar_path_trimmomatic, sampleName, outdir, threads, adaptersFasta, script_path, args.doNotSearchAdapters, fastq_files, maximumReadsLength, args.doNotTrimCrops, args.trimCrop, args.trimHeadCrop, args.trimLeading, args.trimTrailing, args.trimSlidingWindow, args.trimMinLength, nts2clip_based_ntsContent, jarMaxMemory) runs['Trimmomatic'] = [ run_successfully, not_empty_fastq, time_taken, failing, fileSize ] if run_successfully and not_empty_fastq: fastq_files = paired_reads # Run second Estimated Coverage if not args.skipEstimatedCoverage: run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage( fastq_files, genomeSize, outdir, threads) runs['second_Coverage'] = [ run_successfully_estimatedCoverage, pass_qc, time_taken, failing ] else: print '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis' runs['second_Coverage'] = skipped if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): # Run second FastQC if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, maximumReadsLength, nts2clip_based_ntsContent = fastqc.runFastQCanalysis( outdir, threads, adaptersFasta, fastq_files) runs['second_FastQC'] = [ run_successfully, pass_qc, time_taken, failing ] else: print '--skipFastQC set. Skipping Second FastQC analysis' runs['second_FastQC'] = skipped else: print '\n' + 'Estimated coverage is too lower (< ' + str( args.estimatedMinimumCoverage ) + 'x). This sample will not proceed with INNUca pipeline' runs['second_FastQC'] = not_run runs['SPAdes'] = not_run runs['Pilon'] = not_run runs['Assembly_Mapping'] = not_run runs['MLST'] = not_run else: print 'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated Coverage analysis and FastQC analysis' runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped else: print '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second Estimated Coverage analysis' runs['Trimmomatic'] = skipped + ['NA'] runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped else: print '\n' + 'This sample does not pass True Coverage module QA/QC. This sample will not proceed with INNUca pipeline' runs['first_FastQC'] = not_run runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run runs['SPAdes'] = not_run runs['Pilon'] = not_run runs['Assembly_Mapping'] = not_run runs['MLST'] = not_run else: print '\n' + 'Estimated coverage is too lower (< ' + str( args.estimatedMinimumCoverage ) + 'x). This sample will not proceed with INNUca pipeline' runs['trueCoverage_ReMatCh'] = not_run runs['first_FastQC'] = not_run runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run runs['SPAdes'] = not_run runs['Pilon'] = not_run runs['Assembly_Mapping'] = not_run runs['MLST'] = not_run if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if args.skipTrueCoverage or trueCoverage_config is None or ( run_successfully_trueCoverage and pass_qc_trueCoverage): # Run SPAdes if not args.skipSPAdes: run_successfully, pass_qc, time_taken, failing, contigs_spades = spades.runSpades( sampleName, outdir, threads, fastq_files, args.spadesNotUseCareful, spadesMaxMemory, args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genomeSize, args.spadesKmers, maximumReadsLength, args.spadesDefaultKmers, args.spadesMinKmerCovContigs) runs['SPAdes'] = [ run_successfully, pass_qc, time_taken, failing ] if run_successfully: # Run Pilon contigs = contigs_spades if not args.skipPilon: run_successfully, _, time_taken, failing, assembly_polished, bam_file, pilon_folder = pilon.runPilon( jar_path_pilon, contigs_spades, fastq_files, threads, outdir, jarMaxMemory) runs['Pilon'] = [ run_successfully, None, time_taken, failing ] if run_successfully: contigs = assembly_polished # Run Assembly Mapping check if bam_file is not None: if not args.skipAssemblyMapping: run_successfully, pass_qc, time_taken, failing, assembly_filtered = assembly_mapping.runAssemblyMapping( bam_file, contigs_spades, threads, outdir, args.assemblyMinCoverageContigs, assembly_polished, genomeSize) runs['Assembly_Mapping'] = [ run_successfully, pass_qc, time_taken, failing ] if run_successfully: contigs = assembly_filtered else: print '--skipAssemblyMapping set. Skipping Assembly Mapping check' runs['Assembly_Mapping'] = skipped else: print 'Pilon did not produce the bam file! Assembly Mapping check' runs['Assembly_Mapping'] = skipped if not args.pilonKeepFiles: utils.removeDirectory(pilon_folder) else: print '--skipPilon set. Skipping Pilon correction and Assembly Mapping check' runs['Pilon'] = skipped runs['Assembly_Mapping'] = skipped print '\n' + 'Final assembly: ' + contigs with open(os.path.join(outdir, 'final_assembly.txt'), 'wt') as writer: writer.write(contigs + '\n') # Run MLST if not args.skipMLST: runs['MLST'] = mlst.runMlst( contigs, scheme, outdir) else: print '--skipMLST set. Skipping MLST analysis' runs['MLST'] = skipped else: print 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check and MLST analysis' runs['Pilon'] = skipped runs['Assembly_Mapping'] = skipped runs['MLST'] = skipped else: print '--skipSPAdes set. Skipping SPAdes Pilon correction, Assembly Mapping check and MLST analysis' runs['SPAdes'] = skipped runs['Pilon'] = skipped runs['Assembly_Mapping'] = skipped runs['MLST'] = skipped else: print 'Moving to the next sample' for step in ('first_Coverage', 'trueCoverage_ReMatCh', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'SPAdes', 'Pilon', 'Assembly_Mapping', 'MLST'): if step == 'Trimmomatic': runs[step] = not_run + ['NA'] else: runs[step] = not_run # Remove Trimmomatic directory with cleaned reads if not args.trimKeepFiles: try: utils.removeDirectory(trimmomatic_folder) except: print 'It is not possible to remove Trimmomatic directory because Trimmomatic did not run' # Check run run_successfully = all(runs[step][0] or runs[step][0] is None for step in runs) pass_fastqIntegrity = runs['FastQ_Integrity'][0] pass_cov = (runs['second_Coverage'][1] or (runs['second_Coverage'][1] is None and runs['first_Coverage'][1])) is not False pass_trueCov = runs['trueCoverage_ReMatCh'][1] is not False pass_fastqc = (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False pass_trimmomatic = runs['Trimmomatic'][1] is not False pass_spades = runs['SPAdes'][1] is not False pass_assemblyMapping = runs['Assembly_Mapping'][1] is not False pass_mlst = runs['MLST'][1] is not False pass_qc = all([ pass_fastqIntegrity, pass_cov, pass_trueCov, pass_fastqc, pass_trimmomatic, pass_spades, pass_assemblyMapping, pass_mlst ]) return run_successfully, pass_qc, runs
def run_innuca(sample_name, outdir, fastq_files, args, script_path, scheme, spades_max_memory, jar_path_trimmomatic, jar_path_pilon, jar_max_memory, true_coverage_config, rematch_script, species_genus, mlst_scheme_genus, spades_version=None): threads = args.threads adapters_fasta = args.adapters if adapters_fasta is not None: adapters_fasta = os.path.abspath(adapters_fasta.name) genome_size = args.genomeSizeExpectedMb # run_successfully, pass_qc, time_taken, failing, warning, file_size skipped = [None, None, 0, {'sample': 'Skipped'}, {}, 'NA'] not_run = [None, None, 0, {'sample': 'Not run'}, {}, 'NA'] runs = {} # Run FastQ integrity check not_corruption_found, pass_qc, time_taken, failing, fastq_encoding, min_reads_length, max_reads_length = \ fastQintegrity.runFastQintegrity(fastq_files, threads, outdir) runs['FastQ_Integrity'] = [ not_corruption_found, pass_qc, time_taken, failing, {}, 'NA' ] pear_folder = None trimmomatic_folder = None if not_corruption_found: # Run Kraken # most_abundant_taxon_percent = None run_successfully_kraken = False run_successfully_estimated_coverage = False estimated_coverage = None run_successfully_true_coverage = False pass_qc_true_coverage = False trimmomatic_run_successfully = False if args.runKraken: print('\n' '--runKraken set. Running Kraken for reads') run_successfully_kraken, pass_qc, time_taken, failing, warning, _ = \ kraken(species=args.speciesExpected, files_to_classify=fastq_files, kraken_db=args.krakenDB, files_type='fastq', outdir=outdir, version_kraken=version_kraken_global, db_mem=args.krakenMemory, quick=args.krakenQuick, min_percent_covered=args.krakenMinCov, max_unclassified_frag=args.krakenMaxUnclass, min_base_quality=args.krakenMinQual, threads=threads) runs['reads_Kraken'] = [ run_successfully_kraken, pass_qc, time_taken, failing, warning, 'NA' ] else: runs['reads_Kraken'] = skipped if args.runKraken and \ (run_successfully_kraken and not pass_qc) and \ not args.krakenProceed and \ not args.krakenIgnoreQC: print( '\n' 'This sample does not pass Kraken module QA/QC. It will not proceed with INNUca pipeline' ) else: # Run first Estimated Coverage if not args.skipEstimatedCoverage: # Check whether the Estimated Coverage output is already present report_file = os.path.join(outdir, 'coverage_report.txt') if os.path.isfile(report_file): os.remove(report_file) # Run getEstimatedCoverage run_successfully_estimated_coverage, pass_qc, time_taken, failing, estimated_coverage = \ coverage.getEstimatedCoverage(fastq_files, genome_size, outdir, threads, args.estimatedMinimumCoverage) runs['first_Coverage'] = [ run_successfully_estimated_coverage, pass_qc, time_taken, failing, {}, 'NA' ] else: print( '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis' ) runs['first_Coverage'] = skipped # # Correct first estimation coverage with Kraken percentage # # Does not seem to be a good idea (at least for Streptococcus agalactiae) # if args.runKraken and \ # (runs['Kraken'][0] and runs['Kraken'][1]) and \ # most_abundant_taxon_percent is not None and \ # estimated_coverage is not None: # new_estimation = estimated_coverage * (most_abundant_taxon_percent / 100) # print('\n' # 'Correct estimated coverage ({estimated}x) with Kraken taxon percentage' # ' coverage ({percent}%): {new_estimation}x'.format(estimated=estimated_coverage, # percent=most_abundant_taxon_percent, # new_estimation=new_estimation)) # estimated_coverage = new_estimation if args.skipEstimatedCoverage or ( run_successfully_estimated_coverage and not estimated_coverage < args.estimatedMinimumCoverage): if not args.skipTrueCoverage and true_coverage_config is not None: # Run True Coverage run_successfully_true_coverage, pass_qc_true_coverage, time_taken, failing, _ = \ trueCoverage.run_true_coverage(sample_name, fastq_files, true_coverage_config['reference_file'], threads, outdir, true_coverage_config['length_extra_seq'], true_coverage_config['minimum_depth_presence'], true_coverage_config['minimum_depth_call'], true_coverage_config['minimum_depth_frequency_dominant_allele'], true_coverage_config['minimum_gene_coverage'], False, true_coverage_config['minimum_gene_identity'], true_coverage_config, rematch_script, num_map_loc=1, bowtie_algorithm=args.trueCoverageBowtieAlgo, clean_run_rematch=True) runs['trueCoverage_ReMatCh'] = [ run_successfully_true_coverage, pass_qc_true_coverage, time_taken, failing, {}, 'NA' ] else: print( '\n' + '--skipTrueCoverage set. Skipping True coverage analysis' ) runs['trueCoverage_ReMatCh'] = skipped if args.skipTrueCoverage or true_coverage_config is None or args.trueCoverageProceed or \ (run_successfully_true_coverage and pass_qc_true_coverage): # Run first FastQC nts2clip_based_nts_content = None if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, \ nts2clip_based_nts_content = fastqc.runFastQCanalysis(outdir, threads, adapters_fasta, fastq_files, args.fastQCkeepFiles, 'first_run') runs['first_FastQC'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] else: print( '--skipFastQC set. Skipping First FastQC analysis') runs['first_FastQC'] = skipped # Run Trimmomatic not_empty_fastq = True if not args.skipTrimmomatic: run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, \ file_size, warning = trimmomatic.runTrimmomatic(jar_path_trimmomatic, sample_name, outdir, threads, adapters_fasta, script_path, args.doNotSearchAdapters, fastq_files, max_reads_length, args.doNotTrimCrops, args.trimCrop, args.trimHeadCrop, args.trimLeading, args.trimTrailing, args.trimSlidingWindow, args.trimMinLength, nts2clip_based_nts_content, jar_max_memory, fastq_encoding) runs['Trimmomatic'] = [ run_successfully, None, time_taken, failing, warning, file_size ] trimmomatic_run_successfully = run_successfully if run_successfully and not_empty_fastq: fastq_files = paired_reads min_reads_length = args.trimMinLength # Run second Estimated Coverage if not args.skipEstimatedCoverage: run_successfully_estimated_coverage, pass_qc, time_run, failing, estimated_coverage = \ coverage.getEstimatedCoverage(fastq_files, genome_size, outdir, threads, args.estimatedMinimumCoverage) runs['second_Coverage'] = [ run_successfully_estimated_coverage, pass_qc, time_run, failing, {}, 'NA' ] else: print( '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis' ) runs['second_Coverage'] = skipped if args.skipEstimatedCoverage or ( run_successfully_estimated_coverage and not estimated_coverage < args.estimatedMinimumCoverage): # Run second FastQC if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, \ nts2clip_based_nts_content = fastqc.runFastQCanalysis(outdir, threads, adapters_fasta, fastq_files, args.fastQCkeepFiles, 'second_run') runs['second_FastQC'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] if run_successfully: max_reads_length = maximum_reads_length else: print( '--skipFastQC set. Skipping Second FastQC analysis' ) runs['second_FastQC'] = skipped else: print( '\n' 'Estimated coverage is too lower (< {estimatedMinimumCoverage}x). This sample' ' will not proceed with INNUca' ' pipeline'.format( estimatedMinimumCoverage=args. estimatedMinimumCoverage)) runs['second_FastQC'] = skipped else: print( 'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated' ' Coverage analysis and FastQC analysis') runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped else: print( '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second' ' Estimated Coverage analysis') runs['Trimmomatic'] = skipped runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped if not args.skipFastQC and \ (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is False and \ not not_empty_fastq and not args.fastQCproceed: print( '\n' 'This sample does not pass FastQC module QA/QC. It will not proceed with INNUca pipeline' ) else: print( '\n' 'This sample does not pass True Coverage module QA/QC. This sample will not proceed with' ' INNUca pipeline') else: print( '\n' 'Estimated coverage is too lower (< {estimatedMinimumCoverage}x). This sample will not proceed' ' with INNUca pipeline'.format( estimatedMinimumCoverage=args.estimatedMinimumCoverage) ) continue_second_part = False if not args.runKraken or \ (runs['reads_Kraken'][0] is True and runs['reads_Kraken'][1] is True) or \ args.krakenProceed or \ args.krakenIgnoreQC: if args.skipEstimatedCoverage or ( run_successfully_estimated_coverage and not estimated_coverage < args.estimatedMinimumCoverage): if args.skipTrueCoverage or true_coverage_config is None or args.trueCoverageProceed or \ (run_successfully_true_coverage and pass_qc_true_coverage): if args.skipFastQC or (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False or \ args.fastQCproceed: continue_second_part = True if continue_second_part: unassembled_pe_reads = None assembled_se_reads = None # Run Pear if args.runPear: print('--runPear set. Running Pear') pear_min_overlap = pear.determine_minimum_overlap( args.pearMinOverlap, min_reads_length, max_reads_length) run_successfully, pass_qc, time_taken, failing, unassembled_pe_reads, assembled_se_reads, \ pear_folder, warning = pear.runPear(fastq_files, threads, outdir, sample_name, fastq_encoding, trimmomatic_run_successfully, pear_min_overlap) runs['Pear'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] else: runs['Pear'] = skipped # Run SPAdes if not args.skipSPAdes: run_successfully, pass_qc, time_taken, failing, contigs_spades, warning = \ spades.run_spades(sample_name, outdir, threads, unassembled_pe_reads if unassembled_pe_reads is not None else fastq_files, args.spadesNotUseCareful, spades_max_memory, args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genome_size, args.spadesKmers, max_reads_length, args.spadesDefaultKmers, args.spadesMinKmerCovContigs, assembled_se_reads, args.saveExcludedContigs, args.maxNumberContigs, args.keepSPAdesScaffolds, spades_version=spades_version, estimated_coverage=estimated_coverage, spades_not_use_isolate=args.spadesNotUseIsolate) runs['SPAdes'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] if run_successfully: contigs = contigs_spades # Run Assembly Mapping check bam_file = None original_bam = None assembly_mapping_folder = None possible_assemblies_bam_remove = {} if not args.skipAssemblyMapping: run_successfully, pass_qc, time_taken, failing, assembly_filtered, bam_file, \ assembly_mapping_folder, warning, original_bam = \ assembly_mapping.run_assembly_mapping(fastq_files=fastq_files, reference_file=contigs, outdir=outdir, estimated_genome_size_mb=genome_size, max_number_contigs=args.maxNumberContigs, save_excluded_contigs=args.saveExcludedContigs, min_coverage_assembly=args.assemblyMinCoverageContigs, keep_bam=args.keepBAM, threads=threads) runs['Assembly_Mapping'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] if run_successfully: # Assembly to remove if not args.keepIntermediateAssemblies: if os.path.isfile(contigs_spades) and \ assembly_filtered is not None and \ assembly_filtered != contigs_spades: if not args.keepBAM: os.remove(contigs_spades) else: possible_assemblies_bam_remove[ 'assembly_mapping'] = contigs_spades if assembly_filtered is not None and \ assembly_filtered != contigs_spades and \ os.path.isfile(assembly_filtered): contigs = assembly_filtered else: print( '--skipAssemblyMapping set. Skipping Assembly Mapping check' ) runs['Assembly_Mapping'] = skipped # Run Pilon pilon_new_bam = False pilon_bam = None if not args.skipPilon: run_successfully, _, time_taken, failing, assembly_polished, pilon_folder, pilon_new_bam, \ pilon_bam = pilon.run_pilon(jar_path_pilon=jar_path_pilon, assembly=contigs, fastq_files=fastq_files, outdir=outdir, jar_max_memory=jar_max_memory, alignment_file=bam_file, keep_bam=args.keepBAM, threads=threads) runs['Pilon'] = [ run_successfully, None, time_taken, failing, {}, 'NA' ] if run_successfully: if not args.keepIntermediateAssemblies: if os.path.isfile(contigs) and \ assembly_polished is not None and \ os.path.isfile(assembly_polished): if not args.keepBAM: os.remove(contigs) else: if not pilon_new_bam: possible_assemblies_bam_remove[ 'pilon'] = contigs if assembly_polished is not None and \ os.path.isfile(assembly_polished): contigs = assembly_polished if not args.pilonKeepFiles and os.path.isdir( pilon_folder): utils.removeDirectory(pilon_folder) else: print('--skipPilon set. Skipping Pilon correction') runs['Pilon'] = skipped if not args.keepBAM: if bam_file is not None: if os.path.isfile(bam_file): os.remove(bam_file) if os.path.isfile(bam_file + '.bai'): os.remove(bam_file + '.bai') if original_bam is not None and os.path.isfile( original_bam): os.remove(original_bam) if pilon_bam is not None and os.path.isfile(pilon_bam): os.remove(pilon_bam) if 'assembly_mapping' in possible_assemblies_bam_remove and \ os.path.isfile(possible_assemblies_bam_remove['assembly_mapping']): os.remove(possible_assemblies_bam_remove[ 'assembly_mapping']) if 'pilon' in possible_assemblies_bam_remove and \ os.path.isfile(possible_assemblies_bam_remove['pilon']): os.remove(possible_assemblies_bam_remove['pilon']) else: if pilon_new_bam: if bam_file is not None: if os.path.isfile(bam_file): os.remove(bam_file) if os.path.isfile(bam_file + '.bai'): os.remove(bam_file + '.bai') if original_bam is not None and os.path.isfile( original_bam): os.remove(original_bam) if 'assembly_mapping' in possible_assemblies_bam_remove and \ os.path.isfile(possible_assemblies_bam_remove['assembly_mapping']): os.remove(possible_assemblies_bam_remove[ 'assembly_mapping']) else: if original_bam is not None and os.path.isfile(original_bam) and \ bam_file is not None and os.path.isfile(bam_file): os.remove(bam_file) if 'pilon' in possible_assemblies_bam_remove and \ os.path.isfile(possible_assemblies_bam_remove['pilon']): os.remove( possible_assemblies_bam_remove['pilon']) if not args.skipAssemblyMapping: utils.removeDirectory(assembly_mapping_folder) print('\n' + 'Final assembly: ' + contigs) with open(os.path.join(outdir, 'final_assembly.txt'), 'wt') as writer: writer.write(contigs + '\n') # Run MLST if not args.skipMLST: run_successfully, pass_qc, time_taken, failing, warning = \ mlst.runMlst(contigs, scheme, outdir, species_genus, mlst_scheme_genus) runs['MLST'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] else: print('--skipMLST set. Skipping MLST analysis') runs['MLST'] = skipped # Run Kraken if args.runKraken: print('\n' '--runKraken set. Running Kraken for assembly') run_successfully, pass_qc, time_taken, failing, warning, _ = \ kraken(species=args.speciesExpected, files_to_classify=[contigs], kraken_db=args.krakenDB, files_type='fasta', outdir=outdir, version_kraken=version_kraken_global, db_mem=args.krakenMemory, quick=args.krakenQuick, min_percent_covered=args.krakenMinCov, max_unclassified_frag=args.krakenMaxUnclass, min_base_quality=args.krakenMinQual, threads=threads) runs['assembly_Kraken'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] else: runs['assembly_Kraken'] = skipped # Run insert_size if args.runInsertSize: print('\n' '--runInsertSize set. Running insert_size') run_successfully, _, time_taken, failing = \ insert_size(sample_name=sample_name, reference=contigs, fastq=fastq_files, outdir=outdir, threads=threads, dist=args.insertSizeDist) runs['insert_size'] = [ run_successfully, None, time_taken, failing, {}, 'NA' ] else: runs['insert_size'] = skipped else: print( 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check,' ' MLST and Kraken (assembly) analysis and insert size determination' ) else: print( '--skipSPAdes set. Skipping SPAdes, Pilon correction, Assembly Mapping check and MLST and Kraken' ' (assembly) analysis and insert size determination') runs['SPAdes'] = skipped runs['Assembly_Mapping'] = skipped runs['Pilon'] = skipped runs['MLST'] = skipped runs['assembly_Kraken'] = skipped runs['insert_size'] = skipped else: print('Moving to the next sample') for step in ('reads_Kraken', 'first_Coverage', 'trueCoverage_ReMatCh', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'Pilon', 'MLST', 'assembly_Kraken', 'insert_size'): if step not in runs: runs[step] = not_run # Remove Pear directory if not args.pearKeepFiles and pear_folder is not None: utils.removeDirectory(pear_folder) # Remove Trimmomatic directory with cleaned reads if not args.trimKeepFiles and trimmomatic_folder is not None: utils.removeDirectory(trimmomatic_folder) # Check run run_successfully = all(runs[step][0] or runs[step][0] is None for step in runs) pass_fastq_integrity = runs['FastQ_Integrity'][0] pass_reads_kraken = runs['reads_Kraken'][ 1] is not False or args.krakenIgnoreQC pass_cov = (runs['second_Coverage'][1] or (runs['second_Coverage'][1] is None and runs['first_Coverage'][1])) is not False pass_true_cov = runs['trueCoverage_ReMatCh'][ 1] is not False or args.trueCoverageIgnoreQC pass_fastqc = (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False # pass_trimmomatic = runs['Trimmomatic'][1] is not False # pass_pear = runs['Pear'][1] is not False # pass_spades = runs['SPAdes'][1] is not False or runs['Assembly_Mapping'][1] is True pass_spades = runs['SPAdes'][1] is not False pass_assembly_mapping = runs['Assembly_Mapping'][1] is not False pass_pilon = runs['Pilon'][0] is not False pass_mlst = runs['MLST'][1] is not False or args.mlstIgnoreQC pass_assembly_kraken = runs['assembly_Kraken'][ 1] is not False or args.krakenIgnoreQC pass_qc = all([ pass_fastq_integrity, pass_reads_kraken, pass_cov, pass_true_cov, pass_fastqc, pass_spades, pass_assembly_mapping, pass_pilon, pass_mlst, pass_assembly_kraken ]) return run_successfully, pass_qc, runs
def blast_subcommand(args): msg = [] if args.fasta is not None and args.type is None: msg.append('With --fasta option you must provide the --type') # if args.fasta is None and args.org is None: # msg.append('--fasta or --org must be provided') if len(msg) > 0: argparse.ArgumentParser(prog='blast subcommand options').error( '\n'.join(msg)) utils.required_programs({'makeblastdb': ['-version', '>=', '2.6.0']}) if args.fasta is not None: args.fasta = [os.path.abspath(fasta.name) for fasta in args.fasta] else: args.fasta, _ = get_fasta_config(args.org) if args.type != 'nucl': print('\n' 'ATTENTION: Blast DB type provided was not "nucl"\n' 'It was changed to "nucl"' '\n') args.type = 'nucl' print('\n' 'Settings that will be used:\n' ' fasta: {reference}\n' ' Blast DB type: nucl\n' '\n'.format(reference=args.fasta)) utils.removeDirectory(os.path.join(args.outdir, 'pickles', '')) error_msg = [] for fasta in args.fasta: # Create DB blast_db = os.path.join( args.outdir, '{blast_DB}'.format(blast_DB=os.path.basename(fasta))) db_exists, original_file = run_blast.check_db_exists(blast_db) if not db_exists and not original_file: db_exists = run_blast.create_blast_db(fasta, blast_db, args.type) if db_exists: print('Blast DB created for {file} in {outdir}'.format( file=fasta, outdir=args.outdir)) # sys.exit(0) else: error_msg.append( 'It was not possible to create Blast DB or {}'.format( fasta)) elif db_exists and original_file: error_msg.append( 'Blast DB already found for {file} in {outdir} as {blast_db}'. format(file=fasta, outdir=args.outdir, blast_db=blast_db)) else: error_msg.append( 'It was found only Blast DB files or the original fasta file from which the Blast DB' ' should be produced ({file}). Either include the missing files or remove the ones present' ' (usually the original fasta file)'.format(file=fasta)) if len(error_msg) == 0: sys.exit(0) else: sys.exit('\n'.join(error_msg))
def main(): program_name = 'ecoli_stx_subtyping.py' if sys.version_info[0] < 3: sys.exit('Must be using Python 3. Try calling "python3 {}"'.format( program_name)) parser, parser_reads, _, parser_assembly, _ = python_arguments( program_name=program_name, version=version) parser.description = 'Gets E. coli stx subtypes' # Add specific arguments parser_reads.add_argument( '--stx2covered', type=float, metavar='N', help='Minimal percentage of sequence covered to consider extra stx2' ' subtypes (value between [0, 100]) (default: 100)', required=False, default=100) parser_reads.add_argument( '--stx2identity', type=float, metavar='N', help='Minimal sequence identity to consider extra stx2' ' subtypes (value between [0, 100]) (default: 99.5)', required=False, default=99.5) parser_assembly.add_argument( '--stx2covered', type=float, metavar='N', help='Minimal percentage of sequence covered to consider extra stx2' ' subtypes (value between [0, 100]) (default: 100)', required=False, default=100) parser_assembly.add_argument( '--stx2identity', type=float, metavar='N', help='Minimal sequence identity to consider extra stx2' ' subtypes (value between [0, 100]) (default: 99.5)', required=False, default=99.5) args = parser.parse_args() msg = [] if args.minGeneCoverage < 0 or args.minGeneCoverage > 100: msg.append('--minGeneCoverage should be a value between [0, 100]') if args.minGeneIdentity < 0 or args.minGeneIdentity > 100: msg.append('--minGeneIdentity should be a value between [0, 100]') if args.stx2covered < 0 or args.stx2covered > 100: msg.append('--stx2covered should be a value between [0, 100]') if args.stx2identity < 0 or args.stx2identity > 100: msg.append('--stx2identity should be a value between [0, 100]') if args.org != ['stx', 'subtyping']: msg.append('Use "--org stx subtyping" with {}'.format(program_name)) if len(msg) > 0: argparse.ArgumentParser(prog='{} options'.format(program_name)).error( '\n'.join(msg)) start_time = time.time() args.outdir = os.path.abspath(args.outdir) if not os.path.isdir(args.outdir): os.makedirs(args.outdir) # Start logger logfile, time_str = utils.start_logger(args.outdir) _ = utils.general_information(script_name=program_name, logfile=logfile, version=version, outdir=args.outdir, time_str=time_str) print('\n') folders_2_remove = [] # Create modules pickles folder pickles_folder = os.path.join(args.outdir, 'pickles', '') if not os.path.isdir(pickles_folder): os.makedirs(pickles_folder) folders_2_remove.append(pickles_folder) # Run functions folders_2_remove_func, references_results, reference, references_headers = args.func( args) folders_2_remove.extend(folders_2_remove_func) # Parse results _, _, _, _, _ = parse_results.parse_results( references_results, reference, references_headers, args.outdir, args.minGeneCoverage, args.minDepthCoverage, args.typeSeparator) stx1_result, stx2_result = stx_subtype_parser( os.path.join(args.outdir, 'seq_typing.report_types.tab'), [ ref_file for ref_file in reference if 'stx1' in os.path.basename(ref_file).lower() ][0], [ ref_file for ref_file in reference if 'stx2' in os.path.basename(ref_file).lower() ][0], args.stx2covered, args.stx2identity) # Rename the file to keep ecoli_stx_subtyping stamp if os.path.isfile(os.path.join(args.outdir, 'seq_typing.report_types.tab')): os.rename( os.path.join(args.outdir, 'seq_typing.report_types.tab'), os.path.join(args.outdir, 'seq_typing.ecoli_stx_subtyping.report_types.tab')) # Remove the file to only keep the ecoli_stx_subtyping one if os.path.isfile(os.path.join(args.outdir, 'seq_typing.report.txt')): os.remove(os.path.join(args.outdir, 'seq_typing.report.txt')) print('\n' 'E. coli stx_subtyping - {stx1_result}:{stx2_result}\n' '\n'.format(stx1_result=stx1_result, stx2_result=stx2_result)) with open(os.path.join(args.outdir, 'seq_typing.ecoli_stx_subtyping.txt'), 'wt') as writer: writer.write(':'.join([stx1_result, stx2_result])) if not args.debug: for folder in folders_2_remove: utils.removeDirectory(folder) _ = utils.runTime(start_time)