def runTrimmomatic(jar_path_trimmomatic, sampleName, outdir, threads, adaptersFasta, script_path, doNotSearchAdapters, fastq_files, maxReadsLength, doNotTrimCrops, crop, headCrop, leading, trailing, slidingWindow, minLength, nts2clip_based_ntsContent, jarMaxMemory, fastq_encoding): failing = {'sample': False} not_empty_fastq = False warnings = {} paired_reads = None fileSize = 'NA' # Create Trimmomatic output directory trimmomatic_folder = os.path.join(outdir, 'trimmomatic', '') utils.removeDirectory(trimmomatic_folder) os.mkdir(trimmomatic_folder) run_successfully = trimmomatic(jar_path_trimmomatic, sampleName, trimmomatic_folder, threads, adaptersFasta, script_path, doNotSearchAdapters, fastq_files, maxReadsLength, doNotTrimCrops, crop, headCrop, leading, trailing, slidingWindow, minLength, nts2clip_based_ntsContent, jarMaxMemory, fastq_encoding) if run_successfully: paired_reads = getTrimmomaticPairedReads(trimmomatic_folder) not_empty_fastq = controlForZeroReads(paired_reads) # Get raw reads files size fileSize = sum(os.path.getsize(fastq) for fastq in paired_reads) if not not_empty_fastq: warnings['sample'] = 'Zero reads after Trimmomatic' print warnings['sample'] else: failing['sample'] = 'Did not run' print failing['sample'] return run_successfully, not_empty_fastq, failing, paired_reads, trimmomatic_folder, fileSize, warnings
def runTrimmomatic(jar_path_trimmomatic, sampleName, outdir, threads, adaptersFasta, script_path, doNotSearchAdapters, fastq_files, maxReadsLength, doNotTrimCrops, crop, headCrop, leading, trailing, slidingWindow, minLength, nts2clip_based_ntsContent, jarMaxMemory): failing = {} failing['sample'] = False not_empty_fastq = False paired_reads = None fileSize = 'NA' # Create Trimmomatic output directory trimmomatic_folder = os.path.join(outdir, 'trimmomatic', '') utils.removeDirectory(trimmomatic_folder) os.mkdir(trimmomatic_folder) run_successfully = trimmomatic(jar_path_trimmomatic, sampleName, trimmomatic_folder, threads, adaptersFasta, script_path, doNotSearchAdapters, fastq_files, maxReadsLength, doNotTrimCrops, crop, headCrop, leading, trailing, slidingWindow, minLength, nts2clip_based_ntsContent, jarMaxMemory) if run_successfully: paired_reads = getTrimmomaticPairedReads(trimmomatic_folder) not_empty_fastq = controlForZeroReads(paired_reads) # Get raw reads files size fileSize = sum(os.path.getsize(fastq) for fastq in paired_reads) if not not_empty_fastq: failing['sample'] = 'Zero reads after Trimmomatic' print failing['sample'] else: failing['sample'] = 'Did not run' print failing['sample'] return run_successfully, not_empty_fastq, failing, paired_reads, trimmomatic_folder, fileSize
def runSpades(sampleName, outdir, threads, fastq_files, notUseCareful, maxMemory, minCoverageAssembly, minContigsLength, estimatedGenomeSizeMb, kmers, maximumReadsLength, defaultKmers, minCoverageContigs, assembled_se_reads, saveExcludedContigs, maxNumberContigs): pass_qc = True failing = {} failing['sample'] = False warnings = {} # Create SPAdes output directory spades_folder = os.path.join(outdir, 'spades', '') utils.removeDirectory(spades_folder) os.mkdir(spades_folder) # Determine k-mers to run if defaultKmers: kmers = [] else: kmers = define_kmers(kmers, maximumReadsLength) if len(kmers) == 0: print 'SPAdes will use its default k-mers' else: print 'SPAdes will use the following k-mers: ' + str(kmers) run_successfully, contigs = spades(spades_folder, threads, fastq_files, notUseCareful, maxMemory, minCoverageAssembly, kmers, assembled_se_reads) if run_successfully: shutil.copyfile(contigs, os.path.join(outdir, str('SPAdes_original_assembly.contigs.fasta'))) contigs_link = os.path.join(outdir, str(sampleName + '.contigs.fasta')) os.symlink(contigs, contigs_link) contigs = contigs_link minContigsLength = define_minContigsLength(maximumReadsLength, minContigsLength) sequence_dict = get_SPAdes_sequence_information(contigs) warnings, sequence_dict, filtered_sequences_sufix, spades_report_general = decide_filter_parameters(sequence_dict, minContigsLength, minCoverageContigs, estimatedGenomeSizeMb, maxNumberContigs) if filtered_sequences_sufix is not None: filtered_sequence_file = os.path.splitext(contigs)[0] + '.' + filtered_sequences_sufix + '.fasta' write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs, filtered_sequence_file, sampleName, False, saveExcludedContigs) contigs = filtered_sequence_file else: filtered_sequence_file = os.path.splitext(contigs)[0] + '.original.fasta' write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs, filtered_sequence_file, sampleName, True, False) contigs = filtered_sequence_file os.remove(contigs_link) else: failing['sample'] = 'Did not run' print failing['sample'] contigs = None pass_qc = False if not run_successfully: pass_qc = False utils.removeDirectory(spades_folder) return run_successfully, pass_qc, failing, contigs, warnings
def runTrueCoverage(sample, fastq, reference, threads, outdir, extra_seq, min_cov_presence, min_cov_call, min_frequency_dominant_allele, min_gene_coverage, debug, min_gene_identity, true_coverage_config, rematch_script, conserved_true=True, num_map_loc=1): pass_qc = False failing = {} true_coverage_folder = os.path.join(outdir, 'trueCoverage', '') utils.removeDirectory(true_coverage_folder) os.mkdir(true_coverage_folder) sys.path.append(os.path.join(os.path.dirname(rematch_script), 'modules', '')) import rematch_module # Run ReMatCh reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(reference, true_coverage_folder, extra_seq, rematch_module) time_taken, run_successfully, data_by_gene, sample_data_general, consensus_files, consensus_sequences = rematch_module.runRematchModule(sample, fastq, reference_file, threads, true_coverage_folder, extra_seq, min_cov_presence, min_cov_call, min_frequency_dominant_allele, min_gene_coverage, conserved_true, debug, num_map_loc, min_gene_identity, 'first', 7, 'none', reference_dict, 'X', None, gene_list_reference, True) if run_successfully: failing = rematch_report_assess_failing(outdir, None, true_coverage_folder, sample_data_general, true_coverage_config) else: failing['sample'] = 'Did not run' if len(failing) == 0: pass_qc = True failing['sample'] = False else: print failing if not debug: utils.removeDirectory(true_coverage_folder) return run_successfully, pass_qc, failing
def sequence_data(reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele): sequence_data_outdir = os.path.join(outdir, 'sequence_data', '') utils.removeDirectory(sequence_data_outdir) os.makedirs(sequence_data_outdir) sequences = get_sequence_information(reference_file) pool = multiprocessing.Pool(processes=threads) for sequence_counter in sequences: sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '') utils.removeDirectory(sequence_dir) os.makedirs(sequence_dir) pool.apply_async(analyse_sequence_data, args=( bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, )) pool.close() pool.join() run_successfully, sample_data = gather_gene_data_together( sequence_data_outdir, sequences) return run_successfully, sample_data
def sample_coverage(referenceFile, alignment_file, outdir, threads): coverage_outdir = os.path.join(outdir, 'samtools_depth', '') utils.removeDirectory(coverage_outdir) os.makedirs(coverage_outdir) sequences = sequenceHeaders(referenceFile) pool = multiprocessing.Pool(processes=threads) counter = 0 for sequence in sequences: pool.apply_async(get_sequence_coverage, args=(alignment_file, sequence, coverage_outdir, counter,)) counter += 1 pool.close() pool.join() sample_coverage_no_problems = True mean_coverage_data = {} files = [f for f in os.listdir(coverage_outdir) if not f.startswith('.') and os.path.isfile(os.path.join(coverage_outdir, f))] for file_found in files: if file_found.startswith('coverage.sequence_') and file_found.endswith('.pkl'): file_path = os.path.join(coverage_outdir, file_found) if sample_coverage_no_problems: sequence_to_analyse, run_successfully, problems_found, position, coverage = utils.extractVariableFromPickle(file_path) if run_successfully and not problems_found: mean_coverage_data[sequence_to_analyse] = {'position': position, 'coverage': coverage, 'mean_coverage': round((float(coverage) / float(position)), 2)} else: print 'WARNING: it was not possible to compute coverage information for sequence ' + sequence_to_analyse sample_coverage_no_problems = False os.remove(file_path) return sample_coverage_no_problems, mean_coverage_data
def runDownload(ena_id, download_paired_type, asperaKey, outdir, download_cram_bam_True, threads, instrument_platform): download_dir = os.path.join(outdir, 'download', '') utils.removeDirectory(download_dir) os.mkdir(download_dir) run_successfully = False downloaded_files = None sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'date_download': None} readRunInfo = getReadRunInfo(ena_id) if readRunInfo is not None: downloadInformation = getDownloadInformation(readRunInfo) downloadInformation = check_correct_links(downloadInformation) sequencingInformation = getSequencingInformation(readRunInfo) sequencingInformation['date_download'] = time.strftime("%Y-%m-%d") if instrument_platform.lower() == 'all' or sequencingInformation['instrument_platform'].lower() == instrument_platform.lower(): if download_paired_type.lower() == 'both' or sequencingInformation['library_layout'].lower() == download_paired_type.lower(): run_successfully, cram_index_run_successfully = downloadFiles(downloadInformation, sequencingInformation, download_paired_type, asperaKey, download_dir, download_cram_bam_True) if run_successfully: run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads, sequencingInformation['library_layout']) if run_successfully and downloaded_files is not None: run_successfully, downloaded_files = rename_move_files(downloaded_files, sequencingInformation['run_accession'], outdir, sequencingInformation['library_layout']) utils.removeDirectory(download_dir) return run_successfully, downloaded_files, sequencingInformation
def runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files, keepFiles, fastQC_run_name): pass_qc = False failing = {} failing['sample'] = False warnings = {} maximumReadsLength = None nts2clip_based_ntsContent = None # Create FastQC output directory fastqc_folder = os.path.join(outdir, str('fastqc_' + fastQC_run_name), '') utils.removeDirectory(fastqc_folder) os.mkdir(fastqc_folder) # Run FastQC run_successfully = fastQC(fastqc_folder, threads, adaptersFasta, fastq_files) if run_successfully: # Check whether FastQC really run_successfully run_successfully = check_FastQC_runSuccessfully( fastqc_folder, fastq_files) if not run_successfully: failing['sample'] = 'Did not run' return run_successfully, pass_qc, failing, warnings, maximumReadsLength, nts2clip_based_ntsContent # Check which reads pass FastQC goodReads, badReads, failing, warnings = parseFastQC( fastqc_folder, fastq_files) # Get reads information maximumReadsLength, moreFrequentReadsLength, numberReads, ntsContent_biasStatus = getReadsInformation( fastqc_folder, fastq_files) # Get number nucleotides to clip based on nucleotide content bias nts2clip_based_ntsContent = nts2clip(ntsContent_biasStatus) print "Number of reads found: " + str(numberReads) print "Maximum reads length found for both fastq files: " + str( maximumReadsLength) + " nts" print "Reads length class more frequently found in fastq files: " + str( moreFrequentReadsLength) if len(badReads) == 0: pass_qc = True elif len(badReads) > 0: print "Reads files FAILING FastQC control: " + str(badReads) if len(goodReads) > 0: print "Reads files passing FastQC control: " + str(goodReads) print 'To improve reads quality, consider clipping the next number of nucleotides in the fastq files at 5 end and 3 end, respectively: ' + str( nts2clip_based_ntsContent) else: failing['sample'] = 'Did not run' print failing['sample'] if not keepFiles: utils.removeDirectory(fastqc_folder) return run_successfully, pass_qc, failing, warnings, maximumReadsLength, nts2clip_based_ntsContent
def runPilon(jar_path_pilon, assembly, fastq_files, threads, outdir, jarMaxMemory, alignment_file): failing = {} failing['sample'] = False pilon_folder = os.path.join(outdir, 'pilon', '') utils.removeDirectory(pilon_folder) os.mkdir(pilon_folder) # Create a symbolic link to the assembly assembly_link = os.path.join(pilon_folder, os.path.basename(assembly)) os.symlink(assembly, assembly_link) run_successfully = True if alignment_file is None: # Index assembly using Bowtie2 run_successfully = indexSequenceBowtie2(assembly_link, threads) if run_successfully: run_successfully, sam_file = mappingBowtie2( fastq_files, assembly_link, threads, pilon_folder) if run_successfully: alignment_file = os.path.splitext(sam_file)[0] + '.bam' run_successfully, alignment_file = sortAlignment( sam_file, alignment_file, False, threads) if run_successfully: os.remove(sam_file) run_successfully = indexAlignment(alignment_file) assembly_polished = None if run_successfully: run_successfully, assembly_polished = pilon(jar_path_pilon, assembly_link, alignment_file, pilon_folder, jarMaxMemory) if run_successfully: parsePilonResult(assembly_polished, outdir) shutil.copyfile( assembly_polished, os.path.join(outdir, os.path.basename(assembly_polished))) assembly_polished = os.path.join( outdir, os.path.basename(assembly_polished)) if os.path.isfile(alignment_file): os.remove(alignment_file) if not run_successfully: failing['sample'] = 'Did not run' print failing['sample'] return run_successfully, None, failing, assembly_polished, pilon_folder
def run_true_coverage(sample, fastq, reference, threads, outdir, extra_seq, min_cov_presence, min_cov_call, min_frequency_dominant_allele, min_gene_coverage, debug, min_gene_identity, true_coverage_config, rematch_script, num_map_loc=1, bowtie_algorithm='--very-sensitive-local', clean_run_rematch=True): pass_qc = False failing = {} true_coverage_folder = os.path.join(outdir, 'trueCoverage', '') utils.removeDirectory(true_coverage_folder) os.mkdir(true_coverage_folder) sys.path.append(os.path.join(os.path.dirname(rematch_script), 'modules')) import rematch_module # Run ReMatCh reference_file, gene_list_reference, reference_dict = clean_headers_reference_file( reference, true_coverage_folder, extra_seq, rematch_module) time_taken, run_successfully, data_by_gene, sample_data_general, consensus_files, consensus_sequences = \ rematch_module.run_rematch_module(sample, fastq, reference_file, threads, true_coverage_folder, extra_seq, min_cov_presence, min_cov_call, min_frequency_dominant_allele, min_gene_coverage, debug, num_map_loc, min_gene_identity, 'first', 7, 'none', reference_dict, 'X', bowtie_algorithm, None, gene_list_reference, True, clean_run=clean_run_rematch) if run_successfully: failing = rematch_report_assess_failing(outdir, None, true_coverage_folder, sample_data_general, true_coverage_config) else: failing['sample'] = 'Did not run' if len(failing) == 0: pass_qc = True failing['sample'] = False else: print(failing) if not debug: utils.removeDirectory(true_coverage_folder) return run_successfully, pass_qc, failing, sample_data_general
def runFastQintegrity(fastq_files, threads, outdir): pass_qc = True failing = {} failing['sample'] = False not_corruption_found = True fastQintegrity_folder = os.path.join(outdir, 'fastQintegrity', '') utils.removeDirectory(fastQintegrity_folder) os.mkdir(fastQintegrity_folder) pool = multiprocessing.Pool(processes=threads) for fastq in fastq_files: pool.apply_async(fastQintegrity, args=(fastq, fastQintegrity_folder,)) pool.close() pool.join() encoding = {} files = [f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.') and os.path.isfile(os.path.join(fastQintegrity_folder, f))] for file_found in files: if file_found.endswith('.pkl'): file_run_successfully, file_encoding, min_reads_length, max_reads_length = utils.extractVariableFromPickle(os.path.join(fastQintegrity_folder, file_found)) if file_run_successfully: encoding[file_found] = {'file_encoding': file_encoding, 'min_reads_length': min_reads_length, 'max_reads_length': max_reads_length} else: failing[os.path.splitext(file_found)[0]] = ['The file is possibly corrupt'] print(os.path.splitext(file_found)[0] + ': the file is possibly corrupt') os.remove(os.path.join(fastQintegrity_folder, file_found)) if len(failing) > 1: failing.pop('sample') not_corruption_found = False pass_qc = False min_reads_length_found, max_reads_length_found = None, None if len(encoding) == 0: encoding = None print('It was no possible to determine the FASTQ encodings') else: min_reads_length_found, max_reads_length_found, min_reads_length_each_fastq, max_reads_length_each_fastq = \ guess_encoding.determine_min_max_reads_length(encoding) report_reads_length(min_reads_length_each_fastq, max_reads_length_each_fastq, outdir) if len(set([x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None])) == 1: encoding = [x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None][0] print('Fastq quality encoding: {0}'.format(str(encoding))) else: print('It was no possible to determine the FASTQ encodings') print('This was what has been found: {0}'.format(str(encoding))) encoding = None utils.removeDirectory(fastQintegrity_folder) return not_corruption_found, pass_qc, failing, encoding, min_reads_length_found, max_reads_length_found
def sample_coverage(referenceFile, alignment_file, outdir, threads): coverage_outdir = os.path.join(outdir, 'samtools_depth', '') utils.removeDirectory(coverage_outdir) os.makedirs(coverage_outdir) sequences = sequenceHeaders(referenceFile) pool = multiprocessing.Pool(processes=threads) counter = 0 for sequence in sequences: pool.apply_async(get_sequence_coverage, args=( alignment_file, sequence, coverage_outdir, counter, )) counter += 1 pool.close() pool.join() sample_coverage_no_problems = True mean_coverage_data = {} files = [ f for f in os.listdir(coverage_outdir) if not f.startswith('.') and os.path.isfile(os.path.join(coverage_outdir, f)) ] for file_found in files: if file_found.startswith('coverage.sequence_') and file_found.endswith( '.pkl'): file_path = os.path.join(coverage_outdir, file_found) if sample_coverage_no_problems: sequence_to_analyse, run_successfully, problems_found, position, coverage = \ utils.extractVariableFromPickle(file_path) if run_successfully and not problems_found: mean_coverage_data[sequence_to_analyse] = { 'position': position, 'coverage': coverage, 'mean_coverage': round((float(coverage) / float(position)), 2) } else: print( 'WARNING: it was not possible to compute coverage information for' ' sequence ' + sequence_to_analyse) sample_coverage_no_problems = False os.remove(file_path) return sample_coverage_no_problems, mean_coverage_data
def runGetSeqENA(args): start_time = time.time() listENA_IDs = utils.getListIDs(os.path.abspath(args.listENAids.name)) outdir = os.path.abspath(args.outdir) utils.check_create_directory(outdir) asperaKey = args.asperaKey if asperaKey is not None: asperaKey = os.path.abspath(asperaKey.name) # Start logger logfile = utils.start_logger(outdir) # Get general information utils.general_information(logfile, version) # Check programms requiredPrograms(args) runs_successfully = 0 with open(os.path.join(outdir, 'getSeqENA.report.txt'), 'wt') as writer: header_sequencing = ['run_accession', 'instrument_platform', 'instrument_model', 'library_layout', 'library_source', 'extra_run_accession', 'nominal_length', 'read_count', 'base_count', 'date_download'] writer.write('#sample' + '\t' + '\t'.join(header_sequencing) + '\n') for ena_id in listENA_IDs: if args.maximumSamples is None: maximumSamples = runs_successfully + 1 else: maximumSamples = args.maximumSamples if runs_successfully < maximumSamples: print '\n' + 'Download ENA_ID ' + ena_id ena_id_folder = os.path.join(outdir, ena_id) utils.check_create_directory(ena_id_folder) sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None} time_taken, run_successfully, fastq_files, sequencingInformation = download.run_download(ena_id, args.downloadLibrariesType, asperaKey, ena_id_folder, args.downloadCramBam, args.threads, args.downloadInstrumentPlatform, args.SRA, args.SRAopt) if run_successfully: runs_successfully += 1 else: utils.removeDirectory(ena_id_folder) print ena_id + ' was not downloaded' writer.write(ena_id + '\t' + '\t'.join([str(sequencingInformation[i]) for i in header_sequencing]) + '\n') else: break time_taken = utils.runTime(start_time) del time_taken if runs_successfully == 0: sys.exit('No ENA_IDs were successfully downloaded!')
def runFastQintegrity(fastq_files, threads, outdir): pass_qc = True failing = {} failing['sample'] = False not_corruption_found = True fastQintegrity_folder = os.path.join(outdir, 'fastQintegrity', '') utils.removeDirectory(fastQintegrity_folder) os.mkdir(fastQintegrity_folder) pool = multiprocessing.Pool(processes=threads) for fastq in fastq_files: pool.apply_async(fastQintegrity, args=(fastq, fastQintegrity_folder,)) pool.close() pool.join() encoding = {} files = [f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.') and os.path.isfile(os.path.join(fastQintegrity_folder, f))] for file_found in files: if file_found.endswith('.pkl'): file_run_successfully, file_encoding, min_reads_length, max_reads_length = utils.extractVariableFromPickle(os.path.join(fastQintegrity_folder, file_found)) if file_run_successfully: encoding[file_found] = {'file_encoding': file_encoding, 'min_reads_length': min_reads_length, 'max_reads_length': max_reads_length} else: failing[os.path.splitext(file_found)[0]] = ['The file is possibly corrupt'] print os.path.splitext(file_found)[0] + ': the file is possibly corrupt' os.remove(os.path.join(fastQintegrity_folder, file_found)) if len(failing) > 1: failing.pop('sample') not_corruption_found = False pass_qc = False min_reads_length, max_reads_length = None, None if len(encoding) == 0: encoding = None print 'It was no possible to determine the FASTQ encodings' else: min_reads_length, max_reads_length = guess_encoding.determine_min_max_reads_length(encoding) if len(set([x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None])) == 1: encoding = [x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None][0] print 'Fastq quality encoding: ' + str(encoding) else: print 'It was no possible to determine the FASTQ encodings' print 'This was what has been found: ' + str(encoding) encoding = None utils.removeDirectory(fastQintegrity_folder) return not_corruption_found, pass_qc, failing, encoding, min_reads_length, max_reads_length
def run_guess_encoding_single_thread(fastq_file, number_reads_access_None_all, outdir): outdir_guess_encoding = os.path.join(outdir, os.path.splitext(os.path.basename(fastq_file))[0]) utils.removeDirectory(outdir_guess_encoding) os.mkdir(outdir_guess_encoding) guess_encoding.guess_encoding(fastq_file, number_reads_access_None_all, outdir_guess_encoding) encoding_data = guess_encoding.gather_data_together(outdir_guess_encoding) final_enconding = guess_encoding.get_final_encoding(encoding_data) min_reads_length, max_reads_length, _, _ = guess_encoding.determine_min_max_reads_length(encoding_data) utils.removeDirectory(outdir_guess_encoding) return final_enconding, min_reads_length, max_reads_length
def run_guess_encoding_single_thread(fastq_file, number_reads_access_None_all, outdir): outdir_guess_encoding = os.path.join(outdir, os.path.splitext(os.path.basename(fastq_file))[0]) utils.removeDirectory(outdir_guess_encoding) os.mkdir(outdir_guess_encoding) guess_encoding.guess_encoding(fastq_file, number_reads_access_None_all, outdir_guess_encoding) encoding_data = guess_encoding.gather_data_together(outdir_guess_encoding) final_enconding = guess_encoding.get_final_encoding(encoding_data) min_reads_length, max_reads_length = guess_encoding.determine_min_max_reads_length(encoding_data) utils.removeDirectory(outdir_guess_encoding) return final_enconding, min_reads_length, max_reads_length
def downloadAndINNUca(outdir, run_ID, asperaKey, threads): start_time = time.time() temp_file = os.path.join(outdir, run_ID + '.temp.runID_fileList.txt') with open(temp_file, 'wt') as writer: writer.write(run_ID + '\n') command = [ 'getSeqENA.py', '-l', temp_file, '-o', outdir, '-a', asperaKey, '--downloadLibrariesType', 'PE' ] getSeqENA_run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None) os.remove(temp_file) sample_directory = os.path.join(outdir, run_ID, '') innuca_run_successfully = False if getSeqENA_run_successfully: command = [ 'INNUca.py', '-i', sample_directory, '-s', '"Campylobacter jejuni"', '-g', '1.6', '-o', sample_directory, '-j', str(threads), '--jarMaxMemory', 'auto' ] innuca_run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None) innuca_dir = os.path.join(sample_directory, run_ID, '') files = [ f for f in os.listdir(innuca_dir) if not f.startswith('.') and os.path.isfile(os.path.join(innuca_dir, f)) ] for file_innuca in files: shutil.move(os.path.join(innuca_dir, file_innuca), os.path.join(sample_directory, file_innuca)) utils.removeDirectory(innuca_dir) removeFiles(sample_directory, '.gz') removeFiles(sample_directory, '.log') removeFiles(sample_directory, '.cpu.txt') if innuca_run_successfully: time_taken = utils.runTime(start_time) utils.saveVariableToPickle(time_taken, sample_directory, run_ID + '_downloadAndINNUca_time') utils.saveVariableToPickle(innuca_run_successfully, sample_directory, run_ID + '_run_successfully')
def runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files, keepFiles, fastQC_run_name): pass_qc = False failing = {'sample': False} warnings = {} maximumReadsLength = None nts2clip_based_ntsContent = None # Create FastQC output directory fastqc_folder = os.path.join(outdir, str('fastqc_' + fastQC_run_name), '') utils.removeDirectory(fastqc_folder) os.mkdir(fastqc_folder) # Run FastQC run_successfully = fastQC(fastqc_folder, threads, adaptersFasta, fastq_files) if run_successfully: # Check whether FastQC really run_successfully run_successfully = check_FastQC_runSuccessfully(fastqc_folder, fastq_files) if not run_successfully: failing['sample'] = 'Did not run' return run_successfully, pass_qc, failing, warnings, maximumReadsLength, nts2clip_based_ntsContent # Check which reads pass FastQC goodReads, badReads, failing, warnings = parseFastQC(fastqc_folder, fastq_files) # Get reads information maximumReadsLength, moreFrequentReadsLength, numberReads, ntsContent_biasStatus = getReadsInformation(fastqc_folder, fastq_files) # Get number nucleotides to clip based on nucleotide content bias nts2clip_based_ntsContent = nts2clip(ntsContent_biasStatus) print "Number of reads found: " + str(numberReads) print "Maximum reads length found for both fastq files: " + str(maximumReadsLength) + " nts" print "Reads length class more frequently found in fastq files: " + str(moreFrequentReadsLength) if len(badReads) == 0: pass_qc = True elif len(badReads) > 0: print "Reads files FAILING FastQC control: " + str(badReads) if len(goodReads) > 0: print "Reads files passing FastQC control: " + str(goodReads) print 'To improve reads quality, consider clipping the next number of nucleotides in the fastq files at 5 end and 3 end, respectively: ' + str(nts2clip_based_ntsContent) else: failing['sample'] = 'Did not run' print failing['sample'] if not keepFiles: utils.removeDirectory(fastqc_folder) return run_successfully, pass_qc, failing, warnings, maximumReadsLength, nts2clip_based_ntsContent
def runPilon(jar_path_pilon, assembly, fastq_files, threads, outdir, jarMaxMemory, alignment_file): failing = {} failing['sample'] = False pilon_folder = os.path.join(outdir, 'pilon', '') utils.removeDirectory(pilon_folder) os.mkdir(pilon_folder) # Create a symbolic link to the assembly assembly_link = os.path.join(pilon_folder, os.path.basename(assembly)) os.symlink(assembly, assembly_link) run_successfully = True if alignment_file is None: # Index assembly using Bowtie2 run_successfully = indexSequenceBowtie2(assembly_link, threads) if run_successfully: run_successfully, sam_file = mappingBowtie2(fastq_files, assembly_link, threads, pilon_folder) if run_successfully: alignment_file = os.path.splitext(sam_file)[0] + '.bam' run_successfully, alignment_file = sortAlignment(sam_file, alignment_file, False, threads) if run_successfully: os.remove(sam_file) run_successfully = indexAlignment(alignment_file) assembly_polished = None if run_successfully: run_successfully, assembly_polished = pilon(jar_path_pilon, assembly_link, alignment_file, pilon_folder, jarMaxMemory) if run_successfully: parsePilonResult(assembly_polished, outdir) shutil.copyfile(assembly_polished, os.path.join(outdir, os.path.basename(assembly_polished))) assembly_polished = os.path.join(outdir, os.path.basename(assembly_polished)) if os.path.isfile(alignment_file): os.remove(alignment_file) if not run_successfully: failing['sample'] = 'Did not run' print failing['sample'] return run_successfully, None, failing, assembly_polished, pilon_folder
def sequence_data(sample, reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, rematch): sequence_data_outdir = os.path.join(outdir, 'sequence_data', '') utils.removeDirectory(sequence_data_outdir) os.mkdir(sequence_data_outdir) sequences, headers = utils.get_sequence_information( reference_file, length_extra_seq) threads_2_use = rematch.determine_threads_2_use(len(sequences), threads) import multiprocessing pool = multiprocessing.Pool(processes=threads) for sequence_counter in sequences: sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '') utils.removeDirectory(sequence_dir) os.makedirs(sequence_dir) pool.apply_async(rematch.analyse_sequence_data, args=( bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, threads_2_use, )) pool.close() pool.join() run_successfully, sample_data, consensus_files, consensus_sequences = rematch.gather_data_together( sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, False) return run_successfully, sample_data, consensus_files, consensus_sequences
def runFastQintegrity(fastq_files, threads, outdir): failing = {} failing['sample'] = False not_corruption_found = True fastQintegrity_folder = os.path.join(outdir, 'fastQintegrity', '') utils.removeDirectory(fastQintegrity_folder) os.mkdir(fastQintegrity_folder) pool = multiprocessing.Pool(processes=threads) for fastq in fastq_files: pool.apply_async(fastQintegrity, args=( fastq, fastQintegrity_folder, )) pool.close() pool.join() files = [ f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.') and os.path.isfile(os.path.join(fastQintegrity_folder, f)) ] for file_found in files: if file_found.endswith('.pkl'): file_run_successfully = utils.extractVariableFromPickle( os.path.join(fastQintegrity_folder, file_found)) if not file_run_successfully: failing[os.path.splitext(file_found)[0]] = [ 'The file is possibly corrupt' ] print os.path.splitext( file_found)[0] + ': the file is possibly corrupt' os.remove(os.path.join(fastQintegrity_folder, file_found)) if len(failing) > 1: failing.pop('sample') not_corruption_found = False utils.removeDirectory(fastQintegrity_folder) return not_corruption_found, None, failing # None added for consistency with other steps
def gather_gene_data_together(data_directory, sequences_information): run_successfully = True counter = 0 sample_data = {} genes_directories = [ d for d in os.listdir(data_directory) if not d.startswith('.') and os.path.isdir(os.path.join(data_directory, d, '')) ] for gene_dir in genes_directories: gene_dir_path = os.path.join(data_directory, gene_dir, '') files = [ f for f in os.listdir(gene_dir_path) if not f.startswith('.') and os.path.isfile(os.path.join(gene_dir_path, f)) ] for file_found in files: if file_found.startswith('coverage_info.') and file_found.endswith( '.pkl'): file_path = os.path.join(gene_dir_path, file_found) if run_successfully: run_successfully, sequence_counter, multiple_alleles_found, percentage_absent, percentage_lowCoverage, meanCoverage = utils.extractVariableFromPickle( file_path) sample_data[sequence_counter] = { 'header': sequences_information[sequence_counter]['header'], 'gene_coverage': 100 - percentage_absent, 'gene_low_coverage': percentage_lowCoverage, 'gene_number_positions_multiple_alleles': multiple_alleles_found, 'gene_mean_read_coverage': meanCoverage } counter += 1 utils.removeDirectory(gene_dir_path) if counter != len(sequences_information): run_successfully = False return run_successfully, sample_data
def runPear(fastq_files, threads, outdir, sampleName, fastq_encoding, trimmomatic_run_successfully, minimum_overlap_reads): failing = {'sample': False} warnings = {} pear_folder = os.path.join(outdir, 'pear', '') utils.removeDirectory(pear_folder) os.mkdir(pear_folder) pool = multiprocessing.Pool(processes=threads) for fastq in fastq_files: pool.apply_async(compress_decompress, args=(fastq, os.path.join(pear_folder, str('temp.' + os.path.splitext(os.path.basename(fastq))[0])), False,)) pool.close() pool.join() run_successfully, decompressed_reads = get_compressed_decompressed_reads(pear_folder) assembled_se_reads = None unassembled_pe_reads = None if run_successfully: if len(decompressed_reads) == 2: run_successfully, pass_qc, warnings, assembled_se_reads, unassembled_pe_reads, assembled_reads, unassembled_reads, discarded_reads = run_pear(decompressed_reads, sampleName, threads, pear_folder, fastq_encoding, trimmomatic_run_successfully, minimum_overlap_reads) if warnings['sample'] is False: warnings = {} if run_successfully: with open(os.path.join(outdir, str('pear_report.txt')), 'wt') as writer: writer.write('#assembled_reads' + '\n' + str(assembled_reads) + '\n') writer.write('#unassembled_reads' + '\n' + str(unassembled_reads) + '\n') writer.write('#discarded_reads' + '\n' + str(discarded_reads) + '\n') else: run_successfully = False for fastq in decompressed_reads: os.remove(fastq) if not run_successfully: warnings['sample'] = 'Did not run' print warnings return run_successfully, True, failing, unassembled_pe_reads, assembled_se_reads, pear_folder, warnings
def runTrueCoverage(sample, fastq, reference, threads, outdir, extraSeq, minCovPresence, minCovCall, minFrequencyDominantAllele, minGeneCoverage, conserved_True, debug, numMapLoc, minGeneIdentity, trueCoverage_config, rematch_script): pass_qc = False failing = {} trueCoverage_folder = os.path.join(outdir, 'trueCoverage', '') utils.removeDirectory(trueCoverage_folder) os.mkdir(trueCoverage_folder) sys.path.append(os.path.join(os.path.dirname(rematch_script), 'modules', '')) import rematch_module # Run ReMatCh reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(reference, trueCoverage_folder, extraSeq, rematch_module) time_taken, run_successfully, data_by_gene, sample_data_general, consensus_files, consensus_sequences = rematch_module.runRematchModule(sample, fastq, reference_file, threads, trueCoverage_folder, extraSeq, minCovPresence, minCovCall, minFrequencyDominantAllele, minGeneCoverage, True, debug, 1, minGeneIdentity, 'first', 7, 'none', reference_dict, 'X', None, gene_list_reference, True) if run_successfully: print 'Writing report file' os.rename(os.path.join(trueCoverage_folder, 'rematchModule_report.txt'), os.path.join(outdir, 'trueCoverage_report.txt')) if sample_data_general['number_absent_genes'] > trueCoverage_config['maximum_number_absent_genes']: failing['absent_genes'] = 'The number of absent genes (' + str(sample_data_general['number_absent_genes']) + ') exceeds the maximum allowed (' + str(trueCoverage_config['maximum_number_absent_genes']) + ')' if sample_data_general['number_genes_multiple_alleles'] > trueCoverage_config['maximum_number_genes_multiple_alleles']: failing['multiple_alleles'] = 'The number of genes with multiple alleles (' + str(sample_data_general['number_genes_multiple_alleles']) + ') exceeds the maximum allowed (' + str(trueCoverage_config['maximum_number_genes_multiple_alleles']) + ')' if sample_data_general['mean_sample_coverage'] < trueCoverage_config['minimum_read_coverage']: failing['read_coverage'] = 'The mean read coverage for genes present (' + str(sample_data_general['mean_sample_coverage']) + ') dit not meet the minimum required (' + str(trueCoverage_config['minimum_read_coverage']) + ')' else: failing['sample'] = 'Did not run' if len(failing) == 0: pass_qc = True failing['sample'] = False else: print failing if not debug: utils.removeDirectory(trueCoverage_folder) return run_successfully, pass_qc, failing
def run_rematch(rematch, outdir, reference_file, bam_file, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, minimum_gene_identity, debug_mode_true, doNotRemoveConsensus): module_dir = os.path.join(outdir, 'rematch', '') utils.removeDirectory(module_dir) os.makedirs(module_dir) sys.path.append(os.path.join(os.path.dirname(rematch), 'modules', '')) import rematch_module as rematch print 'Analysing alignment data' run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data( 'sample', reference_file, bam_file, module_dir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, rematch) if run_successfully: number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage = write_report( outdir, sample_data, minimum_gene_coverage, minimum_gene_identity) if not debug_mode_true: utils.removeDirectory(module_dir) clean_rematch_folder(consensus_files, bam_file, reference_file, outdir, doNotRemoveConsensus, debug_mode_true) return run_successfully, { 'number_absent_genes': number_absent_genes if 'number_absent_genes' in locals() else None, 'number_genes_multiple_alleles': number_genes_multiple_alleles if 'number_genes_multiple_alleles' in locals() else None, 'mean_sample_coverage': round(mean_sample_coverage, 2) if 'mean_sample_coverage' in locals() else None }, sample_data if 'sample_data' in locals() else None
def runAssemblyMapping(fastq_files, reference_file, threads, outdir, minCoverageAssembly, estimatedGenomeSizeMb, saveExcludedContigs, maxNumberContigs): pass_qc = True pass_qc_coverage = False pass_qc_mapping = False failing = {} warnings = {} assemblyMapping_folder = os.path.join(outdir, 'assemblyMapping', '') utils.removeDirectory(assemblyMapping_folder) os.mkdir(assemblyMapping_folder) assembly_filtered = None # Create a symbolic link to the assembly assembly_link = os.path.join(assemblyMapping_folder, os.path.basename(reference_file)) os.symlink(reference_file, assembly_link) bam_file = None # Index assembly using Bowtie2 run_successfully = indexSequenceBowtie2(assembly_link, threads) sample_coverage_no_problems = False sample_mapping_statistics_no_problems = False if run_successfully: run_successfully, sam_file = mappingBowtie2(fastq_files, assembly_link, threads, assemblyMapping_folder) if run_successfully: bam_file = os.path.splitext(sam_file)[0] + '.bam' run_successfully, bam_file = sortAlignment(sam_file, bam_file, False, threads) if run_successfully: os.remove(sam_file) run_successfully = indexAlignment(bam_file, True) if run_successfully: sequences_2_keep = [] # Get assembly coverage sample_coverage_no_problems, mean_coverage_data = sample_coverage(reference_file, bam_file, assemblyMapping_folder, threads) if sample_coverage_no_problems: pass_qc_coverage, failing_reason, sequences_2_keep = save_assembly_coverage_report(mean_coverage_data, outdir, minCoverageAssembly) if not pass_qc_coverage: failing['Coverage'] = [failing_reason] assembly_filtered = os.path.splitext(reference_file)[0] + '.mappingCov.fasta' sequence_dict, ignore = utils.get_sequence_information(reference_file, 0) sequence_dict, sequence_report_general = determine_sequences_to_filter(sequence_dict, sequences_2_keep, False) failing_sequences_filtered, minimumBP = spades.qc_assembly(sequence_report_general, estimatedGenomeSizeMb, maxNumberContigs) if failing_sequences_filtered['sample'] is not False: warnings['Sequences_filtered'] = [failing_sequences_filtered['sample']] if not minimumBP: assembly_filtered = reference_file else: write_filtered_sequences_and_stats(sequence_dict, sequence_report_general, assembly_filtered, saveExcludedContigs) else: write_filtered_sequences_and_stats(sequence_dict, sequence_report_general, assembly_filtered, saveExcludedContigs) else: failing['Coverage'] = ['Did not run'] # Save mapping statistics sample_mapping_statistics_no_problems, dict_mapping_statistics = getting_mapping_statistics(bam_file) if sample_mapping_statistics_no_problems: pass_qc_mapping, failing_reason = save_mapping_statistics(dict_mapping_statistics, outdir) if not pass_qc_mapping: warnings['Mapping'] = [failing_reason] else: warnings['Mapping'] = ['Did not run'] if assembly_filtered is not None and assembly_filtered != reference_file and len(sequences_2_keep) > 0: print 'Producing bam subset for sequences to keep' run_successfully, bam_subset = get_bam_subset(bam_file, sequences_2_keep, threads) if run_successfully: os.remove(bam_file) os.remove(bam_file + '.bai') bam_file = bam_subset run_successfully = indexAlignment(bam_file, False) if not run_successfully: failing['sample'] = ['Did not run'] run_successfully = all([run_successfully, sample_coverage_no_problems]) if len(failing) == 0: failing = {'sample': False} else: print 'Failing:', failing pass_qc = False return run_successfully, pass_qc, failing, assembly_filtered, bam_file, assemblyMapping_folder, warnings
def gather_data_together(sample, data_directory, sequences_information, outdir, debug_mode_true): run_successfully = True counter = 0 sample_data = {} consensus_files = None write_consensus_first_time = True genes_directories = [ d for d in os.listdir(data_directory) if not d.startswith('.') and os.path.isdir(os.path.join(data_directory, d, '')) ] for gene_dir in genes_directories: gene_dir_path = os.path.join(data_directory, gene_dir, '') files = [ f for f in os.listdir(gene_dir_path) if not f.startswith('.') and os.path.isfile(os.path.join(gene_dir_path, f)) ] for file_found in files: if file_found.startswith('coverage_info.') and file_found.endswith( '.pkl'): file_path = os.path.join(gene_dir_path, file_found) if run_successfully: run_successfully, sequence_counter, multiple_alleles_found, percentage_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences = utils.extractVariableFromPickle( file_path) if write_consensus_first_time: for consensus_type in [ 'correct', 'noMatter', 'alignment' ]: file_to_remove = os.path.join( outdir, str(sample + '.' + consensus_type + '.fasta')) if os.path.isfile(file_to_remove): os.remove(file_to_remove) write_consensus_first_time = False consensus_files = write_consensus(outdir, sample, consensus_sequence) sample_data[sequence_counter] = { 'header': sequences_information[sequence_counter]['header'], 'gene_coverage': 100 - percentage_absent, 'gene_low_coverage': percentage_lowCoverage, 'gene_number_positions_multiple_alleles': multiple_alleles_found, 'gene_mean_read_coverage': meanCoverage, 'gene_identity': 100 - (float(number_diferences) / sequences_information[sequence_counter]['length']) } counter += 1 if not debug_mode_true: utils.removeDirectory(gene_dir_path) if counter != len(sequences_information): run_successfully = False return run_successfully, sample_data, consensus_files
def run_assembly_mapping(fastq_files, reference_file, outdir, estimated_genome_size_mb, max_number_contigs=100, save_excluded_contigs=False, min_coverage_assembly=None, keep_bam=False, threads=1): """ Runs Assembly_Mapping for INNUca and QA/QC the results Parameters ---------- fastq_files : list List of fastq files reference_file : str Path to the reference file (the assembly) outdir : str Path to the output directory estimated_genome_size_mb : float Expected genome size in Mb save_excluded_contigs : bool, default False True if want to save excluded contigs max_number_contigs : int, default 100 Maximum number of contigs per 1.5 Mb of expected genome size min_coverage_assembly : int or None, default None Minimum contigs average coverage. After mapping reads back to the contigs, only keep contigs with at least this average coverage. If None is provided, 1/3 of the assembly mean coverage or 10x will be used keep_bam : bool, default False True if want to keep the BAM file produced (with mapped and unmapped reads) threads : int, default 1 Number of threads to be used Returns ------- run_successfully : bool Boolean stating if INNUca Assembly_Mapping module ran successfully or not pass_qc : bool Boolean stating if sample pass QA/QC or not time_taken : float Seconds that run_assembly_mapping took to run failing : dict Dictionary with the failing reasons. If sample did not fail, it is only {'sample': False}. If it failed, keys will be the level of failing, and values list of strings assembly_filtered : str or None Path to the filtered assembly (or original one if nothing was filtered). If something went wrong, None is returned bam_file : str or None Path to the BAM file to be used in subsequent steps. If something went wrong, None is returned assembly_mapping_folder : str Path to Assembly_Mapping working directory warnings : dict Dictionary with the warning reasons. If no warnings were raised, it is empty. If warnings were raised, keys will be the level of warnings, and values list of strings original_bam : str or None Path to the BAM file produced with reference_file if a new BAM file for subset of sequences is produced, else None. """ pass_qc = True failing = {} warnings = {} assembly_mapping_folder = os.path.join(outdir, 'assemblyMapping', '') utils.removeDirectory(assembly_mapping_folder) os.mkdir(assembly_mapping_folder) assembly_filtered = None # Create a symbolic link to the assembly assembly_link = os.path.join(assembly_mapping_folder, os.path.basename(reference_file)) os.symlink(reference_file, assembly_link) bam_file = None original_bam = None # Index assembly using Bowtie2 run_successfully = indexSequenceBowtie2(assembly_link, threads) sample_coverage_no_problems = False if run_successfully: run_successfully, sam_file = mappingBowtie2(fastq_files, assembly_link, threads, assembly_mapping_folder) if run_successfully: bam_file = os.path.splitext(sam_file)[0] + '.bam' run_successfully, bam_file = sortAlignment(sam_file, bam_file, False, threads) if run_successfully: os.remove(sam_file) run_successfully = indexAlignment(bam_file, True) if run_successfully: sequences_2_keep = [] # Get assembly coverage sample_coverage_no_problems, mean_coverage_data = sample_coverage(reference_file, bam_file, assembly_mapping_folder, threads) if sample_coverage_no_problems: pass_qc_coverage, failing_reason, sequences_2_keep = \ save_assembly_coverage_report(mean_coverage_data, outdir, min_coverage_assembly) if not pass_qc_coverage: failing['Coverage'] = [failing_reason] assembly_filtered = os.path.splitext(reference_file)[0] + '.mappingCov.fasta' sequence_dict, ignore = utils.get_sequence_information(reference_file, 0) sequence_dict, sequence_report_general = determine_sequences_to_filter(sequence_dict, sequences_2_keep, False) failing_sequences_filtered, minimumBP = spades.qc_assembly(sequence_report_general, estimated_genome_size_mb, max_number_contigs) if failing_sequences_filtered['sample'] is not False: warnings['Sequences_filtered'] = [failing_sequences_filtered['sample']] if not minimumBP: assembly_filtered = reference_file else: write_filtered_sequences_and_stats(sequence_dict, sequence_report_general, assembly_filtered, save_excluded_contigs) else: write_filtered_sequences_and_stats(sequence_dict, sequence_report_general, assembly_filtered, save_excluded_contigs) else: failing['Coverage'] = ['Did not run'] # Save mapping statistics sample_mapping_statistics_no_problems, dict_mapping_statistics = getting_mapping_statistics( bam_file) if sample_mapping_statistics_no_problems: pass_qc_mapping, failing_reason = save_mapping_statistics(dict_mapping_statistics, outdir) if not pass_qc_mapping: warnings['Mapping'] = [failing_reason] else: warnings['Mapping'] = ['Did not run'] if assembly_filtered is not None and \ assembly_filtered != reference_file and \ len(sequences_2_keep) > 0: print('Producing bam subset for sequences to keep') run_successfully, bam_subset = get_bam_subset(bam_file, sequences_2_keep, threads) if run_successfully: if not keep_bam: os.remove(bam_file) else: original_bam = os.path.join(outdir, '{}.bam'.format(os.path.basename(reference_file))) os.rename(bam_file, original_bam) os.remove(bam_file + '.bai') bam_file = bam_subset run_successfully = indexAlignment(bam_file, False) else: if keep_bam: os.rename(bam_file, os.path.join(outdir, '{}.bam'.format(os.path.basename(reference_file)))) os.rename(bam_file + '.bai', os.path.join(outdir, '{}.bam.bai'.format(os.path.basename(reference_file)))) bam_file = os.path.join(outdir, '{}.bam'.format(os.path.basename(reference_file))) if not run_successfully: failing['sample'] = ['Did not run'] run_successfully = all([run_successfully, sample_coverage_no_problems]) if len(failing) == 0: failing = {'sample': False} else: print('Failing:', failing) pass_qc = False return run_successfully, pass_qc, failing, assembly_filtered, bam_file, assembly_mapping_folder, warnings, \ original_bam
def runSpades(sampleName, outdir, threads, fastq_files, notUseCareful, maxMemory, minCoverageAssembly, minContigsLength, estimatedGenomeSizeMb, kmers, maximumReadsLength, defaultKmers, minCoverageContigs, assembled_se_reads, saveExcludedContigs, maxNumberContigs): pass_qc = True failing = {'sample': False} warnings = {} # Create SPAdes output directory spades_folder = os.path.join(outdir, 'spades', '') utils.removeDirectory(spades_folder) os.mkdir(spades_folder) # Determine k-mers to run if defaultKmers: kmers = [] else: kmers = define_kmers(kmers, maximumReadsLength) if len(kmers) == 0: print 'SPAdes will use its default k-mers' else: print 'SPAdes will use the following k-mers: ' + str(kmers) run_successfully, contigs = spades(spades_folder, threads, fastq_files, notUseCareful, maxMemory, minCoverageAssembly, kmers, assembled_se_reads) if run_successfully: if os.path.isfile(contigs): shutil.copyfile( contigs, os.path.join(outdir, str('SPAdes_original_assembly.contigs.fasta'))) contigs_link = os.path.join(outdir, str(sampleName + '.contigs.fasta')) os.symlink(contigs, contigs_link) contigs = contigs_link minContigsLength = define_minContigsLength(maximumReadsLength, minContigsLength) sequence_dict = get_SPAdes_sequence_information(contigs) warnings, sequence_dict, filtered_sequences_sufix, spades_report_general = \ decide_filter_parameters(sequence_dict, minContigsLength, minCoverageContigs, estimatedGenomeSizeMb, maxNumberContigs) if filtered_sequences_sufix is not None: filtered_sequence_file = os.path.splitext( contigs)[0] + '.' + filtered_sequences_sufix + '.fasta' write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs, filtered_sequence_file, sampleName, False, saveExcludedContigs) contigs = filtered_sequence_file else: filtered_sequence_file = os.path.splitext( contigs)[0] + '.original.fasta' write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs, filtered_sequence_file, sampleName, True, False) contigs = filtered_sequence_file os.remove(contigs_link) else: run_successfully = False failing['sample'] = 'Assembly was not produced' else: failing['sample'] = 'Did not run' if not run_successfully: print failing['sample'] pass_qc = False utils.removeDirectory(spades_folder) return run_successfully, pass_qc, failing, contigs, warnings
def run_spades(sample_name, outdir, threads, fastq_files, not_use_careful, max_memory, min_coverage_assembly, min_contigs_length, estimated_genome_size_mb, kmers, maximum_reads_length, default_kmers, min_coverage_contigs, assembled_se_reads, save_excluded_contigs, max_number_contigs, keep_scaffolds=False, spades_version=None, estimated_coverage=None, spades_not_use_isolate=False): pass_qc = True failing = {'sample': False} warnings = {} # Create SPAdes output directory spades_folder = os.path.join(outdir, 'spades', '') utils.removeDirectory(spades_folder) os.mkdir(spades_folder) # Determine k-mers to run if default_kmers: kmers = [] else: kmers = define_kmers(kmers, maximum_reads_length) if len(kmers) == 0: print('SPAdes will use its default k-mers') else: print('SPAdes will use the following k-mers: ' + str(kmers)) run_successfully, contigs = spades(spades_folder, threads, fastq_files, not_use_careful, max_memory, min_coverage_assembly, kmers, assembled_se_reads, spades_version=spades_version, estimated_coverage=estimated_coverage, spades_not_use_isolate=spades_not_use_isolate) if run_successfully: scaffolds = os.path.join(spades_folder, 'scaffolds.fasta') if keep_scaffolds: if os.path.isfile(scaffolds): shutil.copyfile(scaffolds, os.path.join(outdir, str('SPAdes_original_assembly.scaffolds.fasta'))) else: print('The scaffolds file was not found!') if os.path.isfile(contigs): shutil.copyfile(contigs, os.path.join(outdir, str('SPAdes_original_assembly.contigs.fasta'))) contigs_link = os.path.join(outdir, str(sample_name + '.contigs.fasta')) os.symlink(contigs, contigs_link) contigs = contigs_link min_contigs_length = define_minContigsLength(maximum_reads_length, min_contigs_length) sequence_dict = get_SPAdes_sequence_information(contigs) warnings, sequence_dict, filtered_sequences_sufix, spades_report_general = \ decide_filter_parameters(sequence_dict, min_contigs_length, min_coverage_contigs, estimated_genome_size_mb, max_number_contigs) if filtered_sequences_sufix is not None: filtered_sequence_file = os.path.splitext(contigs)[0] + '.' + filtered_sequences_sufix + '.fasta' write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs, filtered_sequence_file, sample_name, False, save_excluded_contigs) contigs = filtered_sequence_file else: filtered_sequence_file = os.path.splitext(contigs)[0] + '.original.fasta' write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs, filtered_sequence_file, sample_name, True, False) contigs = filtered_sequence_file os.remove(contigs_link) else: run_successfully = False failing['sample'] = 'Assembly was not produced' else: failing['sample'] = 'Did not run' if not run_successfully: print(failing['sample']) pass_qc = False utils.removeDirectory(spades_folder) return run_successfully, pass_qc, failing, contigs, warnings
def runAssemblyMapping(alignment_file, reference_file, threads, outdir, minCoverageAssembly, assembly_pilon, estimatedGenomeSizeMb): pass_qc = False pass_qc_coverage = False pass_qc_mapping = False pass_qc_sequences = False failing = {} assemblyMapping_folder = os.path.join(outdir, 'assemblyMapping', '') utils.removeDirectory(assemblyMapping_folder) os.mkdir(assemblyMapping_folder) assembly_filtered = None pilon_run_successfuly = True if assembly_pilon is not None else False # Get assembly coverage sample_coverage_no_problems, mean_coverage_data = sample_coverage( reference_file, alignment_file, assemblyMapping_folder, threads) if sample_coverage_no_problems: pass_qc_coverage, failing_reason, sequences_2_keep = save_assembly_coverage_report( mean_coverage_data, outdir, minCoverageAssembly) if not pass_qc_coverage: failing['Coverage'] = [failing_reason] assembly = reference_file if assembly_pilon is None else assembly_pilon assembly_filtered = os.path.splitext(assembly)[0] + '.mappingCov.fasta' sequence_dict = get_sequence_information(assembly) sequence_dict, sequence_report_general = determine_sequences_to_filter( sequence_dict, sequences_2_keep, pilon_run_successfuly) failing_sequences_filtered, minimumBP = spades.qc_assembly( sequence_report_general, estimatedGenomeSizeMb) if failing_sequences_filtered['sample'] is not False and not minimumBP: failing['Sequences_filtered'] = [ failing_sequences_filtered['sample'] ] assembly_filtered = assembly else: write_filtered_sequences_and_stats(sequence_dict, sequence_report_general, assembly_filtered) pass_qc_sequences = True if failing_sequences_filtered['sample'] is not False: print failing_sequences_filtered else: failing['Coverage'] = ['Did not run'] # Save mapping statistics sample_mapping_statistics_no_problems, dict_mapping_statistics = getting_mapping_statistics( alignment_file) if sample_mapping_statistics_no_problems: pass_qc_mapping, failing_reason = save_mapping_statistics( dict_mapping_statistics, outdir) if not pass_qc_mapping: failing['Mapping'] = [failing_reason] else: failing['Mapping'] = ['Did not run'] run_successfully = sample_coverage_no_problems and sample_mapping_statistics_no_problems pass_qc = all([pass_qc_coverage, pass_qc_mapping, pass_qc_sequences]) if not pass_qc: print 'Sample FAILS Assembly Mapping check with: ' + str(failing) utils.removeDirectory(assemblyMapping_folder) return run_successfully, pass_qc, failing, assembly_filtered
def runTrueCoverage(sample, fastq, reference, threads, outdir, extraSeq, minCovPresence, minCovCall, minFrequencyDominantAllele, minGeneCoverage, conserved_True, debug, numMapLoc, minGeneIdentity, trueCoverage_config, rematch_script): pass_qc = False failing = {} trueCoverage_folder = os.path.join(outdir, 'trueCoverage', '') utils.removeDirectory(trueCoverage_folder) os.mkdir(trueCoverage_folder) sys.path.append( os.path.join(os.path.dirname(rematch_script), 'modules', '')) import rematch_module # Run ReMatCh reference_file, gene_list_reference, reference_dict = clean_headers_reference_file( reference, trueCoverage_folder, extraSeq, rematch_module) time_taken, run_successfully, data_by_gene, sample_data_general, consensus_files, consensus_sequences = rematch_module.runRematchModule( sample, fastq, reference_file, threads, trueCoverage_folder, extraSeq, minCovPresence, minCovCall, minFrequencyDominantAllele, minGeneCoverage, True, debug, 1, minGeneIdentity, 'first', 7, 'none', reference_dict, 'X', None, gene_list_reference, True) if run_successfully: print 'Writing report file' os.rename( os.path.join(trueCoverage_folder, 'rematchModule_report.txt'), os.path.join(outdir, 'trueCoverage_report.txt')) if sample_data_general['number_absent_genes'] > trueCoverage_config[ 'maximum_number_absent_genes']: failing['absent_genes'] = 'The number of absent genes (' + str( sample_data_general['number_absent_genes'] ) + ') exceeds the maximum allowed (' + str( trueCoverage_config['maximum_number_absent_genes']) + ')' if sample_data_general[ 'number_genes_multiple_alleles'] > trueCoverage_config[ 'maximum_number_genes_multiple_alleles']: failing[ 'multiple_alleles'] = 'The number of genes with multiple alleles (' + str( sample_data_general['number_genes_multiple_alleles'] ) + ') exceeds the maximum allowed (' + str( trueCoverage_config[ 'maximum_number_genes_multiple_alleles']) + ')' if sample_data_general['mean_sample_coverage'] < trueCoverage_config[ 'minimum_read_coverage']: failing[ 'read_coverage'] = 'The mean read coverage for genes present (' + str( sample_data_general['mean_sample_coverage'] ) + ') dit not meet the minimum required (' + str( trueCoverage_config['minimum_read_coverage']) + ')' else: failing['sample'] = 'Did not run' if len(failing) == 0: pass_qc = True failing['sample'] = False else: print failing if not debug: utils.removeDirectory(trueCoverage_folder) return run_successfully, pass_qc, failing
def run_assembly_mapping(fastq_files, reference_file, outdir, estimated_genome_size_mb, max_number_contigs=100, save_excluded_contigs=False, min_coverage_assembly=None, keep_bam=False, threads=1): """ Runs Assembly_Mapping for INNUca and QA/QC the results Parameters ---------- fastq_files : list List of fastq files reference_file : str Path to the reference file (the assembly) outdir : str Path to the output directory estimated_genome_size_mb : float Expected genome size in Mb save_excluded_contigs : bool, default False True if want to save excluded contigs max_number_contigs : int, default 100 Maximum number of contigs per 1.5 Mb of expected genome size min_coverage_assembly : int or None, default None Minimum contigs average coverage. After mapping reads back to the contigs, only keep contigs with at least this average coverage. If None is provided, 1/3 of the assembly mean coverage or 10x will be used keep_bam : bool, default False True if want to keep the BAM file produced (with mapped and unmapped reads) threads : int, default 1 Number of threads to be used Returns ------- run_successfully : bool Boolean stating if INNUca Assembly_Mapping module ran successfully or not pass_qc : bool Boolean stating if sample pass QA/QC or not time_taken : float Seconds that run_assembly_mapping took to run failing : dict Dictionary with the failing reasons. If sample did not fail, it is only {'sample': False}. If it failed, keys will be the level of failing, and values list of strings assembly_filtered : str or None Path to the filtered assembly (or original one if nothing was filtered). If something went wrong, None is returned bam_file : str or None Path to the BAM file to be used in subsequent steps. If something went wrong, None is returned assembly_mapping_folder : str Path to Assembly_Mapping working directory warnings : dict Dictionary with the warning reasons. If no warnings were raised, it is empty. If warnings were raised, keys will be the level of warnings, and values list of strings original_bam : str or None Path to the BAM file produced with reference_file if a new BAM file for subset of sequences is produced, else None. """ pass_qc = True failing = {} warnings = {} assembly_mapping_folder = os.path.join(outdir, 'assemblyMapping', '') utils.removeDirectory(assembly_mapping_folder) os.mkdir(assembly_mapping_folder) assembly_filtered = None # Create a symbolic link to the assembly assembly_link = os.path.join(assembly_mapping_folder, os.path.basename(reference_file)) os.symlink(reference_file, assembly_link) bam_file = None original_bam = None # Index assembly using Bowtie2 run_successfully = indexSequenceBowtie2(assembly_link, threads) sample_coverage_no_problems = False if run_successfully: run_successfully, sam_file = mappingBowtie2(fastq_files, assembly_link, threads, assembly_mapping_folder) if run_successfully: bam_file = os.path.splitext(sam_file)[0] + '.bam' run_successfully, bam_file = sortAlignment(sam_file, bam_file, False, threads) if run_successfully: os.remove(sam_file) run_successfully = indexAlignment(bam_file, True) if run_successfully: sequences_2_keep = [] # Get assembly coverage sample_coverage_no_problems, mean_coverage_data = sample_coverage( reference_file, bam_file, assembly_mapping_folder, threads) if sample_coverage_no_problems: pass_qc_coverage, failing_reason, sequences_2_keep = \ save_assembly_coverage_report(mean_coverage_data, outdir, min_coverage_assembly) if not pass_qc_coverage: failing['Coverage'] = [failing_reason] assembly_filtered = os.path.splitext( reference_file)[0] + '.mappingCov.fasta' sequence_dict, ignore = utils.get_sequence_information( reference_file, 0) sequence_dict, sequence_report_general = determine_sequences_to_filter( sequence_dict, sequences_2_keep, False) failing_sequences_filtered, minimumBP = spades.qc_assembly( sequence_report_general, estimated_genome_size_mb, max_number_contigs) if failing_sequences_filtered['sample'] is not False: warnings['Sequences_filtered'] = [ failing_sequences_filtered['sample'] ] if not minimumBP: assembly_filtered = reference_file else: write_filtered_sequences_and_stats( sequence_dict, sequence_report_general, assembly_filtered, save_excluded_contigs) else: write_filtered_sequences_and_stats( sequence_dict, sequence_report_general, assembly_filtered, save_excluded_contigs) else: failing['Coverage'] = ['Did not run'] # Save mapping statistics sample_mapping_statistics_no_problems, dict_mapping_statistics = getting_mapping_statistics( bam_file) if sample_mapping_statistics_no_problems: pass_qc_mapping, failing_reason = save_mapping_statistics( dict_mapping_statistics, outdir) if not pass_qc_mapping: warnings['Mapping'] = [failing_reason] else: warnings['Mapping'] = ['Did not run'] if assembly_filtered is not None and \ assembly_filtered != reference_file and \ len(sequences_2_keep) > 0: print('Producing bam subset for sequences to keep') run_successfully, bam_subset = get_bam_subset( bam_file, sequences_2_keep, threads) if run_successfully: if not keep_bam: os.remove(bam_file) else: original_bam = os.path.join( outdir, '{}.bam'.format( os.path.basename(reference_file))) os.rename(bam_file, original_bam) os.remove(bam_file + '.bai') bam_file = bam_subset run_successfully = indexAlignment(bam_file, False) else: if keep_bam: os.rename( bam_file, os.path.join( outdir, '{}.bam'.format( os.path.basename(reference_file)))) os.rename( bam_file + '.bai', os.path.join( outdir, '{}.bam.bai'.format( os.path.basename(reference_file)))) bam_file = os.path.join( outdir, '{}.bam'.format( os.path.basename(reference_file))) if not run_successfully: failing['sample'] = ['Did not run'] run_successfully = all([run_successfully, sample_coverage_no_problems]) if len(failing) == 0: failing = {'sample': False} else: print('Failing:', failing) pass_qc = False return run_successfully, pass_qc, failing, assembly_filtered, bam_file, assembly_mapping_folder, warnings, \ original_bam
def runTrueCoverage(fastq_files, reference_file, threads, outdir, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, maximum_number_absent_genes, maximum_number_genes_multiple_alleles, minimum_read_coverage): pass_qc = False failing = {} trueCoverage_folder = os.path.join(outdir, 'trueCoverage', '') utils.removeDirectory(trueCoverage_folder) os.mkdir(trueCoverage_folder) # Map reads run_successfully, bam_file, reference_file = mapping_reads( fastq_files, reference_file, threads, trueCoverage_folder) if run_successfully: # Index reference file run_successfully = index_fasta_samtools(reference_file) if run_successfully: run_successfully, sample_data = sequence_data( reference_file, bam_file, trueCoverage_folder, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele) if run_successfully: number_absent_genes = 0 number_genes_multiple_alleles = 0 mean_sample_coverage = 0 with open(os.path.join(outdir, 'trueCoverage_report.txt'), 'wt') as writer: writer.write('\t'.join([ '#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage', 'percentage_gene_low_coverage', 'number_positions_multiple_alleles' ]) + '\n') for i in range(1, len(sample_data) + 1): writer.write('\t'.join([ sample_data[i]['header'], str(round(sample_data[i]['gene_coverage'], 2)), str( round( sample_data[i]['gene_mean_read_coverage'], 2)), str(round(sample_data[i]['gene_low_coverage'], 2)), str(sample_data[i] ['gene_number_positions_multiple_alleles']) ]) + '\n') if sample_data[i][ 'gene_coverage'] < minimum_gene_coverage: number_absent_genes += 1 else: mean_sample_coverage += sample_data[i][ 'gene_mean_read_coverage'] if sample_data[i][ 'gene_number_positions_multiple_alleles'] > 0: number_genes_multiple_alleles += 1 if len(sample_data) - number_absent_genes > 0: mean_sample_coverage = float( mean_sample_coverage) / float( len(sample_data) - number_absent_genes) else: mean_sample_coverage = 0 writer.write('\n'.join([ '#general', '>number_absent_genes', str(number_absent_genes), '>number_genes_multiple_alleles', str(number_genes_multiple_alleles), '>mean_sample_coverage', str(round(mean_sample_coverage, 2)) ]) + '\n') print '\n'.join([ str('number_absent_genes: ' + str(number_absent_genes)), str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)), str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2))) ]) if number_absent_genes > maximum_number_absent_genes: failing[ 'absent_genes'] = 'The number of absent genes (' + str( number_absent_genes ) + ') exceeds the maximum allowed (' + str( maximum_number_absent_genes) + ')' if number_genes_multiple_alleles > maximum_number_genes_multiple_alleles: failing[ 'multiple_alleles'] = 'The number of genes with multiple alleles (' + str( number_genes_multiple_alleles ) + ') exceeds the maximum allowed (' + str( maximum_number_genes_multiple_alleles) + ')' if mean_sample_coverage < minimum_read_coverage: failing[ 'read_coverage'] = 'The mean read coverage for genes present (' + str( mean_sample_coverage ) + ') dit not meet the minimum required (' + str( minimum_read_coverage) + ')' else: failing['sample'] = 'Did not run' else: failing['sample'] = 'Did not run' else: failing['sample'] = 'Did not run' if len(failing) == 0: pass_qc = True failing['sample'] = False else: print failing utils.removeDirectory(trueCoverage_folder) return run_successfully, pass_qc, failing
def run_pilon(jar_path_pilon, assembly, fastq_files, outdir, jar_max_memory, alignment_file, keep_bam=False, threads=1): """ Runs Assembly_Mapping for INNUca and QA/QC the results Parameters ---------- jar_path_pilon assembly : str Path to the assembly to correct fastq_files : list List of fastq files outdir : str Path to the output directory jar_max_memory : int or 'off' If not 'off' is provided, sets the maximum RAM Gb usage by jar files alignment_file : str or None Path to the BAM file to be used. If None is provided, new alignment reads will be performed keep_bam : bool, default False True if want to keep the BAM file produced (with mapped and unmapped reads) threads : int, default 1 Number of threads to be used Returns ------- run_successfully : bool Boolean stating if INNUca Assembly_Mapping module ran successfully or not pass_qc : None QA/QC not performed time_taken : float Seconds that run_assembly_mapping took to run failing : dict Dictionary with the failing reasons. If sample did not fail, it is only {'sample': False}. If it failed, keys will be the level of failing, and values list of strings assembly_polished : str or None Path to the polished assembly. If something went wrong, None is returned pilon_folder : str Path to Pilon working directory new_bam : bool True if new alignment reads was performed alignment_file : str or None Path to the BAM file used to correct the assembly. If something went wrong, None is returned. """ failing = {'sample': False} pilon_folder = os.path.join(outdir, 'pilon', '') utils.removeDirectory(pilon_folder) os.mkdir(pilon_folder) # Create a symbolic link to the assembly assembly_link = os.path.join(pilon_folder, os.path.basename(assembly)) os.symlink(assembly, assembly_link) run_successfully = True new_bam = False if alignment_file is None: # Index assembly using Bowtie2 run_successfully = indexSequenceBowtie2(assembly_link, threads) if run_successfully: # mapping_bowtie2(fastq_files, reference_file, outdir, keep_bam=False, threads=1 run_successfully, sam_file = mapping_bowtie2(fastq_files=fastq_files, reference_file=assembly_link, outdir=pilon_folder, keep_bam=keep_bam, threads=threads) if run_successfully: alignment_file = os.path.splitext(sam_file)[0] + '.bam' run_successfully, alignment_file = sortAlignment(sam_file, alignment_file, False, threads) if run_successfully: os.remove(sam_file) run_successfully = indexAlignment(alignment_file) new_bam = True else: alignment_file = None assembly_polished = None if run_successfully: run_successfully, assembly_polished = pilon(jar_path_pilon, assembly_link, alignment_file, pilon_folder, jar_max_memory) if run_successfully: parsePilonResult(assembly_polished, outdir) os.rename(assembly_polished, os.path.join(outdir, os.path.basename(assembly_polished))) assembly_polished = os.path.join(outdir, os.path.basename(assembly_polished)) if keep_bam and new_bam: os.rename(alignment_file, os.path.join(outdir, '{}.bam'.format(os.path.basename(assembly)))) alignment_file = os.path.join(outdir, '{}.bam'.format(os.path.basename(assembly))) if alignment_file is not None and os.path.isfile(str(alignment_file)) and not keep_bam: os.remove(alignment_file) if not run_successfully: failing['sample'] = 'Did not run' print failing['sample'] return run_successfully, None, failing, assembly_polished, pilon_folder, new_bam, alignment_file
def runRematchModule(sample, fastq_files, reference_file, threads, outdir, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, conserved_True, debug_mode_true, numMapLoc, minimum_gene_identity): rematch_folder = os.path.join(outdir, 'rematch_module', '') utils.removeDirectory(rematch_folder) os.mkdir(rematch_folder) # Map reads run_successfully, bam_file, reference_file = mapping_reads( fastq_files, reference_file, threads, rematch_folder, conserved_True, numMapLoc) if run_successfully: # Index reference file run_successfully, stdout = index_fasta_samtools( reference_file, None, None, True) if run_successfully: print 'Analysing alignment data' run_successfully, sample_data, consensus_files = sequence_data( sample, reference_file, bam_file, rematch_folder, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true) if run_successfully: print 'Writing report file' number_absent_genes = 0 number_genes_multiple_alleles = 0 mean_sample_coverage = 0 with open(os.path.join(outdir, 'rematchModule_report.txt'), 'wt') as writer: writer.write('\t'.join([ '#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage', 'percentage_gene_low_coverage', 'number_positions_multiple_alleles', 'percentage_gene_identity' ]) + '\n') for i in range(1, len(sample_data) + 1): writer.write('\t'.join([ sample_data[i]['header'], str(round(sample_data[i]['gene_coverage'], 2)), str( round( sample_data[i]['gene_mean_read_coverage'], 2)), str(round(sample_data[i]['gene_low_coverage'], 2)), str(sample_data[i] ['gene_number_positions_multiple_alleles']), str(round(sample_data[i]['gene_identity'], 2)) ]) + '\n') if sample_data[i][ 'gene_coverage'] < minimum_gene_coverage or sample_data[ i]['gene_identity'] < minimum_gene_identity: number_absent_genes += 1 else: mean_sample_coverage += sample_data[i][ 'gene_mean_read_coverage'] if sample_data[i][ 'gene_number_positions_multiple_alleles'] > 0: number_genes_multiple_alleles += 1 if len(sample_data) - number_absent_genes > 0: mean_sample_coverage = float( mean_sample_coverage) / float( len(sample_data) - number_absent_genes) else: mean_sample_coverage = 0 writer.write('\n'.join([ '#general', '>number_absent_genes', str(number_absent_genes), '>number_genes_multiple_alleles', str(number_genes_multiple_alleles), '>mean_sample_coverage', str(round(mean_sample_coverage, 2)) ]) + '\n') print '\n'.join([ str('number_absent_genes: ' + str(number_absent_genes)), str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)), str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2))) ]) if not debug_mode_true: utils.removeDirectory(rematch_folder) return run_successfully, sample_data if 'sample_data' in locals( ) else None, { 'number_absent_genes': number_absent_genes, 'number_genes_multiple_alleles': number_genes_multiple_alleles, 'mean_sample_coverage': round(mean_sample_coverage, 2) } if 'number_absent_genes' in locals( ) else None, consensus_files if 'consensus_files' in locals() else None
def run_spades(sample_name, outdir, threads, fastq_files, not_use_careful, max_memory, min_coverage_assembly, min_contigs_length, estimated_genome_size_mb, kmers, maximum_reads_length, default_kmers, min_coverage_contigs, assembled_se_reads, save_excluded_contigs, max_number_contigs, keep_scaffolds=False): pass_qc = True failing = {'sample': False} warnings = {} # Create SPAdes output directory spades_folder = os.path.join(outdir, 'spades', '') utils.removeDirectory(spades_folder) os.mkdir(spades_folder) # Determine k-mers to run if default_kmers: kmers = [] else: kmers = define_kmers(kmers, maximum_reads_length) if len(kmers) == 0: print('SPAdes will use its default k-mers') else: print('SPAdes will use the following k-mers: ' + str(kmers)) run_successfully, contigs = spades(spades_folder, threads, fastq_files, not_use_careful, max_memory, min_coverage_assembly, kmers, assembled_se_reads) if run_successfully: scaffolds = os.path.join(spades_folder, 'scaffolds.fasta') if keep_scaffolds: if os.path.isfile(scaffolds): shutil.copyfile(scaffolds, os.path.join(outdir, str('SPAdes_original_assembly.scaffolds.fasta'))) else: print('The scaffolds file was not found!') if os.path.isfile(contigs): shutil.copyfile(contigs, os.path.join(outdir, str('SPAdes_original_assembly.contigs.fasta'))) contigs_link = os.path.join(outdir, str(sample_name + '.contigs.fasta')) os.symlink(contigs, contigs_link) contigs = contigs_link min_contigs_length = define_minContigsLength(maximum_reads_length, min_contigs_length) sequence_dict = get_SPAdes_sequence_information(contigs) warnings, sequence_dict, filtered_sequences_sufix, spades_report_general = \ decide_filter_parameters(sequence_dict, min_contigs_length, min_coverage_contigs, estimated_genome_size_mb, max_number_contigs) if filtered_sequences_sufix is not None: filtered_sequence_file = os.path.splitext(contigs)[0] + '.' + filtered_sequences_sufix + '.fasta' write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs, filtered_sequence_file, sample_name, False, save_excluded_contigs) contigs = filtered_sequence_file else: filtered_sequence_file = os.path.splitext(contigs)[0] + '.original.fasta' write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs, filtered_sequence_file, sample_name, True, False) contigs = filtered_sequence_file os.remove(contigs_link) else: run_successfully = False failing['sample'] = 'Assembly was not produced' else: failing['sample'] = 'Did not run' if not run_successfully: print(failing['sample']) pass_qc = False utils.removeDirectory(spades_folder) return run_successfully, pass_qc, failing, contigs, warnings
def run_download(ena_id, download_paired_type, aspera_key, outdir, download_cram_bam_true, threads, instrument_platform, sra, sra_opt): download_dir = os.path.join(outdir, 'download', '') utils.removeDirectory(download_dir) os.mkdir(download_dir) run_successfully = False downloaded_files = None sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': time.strftime("%Y-%m-%d")} read_run_info = get_read_run_info(ena_id) if read_run_info is not None: download_information = get_download_information(read_run_info) download_information = check_correct_links(download_information) sequencing_information = get_sequencing_information(read_run_info) if instrument_platform.lower() == 'all' or \ (sequencing_information['instrument_platform'] is not None and sequencing_information['instrument_platform'].lower() == instrument_platform.lower()): if download_paired_type.lower() == 'both' or \ (sequencing_information['library_layout'] is not None and sequencing_information['library_layout'].lower() == download_paired_type.lower()): run_successfully, cram_index_run_successfully, download_sra = download_files(download_information, aspera_key, download_dir, download_cram_bam_true, sra, sra_opt, ena_id) if download_sra: run_successfully = sra_2_fastq(download_dir, ena_id) if run_successfully: run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads, sequencing_information['library_layout']) if run_successfully and downloaded_files is not None: run_successfully, downloaded_files = rename_move_files(downloaded_files, sequencing_information['run_accession'], outdir, sequencing_information['library_layout']) else: if sra or sra_opt: run_successfully, cram_index_run_successfully, download_sra = download_files({'fastq': None, 'submitted': None, 'cram_index': None}, aspera_key, download_dir, download_cram_bam_true, sra, sra_opt, ena_id) if download_sra: run_successfully = sra_2_fastq(download_dir, ena_id) if run_successfully: run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads, 'paired') if not run_successfully: run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads, 'single') if run_successfully and downloaded_files is not None: run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'paired') if not run_successfully: run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'single') utils.removeDirectory(download_dir) return run_successfully, downloaded_files, sequencing_information
def run_pilon(jar_path_pilon, assembly, fastq_files, outdir, jar_max_memory, alignment_file, keep_bam=False, threads=1): """ Runs Assembly_Mapping for INNUca and QA/QC the results Parameters ---------- jar_path_pilon : str Path to the Pilon jar file that will be executed assembly : str Path to the assembly to correct fastq_files : list List of fastq files outdir : str Path to the output directory jar_max_memory : int or 'off' If not 'off' is provided, sets the maximum RAM Gb usage by jar files alignment_file : str or None Path to the BAM file to be used. If None is provided, new alignment reads will be performed keep_bam : bool, default False True if want to keep the BAM file produced (with mapped and unmapped reads) threads : int, default 1 Number of threads to be used Returns ------- run_successfully : bool Boolean stating if INNUca Assembly_Mapping module ran successfully or not pass_qc : None QA/QC not performed time_taken : float Seconds that run_assembly_mapping took to run failing : dict Dictionary with the failing reasons. If sample did not fail, it is only {'sample': False}. If it failed, keys will be the level of failing, and values list of strings assembly_polished : str or None Path to the polished assembly. If something went wrong, None is returned pilon_folder : str Path to Pilon working directory new_bam : bool True if new alignment reads was performed alignment_file : str or None Path to the BAM file used to correct the assembly. If something went wrong, None is returned. """ failing = {'sample': False} pilon_folder = os.path.join(outdir, 'pilon', '') utils.removeDirectory(pilon_folder) os.mkdir(pilon_folder) # Create a symbolic link to the assembly assembly_link = os.path.join(pilon_folder, os.path.basename(assembly)) os.symlink(assembly, assembly_link) run_successfully = True new_bam = False if alignment_file is None: # Index assembly using Bowtie2 run_successfully = indexSequenceBowtie2(assembly_link, threads) if run_successfully: # mapping_bowtie2(fastq_files, reference_file, outdir, keep_bam=False, threads=1 run_successfully, sam_file = mapping_bowtie2(fastq_files=fastq_files, reference_file=assembly_link, outdir=pilon_folder, keep_bam=keep_bam, threads=threads) if run_successfully: alignment_file = os.path.splitext(sam_file)[0] + '.bam' run_successfully, alignment_file = sortAlignment(sam_file, alignment_file, False, threads) if run_successfully: os.remove(sam_file) run_successfully = indexAlignment(alignment_file) new_bam = True else: alignment_file = None assembly_polished = None if run_successfully: run_successfully, assembly_polished = pilon(jar_path_pilon, assembly_link, alignment_file, pilon_folder, jar_max_memory) if run_successfully: parsePilonResult(assembly_polished, outdir) os.rename(assembly_polished, os.path.join(outdir, os.path.basename(assembly_polished))) assembly_polished = os.path.join(outdir, os.path.basename(assembly_polished)) write_assembly_statistics(assembly=assembly_polished, outdir=outdir) if keep_bam and new_bam: os.rename(alignment_file, os.path.join(outdir, '{}.bam'.format(os.path.basename(assembly)))) alignment_file = os.path.join(outdir, '{}.bam'.format(os.path.basename(assembly))) if alignment_file is not None and os.path.isfile(str(alignment_file)) and not keep_bam: os.remove(alignment_file) if not run_successfully: failing['sample'] = 'Did not run' print(failing['sample']) return run_successfully, None, failing, assembly_polished, pilon_folder, new_bam, alignment_file