def analyze_single_sam(mode_code, sam_file, reference_file, reads_file, simulated_sam_path='', consensus_coverage_threshold=20): if (sam_file == '' or reference_file == ''): print '[analyze_single_sam] ERROR: No input files given.'; print_usage_and_exit(); return; dir_name = os.path.dirname(sam_file); if (dir_name == ''): dir_name = '.'; # Create the output path if it doesn't exist yet. output_folder_intermediate = dir_name + '/analysis-intermediate'; output_folder_final = dir_name + '/analysis-final'; if not os.path.exists(output_folder_intermediate): os.makedirs(output_folder_intermediate); if not os.path.exists(output_folder_final): os.makedirs(output_folder_final); dataset_name = os.path.splitext(os.path.basename(sam_file))[0]; summary = ''; csv_line = '%s\t' % (dataset_name); csv_header = 'mapper\t'; # Defining output filenames. out_accuracy_counts_path = '%s/error_rates-individualbases-%s.csv' % (output_folder_intermediate, dataset_name); out_accuracy_counts_indel_events_path = '%s/error_rates-eventsindel-%s.csv' % (output_folder_intermediate, dataset_name); consensus_prefix = '%s/consensus-%s' % (output_folder_intermediate, dataset_name); out_summary_path = '%s/summary_sam_analysis-%s.txt' % (output_folder_intermediate, dataset_name); out_consensus_plot_lines = consensus_prefix + '.plot'; out_consensus_plot_png = consensus_prefix + '.plot.png'; out_count_mapped_reads_prefix = '%s/count_reads-%s' % (output_folder_intermediate, dataset_name); out_results_path = '%s/results-%s.csv' % (output_folder_final, dataset_name); try: fp = open(out_summary_path, 'w'); except IOError: sys.stderr.write('ERROR: Could not open summary path "%s" for writing!\n' % out_summary_path); os.exit(1); # Verbose of the filenames for the output. summary_file_paths = ''; summary_file_paths += 'Analyzing SAM file %s...\n' % (sam_file); summary_file_paths += 'Reference file: %s\n' % (reference_file); # summary_file_paths += 'Output folder: %s\n' % (output_folder); summary_file_paths += 'Accuracy counts: %s\n' % (out_accuracy_counts_path); summary_file_paths += 'Consensus prefix: %s\n' % (consensus_prefix); summary_file_paths += 'Count mapped reads: %s\n' % (out_count_mapped_reads_prefix); summary += '[Paths]\n' + summary_file_paths + '\n'; # Just simply verbose to screen and summary file. sys.stderr.write('[Paths]\n'); sys.stderr.write(summary_file_paths + '\n'); fp.write('[Paths]\n'); fp.write(summary_file_paths + '\n'); # Get the headers from the SAM file (if they exist). Useful if the commandline was stored. sam_headers = utility_sam.LoadOnlySAMHeaders(sam_file, False); summary_file_sam_headers = '%s\n' % ('\n'.join(sam_headers)); summary += '[SAM headers]\n' + summary_file_sam_headers + '\n'; # Just simply verbose to screen and summary file. sys.stderr.write('[SAM headers]\n'); sys.stderr.write(summary_file_sam_headers + '\n'); fp.write('[SAM headers]\n'); fp.write(summary_file_sam_headers + '\n'); # Calculate the input reads statistics. [fastqinfo_string, fastqinfo_num_seqs, fastqinfo_total_seq_len, fastqinfo_average_seq_len, temp_max_seq_len] = fastqparser.count_seq_length(reads_file); summary_reads_file = ''; summary_reads_file += 'Number of reads in the input file: %d\n' % (fastqinfo_num_seqs); summary_reads_file += 'Total number of bases in the input reads file: %d\n' % (fastqinfo_total_seq_len); summary_reads_file += 'Average read length in the input file: %d\n' % (fastqinfo_average_seq_len); summary += '[Input reads file]\n' + summary_reads_file + '\n'; ###############################3 # This version is more verbose, but is different than figure1.xlsx! # csv_line += '%d\t%d\t' % (fastqinfo_num_seqs, fastqinfo_total_seq_len); # csv_header += 'Num input reads\tNum input bases\t'; ###############################3 # Just simply verbose to screen and summary file. sys.stderr.write('[Input reads file]\n'); sys.stderr.write(summary_reads_file + '\n'); fp.write('[Input reads file]\n'); fp.write(summary_reads_file + '\n'); # header = 'CPU time\tMax RSS\tNum input reads\tNum input bases\t'; if (mode_code & MODE_CODE_CALC_CONSENSUS): summary_consensus = ''; sys.stderr.write('[main_sam_analysis.py] Calculating consensus statistics, base coverage threshold %d...\n' % (consensus_coverage_threshold)); consensus.main(sam_file, reference_file, consensus_coverage_threshold, consensus_prefix, thread_id=0); sys.stderr.write('\n'); if ((mode_code & MODE_CODE_CALC_CONSENSUS) or (mode_code & MODE_CODE_COLLECT_CONSENSUS)): # Collecting stats from the consensus caller. # consensus_plot_lines and consensus_statistics have the same information, only the # consensus_statistics object is a dict and the consensus_plot_lines is a formatted string # meant for outputting to a summary text file. variant_summary_path = ('%s-cov_%d.variant.sum' % (consensus_prefix, consensus_coverage_threshold)); [alignments_file, mpileup_file, coverage_threshold, snp_count, insertion_count, deletion_count, \ num_undercovered_bases, num_called_bases, num_correct_bases, average_coverage, variant_lines] = ParseVariantStats(variant_summary_path); summary_consensus = '\n'.join(variant_lines); summary += '[Consensus statistics]\n' + summary_consensus + '\n\n'; ###############################3 # This version is more verbose, but is different than figure1.xlsx! # csv_line += '%d\t%d\t%d\t%d\t%d\t%d\t%f\t%d\t' % ( snp_count, deletion_count, insertion_count, # num_undercovered_bases, num_called_bases, # num_correct_bases, average_coverage, coverage_threshold); # csv_header += 'snp_count\tdeletion_count\tinsertion_count\tnum_undercovered_bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\tcoverage_threshold\t'; ###############################3 ###############################3 # This version is in the same order as figure1.xlsx! csv_line += '%d\t%d\t%d\t%d\t%d\t%d\t%f\t' % ( insertion_count, deletion_count, snp_count, num_undercovered_bases, num_called_bases, num_correct_bases, average_coverage); csv_header += 'Insertions\tDeletions\tSNPs\tUncalled Bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\t'; ###############################3 # [summary_consensus, consensus_plot_lines, consensus_statistics] = consensus_stats.CollectConsensus(sam_file, consensus_prefix, consensus_coverage_thresholds, suppress_error_messages); sys.stderr.write('[Consensus statistics]\n'); sys.stderr.write(summary_consensus + '\n\n'); fp.write('[Consensus statistics]\n'); fp.write(summary_consensus + '\n\n'); # fp_consensus_plot_lines = open(out_consensus_plot_lines, 'w'); # fp_consensus_plot_lines.write(consensus_plot_lines); # fp_consensus_plot_lines.close(); # consensus_stats.PlotConsensusStats(dataset_name, consensus_prefix, consensus_statistics, out_consensus_plot_png); if (mode_code & MODE_CODE_CALC_ERROR_RATE): sys.stderr.write('[analyzesam.py] Calculating error rates - individual indels...\n'); errorrates.ProcessFromFiles(reference_file, sam_file, out_accuracy_counts_path, False); sys.stderr.write('\n'); if ((mode_code & MODE_CODE_CALC_ERROR_RATE) or (mode_code & MODE_CODE_COLLECT_ERROR_RATE)): # Collecting stats from the error rate estimation. error_rates_return = errorrates.CollectAccuracy(sam_file, out_accuracy_counts_path, False); try: [summary_lines, summary_cigar, error_rate_hist, insertion_hist, deletion_hist, snp_hist, match_hist, cigar_dataset_name, out_png_path] = error_rates_return; fp.write('[CIGAR statistics - individual indels]\n'); fp.write(summary_lines + '\n'); sys.stderr.write('[CIGAR statistics - individual indels]\n'); sys.stderr.write(summary_lines + '\n'); summary += '[CIGAR statistics - individual indels]\n'; summary += summary_lines + '\n\n'; except Exception, e: sys.stderr.write(str(e) + '\n'); sys.stderr.write('Returned values: %s\n' % (str(error_rates_return)));
def run_poa_sequentially_v2(seq_path, out_consensus_file): temp_subseq_file = '%s/tmp.subseq.fasta' % ( os.path.dirname(out_consensus_file)) temp_msa_file = '%s/tmp.subseq.fasta.pir' % ( os.path.dirname(out_consensus_file)) # out_consensus_file = '%s/consensus-poa.fasta' % (os.path.dirname(seq_path)); out_consensus_file_chunks = '%s/tmp.consensus.chunks.fasta' % ( os.path.dirname(out_consensus_file)) fp_out_all = open(out_consensus_file, 'w') fp_out_chunks = open(out_consensus_file_chunks, 'w') timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime()) fp_out_all.write('>Consensus_with_POA all %s\n' % (timestamp)) print 'seq_path = "%s"' % (seq_path) [ret_string, num_seqs, total_seq_len, average_seq_len, max_seq_len] = fastqparser.count_seq_length(seq_path) window_len = 5000 # window_len = 1000; # window_len = max_seq_len; start_coord = 0 while (start_coord < max_seq_len): end_coord = start_coord + window_len if (end_coord > (max_seq_len - window_len)): end_coord = max_seq_len sys.stderr.write('Window: start = %d, end = %d\n' % (start_coord, end_coord)) execute_command('%s/fastqfilter.py subseqs %s %d %d %s' % (SAMSCRIPTS_PATH, seq_path, start_coord, end_coord, temp_subseq_file)) # if (start_coord == 0 or end_coord == max_seq_len): # execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)); # execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)); # else: execute_command( '%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)) # execute_command('%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)); timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime()) fp_out_chunks.write('>Consensus_with_POA %d-%d %s\n' % (start_coord, end_coord, timestamp)) [headers, seqs, quals] = fastqparser.read_fastq(temp_msa_file) cons_seq = '' for i in xrange(0, len(seqs[0])): base_counts = { 'A': 0, 'C': 0, 'T': 0, 'G': 0, '.': 0 } for j in xrange(0, len(seqs)): base_counts[seqs[j][i]] += 1 sorted_base_counts = sorted(base_counts.items(), key=operator.itemgetter(1)) # print sorted_base_counts; if (sorted_base_counts[-1][0] != '.'): cons_seq += sorted_base_counts[-1][0] fp_out_all.write('%s' % (cons_seq)) fp_out_chunks.write('%s\n' % (cons_seq)) # # print temp_subseq_file; # # print headers; # i = 0; # while (i < len(headers)): # if ('consensus' in headers[i]): # # print seqs[i]; # # print seqs[i].replace('.', ''); # chunk_seq = seqs[i].replace('.', ''); # fp_out_all.write('%s' % (chunk_seq)); # fp_out_chunks.write('%s\n' % (chunk_seq)); # break; # i += 1; # break; start_coord = end_coord fp_out_all.write('\n') fp_out_all.close() fp_out_chunks.close()
def analyze_single_sam(mode_code, sam_file, reference_file, reads_file, simulated_sam_path='', consensus_coverage_threshold=20): if (sam_file == '' or reference_file == ''): print '[analyze_single_sam] ERROR: No input files given.' print_usage_and_exit() return dir_name = os.path.dirname(sam_file) if (dir_name == ''): dir_name = '.' # Create the output path if it doesn't exist yet. output_folder_intermediate = dir_name + '/analysis-intermediate' output_folder_final = dir_name + '/analysis-final' if not os.path.exists(output_folder_intermediate): os.makedirs(output_folder_intermediate) if not os.path.exists(output_folder_final): os.makedirs(output_folder_final) dataset_name = os.path.splitext(os.path.basename(sam_file))[0] summary = '' csv_line = '%s\t' % (dataset_name) csv_header = 'mapper\t' # Defining output filenames. out_accuracy_counts_path = '%s/error_rates-individualbases-%s.csv' % ( output_folder_intermediate, dataset_name) out_accuracy_counts_indel_events_path = '%s/error_rates-eventsindel-%s.csv' % ( output_folder_intermediate, dataset_name) consensus_prefix = '%s/consensus-%s' % (output_folder_intermediate, dataset_name) out_summary_path = '%s/summary_sam_analysis-%s.txt' % ( output_folder_intermediate, dataset_name) out_consensus_plot_lines = consensus_prefix + '.plot' out_consensus_plot_png = consensus_prefix + '.plot.png' out_count_mapped_reads_prefix = '%s/count_reads-%s' % ( output_folder_intermediate, dataset_name) out_results_path = '%s/results-%s.csv' % (output_folder_final, dataset_name) try: fp = open(out_summary_path, 'w') except IOError: sys.stderr.write( 'ERROR: Could not open summary path "%s" for writing!\n' % out_summary_path) os.exit(1) # Verbose of the filenames for the output. summary_file_paths = '' summary_file_paths += 'Analyzing SAM file %s...\n' % (sam_file) summary_file_paths += 'Reference file: %s\n' % (reference_file) # summary_file_paths += 'Output folder: %s\n' % (output_folder); summary_file_paths += 'Accuracy counts: %s\n' % (out_accuracy_counts_path) summary_file_paths += 'Consensus prefix: %s\n' % (consensus_prefix) summary_file_paths += 'Count mapped reads: %s\n' % ( out_count_mapped_reads_prefix) summary += '[Paths]\n' + summary_file_paths + '\n' # Just simply verbose to screen and summary file. sys.stderr.write('[Paths]\n') sys.stderr.write(summary_file_paths + '\n') fp.write('[Paths]\n') fp.write(summary_file_paths + '\n') # Get the headers from the SAM file (if they exist). Useful if the commandline was stored. sam_headers = utility_sam.LoadOnlySAMHeaders(sam_file, False) summary_file_sam_headers = '%s\n' % ('\n'.join(sam_headers)) summary += '[SAM headers]\n' + summary_file_sam_headers + '\n' # Just simply verbose to screen and summary file. sys.stderr.write('[SAM headers]\n') sys.stderr.write(summary_file_sam_headers + '\n') fp.write('[SAM headers]\n') fp.write(summary_file_sam_headers + '\n') # Calculate the input reads statistics. [ fastqinfo_string, fastqinfo_num_seqs, fastqinfo_total_seq_len, fastqinfo_average_seq_len, temp_max_seq_len ] = fastqparser.count_seq_length(reads_file) summary_reads_file = '' summary_reads_file += 'Number of reads in the input file: %d\n' % ( fastqinfo_num_seqs) summary_reads_file += 'Total number of bases in the input reads file: %d\n' % ( fastqinfo_total_seq_len) summary_reads_file += 'Average read length in the input file: %d\n' % ( fastqinfo_average_seq_len) summary += '[Input reads file]\n' + summary_reads_file + '\n' ###############################3 # This version is more verbose, but is different than figure1.xlsx! # csv_line += '%d\t%d\t' % (fastqinfo_num_seqs, fastqinfo_total_seq_len); # csv_header += 'Num input reads\tNum input bases\t'; ###############################3 # Just simply verbose to screen and summary file. sys.stderr.write('[Input reads file]\n') sys.stderr.write(summary_reads_file + '\n') fp.write('[Input reads file]\n') fp.write(summary_reads_file + '\n') # header = 'CPU time\tMax RSS\tNum input reads\tNum input bases\t'; if (mode_code & MODE_CODE_CALC_CONSENSUS): summary_consensus = '' sys.stderr.write( '[main_sam_analysis.py] Calculating consensus statistics, base coverage threshold %d...\n' % (consensus_coverage_threshold)) consensus.main(sam_file, reference_file, consensus_coverage_threshold, consensus_prefix, thread_id=0) sys.stderr.write('\n') if ((mode_code & MODE_CODE_CALC_CONSENSUS) or (mode_code & MODE_CODE_COLLECT_CONSENSUS)): # Collecting stats from the consensus caller. # consensus_plot_lines and consensus_statistics have the same information, only the # consensus_statistics object is a dict and the consensus_plot_lines is a formatted string # meant for outputting to a summary text file. variant_summary_path = ( '%s-cov_%d.variant.sum' % (consensus_prefix, consensus_coverage_threshold)) [alignments_file, mpileup_file, coverage_threshold, snp_count, insertion_count, deletion_count, \ num_undercovered_bases, num_called_bases, num_correct_bases, average_coverage, variant_lines] = ParseVariantStats(variant_summary_path) summary_consensus = '\n'.join(variant_lines) summary += '[Consensus statistics]\n' + summary_consensus + '\n\n' ###############################3 # This version is more verbose, but is different than figure1.xlsx! # csv_line += '%d\t%d\t%d\t%d\t%d\t%d\t%f\t%d\t' % ( snp_count, deletion_count, insertion_count, # num_undercovered_bases, num_called_bases, # num_correct_bases, average_coverage, coverage_threshold); # csv_header += 'snp_count\tdeletion_count\tinsertion_count\tnum_undercovered_bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\tcoverage_threshold\t'; ###############################3 ###############################3 # This version is in the same order as figure1.xlsx! csv_line += '%d\t%d\t%d\t%d\t%d\t%d\t%f\t' % ( insertion_count, deletion_count, snp_count, num_undercovered_bases, num_called_bases, num_correct_bases, average_coverage) csv_header += 'Insertions\tDeletions\tSNPs\tUncalled Bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\t' ###############################3 # [summary_consensus, consensus_plot_lines, consensus_statistics] = consensus_stats.CollectConsensus(sam_file, consensus_prefix, consensus_coverage_thresholds, suppress_error_messages); sys.stderr.write('[Consensus statistics]\n') sys.stderr.write(summary_consensus + '\n\n') fp.write('[Consensus statistics]\n') fp.write(summary_consensus + '\n\n') # fp_consensus_plot_lines = open(out_consensus_plot_lines, 'w'); # fp_consensus_plot_lines.write(consensus_plot_lines); # fp_consensus_plot_lines.close(); # consensus_stats.PlotConsensusStats(dataset_name, consensus_prefix, consensus_statistics, out_consensus_plot_png); if (mode_code & MODE_CODE_CALC_ERROR_RATE): sys.stderr.write( '[analyzesam.py] Calculating error rates - individual indels...\n') errorrates.ProcessFromFiles(reference_file, sam_file, out_accuracy_counts_path, False) sys.stderr.write('\n') if ((mode_code & MODE_CODE_CALC_ERROR_RATE) or (mode_code & MODE_CODE_COLLECT_ERROR_RATE)): # Collecting stats from the error rate estimation. error_rates_return = errorrates.CollectAccuracy( sam_file, out_accuracy_counts_path, False) try: [ summary_lines, summary_cigar, error_rate_hist, insertion_hist, deletion_hist, snp_hist, match_hist, cigar_dataset_name, out_png_path ] = error_rates_return fp.write('[CIGAR statistics - individual indels]\n') fp.write(summary_lines + '\n') sys.stderr.write('[CIGAR statistics - individual indels]\n') sys.stderr.write(summary_lines + '\n') summary += '[CIGAR statistics - individual indels]\n' summary += summary_lines + '\n\n' except Exception, e: sys.stderr.write(str(e) + '\n') sys.stderr.write('Returned values: %s\n' % (str(error_rates_return)))
def run_poa_sequentially_v2(seq_path, out_consensus_file): temp_subseq_file = '%s/tmp.subseq.fasta' % (os.path.dirname(out_consensus_file)); temp_msa_file = '%s/tmp.subseq.fasta.pir' % (os.path.dirname(out_consensus_file)); # out_consensus_file = '%s/consensus-poa.fasta' % (os.path.dirname(seq_path)); out_consensus_file_chunks = '%s/tmp.consensus.chunks.fasta' % (os.path.dirname(out_consensus_file)); fp_out_all = open(out_consensus_file, 'w'); fp_out_chunks = open(out_consensus_file_chunks, 'w'); timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime()); fp_out_all.write('>Consensus_with_POA all %s\n' % (timestamp)); print 'seq_path = "%s"' % (seq_path); [ret_string, num_seqs, total_seq_len, average_seq_len, max_seq_len] = fastqparser.count_seq_length(seq_path); window_len = 5000; # window_len = 1000; # window_len = max_seq_len; start_coord = 0; while (start_coord < max_seq_len): end_coord = start_coord + window_len; if (end_coord > (max_seq_len - window_len)): end_coord = max_seq_len; sys.stderr.write('Window: start = %d, end = %d\n' % (start_coord, end_coord)); execute_command('%s/fastqfilter.py subseqs %s %d %d %s' % (SAMSCRIPTS_PATH, seq_path, start_coord, end_coord, temp_subseq_file)); # if (start_coord == 0 or end_coord == max_seq_len): # execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)); # execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)); # else: execute_command('%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)); # execute_command('%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)); timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime()); fp_out_chunks.write('>Consensus_with_POA %d-%d %s\n' % (start_coord, end_coord, timestamp)); [headers, seqs, quals] = fastqparser.read_fastq(temp_msa_file); cons_seq = ''; for i in xrange(0, len(seqs[0])): base_counts = {'A': 0, 'C': 0, 'T': 0, 'G': 0, '.': 0}; for j in xrange(0, len(seqs)): base_counts[seqs[j][i]] += 1; sorted_base_counts = sorted(base_counts.items(), key=operator.itemgetter(1)); # print sorted_base_counts; if (sorted_base_counts[-1][0] != '.'): cons_seq += sorted_base_counts[-1][0] fp_out_all.write('%s' % (cons_seq)); fp_out_chunks.write('%s\n' % (cons_seq)); # # print temp_subseq_file; # # print headers; # i = 0; # while (i < len(headers)): # if ('consensus' in headers[i]): # # print seqs[i]; # # print seqs[i].replace('.', ''); # chunk_seq = seqs[i].replace('.', ''); # fp_out_all.write('%s' % (chunk_seq)); # fp_out_chunks.write('%s\n' % (chunk_seq)); # break; # i += 1; # break; start_coord = end_coord; fp_out_all.write('\n'); fp_out_all.close(); fp_out_chunks.close();
def analyze_single_sam( mode_code, sam_file, reference_file, reads_file, simulated_sam_path="", consensus_coverage_threshold=20, ): if sam_file == "" or reference_file == "": print("[analyze_single_sam] ERROR: No input files given.") print_usage_and_exit() return dir_name = os.path.dirname(sam_file) if dir_name == "": dir_name = "." # Create the output path if it doesn't exist yet. output_folder_intermediate = dir_name + "/analysis-intermediate" output_folder_final = dir_name + "/analysis-final" if not os.path.exists(output_folder_intermediate): os.makedirs(output_folder_intermediate) if not os.path.exists(output_folder_final): os.makedirs(output_folder_final) dataset_name = os.path.splitext(os.path.basename(sam_file))[0] summary = "" csv_line = "%s\t" % (dataset_name) csv_header = "mapper\t" # Defining output filenames. out_accuracy_counts_path = "%s/error_rates-individualbases-%s.csv" % ( output_folder_intermediate, dataset_name, ) out_accuracy_counts_indel_events_path = "%s/error_rates-eventsindel-%s.csv" % ( output_folder_intermediate, dataset_name, ) consensus_prefix = "%s/consensus-%s" % (output_folder_intermediate, dataset_name) out_summary_path = "%s/summary_sam_analysis-%s.txt" % ( output_folder_intermediate, dataset_name, ) out_consensus_plot_lines = consensus_prefix + ".plot" out_consensus_plot_png = consensus_prefix + ".plot.png" out_count_mapped_reads_prefix = "%s/count_reads-%s" % ( output_folder_intermediate, dataset_name, ) out_results_path = "%s/results-%s.csv" % (output_folder_final, dataset_name) try: fp = open(out_summary_path, "w") except IOError: sys.stderr.write( 'ERROR: Could not open summary path "%s" for writing!\n' % out_summary_path ) os.exit(1) # Verbose of the filenames for the output. summary_file_paths = "" summary_file_paths += "Analyzing SAM file %s...\n" % (sam_file) summary_file_paths += "Reference file: %s\n" % (reference_file) # summary_file_paths += 'Output folder: %s\n' % (output_folder); summary_file_paths += "Accuracy counts: %s\n" % (out_accuracy_counts_path) summary_file_paths += "Consensus prefix: %s\n" % (consensus_prefix) summary_file_paths += "Count mapped reads: %s\n" % (out_count_mapped_reads_prefix) summary += "[Paths]\n" + summary_file_paths + "\n" # Just simply verbose to screen and summary file. sys.stderr.write("[Paths]\n") sys.stderr.write(summary_file_paths + "\n") fp.write("[Paths]\n") fp.write(summary_file_paths + "\n") # Get the headers from the SAM file (if they exist). Useful if the commandline was stored. sam_headers = utility_sam.LoadOnlySAMHeaders(sam_file, False) summary_file_sam_headers = "%s\n" % ("\n".join(sam_headers)) summary += "[SAM headers]\n" + summary_file_sam_headers + "\n" # Just simply verbose to screen and summary file. sys.stderr.write("[SAM headers]\n") sys.stderr.write(summary_file_sam_headers + "\n") fp.write("[SAM headers]\n") fp.write(summary_file_sam_headers + "\n") # Calculate the input reads statistics. [ fastqinfo_string, fastqinfo_num_seqs, fastqinfo_total_seq_len, fastqinfo_average_seq_len, temp_max_seq_len, ] = fastqparser.count_seq_length(reads_file) summary_reads_file = "" summary_reads_file += "Number of reads in the input file: %d\n" % ( fastqinfo_num_seqs ) summary_reads_file += "Total number of bases in the input reads file: %d\n" % ( fastqinfo_total_seq_len ) summary_reads_file += "Average read length in the input file: %d\n" % ( fastqinfo_average_seq_len ) summary += "[Input reads file]\n" + summary_reads_file + "\n" ###############################3 # This version is more verbose, but is different than figure1.xlsx! # csv_line += '%d\t%d\t' % (fastqinfo_num_seqs, fastqinfo_total_seq_len); # csv_header += 'Num input reads\tNum input bases\t'; ###############################3 # Just simply verbose to screen and summary file. sys.stderr.write("[Input reads file]\n") sys.stderr.write(summary_reads_file + "\n") fp.write("[Input reads file]\n") fp.write(summary_reads_file + "\n") # header = 'CPU time\tMax RSS\tNum input reads\tNum input bases\t'; if mode_code & MODE_CODE_CALC_CONSENSUS: summary_consensus = "" sys.stderr.write( "[main_sam_analysis.py] Calculating consensus statistics, base coverage threshold %d...\n" % (consensus_coverage_threshold) ) consensus.main( sam_file, reference_file, consensus_coverage_threshold, consensus_prefix, thread_id=0, ) sys.stderr.write("\n") if (mode_code & MODE_CODE_CALC_CONSENSUS) or ( mode_code & MODE_CODE_COLLECT_CONSENSUS ): # Collecting stats from the consensus caller. # consensus_plot_lines and consensus_statistics have the same information, only the # consensus_statistics object is a dict and the consensus_plot_lines is a formatted string # meant for outputting to a summary text file. variant_summary_path = "%s-cov_%d.variant.sum" % ( consensus_prefix, consensus_coverage_threshold, ) [ alignments_file, mpileup_file, coverage_threshold, snp_count, insertion_count, deletion_count, num_undercovered_bases, num_called_bases, num_correct_bases, average_coverage, variant_lines, ] = ParseVariantStats(variant_summary_path) summary_consensus = "\n".join(variant_lines) summary += "[Consensus statistics]\n" + summary_consensus + "\n\n" ###############################3 # This version is more verbose, but is different than figure1.xlsx! # csv_line += '%d\t%d\t%d\t%d\t%d\t%d\t%f\t%d\t' % ( snp_count, deletion_count, insertion_count, # num_undercovered_bases, num_called_bases, # num_correct_bases, average_coverage, coverage_threshold); # csv_header += 'snp_count\tdeletion_count\tinsertion_count\tnum_undercovered_bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\tcoverage_threshold\t'; ###############################3 ###############################3 # This version is in the same order as figure1.xlsx! csv_line += "%d\t%d\t%d\t%d\t%d\t%d\t%f\t" % ( insertion_count, deletion_count, snp_count, num_undercovered_bases, num_called_bases, num_correct_bases, average_coverage, ) csv_header += "Insertions\tDeletions\tSNPs\tUncalled Bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\t" ###############################3 # [summary_consensus, consensus_plot_lines, consensus_statistics] = consensus_stats.CollectConsensus(sam_file, consensus_prefix, consensus_coverage_thresholds, suppress_error_messages); sys.stderr.write("[Consensus statistics]\n") sys.stderr.write(summary_consensus + "\n\n") fp.write("[Consensus statistics]\n") fp.write(summary_consensus + "\n\n") # fp_consensus_plot_lines = open(out_consensus_plot_lines, 'w'); # fp_consensus_plot_lines.write(consensus_plot_lines); # fp_consensus_plot_lines.close(); # consensus_stats.PlotConsensusStats(dataset_name, consensus_prefix, consensus_statistics, out_consensus_plot_png); if mode_code & MODE_CODE_CALC_ERROR_RATE: sys.stderr.write( "[analyzesam.py] Calculating error rates - individual indels...\n" ) errorrates.ProcessFromFiles( reference_file, sam_file, out_accuracy_counts_path, False ) sys.stderr.write("\n") if (mode_code & MODE_CODE_CALC_ERROR_RATE) or ( mode_code & MODE_CODE_COLLECT_ERROR_RATE ): # Collecting stats from the error rate estimation. error_rates_return = errorrates.CollectAccuracy( sam_file, out_accuracy_counts_path, False ) try: [ summary_lines, summary_cigar, error_rate_hist, insertion_hist, deletion_hist, snp_hist, match_hist, cigar_dataset_name, out_png_path, ] = error_rates_return fp.write("[CIGAR statistics - individual indels]\n") fp.write(summary_lines + "\n") sys.stderr.write("[CIGAR statistics - individual indels]\n") sys.stderr.write(summary_lines + "\n") summary += "[CIGAR statistics - individual indels]\n" summary += summary_lines + "\n\n" except Exception as e: sys.stderr.write(str(e) + "\n") sys.stderr.write("Returned values: %s\n" % (str(error_rates_return))) if mode_code & MODE_CODE_CALC_ERROR_RATE_INDELS_AS_EVENTS: sys.stderr.write( "[analyzesam.py] Calculating error rates - indels as events...\n" ) errorrates.ProcessFromFiles( reference_file, sam_file, out_accuracy_counts_indel_events_path, True ) sys.stderr.write("\n") if (mode_code & MODE_CODE_CALC_ERROR_RATE_INDELS_AS_EVENTS) or ( mode_code & MODE_CODE_COLLECT_ERROR_RATE_INDELS_AS_EVENTS ): # Collecting stats from the error rate estimation. error_rates_return = errorrates.CollectAccuracy( sam_file, out_accuracy_counts_indel_events_path, False ) try: [ summary_lines, summary_cigar, error_rate_hist, insertion_hist, deletion_hist, snp_hist, match_hist, cigar_dataset_name, out_png_path, ] = error_rates_return # [summary_cigar, error_rate_hist, insertion_hist, deletion_hist, snp_hist, match_hist, cigar_dataset_name, out_png_path] = errorrates.CollectAccuracy(sam_file, out_accuracy_counts_indel_events_path, False); fp.write("[CIGAR statistics - indels as events]\n") fp.write(summary_lines + "\n") sys.stderr.write("[CIGAR statistics - indels as events]\n") sys.stderr.write(summary_lines + "\n") summary += "[CIGAR statistics - indels as events]\n" summary += summary_lines + "\n\n" except Exception as e: sys.stderr.write(str(e) + "\n") sys.stderr.write("Returned values: %s\n" % (str(error_rates_return))) # if (mode_code & MODE_CODE_CALC_READ_COUNTS): # sys.stderr.write('[main_sam_analysis.py] Counting mapped reads...\n'); # [num_alignments, num_mapped_alignments, num_unique_reads, num_mapped_reads, num_mapped_bases] = utility_sam.CountMappedReads(sam_file); # sys.stderr.write('\n'); # summary += num_alignments = 0 num_mapped_alignments = 0 num_unique_reads = 0 num_mapped_reads = 0 num_mapped_bases = 0 if (mode_code & MODE_CODE_CALC_READ_COUNTS) or ( mode_code & MODE_CODE_COLLECT_READ_COUNTS ): sys.stderr.write("[main_sam_analysis.py] Counting mapped reads...\n") [ num_alignments, num_mapped_alignments, num_unique_reads, num_mapped_reads, num_mapped_bases, ] = utility_sam.CountMappedReads(sam_file) sys.stderr.write("\n") summary_read_count = "" summary_read_count += "num_alignments: %d\n" % num_alignments summary_read_count += "num_mapped_alignments: %d (%.2f%%)\n" % ( num_mapped_alignments, (float(num_mapped_alignments) / float(num_alignments)) * 100.0, ) summary_read_count += "num_unmapped_alignments: %d (%.2f%%)\n" % ( (num_alignments - num_mapped_alignments), (float(num_alignments - num_mapped_alignments) / float(num_alignments)) * 100.0, ) summary_read_count += "num_mapped_reads: %d\n" % num_mapped_reads summary_read_count += "num_uniquely_mapped_reads: %d\n" % num_unique_reads summary_read_count += "num_mapped_bases: %d\n" % num_mapped_bases summary_read_count += "num_read_in_input_reads_file: %d\n" % ( fastqinfo_num_seqs ) summary_read_count += "num_bases_in_input_reads_file: %d\n" % ( fastqinfo_total_seq_len ) summary_read_count += "percent_mapped_reads: %.2f%%\n" % ( (float(num_mapped_reads) / float(fastqinfo_num_seqs)) * 100.0 ) summary_read_count += "percent_mapped_bases: %.2f%%\n" % ( (float(num_mapped_bases) / float(fastqinfo_total_seq_len)) * 100.0 ) summary += "[SAM file statistics]\n" summary += summary_read_count + "\n\n" ###############################3 # This version is more verbose, but is different than figure1.xlsx! # csv_line += '%d\t%d\t%f\t%f\t' % (num_mapped_reads, num_mapped_bases, (float(num_mapped_reads)/float(fastqinfo_num_seqs))*100.0, (float(num_mapped_bases)/float(fastqinfo_total_seq_len))*100.0); # csv_header += 'num_mapped_reads\tnum_mapped_bases\tpercent_mapped_reads\tpercent_mapped_bases\t'; ###############################3 ###############################3 # This version is in the same order as figure1.xlsx! csv_line += "%d\t%f\t%d\t%f\t" % ( num_mapped_bases, (float(num_mapped_bases) / float(fastqinfo_total_seq_len)) * 100.0, num_mapped_reads, (float(num_mapped_reads) / float(fastqinfo_num_seqs)) * 100.0, ) csv_header += "Num mapped bases\tPercent mapped bases\tNum mapped reads\tPercent mapped reads\t" ###############################3 # Collecting stats from the SAM file. # [summary_read_count_readable, read_count_plot_lines] = count_mapped_reads.CollectSAMStats(out_count_mapped_reads_prefix, suppress_error_messages); sys.stderr.write("[SAM file statistics]\n") sys.stderr.write(summary_read_count + "\n") fp.write("[SAM file statistics]\n") fp.write(summary_read_count + "\n") # Get the CPU time and memory consumption from the process. [ cmdline, realtime, cputime, usertime, systemtime, maxrss, rsscache, memtime_lines, ] = ParseMemTime(sam_file) reads_per_sec = ( 0.0 if (cputime == 0) else (float(num_unique_reads) / float(cputime)) ) bases_per_sec = ( 0.0 if (cputime == 0) else (float(num_mapped_bases) / float(cputime)) ) bases_per_mb = 0.0 if (cputime == 0) else float(num_mapped_bases) / float(maxrss) summary_memtime = "\n".join(memtime_lines) summary += "[Memtime]\n" + summary_memtime + "\n\n" csv_line += "%f\t%f\t" % (cputime, maxrss) csv_line += "%f\t%f\t%f\t" % (reads_per_sec, bases_per_sec, bases_per_mb) csv_header += "CPU time [s]\tMemory [MB]\treads/sec\tbases/sec\tbases/MB\t" # Just simply verbose to screen and summary file. sys.stderr.write("[Memtime]\n") sys.stderr.write(summary_memtime + "\n\n") fp.write("[Memtime]\n") fp.write(summary_memtime + "\n\n") # sys.stderr.write('[SAM headers]\n'); # sys.stderr.write(summary_file_sam_headers + '\n'); # sys.stderr.write('[SAM file statistics]:\n'); # sys.stderr.write(summary_read_count_readable + '\n'); # sys.stderr.write('[CIGAR statistics]\n'); # sys.stderr.write(summary_cigar + '\n'); # sys.stderr.write('[Consensus statistics]\n'); # sys.stderr.write(summary_consensus + '\n'); # sys.stderr.write('\n'); sys.stderr.write(summary) sys.stderr.write("[Done!]\n") sys.stderr.write("\n") fp.close() csv_header += "Coverage threshold" csv_line += "%d\t" % (consensus_coverage_threshold) csv_header = csv_header.strip() csv_line = csv_line.strip() if mode_code & MODE_CODE_HEADER: return "%s\n%s\n" % (csv_header, csv_line) return "%s\n" % (csv_line)