コード例 #1
0
ファイル: alignmentstats.py プロジェクト: isovic/samscripts
def analyze_single_sam(mode_code, sam_file, reference_file, reads_file, simulated_sam_path='', consensus_coverage_threshold=20):
	
	if (sam_file == '' or reference_file == ''):
		print '[analyze_single_sam] ERROR: No input files given.';
		print_usage_and_exit();
		return;

	dir_name = os.path.dirname(sam_file);
	if (dir_name == ''):
		dir_name = '.';

	# Create the output path if it doesn't exist yet.
	output_folder_intermediate = dir_name + '/analysis-intermediate';
	output_folder_final = dir_name + '/analysis-final';
	if not os.path.exists(output_folder_intermediate):
		os.makedirs(output_folder_intermediate);
	if not os.path.exists(output_folder_final):
		os.makedirs(output_folder_final);
	
	dataset_name = os.path.splitext(os.path.basename(sam_file))[0];
	summary = '';
	csv_line = '%s\t' % (dataset_name);
	csv_header = 'mapper\t';

	# Defining output filenames.
	out_accuracy_counts_path = '%s/error_rates-individualbases-%s.csv' % (output_folder_intermediate, dataset_name);
	out_accuracy_counts_indel_events_path = '%s/error_rates-eventsindel-%s.csv' % (output_folder_intermediate, dataset_name);
	consensus_prefix = '%s/consensus-%s' % (output_folder_intermediate, dataset_name);
	out_summary_path = '%s/summary_sam_analysis-%s.txt' % (output_folder_intermediate, dataset_name);
	out_consensus_plot_lines = consensus_prefix + '.plot';
	out_consensus_plot_png = consensus_prefix + '.plot.png';
	out_count_mapped_reads_prefix = '%s/count_reads-%s' % (output_folder_intermediate, dataset_name);
	out_results_path = '%s/results-%s.csv' % (output_folder_final, dataset_name);


	
	try:
		fp = open(out_summary_path, 'w');
	except IOError:
		sys.stderr.write('ERROR: Could not open summary path "%s" for writing!\n' % out_summary_path);
		os.exit(1);

	# Verbose of the filenames for the output.
	summary_file_paths = '';
	summary_file_paths += 'Analyzing SAM file %s...\n' % (sam_file);
	summary_file_paths += 'Reference file: %s\n' % (reference_file);
	# summary_file_paths += 'Output folder: %s\n' % (output_folder);
	summary_file_paths += 'Accuracy counts: %s\n' % (out_accuracy_counts_path);
	summary_file_paths += 'Consensus prefix: %s\n' % (consensus_prefix);
	summary_file_paths += 'Count mapped reads: %s\n' % (out_count_mapped_reads_prefix);
	summary += '[Paths]\n' + summary_file_paths + '\n';
	# Just simply verbose to screen and summary file.
	sys.stderr.write('[Paths]\n');
	sys.stderr.write(summary_file_paths + '\n');
	fp.write('[Paths]\n');
	fp.write(summary_file_paths + '\n');
	
	# Get the headers from the SAM file (if they exist). Useful if the commandline was stored.
	sam_headers = utility_sam.LoadOnlySAMHeaders(sam_file, False);
	summary_file_sam_headers = '%s\n' % ('\n'.join(sam_headers));
	summary += '[SAM headers]\n' + summary_file_sam_headers + '\n';
	# Just simply verbose to screen and summary file.
	sys.stderr.write('[SAM headers]\n');
	sys.stderr.write(summary_file_sam_headers + '\n');
	fp.write('[SAM headers]\n');
	fp.write(summary_file_sam_headers + '\n');

	# Calculate the input reads statistics.
	[fastqinfo_string, fastqinfo_num_seqs, fastqinfo_total_seq_len, fastqinfo_average_seq_len, temp_max_seq_len] = fastqparser.count_seq_length(reads_file);
	summary_reads_file = '';
	summary_reads_file += 'Number of reads in the input file: %d\n' % (fastqinfo_num_seqs);
	summary_reads_file += 'Total number of bases in the input reads file: %d\n' % (fastqinfo_total_seq_len);
	summary_reads_file += 'Average read length in the input file: %d\n' % (fastqinfo_average_seq_len);
	summary += '[Input reads file]\n' + summary_reads_file + '\n';
	###############################3
	# This version is more verbose, but is different than figure1.xlsx!
	# csv_line += '%d\t%d\t' % (fastqinfo_num_seqs, fastqinfo_total_seq_len);
	# csv_header += 'Num input reads\tNum input bases\t';
	###############################3
	# Just simply verbose to screen and summary file.
	sys.stderr.write('[Input reads file]\n');
	sys.stderr.write(summary_reads_file + '\n');
	fp.write('[Input reads file]\n');
	fp.write(summary_reads_file + '\n');
	
	# header = 'CPU time\tMax RSS\tNum input reads\tNum input bases\t';



	if (mode_code & MODE_CODE_CALC_CONSENSUS):
		summary_consensus = '';
		sys.stderr.write('[main_sam_analysis.py] Calculating consensus statistics, base coverage threshold %d...\n' % (consensus_coverage_threshold));
		consensus.main(sam_file, reference_file, consensus_coverage_threshold, consensus_prefix, thread_id=0);
		sys.stderr.write('\n');
	if ((mode_code & MODE_CODE_CALC_CONSENSUS) or (mode_code & MODE_CODE_COLLECT_CONSENSUS)):
		# Collecting stats from the consensus caller.
		# consensus_plot_lines and consensus_statistics have the same information, only the
		# consensus_statistics object is a dict and the consensus_plot_lines is a formatted string
		# meant for outputting to a summary text file.
		variant_summary_path = ('%s-cov_%d.variant.sum' % (consensus_prefix, consensus_coverage_threshold));
		[alignments_file, mpileup_file, coverage_threshold, snp_count, insertion_count, deletion_count, \
						num_undercovered_bases, num_called_bases, num_correct_bases, average_coverage, variant_lines] = ParseVariantStats(variant_summary_path);

		summary_consensus = '\n'.join(variant_lines);
		summary += '[Consensus statistics]\n' + summary_consensus + '\n\n';

		###############################3
		# This version is more verbose, but is different than figure1.xlsx!
		# csv_line += '%d\t%d\t%d\t%d\t%d\t%d\t%f\t%d\t' % (	snp_count, deletion_count, insertion_count,
		# 													num_undercovered_bases, num_called_bases,
		# 													num_correct_bases, average_coverage, coverage_threshold);
		# csv_header += 'snp_count\tdeletion_count\tinsertion_count\tnum_undercovered_bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\tcoverage_threshold\t';
		###############################3

		###############################3
		# This version is in the same order as figure1.xlsx!
		csv_line += '%d\t%d\t%d\t%d\t%d\t%d\t%f\t' % (	insertion_count, deletion_count, snp_count,
															num_undercovered_bases, num_called_bases,
															num_correct_bases, average_coverage);
		csv_header += 'Insertions\tDeletions\tSNPs\tUncalled Bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\t';
		###############################3

		# [summary_consensus, consensus_plot_lines, consensus_statistics] = consensus_stats.CollectConsensus(sam_file, consensus_prefix, consensus_coverage_thresholds, suppress_error_messages);
		sys.stderr.write('[Consensus statistics]\n');
		sys.stderr.write(summary_consensus + '\n\n');
		fp.write('[Consensus statistics]\n');
		fp.write(summary_consensus + '\n\n');
		# fp_consensus_plot_lines = open(out_consensus_plot_lines, 'w');
		# fp_consensus_plot_lines.write(consensus_plot_lines);
		# fp_consensus_plot_lines.close();
		# consensus_stats.PlotConsensusStats(dataset_name, consensus_prefix, consensus_statistics, out_consensus_plot_png);

	if (mode_code & MODE_CODE_CALC_ERROR_RATE):
		sys.stderr.write('[analyzesam.py] Calculating error rates - individual indels...\n');
		errorrates.ProcessFromFiles(reference_file, sam_file, out_accuracy_counts_path, False);
		sys.stderr.write('\n');
	if ((mode_code & MODE_CODE_CALC_ERROR_RATE) or (mode_code & MODE_CODE_COLLECT_ERROR_RATE)):
		# Collecting stats from the error rate estimation.
		error_rates_return = errorrates.CollectAccuracy(sam_file, out_accuracy_counts_path, False);

		try:
			[summary_lines, summary_cigar, error_rate_hist, insertion_hist, deletion_hist, snp_hist, match_hist, cigar_dataset_name, out_png_path] = error_rates_return;
			fp.write('[CIGAR statistics - individual indels]\n');
			fp.write(summary_lines + '\n');
			sys.stderr.write('[CIGAR statistics - individual indels]\n');
			sys.stderr.write(summary_lines + '\n');
			summary += '[CIGAR statistics - individual indels]\n';
			summary += summary_lines + '\n\n';

		except Exception, e:
			sys.stderr.write(str(e) + '\n');
			sys.stderr.write('Returned values: %s\n' % (str(error_rates_return)));
コード例 #2
0
def run_poa_sequentially_v2(seq_path, out_consensus_file):
    temp_subseq_file = '%s/tmp.subseq.fasta' % (
        os.path.dirname(out_consensus_file))
    temp_msa_file = '%s/tmp.subseq.fasta.pir' % (
        os.path.dirname(out_consensus_file))
    # out_consensus_file = '%s/consensus-poa.fasta' % (os.path.dirname(seq_path));
    out_consensus_file_chunks = '%s/tmp.consensus.chunks.fasta' % (
        os.path.dirname(out_consensus_file))

    fp_out_all = open(out_consensus_file, 'w')
    fp_out_chunks = open(out_consensus_file_chunks, 'w')

    timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime())
    fp_out_all.write('>Consensus_with_POA all %s\n' % (timestamp))

    print 'seq_path = "%s"' % (seq_path)

    [ret_string, num_seqs, total_seq_len, average_seq_len,
     max_seq_len] = fastqparser.count_seq_length(seq_path)

    window_len = 5000
    # window_len = 1000;
    # window_len = max_seq_len;

    start_coord = 0
    while (start_coord < max_seq_len):
        end_coord = start_coord + window_len
        if (end_coord > (max_seq_len - window_len)):
            end_coord = max_seq_len

        sys.stderr.write('Window: start = %d, end = %d\n' %
                         (start_coord, end_coord))
        execute_command('%s/fastqfilter.py subseqs %s %d %d %s' %
                        (SAMSCRIPTS_PATH, seq_path, start_coord, end_coord,
                         temp_subseq_file))

        # if (start_coord == 0 or end_coord == max_seq_len):
        # 	execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH));
        # execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH));
        # else:
        execute_command(
            '%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat'
            % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH))
        # execute_command('%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH));

        timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime())
        fp_out_chunks.write('>Consensus_with_POA %d-%d %s\n' %
                            (start_coord, end_coord, timestamp))
        [headers, seqs, quals] = fastqparser.read_fastq(temp_msa_file)

        cons_seq = ''
        for i in xrange(0, len(seqs[0])):
            base_counts = {
                'A': 0,
                'C': 0,
                'T': 0,
                'G': 0,
                '.': 0
            }
            for j in xrange(0, len(seqs)):
                base_counts[seqs[j][i]] += 1
            sorted_base_counts = sorted(base_counts.items(),
                                        key=operator.itemgetter(1))
            # print sorted_base_counts;
            if (sorted_base_counts[-1][0] != '.'):
                cons_seq += sorted_base_counts[-1][0]

        fp_out_all.write('%s' % (cons_seq))
        fp_out_chunks.write('%s\n' % (cons_seq))

        # # print temp_subseq_file;
        # # print headers;
        # i = 0;
        # while (i < len(headers)):
        # 	if ('consensus' in headers[i]):
        # 		# print seqs[i];
        # 		# print seqs[i].replace('.', '');
        # 		chunk_seq = seqs[i].replace('.', '');
        # 		fp_out_all.write('%s' % (chunk_seq));
        # 		fp_out_chunks.write('%s\n' % (chunk_seq));
        # 		break;
        # 	i += 1;

        # break;
        start_coord = end_coord

    fp_out_all.write('\n')
    fp_out_all.close()
    fp_out_chunks.close()
コード例 #3
0
ファイル: alignmentstats.py プロジェクト: isovic/samscripts
def analyze_single_sam(mode_code,
                       sam_file,
                       reference_file,
                       reads_file,
                       simulated_sam_path='',
                       consensus_coverage_threshold=20):

    if (sam_file == '' or reference_file == ''):
        print '[analyze_single_sam] ERROR: No input files given.'
        print_usage_and_exit()
        return

    dir_name = os.path.dirname(sam_file)
    if (dir_name == ''):
        dir_name = '.'

    # Create the output path if it doesn't exist yet.
    output_folder_intermediate = dir_name + '/analysis-intermediate'
    output_folder_final = dir_name + '/analysis-final'
    if not os.path.exists(output_folder_intermediate):
        os.makedirs(output_folder_intermediate)
    if not os.path.exists(output_folder_final):
        os.makedirs(output_folder_final)

    dataset_name = os.path.splitext(os.path.basename(sam_file))[0]
    summary = ''
    csv_line = '%s\t' % (dataset_name)
    csv_header = 'mapper\t'

    # Defining output filenames.
    out_accuracy_counts_path = '%s/error_rates-individualbases-%s.csv' % (
        output_folder_intermediate, dataset_name)
    out_accuracy_counts_indel_events_path = '%s/error_rates-eventsindel-%s.csv' % (
        output_folder_intermediate, dataset_name)
    consensus_prefix = '%s/consensus-%s' % (output_folder_intermediate,
                                            dataset_name)
    out_summary_path = '%s/summary_sam_analysis-%s.txt' % (
        output_folder_intermediate, dataset_name)
    out_consensus_plot_lines = consensus_prefix + '.plot'
    out_consensus_plot_png = consensus_prefix + '.plot.png'
    out_count_mapped_reads_prefix = '%s/count_reads-%s' % (
        output_folder_intermediate, dataset_name)
    out_results_path = '%s/results-%s.csv' % (output_folder_final,
                                              dataset_name)

    try:
        fp = open(out_summary_path, 'w')
    except IOError:
        sys.stderr.write(
            'ERROR: Could not open summary path "%s" for writing!\n' %
            out_summary_path)
        os.exit(1)

    # Verbose of the filenames for the output.
    summary_file_paths = ''
    summary_file_paths += 'Analyzing SAM file %s...\n' % (sam_file)
    summary_file_paths += 'Reference file: %s\n' % (reference_file)
    # summary_file_paths += 'Output folder: %s\n' % (output_folder);
    summary_file_paths += 'Accuracy counts: %s\n' % (out_accuracy_counts_path)
    summary_file_paths += 'Consensus prefix: %s\n' % (consensus_prefix)
    summary_file_paths += 'Count mapped reads: %s\n' % (
        out_count_mapped_reads_prefix)
    summary += '[Paths]\n' + summary_file_paths + '\n'
    # Just simply verbose to screen and summary file.
    sys.stderr.write('[Paths]\n')
    sys.stderr.write(summary_file_paths + '\n')
    fp.write('[Paths]\n')
    fp.write(summary_file_paths + '\n')

    # Get the headers from the SAM file (if they exist). Useful if the commandline was stored.
    sam_headers = utility_sam.LoadOnlySAMHeaders(sam_file, False)
    summary_file_sam_headers = '%s\n' % ('\n'.join(sam_headers))
    summary += '[SAM headers]\n' + summary_file_sam_headers + '\n'
    # Just simply verbose to screen and summary file.
    sys.stderr.write('[SAM headers]\n')
    sys.stderr.write(summary_file_sam_headers + '\n')
    fp.write('[SAM headers]\n')
    fp.write(summary_file_sam_headers + '\n')

    # Calculate the input reads statistics.
    [
        fastqinfo_string, fastqinfo_num_seqs, fastqinfo_total_seq_len,
        fastqinfo_average_seq_len, temp_max_seq_len
    ] = fastqparser.count_seq_length(reads_file)
    summary_reads_file = ''
    summary_reads_file += 'Number of reads in the input file: %d\n' % (
        fastqinfo_num_seqs)
    summary_reads_file += 'Total number of bases in the input reads file: %d\n' % (
        fastqinfo_total_seq_len)
    summary_reads_file += 'Average read length in the input file: %d\n' % (
        fastqinfo_average_seq_len)
    summary += '[Input reads file]\n' + summary_reads_file + '\n'
    ###############################3
    # This version is more verbose, but is different than figure1.xlsx!
    # csv_line += '%d\t%d\t' % (fastqinfo_num_seqs, fastqinfo_total_seq_len);
    # csv_header += 'Num input reads\tNum input bases\t';
    ###############################3
    # Just simply verbose to screen and summary file.
    sys.stderr.write('[Input reads file]\n')
    sys.stderr.write(summary_reads_file + '\n')
    fp.write('[Input reads file]\n')
    fp.write(summary_reads_file + '\n')

    # header = 'CPU time\tMax RSS\tNum input reads\tNum input bases\t';

    if (mode_code & MODE_CODE_CALC_CONSENSUS):
        summary_consensus = ''
        sys.stderr.write(
            '[main_sam_analysis.py] Calculating consensus statistics, base coverage threshold %d...\n'
            % (consensus_coverage_threshold))
        consensus.main(sam_file,
                       reference_file,
                       consensus_coverage_threshold,
                       consensus_prefix,
                       thread_id=0)
        sys.stderr.write('\n')
    if ((mode_code & MODE_CODE_CALC_CONSENSUS)
            or (mode_code & MODE_CODE_COLLECT_CONSENSUS)):
        # Collecting stats from the consensus caller.
        # consensus_plot_lines and consensus_statistics have the same information, only the
        # consensus_statistics object is a dict and the consensus_plot_lines is a formatted string
        # meant for outputting to a summary text file.
        variant_summary_path = (
            '%s-cov_%d.variant.sum' %
            (consensus_prefix, consensus_coverage_threshold))
        [alignments_file, mpileup_file, coverage_threshold, snp_count, insertion_count, deletion_count, \
            num_undercovered_bases, num_called_bases, num_correct_bases, average_coverage, variant_lines] = ParseVariantStats(variant_summary_path)

        summary_consensus = '\n'.join(variant_lines)
        summary += '[Consensus statistics]\n' + summary_consensus + '\n\n'

        ###############################3
        # This version is more verbose, but is different than figure1.xlsx!
        # csv_line += '%d\t%d\t%d\t%d\t%d\t%d\t%f\t%d\t' % (	snp_count, deletion_count, insertion_count,
        # 													num_undercovered_bases, num_called_bases,
        # 													num_correct_bases, average_coverage, coverage_threshold);
        # csv_header += 'snp_count\tdeletion_count\tinsertion_count\tnum_undercovered_bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\tcoverage_threshold\t';
        ###############################3

        ###############################3
        # This version is in the same order as figure1.xlsx!
        csv_line += '%d\t%d\t%d\t%d\t%d\t%d\t%f\t' % (
            insertion_count, deletion_count, snp_count, num_undercovered_bases,
            num_called_bases, num_correct_bases, average_coverage)
        csv_header += 'Insertions\tDeletions\tSNPs\tUncalled Bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\t'
        ###############################3

        # [summary_consensus, consensus_plot_lines, consensus_statistics] = consensus_stats.CollectConsensus(sam_file, consensus_prefix, consensus_coverage_thresholds, suppress_error_messages);
        sys.stderr.write('[Consensus statistics]\n')
        sys.stderr.write(summary_consensus + '\n\n')
        fp.write('[Consensus statistics]\n')
        fp.write(summary_consensus + '\n\n')
        # fp_consensus_plot_lines = open(out_consensus_plot_lines, 'w');
        # fp_consensus_plot_lines.write(consensus_plot_lines);
        # fp_consensus_plot_lines.close();
        # consensus_stats.PlotConsensusStats(dataset_name, consensus_prefix, consensus_statistics, out_consensus_plot_png);

    if (mode_code & MODE_CODE_CALC_ERROR_RATE):
        sys.stderr.write(
            '[analyzesam.py] Calculating error rates - individual indels...\n')
        errorrates.ProcessFromFiles(reference_file, sam_file,
                                    out_accuracy_counts_path, False)
        sys.stderr.write('\n')
    if ((mode_code & MODE_CODE_CALC_ERROR_RATE)
            or (mode_code & MODE_CODE_COLLECT_ERROR_RATE)):
        # Collecting stats from the error rate estimation.
        error_rates_return = errorrates.CollectAccuracy(
            sam_file, out_accuracy_counts_path, False)

        try:
            [
                summary_lines, summary_cigar, error_rate_hist, insertion_hist,
                deletion_hist, snp_hist, match_hist, cigar_dataset_name,
                out_png_path
            ] = error_rates_return
            fp.write('[CIGAR statistics - individual indels]\n')
            fp.write(summary_lines + '\n')
            sys.stderr.write('[CIGAR statistics - individual indels]\n')
            sys.stderr.write(summary_lines + '\n')
            summary += '[CIGAR statistics - individual indels]\n'
            summary += summary_lines + '\n\n'

        except Exception, e:
            sys.stderr.write(str(e) + '\n')
            sys.stderr.write('Returned values: %s\n' %
                             (str(error_rates_return)))
コード例 #4
0
ファイル: poaconsv2.py プロジェクト: isovic/ra-consensus
def run_poa_sequentially_v2(seq_path, out_consensus_file):
	temp_subseq_file = '%s/tmp.subseq.fasta' % (os.path.dirname(out_consensus_file));
	temp_msa_file = '%s/tmp.subseq.fasta.pir' % (os.path.dirname(out_consensus_file));
	# out_consensus_file = '%s/consensus-poa.fasta' % (os.path.dirname(seq_path));
	out_consensus_file_chunks = '%s/tmp.consensus.chunks.fasta' % (os.path.dirname(out_consensus_file));

	fp_out_all = open(out_consensus_file, 'w');
	fp_out_chunks = open(out_consensus_file_chunks, 'w');

	timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime());
	fp_out_all.write('>Consensus_with_POA all %s\n' % (timestamp));

	print 'seq_path = "%s"' % (seq_path);

	[ret_string, num_seqs, total_seq_len, average_seq_len, max_seq_len] = fastqparser.count_seq_length(seq_path);

	window_len = 5000;
	# window_len = 1000;
	# window_len = max_seq_len;

	start_coord = 0;
	while (start_coord < max_seq_len):
		end_coord = start_coord + window_len;
		if (end_coord > (max_seq_len - window_len)):
			end_coord = max_seq_len;

		sys.stderr.write('Window: start = %d, end = %d\n' % (start_coord, end_coord));
		execute_command('%s/fastqfilter.py subseqs %s %d %d %s' % (SAMSCRIPTS_PATH, seq_path, start_coord, end_coord, temp_subseq_file));

		# if (start_coord == 0 or end_coord == max_seq_len):
		# 	execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH));
			# execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH));
		# else:
		execute_command('%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH));
			# execute_command('%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH));

		timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime());
		fp_out_chunks.write('>Consensus_with_POA %d-%d %s\n' % (start_coord, end_coord, timestamp));
		[headers, seqs, quals] = fastqparser.read_fastq(temp_msa_file);

		cons_seq = '';
		for i in xrange(0, len(seqs[0])):
			base_counts = {'A': 0, 'C': 0, 'T': 0, 'G': 0, '.': 0};
			for j in xrange(0, len(seqs)):
				base_counts[seqs[j][i]] += 1;
			sorted_base_counts = sorted(base_counts.items(), key=operator.itemgetter(1));
			# print sorted_base_counts;
			if (sorted_base_counts[-1][0] != '.'):
				cons_seq += sorted_base_counts[-1][0]

		fp_out_all.write('%s' % (cons_seq));
		fp_out_chunks.write('%s\n' % (cons_seq));

		# # print temp_subseq_file;
		# # print headers;
		# i = 0;
		# while (i < len(headers)):
		# 	if ('consensus' in headers[i]):
		# 		# print seqs[i];
		# 		# print seqs[i].replace('.', '');
		# 		chunk_seq = seqs[i].replace('.', '');
		# 		fp_out_all.write('%s' % (chunk_seq));
		# 		fp_out_chunks.write('%s\n' % (chunk_seq));
		# 		break;
		# 	i += 1;

		# break;
		start_coord = end_coord;

	fp_out_all.write('\n');
	fp_out_all.close();
	fp_out_chunks.close();
コード例 #5
0
def analyze_single_sam(
    mode_code,
    sam_file,
    reference_file,
    reads_file,
    simulated_sam_path="",
    consensus_coverage_threshold=20,
):

    if sam_file == "" or reference_file == "":
        print("[analyze_single_sam] ERROR: No input files given.")
        print_usage_and_exit()
        return

    dir_name = os.path.dirname(sam_file)
    if dir_name == "":
        dir_name = "."

    # Create the output path if it doesn't exist yet.
    output_folder_intermediate = dir_name + "/analysis-intermediate"
    output_folder_final = dir_name + "/analysis-final"
    if not os.path.exists(output_folder_intermediate):
        os.makedirs(output_folder_intermediate)
    if not os.path.exists(output_folder_final):
        os.makedirs(output_folder_final)

    dataset_name = os.path.splitext(os.path.basename(sam_file))[0]
    summary = ""
    csv_line = "%s\t" % (dataset_name)
    csv_header = "mapper\t"

    # Defining output filenames.
    out_accuracy_counts_path = "%s/error_rates-individualbases-%s.csv" % (
        output_folder_intermediate,
        dataset_name,
    )
    out_accuracy_counts_indel_events_path = "%s/error_rates-eventsindel-%s.csv" % (
        output_folder_intermediate,
        dataset_name,
    )
    consensus_prefix = "%s/consensus-%s" % (output_folder_intermediate, dataset_name)
    out_summary_path = "%s/summary_sam_analysis-%s.txt" % (
        output_folder_intermediate,
        dataset_name,
    )
    out_consensus_plot_lines = consensus_prefix + ".plot"
    out_consensus_plot_png = consensus_prefix + ".plot.png"
    out_count_mapped_reads_prefix = "%s/count_reads-%s" % (
        output_folder_intermediate,
        dataset_name,
    )
    out_results_path = "%s/results-%s.csv" % (output_folder_final, dataset_name)

    try:
        fp = open(out_summary_path, "w")
    except IOError:
        sys.stderr.write(
            'ERROR: Could not open summary path "%s" for writing!\n' % out_summary_path
        )
        os.exit(1)

    # Verbose of the filenames for the output.
    summary_file_paths = ""
    summary_file_paths += "Analyzing SAM file %s...\n" % (sam_file)
    summary_file_paths += "Reference file: %s\n" % (reference_file)
    # summary_file_paths += 'Output folder: %s\n' % (output_folder);
    summary_file_paths += "Accuracy counts: %s\n" % (out_accuracy_counts_path)
    summary_file_paths += "Consensus prefix: %s\n" % (consensus_prefix)
    summary_file_paths += "Count mapped reads: %s\n" % (out_count_mapped_reads_prefix)
    summary += "[Paths]\n" + summary_file_paths + "\n"
    # Just simply verbose to screen and summary file.
    sys.stderr.write("[Paths]\n")
    sys.stderr.write(summary_file_paths + "\n")
    fp.write("[Paths]\n")
    fp.write(summary_file_paths + "\n")

    # Get the headers from the SAM file (if they exist). Useful if the commandline was stored.
    sam_headers = utility_sam.LoadOnlySAMHeaders(sam_file, False)
    summary_file_sam_headers = "%s\n" % ("\n".join(sam_headers))
    summary += "[SAM headers]\n" + summary_file_sam_headers + "\n"
    # Just simply verbose to screen and summary file.
    sys.stderr.write("[SAM headers]\n")
    sys.stderr.write(summary_file_sam_headers + "\n")
    fp.write("[SAM headers]\n")
    fp.write(summary_file_sam_headers + "\n")

    # Calculate the input reads statistics.
    [
        fastqinfo_string,
        fastqinfo_num_seqs,
        fastqinfo_total_seq_len,
        fastqinfo_average_seq_len,
        temp_max_seq_len,
    ] = fastqparser.count_seq_length(reads_file)
    summary_reads_file = ""
    summary_reads_file += "Number of reads in the input file: %d\n" % (
        fastqinfo_num_seqs
    )
    summary_reads_file += "Total number of bases in the input reads file: %d\n" % (
        fastqinfo_total_seq_len
    )
    summary_reads_file += "Average read length in the input file: %d\n" % (
        fastqinfo_average_seq_len
    )
    summary += "[Input reads file]\n" + summary_reads_file + "\n"
    ###############################3
    # This version is more verbose, but is different than figure1.xlsx!
    # csv_line += '%d\t%d\t' % (fastqinfo_num_seqs, fastqinfo_total_seq_len);
    # csv_header += 'Num input reads\tNum input bases\t';
    ###############################3
    # Just simply verbose to screen and summary file.
    sys.stderr.write("[Input reads file]\n")
    sys.stderr.write(summary_reads_file + "\n")
    fp.write("[Input reads file]\n")
    fp.write(summary_reads_file + "\n")

    # header = 'CPU time\tMax RSS\tNum input reads\tNum input bases\t';

    if mode_code & MODE_CODE_CALC_CONSENSUS:
        summary_consensus = ""
        sys.stderr.write(
            "[main_sam_analysis.py] Calculating consensus statistics, base coverage threshold %d...\n"
            % (consensus_coverage_threshold)
        )
        consensus.main(
            sam_file,
            reference_file,
            consensus_coverage_threshold,
            consensus_prefix,
            thread_id=0,
        )
        sys.stderr.write("\n")
    if (mode_code & MODE_CODE_CALC_CONSENSUS) or (
        mode_code & MODE_CODE_COLLECT_CONSENSUS
    ):
        # Collecting stats from the consensus caller.
        # consensus_plot_lines and consensus_statistics have the same information, only the
        # consensus_statistics object is a dict and the consensus_plot_lines is a formatted string
        # meant for outputting to a summary text file.
        variant_summary_path = "%s-cov_%d.variant.sum" % (
            consensus_prefix,
            consensus_coverage_threshold,
        )
        [
            alignments_file,
            mpileup_file,
            coverage_threshold,
            snp_count,
            insertion_count,
            deletion_count,
            num_undercovered_bases,
            num_called_bases,
            num_correct_bases,
            average_coverage,
            variant_lines,
        ] = ParseVariantStats(variant_summary_path)

        summary_consensus = "\n".join(variant_lines)
        summary += "[Consensus statistics]\n" + summary_consensus + "\n\n"

        ###############################3
        # This version is more verbose, but is different than figure1.xlsx!
        # csv_line += '%d\t%d\t%d\t%d\t%d\t%d\t%f\t%d\t' % (	snp_count, deletion_count, insertion_count,
        # 													num_undercovered_bases, num_called_bases,
        # 													num_correct_bases, average_coverage, coverage_threshold);
        # csv_header += 'snp_count\tdeletion_count\tinsertion_count\tnum_undercovered_bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\tcoverage_threshold\t';
        ###############################3

        ###############################3
        # This version is in the same order as figure1.xlsx!
        csv_line += "%d\t%d\t%d\t%d\t%d\t%d\t%f\t" % (
            insertion_count,
            deletion_count,
            snp_count,
            num_undercovered_bases,
            num_called_bases,
            num_correct_bases,
            average_coverage,
        )
        csv_header += "Insertions\tDeletions\tSNPs\tUncalled Bases\tnum_called_bases\tnum_correct_bases\taverage_coverage\t"
        ###############################3

        # [summary_consensus, consensus_plot_lines, consensus_statistics] = consensus_stats.CollectConsensus(sam_file, consensus_prefix, consensus_coverage_thresholds, suppress_error_messages);
        sys.stderr.write("[Consensus statistics]\n")
        sys.stderr.write(summary_consensus + "\n\n")
        fp.write("[Consensus statistics]\n")
        fp.write(summary_consensus + "\n\n")
        # fp_consensus_plot_lines = open(out_consensus_plot_lines, 'w');
        # fp_consensus_plot_lines.write(consensus_plot_lines);
        # fp_consensus_plot_lines.close();
        # consensus_stats.PlotConsensusStats(dataset_name, consensus_prefix, consensus_statistics, out_consensus_plot_png);

    if mode_code & MODE_CODE_CALC_ERROR_RATE:
        sys.stderr.write(
            "[analyzesam.py] Calculating error rates - individual indels...\n"
        )
        errorrates.ProcessFromFiles(
            reference_file, sam_file, out_accuracy_counts_path, False
        )
        sys.stderr.write("\n")
    if (mode_code & MODE_CODE_CALC_ERROR_RATE) or (
        mode_code & MODE_CODE_COLLECT_ERROR_RATE
    ):
        # Collecting stats from the error rate estimation.
        error_rates_return = errorrates.CollectAccuracy(
            sam_file, out_accuracy_counts_path, False
        )

        try:
            [
                summary_lines,
                summary_cigar,
                error_rate_hist,
                insertion_hist,
                deletion_hist,
                snp_hist,
                match_hist,
                cigar_dataset_name,
                out_png_path,
            ] = error_rates_return
            fp.write("[CIGAR statistics - individual indels]\n")
            fp.write(summary_lines + "\n")
            sys.stderr.write("[CIGAR statistics - individual indels]\n")
            sys.stderr.write(summary_lines + "\n")
            summary += "[CIGAR statistics - individual indels]\n"
            summary += summary_lines + "\n\n"

        except Exception as e:
            sys.stderr.write(str(e) + "\n")
            sys.stderr.write("Returned values: %s\n" % (str(error_rates_return)))

    if mode_code & MODE_CODE_CALC_ERROR_RATE_INDELS_AS_EVENTS:
        sys.stderr.write(
            "[analyzesam.py] Calculating error rates - indels as events...\n"
        )
        errorrates.ProcessFromFiles(
            reference_file, sam_file, out_accuracy_counts_indel_events_path, True
        )
        sys.stderr.write("\n")
    if (mode_code & MODE_CODE_CALC_ERROR_RATE_INDELS_AS_EVENTS) or (
        mode_code & MODE_CODE_COLLECT_ERROR_RATE_INDELS_AS_EVENTS
    ):
        # Collecting stats from the error rate estimation.
        error_rates_return = errorrates.CollectAccuracy(
            sam_file, out_accuracy_counts_indel_events_path, False
        )

        try:
            [
                summary_lines,
                summary_cigar,
                error_rate_hist,
                insertion_hist,
                deletion_hist,
                snp_hist,
                match_hist,
                cigar_dataset_name,
                out_png_path,
            ] = error_rates_return
            # [summary_cigar, error_rate_hist, insertion_hist, deletion_hist, snp_hist, match_hist, cigar_dataset_name, out_png_path] = errorrates.CollectAccuracy(sam_file, out_accuracy_counts_indel_events_path, False);
            fp.write("[CIGAR statistics - indels as events]\n")
            fp.write(summary_lines + "\n")
            sys.stderr.write("[CIGAR statistics - indels as events]\n")
            sys.stderr.write(summary_lines + "\n")
            summary += "[CIGAR statistics - indels as events]\n"
            summary += summary_lines + "\n\n"
        except Exception as e:
            sys.stderr.write(str(e) + "\n")
            sys.stderr.write("Returned values: %s\n" % (str(error_rates_return)))

    # if (mode_code & MODE_CODE_CALC_READ_COUNTS):
    # 	sys.stderr.write('[main_sam_analysis.py] Counting mapped reads...\n');
    # 	[num_alignments, num_mapped_alignments, num_unique_reads, num_mapped_reads, num_mapped_bases] = utility_sam.CountMappedReads(sam_file);
    # 	sys.stderr.write('\n');
    # 	summary +=

    num_alignments = 0
    num_mapped_alignments = 0
    num_unique_reads = 0
    num_mapped_reads = 0
    num_mapped_bases = 0
    if (mode_code & MODE_CODE_CALC_READ_COUNTS) or (
        mode_code & MODE_CODE_COLLECT_READ_COUNTS
    ):
        sys.stderr.write("[main_sam_analysis.py] Counting mapped reads...\n")
        [
            num_alignments,
            num_mapped_alignments,
            num_unique_reads,
            num_mapped_reads,
            num_mapped_bases,
        ] = utility_sam.CountMappedReads(sam_file)
        sys.stderr.write("\n")
        summary_read_count = ""
        summary_read_count += "num_alignments: %d\n" % num_alignments
        summary_read_count += "num_mapped_alignments: %d (%.2f%%)\n" % (
            num_mapped_alignments,
            (float(num_mapped_alignments) / float(num_alignments)) * 100.0,
        )
        summary_read_count += "num_unmapped_alignments: %d (%.2f%%)\n" % (
            (num_alignments - num_mapped_alignments),
            (float(num_alignments - num_mapped_alignments) / float(num_alignments))
            * 100.0,
        )
        summary_read_count += "num_mapped_reads: %d\n" % num_mapped_reads
        summary_read_count += "num_uniquely_mapped_reads: %d\n" % num_unique_reads
        summary_read_count += "num_mapped_bases: %d\n" % num_mapped_bases
        summary_read_count += "num_read_in_input_reads_file: %d\n" % (
            fastqinfo_num_seqs
        )
        summary_read_count += "num_bases_in_input_reads_file: %d\n" % (
            fastqinfo_total_seq_len
        )
        summary_read_count += "percent_mapped_reads: %.2f%%\n" % (
            (float(num_mapped_reads) / float(fastqinfo_num_seqs)) * 100.0
        )
        summary_read_count += "percent_mapped_bases: %.2f%%\n" % (
            (float(num_mapped_bases) / float(fastqinfo_total_seq_len)) * 100.0
        )

        summary += "[SAM file statistics]\n"
        summary += summary_read_count + "\n\n"

        ###############################3
        # This version is more verbose, but is different than figure1.xlsx!
        # csv_line += '%d\t%d\t%f\t%f\t' % (num_mapped_reads, num_mapped_bases, (float(num_mapped_reads)/float(fastqinfo_num_seqs))*100.0, (float(num_mapped_bases)/float(fastqinfo_total_seq_len))*100.0);
        # csv_header += 'num_mapped_reads\tnum_mapped_bases\tpercent_mapped_reads\tpercent_mapped_bases\t';
        ###############################3

        ###############################3
        # This version is in the same order as figure1.xlsx!
        csv_line += "%d\t%f\t%d\t%f\t" % (
            num_mapped_bases,
            (float(num_mapped_bases) / float(fastqinfo_total_seq_len)) * 100.0,
            num_mapped_reads,
            (float(num_mapped_reads) / float(fastqinfo_num_seqs)) * 100.0,
        )
        csv_header += "Num mapped bases\tPercent mapped bases\tNum mapped reads\tPercent mapped reads\t"
        ###############################3

        # Collecting stats from the SAM file.
        # [summary_read_count_readable, read_count_plot_lines] = count_mapped_reads.CollectSAMStats(out_count_mapped_reads_prefix, suppress_error_messages);
        sys.stderr.write("[SAM file statistics]\n")
        sys.stderr.write(summary_read_count + "\n")
        fp.write("[SAM file statistics]\n")
        fp.write(summary_read_count + "\n")

    # Get the CPU time and memory consumption from the process.
    [
        cmdline,
        realtime,
        cputime,
        usertime,
        systemtime,
        maxrss,
        rsscache,
        memtime_lines,
    ] = ParseMemTime(sam_file)
    reads_per_sec = (
        0.0 if (cputime == 0) else (float(num_unique_reads) / float(cputime))
    )
    bases_per_sec = (
        0.0 if (cputime == 0) else (float(num_mapped_bases) / float(cputime))
    )
    bases_per_mb = 0.0 if (cputime == 0) else float(num_mapped_bases) / float(maxrss)

    summary_memtime = "\n".join(memtime_lines)
    summary += "[Memtime]\n" + summary_memtime + "\n\n"
    csv_line += "%f\t%f\t" % (cputime, maxrss)
    csv_line += "%f\t%f\t%f\t" % (reads_per_sec, bases_per_sec, bases_per_mb)
    csv_header += "CPU time [s]\tMemory [MB]\treads/sec\tbases/sec\tbases/MB\t"
    # Just simply verbose to screen and summary file.
    sys.stderr.write("[Memtime]\n")
    sys.stderr.write(summary_memtime + "\n\n")
    fp.write("[Memtime]\n")
    fp.write(summary_memtime + "\n\n")

    # sys.stderr.write('[SAM headers]\n');
    # sys.stderr.write(summary_file_sam_headers + '\n');
    # sys.stderr.write('[SAM file statistics]:\n');
    # sys.stderr.write(summary_read_count_readable + '\n');
    # sys.stderr.write('[CIGAR statistics]\n');
    # sys.stderr.write(summary_cigar + '\n');
    # sys.stderr.write('[Consensus statistics]\n');
    # sys.stderr.write(summary_consensus + '\n');
    # sys.stderr.write('\n');
    sys.stderr.write(summary)
    sys.stderr.write("[Done!]\n")
    sys.stderr.write("\n")

    fp.close()

    csv_header += "Coverage threshold"
    csv_line += "%d\t" % (consensus_coverage_threshold)

    csv_header = csv_header.strip()
    csv_line = csv_line.strip()

    if mode_code & MODE_CODE_HEADER:
        return "%s\n%s\n" % (csv_header, csv_line)

    return "%s\n" % (csv_line)