def main(): parser = argparse.ArgumentParser( description='Report coverage gaps/dips from a samtools mpileup file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input mpileup file to be read' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' ) parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Reference fasta file, against which reads were aligned. Needed for low 3-prime end coverage' ) parser.add_argument('-mcd', '--min_coverage_depth', type=int, required=True, help='Min coverage depth, below which is reported' ) parser.add_argument('-mcs', '--min_coverage_span', type=int, required=False, help='Coverage window size, the avg of which is calculated for depth cutoff' ) parser.add_argument('-eb', '--end_buffer', type=int, required=False, default=0, help='If specified, gaps this far from either end of the molecule will not be reported.' ) args = parser.parse_args() # Check, this isn't ready yet: if args.min_coverage_span is not None: raise Exception("ERROR: Sorry, --min_coverage_span not yet implemented.") if args.output_file is None: out_fh = sys.stdout else: out_fh = open(args.output_file, 'wt') lengths = utils.fasta_sizes_from_file(args.fasta_file) stats = {'total_molecules': 0, 'total_bases': 0, 'depth_sum': 0} # In mpileup gaps are reported either with positions of coverage 0 OR omitted rows. depths = list() current_seq_id = None for line in open(args.input_file): contig, this_coord, base, depth = line.split("\t")[0:4] this_coord = int(this_coord) # new molecule if contig != current_seq_id: stats['total_molecules'] += 1 # purge the last one if current_seq_id != None: print_spans(current_seq_id, depths, args.min_coverage_depth, out_fh, stats, lengths, args.end_buffer) depths = [0] * lengths[contig] current_seq_id = contig depths[this_coord - 1] = depth print_spans(current_seq_id, depths, args.min_coverage_depth, out_fh, stats, lengths, args.end_buffer) print("INFO: Total molecules: {0}".format(stats['total_molecules']), file=sys.stderr) print("INFO: Total bases : {0}".format(stats['total_bases']), file=sys.stderr) print("INFO: Avg cov depth : {0}x".format(int(stats['depth_sum'] / stats['total_bases'])), file=sys.stderr)
def main(): parser = argparse.ArgumentParser( description='Parse BLAST output and report coverage of queries') ## output file to be written parser.add_argument('-f', '--fasta_file', type=str, required=True, help='FASTA file of query.fasta (for lengths)' ) parser.add_argument('-b', '--blast_file', type=str, required=True, help='BLAST output using -m 8 or -m 9 options' ) #parser.add_argument('-e', '--evalue_cutoff', type=str, required=False, help='E-value cutoff' ) parser.add_argument('-slpc', '--subject_length_percentage_cutoff', type=int, required=False, help='Ignore hits from transcripts of length > N% relative to the query transcript' ) parser.add_argument('-sf', '--subject_fasta', type=str, required=False, help='Only required if -slpc is passed. FASTA file of the sub' ) parser.add_argument('-o', '--output_base', type=str, required=True, help='base name/path for output. Two files starting with this will be created' ) args = parser.parse_args() qsizes = utils.fasta_sizes_from_file(args.fasta_file) if args.subject_fasta is not None: ssizes = utils.fasta_sizes_from_file(args.subject_fasta) all_fh = open("{0}.cov.all.perc.txt".format(args.output_base), 'wt') longest_fh = open("{0}.cov.longest.perc.txt".format(args.output_base), 'wt') current_query_id = None covs_all = None match_segments = dict() longest_match_length = None for line in open(args.blast_file): if line.startswith('#'): continue line = line.rstrip() cols = line.split("\t") query_id = cols[0] subj_id = cols[1] qstart = int(cols[6]) qend = int(cols[7]) if args.subject_length_percentage_cutoff is not None: perc_length_diff = (ssizes[subj_id] / qsizes[query_id]) * 100 if perc_length_diff > args.subject_length_percentage_cutoff: continue if query_id != current_query_id: if current_query_id is not None: # report last coverage # first the 'all' coverage all_cov_perc = (qsizes[current_query_id] - covs_all.count(0)) / qsizes[current_query_id] all_fh.write("{0}\t{1:.1f}\n".format(current_query_id, all_cov_perc * 100)) # now the 'longest' coverage longest_cov_transcript_id = None longest_cov_perc = None for tid in match_segments: # calculate coverage of this tid this_cov = [0] * qsizes[current_query_id] for seg in match_segments[tid]: for i in range(seg[0] - 1, seg[1]): this_cov[i] += 1 this_cov_perc = (len(this_cov) - this_cov.count(0)) / len(this_cov) if longest_cov_perc is None or this_cov_perc > longest_cov_perc: longest_cov_perc = this_cov_perc longest_cov_transcript_id = tid print("LOG: transcript {0} covers {1} (len:{3}) best at {2:.1f}%".format(longest_cov_transcript_id, current_query_id, longest_cov_perc * 100, len(this_cov))) longest_fh.write("{0}\t{1:.1f}\t{2}\n".format(current_query_id, longest_cov_perc * 100, longest_cov_transcript_id)) # now reset and init this transcript current_query_id = query_id covs_all = [0] * qsizes[query_id] longest_match_length = None match_segments = dict() # now handle this row if subj_id not in match_segments: match_segments[subj_id] = list() match_segments[subj_id].append([qstart, qend]) for i in range(qstart - 1, qend): covs_all[i] += 1 # handle last ones # first the 'all' coverage all_cov_perc = (qsizes[current_query_id] - covs_all.count(0)) / qsizes[current_query_id] all_fh.write("{0}\t{1:.1f}\n".format(current_query_id, all_cov_perc * 100)) # now the 'longest' coverage longest_cov_transcript_id = None longest_cov_perc = None for tid in match_segments: # calculate coverage of this tid this_cov = [0] * qsizes[current_query_id] for seg in match_segments[tid]: for i in range(seg[0] - 1, seg[1]): this_cov[i] += 1 this_cov_perc = (len(this_cov) - this_cov.count(0)) / len(this_cov) if longest_cov_perc is None or this_cov_perc > longest_cov_perc: longest_cov_perc = this_cov_perc longest_cov_transcript_id = tid print("LOG: transcript {0} covers {1} (len:{3}) best at {2:.1f}%".format(longest_cov_transcript_id, current_query_id, longest_cov_perc * 100, len(this_cov))) longest_fh.write("{0}\t{1:.1f}\n".format(current_query_id, longest_cov_perc * 100)) all_fh.close() longest_fh.close()
def main(): parser = argparse.ArgumentParser( description='Parse BLAST output and report coverage of queries') ## output file to be written parser.add_argument('-f', '--fasta_file', type=str, required=True, help='FASTA file of query.fasta (for lengths)') parser.add_argument('-b', '--blast_file', type=str, required=True, help='BLAST output using -m 8 or -m 9 options') #parser.add_argument('-e', '--evalue_cutoff', type=str, required=False, help='E-value cutoff' ) parser.add_argument( '-slpc', '--subject_length_percentage_cutoff', type=int, required=False, help= 'Ignore hits from transcripts of length > N% relative to the query transcript' ) parser.add_argument( '-sf', '--subject_fasta', type=str, required=False, help='Only required if -slpc is passed. FASTA file of the sub') parser.add_argument( '-o', '--output_base', type=str, required=True, help= 'base name/path for output. Two files starting with this will be created' ) args = parser.parse_args() qsizes = utils.fasta_sizes_from_file(args.fasta_file) if args.subject_fasta is not None: ssizes = utils.fasta_sizes_from_file(args.subject_fasta) all_fh = open("{0}.cov.all.perc.txt".format(args.output_base), 'wt') longest_fh = open("{0}.cov.longest.perc.txt".format(args.output_base), 'wt') current_query_id = None covs_all = None match_segments = dict() longest_match_length = None for line in open(args.blast_file): if line.startswith('#'): continue line = line.rstrip() cols = line.split("\t") query_id = cols[0] subj_id = cols[1] qstart = int(cols[6]) qend = int(cols[7]) if args.subject_length_percentage_cutoff is not None: perc_length_diff = (ssizes[subj_id] / qsizes[query_id]) * 100 if perc_length_diff > args.subject_length_percentage_cutoff: continue if query_id != current_query_id: if current_query_id is not None: # report last coverage # first the 'all' coverage all_cov_perc = (qsizes[current_query_id] - covs_all.count(0)) / qsizes[current_query_id] all_fh.write("{0}\t{1:.1f}\n".format(current_query_id, all_cov_perc * 100)) # now the 'longest' coverage longest_cov_transcript_id = None longest_cov_perc = None for tid in match_segments: # calculate coverage of this tid this_cov = [0] * qsizes[current_query_id] for seg in match_segments[tid]: for i in range(seg[0] - 1, seg[1]): this_cov[i] += 1 this_cov_perc = (len(this_cov) - this_cov.count(0)) / len(this_cov) if longest_cov_perc is None or this_cov_perc > longest_cov_perc: longest_cov_perc = this_cov_perc longest_cov_transcript_id = tid print( "LOG: transcript {0} covers {1} (len:{3}) best at {2:.1f}%" .format(longest_cov_transcript_id, current_query_id, longest_cov_perc * 100, len(this_cov))) longest_fh.write("{0}\t{1:.1f}\t{2}\n".format( current_query_id, longest_cov_perc * 100, longest_cov_transcript_id)) # now reset and init this transcript current_query_id = query_id covs_all = [0] * qsizes[query_id] longest_match_length = None match_segments = dict() # now handle this row if subj_id not in match_segments: match_segments[subj_id] = list() match_segments[subj_id].append([qstart, qend]) for i in range(qstart - 1, qend): covs_all[i] += 1 # handle last ones # first the 'all' coverage all_cov_perc = (qsizes[current_query_id] - covs_all.count(0)) / qsizes[current_query_id] all_fh.write("{0}\t{1:.1f}\n".format(current_query_id, all_cov_perc * 100)) # now the 'longest' coverage longest_cov_transcript_id = None longest_cov_perc = None for tid in match_segments: # calculate coverage of this tid this_cov = [0] * qsizes[current_query_id] for seg in match_segments[tid]: for i in range(seg[0] - 1, seg[1]): this_cov[i] += 1 this_cov_perc = (len(this_cov) - this_cov.count(0)) / len(this_cov) if longest_cov_perc is None or this_cov_perc > longest_cov_perc: longest_cov_perc = this_cov_perc longest_cov_transcript_id = tid print("LOG: transcript {0} covers {1} (len:{3}) best at {2:.1f}%".format( longest_cov_transcript_id, current_query_id, longest_cov_perc * 100, len(this_cov))) longest_fh.write("{0}\t{1:.1f}\n".format(current_query_id, longest_cov_perc * 100)) all_fh.close() longest_fh.close()
def main(): parser = argparse.ArgumentParser( description='Report coverage gaps/dips from a samtools mpileup file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input mpileup file to be read') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created') parser.add_argument( '-f', '--fasta_file', type=str, required=True, help= 'Reference fasta file, against which reads were aligned. Needed for low 3-prime end coverage' ) parser.add_argument('-mcd', '--min_coverage_depth', type=int, required=True, help='Min coverage depth, below which is reported') parser.add_argument( '-mcs', '--min_coverage_span', type=int, required=False, help= 'Coverage window size, the avg of which is calculated for depth cutoff' ) parser.add_argument( '-eb', '--end_buffer', type=int, required=False, default=0, help= 'If specified, gaps this far from either end of the molecule will not be reported.' ) args = parser.parse_args() # Check, this isn't ready yet: if args.min_coverage_span is not None: raise Exception( "ERROR: Sorry, --min_coverage_span not yet implemented.") if args.output_file is None: out_fh = sys.stdout else: out_fh = open(args.output_file, 'wt') lengths = utils.fasta_sizes_from_file(args.fasta_file) stats = {'total_molecules': 0, 'total_bases': 0, 'depth_sum': 0} # In mpileup gaps are reported either with positions of coverage 0 OR omitted rows. depths = list() current_seq_id = None for line in open(args.input_file): contig, this_coord, base, depth = line.split("\t")[0:4] this_coord = int(this_coord) # new molecule if contig != current_seq_id: stats['total_molecules'] += 1 # purge the last one if current_seq_id != None: print_spans(current_seq_id, depths, args.min_coverage_depth, out_fh, stats, lengths, args.end_buffer) depths = [0] * lengths[contig] current_seq_id = contig depths[this_coord - 1] = depth print_spans(current_seq_id, depths, args.min_coverage_depth, out_fh, stats, lengths, args.end_buffer) print("INFO: Total molecules: {0}".format(stats['total_molecules']), file=sys.stderr) print("INFO: Total bases : {0}".format(stats['total_bases']), file=sys.stderr) print("INFO: Avg cov depth : {0}x".format( int(stats['depth_sum'] / stats['total_bases'])), file=sys.stderr)