def iterator_psl_intervals(options): """iterate over psl file yield an entry together with overlapping entries. returns tuples of (match, list(query_intervals), list(target_intervals)) """ if options.filename_filter_query: intervals_query = readIntervals( IOTools.openFile(options.filename_filter_query, "r"), options) else: intervals_query = None if options.filename_filter_target: intervals_target = readIntervals( IOTools.openFile(options.filename_filter_target, "r"), options) else: intervals_target = None iterator = Blat.BlatIterator(options.stdin) ninput = 0 while 1: match = iterator.next() if not match: break ninput += 1 if options.test and ninput >= options.test: break if options.loglevel >= 1 and ninput % options.report_step == 0: options.stdlog.write("# progress: ninput=%i\n" % (ninput)) options.stdlog.flush() qx, tx = None, None if intervals_query: try: qx = list( intervals_query.get(match.mQueryId, match.mQueryFrom, match.mQueryTo)) except KeyError: qx = [] if intervals_target: try: tx = list( intervals_target.get(match.mSbjctId, match.mSbjctFrom, match.mSbjctTo)) except KeyError: tx = [] if options.loglevel >= 2: options.stdlog.write( "###################################################\n") options.stdlog.write("# testing match %s\n" % (str(match))) options.stdlog.write( "###################################################\n") yield match, qx, tx
def pslAddSequence(query_fasta, sbjct_fasta, options): iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 while 1: match = next(iterator) if not match: break ninput += 1 if options.test and ninput >= options.test: break if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) new = Blat.MatchPSLX() new.fromPSL(match, query_fasta.getSequence( match.mQueryId, "+", match.mQueryFrom, match.mQueryTo), sbjct_fasta.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)) options.stdout.write(str(new) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def pslComplement(query_fasta, target_fasta, options): """complenment psl entries. """ iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 border = options.complement_border min_length = options.complement_min_length while 1: match = iterator.next() if not match: break ninput += 1 if options.test and ninput >= options.test: break if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) if match.mNBlocks <= 1: nskipped += 1 continue pairs = [] for qstart, tstart, size in match.getBlocks(): qend = qstart + size - border qstart += border if qend - qstart < options.complement_min_length: continue tend = tstart + size - border tstart += border if tend - tstart < options.complement_min_length: continue query_sequence = query_fasta.getSequence(match.mQueryId, match.strand, qstart, qend) sbjct_sequence = sbjct_fasta.getSequence(match.mSbjctId, "+", tstart, tend) ndiscarded += 1 options.stdout.write(str(new) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def pslComplementQuery(options): """complement psl entries. Fill the regions from a second psl file. """ Iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 border = options.complement_border min_length = options.complement_min_length while 1: match = next(iterator) if not match: break ninput += 1 if options.test and ninput >= options.test: break if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) if match.mNBlocks <= 1: nskipped += 1 continue pairs = [] for qstart, tstart, size in match.getBlocks(): qend = qstart + size - border qstart += border if qend - qstart < options.complement_min_length: continue tend = tstart + size - border tstart += border if tend - tstart < options.complement_min_length: continue ndiscarded += 1 options.stdout.write(str(new) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def chunk_iterator_psl_overlap(infile, args, prefix, use_header=False): """iterate over overlapping entries in a psl file.""" iterator = Blat.BlatIterator(sys.stdin) processed_contigs = set() merge_distance = args[0] last_sbjct_id = None sbjct_end = 0 outfile = None filename = None while 1: match = next(iterator) if match is None: break if match.mSbjctId != last_sbjct_id or \ match.mSbjctFrom >= (sbjct_end + merge_distance): if last_sbjct_id: outfile.close() yield filename if last_sbjct_id != match.mSbjctId and \ match.mSbjctId in processed_contigs: raise ValueError("input not sorted correctly (contig,start): " "already encountered %s\n%s" % (match.mSbjctId, str(match))) last_sbjct_id = match.mSbjctId processed_contigs.add(last_sbjct_id) sbjct_start = match.mSbjctFrom sbjct_end = match.mSbjctTo if match.mSbjctFrom < sbjct_start: raise ValueError("input not sorted correctly (contig,start): " "%i < %i\n%s" % (match.mSbjctFrom, sbjct_start, str(match))) sbjct_end = max(match.mSbjctTo, sbjct_end) outfile.write(str(match) + "\n") if outfile: outfile.close() yield filename
def main(argv=sys.argv): parser = E.OptionParser( version="%prog version: $Id: psl2wiggle.py 2834 2009-11-24 16:11:23Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-b", "--output-filename-pattern", dest="output_filename", type="string", help="filename for output [default=%default]") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("bedgraph", "wiggle", "bigbed", "bigwig"), help="output format [default=%default]") parser.set_defaults(genome_file=None, typecode=numpy.int16, output_filename=None, output_format="wiggle", test=None) (options, args) = E.Start(parser, add_pipe_options=True) typecode = options.typecode if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) counts = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * typecode().itemsize)) for contig, size in contig_sizes.items(): E.debug("allocating %s: %i bases" % (contig, size)) counts[contig] = numpy.zeros(size, typecode) E.info("allocated memory for %i contigs" % len(fasta)) else: fasta = None contig_sizes = {} if options.output_format in ("bigwig", "bigbed"): if not options.genome_file: raise ValueError( "please supply genome file for bigwig/bigbed computation.") if not options.output_filename: raise ValueError( "please output file for bigwig/bigbed computation.") if options.output_format == "bigwig": executable_name = "wigToBigWig" elif options.output_format == "bigbed": executable_name = "bedToBigBed" else: raise ValueError("unknown output format `%s`" % options.output_format) executable = IOTools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # write contig sizes outfile_size = open(tmpfile_sizes, "w") for contig, size in contig_sizes.items(): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() outfile = open(tmpfile_wig, "w") else: outfile = options.stdout iterator = Blat.BlatIterator(sys.stdin) ninput, ncontigs, nskipped = 0, 0, 0 E.info("started counting") while 1: if options.test and ninput >= options.test: break match = iterator.next() if match is None: break ninput += 1 contig = match.mSbjctId for start, length in zip(match.mSbjctBlockStarts, match.mBlockSizes): counts[contig][start:start + length] += 1 E.info("finished counting") if options.output_format in ("wig", "bigwig"): E.info("starting wig output") for contig, vals in counts.items(): E.debug("output for %s" % contig) for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]): l = list(iter) start, end = l[0][0], l[-1][0] val = vals[start] if val > 0: outfile.write("variableStep chrom=%s span=%i\n" % (contig, end - start + 1)) outfile.write("%i\t%i\n" % (start, val)) ncontigs += 1 elif options.output_format in ("bedgraph", "bigbed"): E.info("starting bedgraph output") for contig, vals in counts.items(): E.debug("output for %s" % contig) for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]): l = list(iter) start, end = l[0][0], l[-1][0] val = vals[start] if val > 0: outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end + 1, val)) ncontigs += 1 E.info("finished output") if options.output_format in ("bigwig", "bigbed"): outfile.close() E.info("starting bigwig conversion") try: retcode = subprocess.call(" ".join((executable, tmpfile_wig, tmpfile_sizes, os.path.abspath(options.output_filename)), ), shell=True) if retcode < 0: warn("wigToBigWig terminated with signal: %i" % -retcode) return -retcode except OSError, msg: warn("Error while executing bigwig: %s" % e) return 1 shutil.rmtree(tmpdir) E.info("finished bigwig conversion")
def pslMerge(options): """merge psl alignments. """ iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 last_query = None last_target = None last_strand = None def process(matches): new = matches[0].copy() map_query2target = alignlib_lite.py_makeAlignmentBlocks() graph = networkx.DiGraph() graph.add_nodes_from(range(len(matches) + 2)) matches.sort(key=lambda x: x.mQueryFrom) if Genomics.IsPositiveStrand(matches[0].strand): f = lambda x, y: x.mSbjctTo < y.mSbjctFrom else: f = lambda x, y: x.mSbjctFrom > y.mSbjctTo for x in range(0, len(matches)): xx = matches[x] if options.loglevel >= 6: options.stdlog.write("# graph: %2i %s\n" % (x, str(xx))) for y in range(x + 1, len(matches)): yy = matches[y] d = min(xx.mQueryTo, yy.mQueryTo) - \ max(xx.mQueryFrom, yy.mQueryFrom) if d > 0 or not f(xx, yy): continue else: graph.add_edge(x, y, {'weight': -d}) source = len(matches) target = len(matches) + 1 for x in range(len(matches)): xx = matches[x] graph.add_edge(source, x, {'weight': xx.mQueryFrom}) graph.add_edge( x, target, {'weight': xx.mQueryLength - xx.mQueryTo}) if options.loglevel >= 6: networkx.write_edgelist(graph, options.stdlog) path = networkx.dijkstra_path(graph, source, target) if options.loglevel >= 6: options.stdlog.write("# path: %s\n" % (str(path))) new_matches = [matches[x] for x in path[1:-1]] if len(matches) != len(new_matches): E.warn(("query=%s, target=%s, strand=%s: " "removed overlapping/out-of-order segments: " "before=%i, after=%i") % (matches[0].mQueryId, matches[0].mSbjctId, matches[0].strand, len(matches), len(new_matches))) matches = new_matches for match in matches: m = match.getMapQuery2Target() alignlib_lite.py_addAlignment2Alignment(map_query2target, m) new.fromMap(map_query2target, use_strand=True) options.stdout.write(str(new) + "\n") options.stdout.flush() return 1 while 1: match = next(iterator) if not match: break ninput += 1 if options.test and ninput >= options.test: break if options.loglevel >= 10: options.stdlog.write("# input: %s\n" % (str(match))) if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) if match.mQueryId != last_query or\ match.strand != last_strand or\ match.mSbjctId != last_target: if last_query: noutput += process(matches) matches = [] last_query, last_target, last_strand = ( match.mQueryId, match.mSbjctId, match.strand) matches.append(match) if last_query: noutput += process(matches) E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def main(): parser = E.OptionParser( version= "%prog version: $Id: psl2gff.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf.") parser.add_option( "-s", "--filename-strand", dest="filename_strand", type="string", help="set strand information according to file [default=%DEFAULT].") parser.set_defaults(as_gtf=False, filename_strand=None, test=None) (options, args) = E.Start(parser, add_pipe_options=True) #################################### if options.filename_strand: map_id2strand = IOTools.readMap(open(options.filename_strand, "r")) else: map_id2strand = {} iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 if options.as_gtf: gff = GTF.Entry() else: gff = GTF.Entry() gff.source = "psl" gff.feature = "exon" ids = {} while 1: if options.test and ninput >= options.test: break match = iterator.next() if match is None: break ninput += 1 if match.mQueryId not in ids: ids[match.mQueryId] = 1 id = match.mQueryId else: id = match.mQueryId + ":%i" % ids[match.mQueryId] ids[match.mQueryId] += 1 if options.as_gtf: gff.contig = match.mSbjctId gff.gene_id = id gff.transcript_id = id else: gff.contig = match.mSbjctId gff.clearAttributes() gff.addAttribute("gene_id", id) if id in map_id2strand: gff.strand = map_id2strand[id] else: gff.strand = match.strand for qstart, sstart, size in match.getBlocks(): gff.start = sstart gff.end = sstart + size options.stdout.write(str(gff) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: psl2map.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string", help="fasta filename with queries - required for polyA analysis [%default].") parser.add_option("--polyA", dest="polyA", action="store_true", help="detect polyA tails [%default].") parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="OUTPUT filename with histogram information on aggregate coverages [%default].") parser.add_option("--output-filename-empty", dest="output_filename_empty", type="string", help="OUTPUT filename with queries for which all matches have been discarded [%default].") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("map", "psl"), help="output format to choose [%default].") parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true", help="input is zipped.") parser.add_option("--threshold-min-pid", dest="threshold_min_pid", type="float", help="minimum thresholds for pid [%default].") parser.add_option("--threshold-min-matches", dest="threshold_min_matches", type="int", help="minimum threshold for number of matching residues [%default].") parser.add_option("--threshold-max-error-rate", dest="threshold_max_error_rate", type="float", help="maximum threshold for error of aligned part [%default].") parser.add_option("--threshold-good-query-coverage", dest="threshold_good_query_coverage", type="float", help="minimum query coverage for segments to be counted as good [%default].") parser.add_option("--threshold-min-query-coverage", dest="threshold_min_query_coverage", type="float", help="minimum query coverage for segments to be accepted [%default].") parser.add_option("--threshold-max-query-gapchars", dest="threshold_max_query_gapchars", type="int", help="maximum number of gap characters in query[%default].") parser.add_option("--threshold-max-query-gaps", dest="threshold_max_query_gaps", type="int", help="maximum number of gaps in query[%default].") parser.add_option("--threshold-max-sbjct-gapchars", dest="threshold_max_sbjct_gapchars", type="int", help="maximum number of gap characters in sbjct[%default].") parser.add_option("--keep-unique-matches", dest="keep_unique_matches", action="store_true", help="ignore filters for unique matches [%default].") parser.add_option("--keep-all-best", dest="keep_all_best", action="store_true", help="when sorting matches, keep all matches within the collection threshold [%default].") parser.add_option("--output-best-per-subject", dest="best_per_sbjct", action="store_true", help="keep only the best entry per sbjct (for transcript mapping) [%default].") parser.add_option("--threshold-max-sbjct-gaps", dest="threshold_max_sbjct_gaps", type="int", help="maximum number of gaps in sbjct[%default].") parser.add_option("--test", dest="test", type="int", help="test - stop after # rows of parsing[%default].") parser.add_option("-m", "--matching-mode", dest="matching_mode", type="choice", choices=("best-coverage", "best-query-coverage", "best-sbjct-coverage", "best-pid", "best-covpid", "best-query-covpid", "best-sbjct-covpid", "best-min-covpid", "best-query-min-covpid", "best-sbjct-min-covpid", "unique", "all"), help="determines how to selecte the best match [%default].") parser.add_option("--subjctfilter-tsv-file", dest="filename_filter_sbjct", type="string", help="gff file for filtering sbjct matches. Matches overlapping these regions are discarded, but see --keep-forbidden [%default].") parser.add_option("--keep-forbidden", dest="keep_forbidden", action="store_true", help="if set, keep only matches that overlap the regions supplied with --subjctfilter-tsv-file [%default].") parser.add_option("--query-forward-coordinates", dest="query_forward_coordinates", action="store_true", help="use forward coordinates for query, strand will refer to sbjct [%default].") parser.add_option("--ignore-all-random", dest="ignore_all_random", action="store_true", help="if there are multiple best matches, ignore all those to chrUn and _random [%default].") parser.add_option("--collection-threshold", dest="collection_threshold", type="float", help="threshold for collecting matches, percent of best score [%default].") parser.add_option("--collection-distance", dest="collection_distance", type="float", help="threshold for collecting matches, difference to best score [%default].") parser.set_defaults(input_filename_domains=None, input_filename_queries=None, threshold_good_query_coverage=90.0, threshold_min_pid=30.0, threshold_min_matches=0, threshold_max_error_rate=None, output_filename_pattern="%s", keep_unique_matches=False, output_format="map", print_matched=["full", "partial", "good"], from_zipped=False, combine_overlaps=True, min_length_domain=30, threshold_min_query_coverage=50, min_length_singletons=30, new_family_id=10000000, add_singletons=False, matching_mode="best-coverage", best_per_sbjct=False, threshold_max_query_gapchars=None, threshold_max_query_gaps=None, threshold_max_sbjct_gapchars=None, threshold_max_sbjct_gaps=None, filename_filter_sbjct=None, keep_forbidden=False, keep_all_best=False, test=None, query_forward_coordinates=False, output_filename_empty=None, collection_threshold=1.0, collection_distance=0, polyA=False, # max residues missing from non polyA end polyA_max_unaligned=3, # min residues in tail polyA_min_unaligned=10, # min percent residues that are A/T in tail polyA_min_percent=70.0, # ignore duplicate matches if they are on Un or # _random ignore_all_random=False, ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) == 1: if options.from_zipped or args[0][-3:] == ".gz": import gzip infile = gzip.open(args[0], "r") else: infile = IOTools.openFile(args[0], "r") else: infile = sys.stdin if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None if options.filename_filter_sbjct: try: import bx.intervals.intersection except ImportError: raise ValueError("filtering for intervals requires the bx tools") intervals = GTF.readGFFFromFileAsIntervals( IOTools.openFile(options.filename_filter_sbjct, "r")) intersectors = {} for contig, values in list(intervals.items()): intersector = bx.intervals.intersection.Intersecter() for start, end in values: intersector.add_interval(bx.intervals.Interval(start, end)) intersectors[contig] = intersector if options.loglevel >= 1: options.stdlog.write("# read %i intervals for %i contigs.\n" % (sum([len(x) for x in list(intervals.values())]), len(intersectors))) else: intersectors = None ################################################ ################################################ ################################################ # processing of a chunk (matches of same query) ################################################ ninput, noutput, nskipped = 0, 0, 0 # number of sequences with full/partial/good matches nfull_matches, npartial_matches, ngood_matches = 0, 0, 0 # number of sequences which are fully/good/partially matched # i.e., after combining all aligned regions nfully_matched, npartially_matched, nwell_matched = 0, 0, 0 nremoved_pid, nremoved_query_coverage, nempty = 0, 0, 0 nremoved_gaps, nremoved_nmatches = 0, 0 nremoved_regions = 0 nqueries_removed_region = 0 aggregate_coverages = [] mapped_coverages = [] fully_matched = [] well_matched = [] partially_matched = [] new_family_id = options.new_family_id if options.output_filename_empty: outfile_empty = IOTools.openFile(options.output_filename_empty, "w") outfile_empty.write("read_id\tcomment\n") else: outfile_empty = None if options.polyA: options.outfile_polyA = IOTools.openFile( options.output_filename_pattern % "polyA", "w") options.outfile_polyA.write("query_id\tstart\tend\tpA+N\tpT+N\ttail\n") def processChunk(query_id, matches): """process a set of matches from query_id""" global ninput, noutput, nskipped global nfull_matches, npartial_matches, ngood_matches global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches global nremoved_regions, nqueries_removed_region global outfile_empty ninput += 1 full_matches = [] good_matches = [] partial_matches = [] x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0 nmatches = len(matches) new_matches = [] # absolute filters applicable to non-fragmentory matches for match in matches: if match.mPid < options.threshold_min_pid: nremoved_pid += 1 continue if match.mNMatches < options.threshold_min_matches: nremoved_nmatches += 1 continue if options.threshold_max_error_rate: r = 100.0 * \ math.power( options.threshold_max_error_rate, match.mNMatches + match.mNMismatches) if match.mPid < r: nremoved_pid += 1 x_nremoved_pid += 1 continue new_matches.append(match) matches = new_matches # filter matches if len(matches) == 0: if outfile_empty: outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" % (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches)) nskipped += 1 return if options.keep_unique_matches and len(matches) == 1: pass else: new_matches = [] for match in matches: if match.mQueryCoverage < options.threshold_min_query_coverage: nremoved_query_coverage += 1 x_nquery_coverage += 1 continue if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue new_matches.append(match) matches = new_matches if len(matches) == 0: if outfile_empty: outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" % (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches)) nskipped += 1 return # Remove queries matching to a forbidden region. This section # will remove the full query if any of its matches matches in a # forbidden region. keep = True for match in matches: if intersectors and match.mSbjctId in intersectors: found = intersectors[match.mSbjctId].find( match.mSbjctFrom, match.mSbjctTo) if found and not options.keep_forbidden or (found and not options.keep_forbidden): nremoved_regions += 1 keep = False continue if not keep: nqueries_removed_region += 1 if outfile_empty: outfile_empty.write( "%s\toverlap with forbidden region\n" % query_id) return # check for full length matches for match in matches: if match.mQueryCoverage >= 99.9: full_matches.append(match) if match.mQueryCoverage > options.threshold_good_query_coverage: good_matches.append(match) else: partial_matches.append(match) if full_matches: nfull_matches += 1 elif good_matches: ngood_matches += 1 elif partial_matches: npartial_matches += 1 # compute coverage of sequence with matches intervals = [] for match in full_matches + good_matches + partial_matches: intervals.append((match.mQueryFrom, match.mQueryTo)) rest = Intervals.complement(intervals, 0, match.mQueryLength) query_coverage = 100.0 * \ (match.mQueryLength - sum([x[1] - x[0] for x in rest])) / match.mQueryLength if query_coverage >= 99.9: fully_matched.append(query_id) elif query_coverage > options.threshold_good_query_coverage: well_matched.append(query_id) else: partially_matched.append(query_id) aggregate_coverages.append(query_coverage) # select matches to output matches, msg = selectMatches(query_id, matches, options, queries_fasta) if len(matches) > 0: for match in matches: if options.query_forward_coordinates: match.convertCoordinates() if options.output_format == "map": options.stdout.write("%s\n" % "\t".join(map(str, ( match.mQueryId, match.mSbjctId, match.strand, "%5.2f" % match.mQueryCoverage, "%5.2f" % match.mSbjctCoverage, "%5.2f" % match.mPid, match.mQueryLength, match.mSbjctLength, match.mQueryFrom, match.mQueryTo, match.mSbjctFrom, match.mSbjctTo, ",".join( map(str, match.mBlockSizes)), ",".join( map(str, match.mQueryBlockStarts)), ",".join( map(str, match.mSbjctBlockStarts)), )))) elif options.output_format == "psl": options.stdout.write(str(match) + "\n") noutput += 1 else: if outfile_empty: outfile_empty.write( "%s\tno matches selected: %s\n" % (query_id, msg)) nempty += 1 if options.output_format == "map": options.stdout.write("\t".join(("query_id", "sbjct_id", "sstrand", "qcoverage", "scoverage", "pid", "qlen", "slen", "qfrom", "qto", "sfrom", "sto", "blocks", "qstarts", "sstarts")) + "\n") elif options.output_format == "psl": options.stdout.write(Blat.Match().getHeader() + "\n") ################################################ ################################################ ################################################ # main loop ################################################ nfully_covered = None matches = [] last_query_id = None is_complete = True ninput_lines = 0 skip = 0 iterator = Blat.BlatIterator(infile) while 1: try: match = next(iterator) except Blat.ParsingError: iterator = Blat.BlatIterator(infile) continue if match is None: break ninput_lines += 1 if options.test and ninput_lines > options.test: break if match.mQueryId != last_query_id: if last_query_id: processChunk(last_query_id, matches) matches = [] last_query_id = match.mQueryId matches.append(match) processChunk(last_query_id, matches) printHistogram(aggregate_coverages, "aggregate", options) printHistogram(mapped_coverages, "mapped", options) if "full" in options.print_matched: printMatched(fully_matched, "full", options) if "good" in options.print_matched: printMatched(well_matched, "good", options) if "partial" in options.print_matched: printMatched(partially_matched, "partial", options) if options.loglevel >= 1: options.stdlog.write( "# alignments: ninput=%i, is_complete=%s\n" % (ninput_lines, str(is_complete))) options.stdlog.write( "# queries: ninput=%i, noutput=%i\n" % (ninput, noutput)) options.stdlog.write("# individual coverage: full=%i, good=%i, partial=%i\n" % ( nfull_matches, ngood_matches, npartial_matches)) options.stdlog.write("# aggregate coverage: full=%i, good=%i, partial=%i\n" % ( len(fully_matched), len(well_matched), len(partially_matched))) options.stdlog.write("# omitted queries: total=%i, thresholds=%i, regions=%i, selection=%i\n" % (nskipped + nqueries_removed_region + nempty, nskipped, nqueries_removed_region, nempty)) options.stdlog.write("# omitted matches: pid=%i, query_coverage=%i, gaps=%i, regions=%i, nmatches=%i\n" % ( nremoved_pid, nremoved_query_coverage, nremoved_gaps, nremoved_regions, nremoved_nmatches)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: psl2wiggle_stats.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--wiggle-files", dest="wiggle_files", type="string", help="glob expression for wiggle files [%default].") parser.add_option("--prefix", dest="prefix", type="string", help="prefix to add to contig names before lookup [%default].") parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true", help="input is zipped.") parser.add_option("--test", dest="test", type="int", help="test - stop after # rows of parsing [%default].") parser.add_option("--with-values", dest="with_values", action="store_true", help="output values in last column [%default].") parser.set_defaults(wiggle_files="*.data.bz2", from_zipped=False, prefix="", with_values=False, test=None) (options, args) = E.Start(parser, add_pipe_options=True) # open indexed access to wiggles wiggle_files = glob.glob(options.wiggle_files) if not wiggle_files: raise IOError("could not find wiggle files with '%s'" % options.wiggle_files) index = Wiggle.WiggleMultiIndexedAccess(wiggle_files, keep_open=True, use_cache=False) iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 options.stdout.write( "query\tnali\t%s" % ("\t".join(Stats.DistributionalParameters().getHeaders()))) if options.with_values: options.stdout.write("\tvalues") options.stdout.write("\n") while 1: if options.test and ninput >= options.test: break match = iterator.next() if match is None: break ninput += 1 if options.loglevel >= 2: options.stdlog.write(str(match) + "\n") # psl always matches on the forward strand map_genome2query = alignlib_lite.py_makeAlignmentBlocks() f = alignlib_lite.py_AlignmentFormatBlat("%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % ( match.mSbjctFrom, match.mSbjctTo, match.mQueryFrom, match.mQueryTo, match.mSbjctBlockStarts, match.mQueryBlockStarts, match.mBlockSizes)) f.copy(map_genome2query) data = index.get(options.prefix + match.mSbjctId, match.mSbjctFrom, match.mSbjctTo) values = [] for x, vv in data: for v in vv: if map_genome2query.mapRowToCol(x) >= 0: values.append(v) x += 1 if len(values) == 0: nskipped += 1 continue noutput += 1 if options.loglevel >= 2: options.stdlog.write( "# %s\n" % ",".join(["%5.3f" % v for v in values])) s = Stats.DistributionalParameters(values) options.stdout.write("%s\t%i\t%s" % (match.mQueryId, match.mNMismatches + match.mNMatches, str(s))) if options.with_values: options.stdout.write( "\t%s" % ",".join(["%5.3f" % v for v in values])) options.stdout.write("\n") if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()