def pslAddSequence(query_fasta, sbjct_fasta, options): iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 while 1: match = next(iterator) if not match: break ninput += 1 if options.test and ninput >= options.test: break if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) new = Blat.MatchPSLX() new.fromPSL(match, query_fasta.getSequence( match.mQueryId, "+", match.mQueryFrom, match.mQueryTo), sbjct_fasta.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)) options.stdout.write(str(new) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--mask-lowercase", dest="mask_lowercase", action="store_true", help= "mask lowercase characters before computing properties [default=%default]" ) parser.add_option("--with-match", dest="with_match", action="store_true", help="echo the match in output [default=%default]") parser.add_option( "--without-match", dest="with_match", action="store_false", help="do not echo the match in output [default=%default]") parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"), help="methods to compute properties between sequence pairs.") WrapperCodeML.BaseML().AddOptions(parser) parser.set_defaults( methods=[], mask_lowercase=False, is_pslx=True, with_match=True, ) (options, args) = E.Start(parser) counters_plain = [] counters = [] for method in options.methods: if method == "counts": counters.append( SequencePairProperties.SequencePairPropertiesCountsNa()) elif method == "query-counts": counters.append(QueriesCounter()) elif method == "sbjct-counts": counters.append(SbjctsCounter()) elif method == "baseml": counters.append( SequencePairProperties.SequencePairPropertiesBaseML(options)) elif method == "match": counters_plain.append(CounterMatch(options)) if counters: iterator = Blat.iterator_pslx(options.stdin) header = "\t".join(Blat.MatchPSLX().getHeaders()) else: iterator = Blat.iterator(options.stdin) header = "\t".join(Blat.Match().getHeaders()) if not options.with_match: header = "qName" options.stdout.write( "\t".join([ header, ] + ["\t".join(x.getHeaders()) for x in counters] + ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n") ninput, noutput, nskipped = 0, 0, 0 for match in iterator: ninput += 1 if options.with_match: options.stdout.write(str(match)) else: options.stdout.write(match.mQueryId) if counters: qseq = match.mQuerySequence sseq = match.mSbjctSequence # mask non printable characters - sometimes # appear after using pslToPslX qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq] sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq] if options.mask_lowercase: qseq = [re.sub("[a-z]", "N", x) for x in qseq] sseq = [re.sub("[a-z]", "N", x) for x in sseq] match.mQuerySequence = qseq match.mSbjctSequence = sseq qseq = "".join(match.mQuerySequence).upper() sseq = "".join(match.mSbjctSequence).upper() if len(qseq) != len(sseq): if options.loglevel >= 1: options.stdlog.write( "# WARNING: two sequences of unequal length in match\n# %s\n" % str(match)) nskipped += 1 continue for counter in counters: counter(qseq, sseq) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters])) if counters_plain: for counter in counters_plain: counter(match) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters_plain])) options.stdout.write("\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()