def pslSelectQuery(options): ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 value, field = options.select.split("-") if field == "nmatches": f = lambda x: x.mNMatches elif field == "nmismatches": f = lambda x: x.mNMisMatches for data in Blat.iterator_per_query(Blat.iterator(options.stdin)): ninput += 1 if options.test and ninput >= options.test: break if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) data.sort(key=f) if value == "most": options.stdout.write("%s\n" % str(data[-1])) elif value == "least": options.stdout.write("%s\n" % str(data[0])) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: psl2chain.py 2901 2010-04-13 14:38:07Z andreas $", usage=globals()["__doc__"] ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) ## do sth ninput, nskipped, noutput = 0, 0, 0 for psl in Blat.iterator(options.stdin): ninput += 1 if psl.strand == "-": qstart, qend = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom else: qstart, qend = psl.mQueryFrom, psl.mQueryTo options.stdout.write( "chain %i %s %i %s %i %i %s %i %s %i %i %i\n" % ( psl.mNMatches, psl.mSbjctId, psl.mSbjctLength, "+", psl.mSbjctFrom, psl.mSbjctTo, psl.mQueryId, psl.mQueryLength, psl.strand, qstart, qend, ninput, ) ) size, tend, qend = 0, None, None for qstart, tstart, size in psl.getBlocks(): if tend != None: options.stdout.write("\t%i\t%i\n" % (tstart - tend, qstart - qend)) qend, tend = qstart + size, tstart + size options.stdout.write("%i" % (size,)) options.stdout.write("\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) ## write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: psl2chain.py 2901 2010-04-13 14:38:07Z andreas $", usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # do sth ninput, nskipped, noutput = 0, 0, 0 for psl in Blat.iterator(options.stdin): ninput += 1 if psl.strand == "-": qstart, qend = psl.mQueryLength - \ psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom else: qstart, qend = psl.mQueryFrom, psl.mQueryTo options.stdout.write("chain %i %s %i %s %i %i %s %i %s %i %i %i\n" % (psl.mNMatches, psl.mSbjctId, psl.mSbjctLength, "+", psl.mSbjctFrom, psl.mSbjctTo, psl.mQueryId, psl.mQueryLength, psl.strand, qstart, qend, ninput)) size, tend, qend = 0, None, None for qstart, tstart, size in psl.getBlocks(): if tend is not None: options.stdout.write( "\t%i\t%i\n" % (tstart - tend, qstart - qend)) qend, tend = qstart + size, tstart + size options.stdout.write("%i" % (size,)) options.stdout.write("\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--mask-lowercase", dest="mask_lowercase", action="store_true", help= "mask lowercase characters before computing properties [default=%default]" ) parser.add_option("--with-match", dest="with_match", action="store_true", help="echo the match in output [default=%default]") parser.add_option( "--without-match", dest="with_match", action="store_false", help="do not echo the match in output [default=%default]") parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"), help="methods to compute properties between sequence pairs.") WrapperCodeML.BaseML().AddOptions(parser) parser.set_defaults( methods=[], mask_lowercase=False, is_pslx=True, with_match=True, ) (options, args) = E.Start(parser) counters_plain = [] counters = [] for method in options.methods: if method == "counts": counters.append( SequencePairProperties.SequencePairPropertiesCountsNa()) elif method == "query-counts": counters.append(QueriesCounter()) elif method == "sbjct-counts": counters.append(SbjctsCounter()) elif method == "baseml": counters.append( SequencePairProperties.SequencePairPropertiesBaseML(options)) elif method == "match": counters_plain.append(CounterMatch(options)) if counters: iterator = Blat.iterator_pslx(options.stdin) header = "\t".join(Blat.MatchPSLX().getHeaders()) else: iterator = Blat.iterator(options.stdin) header = "\t".join(Blat.Match().getHeaders()) if not options.with_match: header = "qName" options.stdout.write( "\t".join([ header, ] + ["\t".join(x.getHeaders()) for x in counters] + ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n") ninput, noutput, nskipped = 0, 0, 0 for match in iterator: ninput += 1 if options.with_match: options.stdout.write(str(match)) else: options.stdout.write(match.mQueryId) if counters: qseq = match.mQuerySequence sseq = match.mSbjctSequence # mask non printable characters - sometimes # appear after using pslToPslX qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq] sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq] if options.mask_lowercase: qseq = [re.sub("[a-z]", "N", x) for x in qseq] sseq = [re.sub("[a-z]", "N", x) for x in sseq] match.mQuerySequence = qseq match.mSbjctSequence = sseq qseq = "".join(match.mQuerySequence).upper() sseq = "".join(match.mSbjctSequence).upper() if len(qseq) != len(sseq): if options.loglevel >= 1: options.stdlog.write( "# WARNING: two sequences of unequal length in match\n# %s\n" % str(match)) nskipped += 1 continue for counter in counters: counter(qseq, sseq) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters])) if counters_plain: for counter in counters_plain: counter(match) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters_plain])) options.stdout.write("\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--filter-query", dest="filename_filter_query", type="string", help="filename with intervals in the query " "to filter (in gff format) [default=%default].") parser.add_option("--filter-target", dest="filename_filter_target", type="string", help="filename with intervals in the target to " "filter (in gff format) [default=%default].") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("map", "merge", "add-sequence", "complement", "select-query", "test", "filter-keep", "filter-remove", "rename-query", "sanitize", "filter-fasta", "remove-overlapping-query", "remove-overlapping-target"), help="""action to perform [default=%default].""") parser.add_option("--select", dest="select", type="choice", choices=("most-nmatches", "least-nmatches", "most-nmismatches", "least-nmismatches"), help="entry to select [default=%default].") parser.add_option("--header-names", dest="header", type="choice", choices=("none", "table", "full"), help="output psl header [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("gff", "gtf"), help="format of intervals [default=%default].") parser.add_option("--queries-tsv-file", dest="filename_queries", type="string", help="fasta filename with queries.") parser.add_option("--target-psl-file", dest="filename_sbjcts", type="string", help="fasta filename with sbjct [default=%default].") parser.add_option("--id-format", dest="id_format", type="string", help="format of new identifiers for the rename " "function [default=%default].") parser.add_option("--unique", dest="unique", action="store_true", help="in the rename function, make each match " "unique [default=%default].") parser.add_option("--output-filename-map", dest="output_filename_map", type="string", help="filename with map of old to new labels for " "rename function [default=%default].") parser.add_option("--complement-min-length", dest="complement_min_length", type="int", help="minimum length for complemented blocks " "[default=%default].") parser.add_option("--complement-border", dest="complement_border", type="int", help="number of residues to exclude before alignment " "at either end [default=%default].") parser.add_option("--complement-aligner", dest="complement_aligner", type="choice", choices=("clustal", "dba", "dialign", "dialign-lgs"), help="aligner for complemented segments " "[default=%default].") parser.add_option("--threshold-merge-distance", dest="threshold_merge_distance", type="int", help="distance in nucleotides at which two adjacent " "reads shall be merged even if they are not " "overlapping [%default].") parser.add_option("--test", dest="test", type="int", help="for debugging purposes - stop after x " "iterations [default=%default].") parser.set_defaults(filename_filter_target=None, filename_filter_query=None, filename_queries=None, filename_sbjcts=None, threshold_merge_distance=0, report_step=100000, min_aligned=100, methods=[], format="gff", select="most-nmatches", id_format="%06i", unique=False, output_filename_map=None, header=None, test=None) (options, args) = E.start(parser, add_pipe_options=True) if options.filename_queries: query_fasta = IndexedFasta.IndexedFasta(options.filename_queries) else: query_fasta = None if options.filename_sbjcts: sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts) else: sbjct_fasta = None if "add-sequence" in options.methods and \ (sbjct_fasta is None or query_fasta is None): raise ValueError( "please supply both indexed query and " "target/genome sequence data.") iterator = Blat.iterator(options.stdin) if options.header is not None or options.header != "none": if options.header == "table": options.stdout.write("\t".join(Blat.FIELDS) + "\n") elif options.header == "full": options.stdout.write(Blat.HEADER + "\n") for method in options.methods: if "map" == method: pslMap(options) break elif "filter-keep" == method: pslFilter(options, keep=True) break elif "filter-remove" == method: pslFilter(options, keep=False) break elif "merge" == method: pslMerge(options) break elif "add-sequence" == method: pslAddSequence(query_fasta, sbjct_fasta, options) break elif "complement" == method: pslComplement(query_fasta, sbjct_fasta, options) break elif "select-query" == method: pslSelectQuery(options) break elif "test" == method: iterator = Blat.iterator_test(iterator, options.report_step) elif "rename-query" == method: iterator = iterator_rename_query(iterator, options) elif "sanitize" == method: iterator = iterator_sanitize( iterator, query_fasta, sbjct_fasta, options) elif "filter-fasta" == method: iterator = iterator_filter_fasta( iterator, query_fasta, sbjct_fasta, options) elif "remove-overlapping-query" == method: iterator = iterator_filter_overlapping_query(iterator, options) elif "remove-overlapping-target" == method: iterator = iterator_filter_overlapping_target(iterator, options) for psl in iterator: options.stdout.write("%s\n" % str(psl)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--filter-query", dest="filename_filter_query", type="string", help="filename with intervals in the query " "to filter (in gff format) [default=%default].") parser.add_option("--filter-target", dest="filename_filter_target", type="string", help="filename with intervals in the target to " "filter (in gff format) [default=%default].") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("map", "merge", "add-sequence", "complement", "select-query", "test", "filter-keep", "filter-remove", "rename-query", "sanitize", "filter-fasta", "remove-overlapping-query", "remove-overlapping-target"), help="""action to perform [default=%default].""") parser.add_option("--select", dest="select", type="choice", choices=("most-nmatches", "least-nmatches", "most-nmismatches", "least-nmismatches"), help="entry to select [default=%default].") parser.add_option("--header-names", dest="header", type="choice", choices=("none", "table", "full"), help="output psl header [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("gff", "gtf"), help="format of intervals [default=%default].") parser.add_option("--queries-tsv-file", dest="filename_queries", type="string", help="fasta filename with queries.") parser.add_option("--target-psl-file", dest="filename_sbjcts", type="string", help="fasta filename with sbjct [default=%default].") parser.add_option("--id-format", dest="id_format", type="string", help="format of new identifiers for the rename " "function [default=%default].") parser.add_option("--unique", dest="unique", action="store_true", help="in the rename function, make each match " "unique [default=%default].") parser.add_option("--output-filename-map", dest="output_filename_map", type="string", help="filename with map of old to new labels for " "rename function [default=%default].") parser.add_option("--complement-min-length", dest="complement_min_length", type="int", help="minimum length for complemented blocks " "[default=%default].") parser.add_option("--complement-border", dest="complement_border", type="int", help="number of residues to exclude before alignment " "at either end [default=%default].") parser.add_option("--complement-aligner", dest="complement_aligner", type="choice", choices=("clustal", "dba", "dialign", "dialign-lgs"), help="aligner for complemented segments " "[default=%default].") parser.add_option("--threshold-merge-distance", dest="threshold_merge_distance", type="int", help="distance in nucleotides at which two adjacent " "reads shall be merged even if they are not " "overlapping [%default].") parser.add_option("--test", dest="test", type="int", help="for debugging purposes - stop after x " "iterations [default=%default].") parser.set_defaults(filename_filter_target=None, filename_filter_query=None, filename_queries=None, filename_sbjcts=None, threshold_merge_distance=0, report_step=100000, min_aligned=100, methods=[], format="gff", select="most-nmatches", id_format="%06i", unique=False, output_filename_map=None, header=None, test=None) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_queries: query_fasta = IndexedFasta.IndexedFasta(options.filename_queries) else: query_fasta = None if options.filename_sbjcts: sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts) else: sbjct_fasta = None if "add-sequence" in options.methods and \ (sbjct_fasta is None or query_fasta is None): raise ValueError( "please supply both indexed query and " "target/genome sequence data.") iterator = Blat.iterator(options.stdin) if options.header is not None or options.header != "none": if options.header == "table": options.stdout.write("\t".join(Blat.FIELDS) + "\n") elif options.header == "full": options.stdout.write(Blat.HEADER + "\n") for method in options.methods: if "map" == method: pslMap(options) break elif "filter-keep" == method: pslFilter(options, keep=True) break elif "filter-remove" == method: pslFilter(options, keep=False) break elif "merge" == method: pslMerge(options) break elif "add-sequence" == method: pslAddSequence(query_fasta, sbjct_fasta, options) break elif "complement" == method: pslComplement(query_fasta, sbjct_fasta, options) break elif "select-query" == method: pslSelectQuery(options) break elif "test" == method: iterator = Blat.iterator_test(iterator, options.report_step) elif "rename-query" == method: iterator = iterator_rename_query(iterator, options) elif "sanitize" == method: iterator = iterator_sanitize( iterator, query_fasta, sbjct_fasta, options) elif "filter-fasta" == method: iterator = iterator_filter_fasta( iterator, query_fasta, sbjct_fasta, options) elif "remove-overlapping-query" == method: iterator = iterator_filter_overlapping_query(iterator, options) elif "remove-overlapping-target" == method: iterator = iterator_filter_overlapping_target(iterator, options) for psl in iterator: options.stdout.write("%s\n" % str(psl)) E.Stop()
if method == "counts": counters.append( SequencePairProperties.SequencePairPropertiesCountsNa() ) elif method == "query-counts": counters.append( QueriesCounter() ) elif method == "sbjct-counts": counters.append( SbjctsCounter() ) elif method == "baseml": counters.append( SequencePairProperties.SequencePairPropertiesBaseML( options ) ) elif method == "match": counters_plain.append( CounterMatch( options ) ) if counters: iterator = Blat.iterator_pslx( options.stdin ) header = "\t".join(Blat.MatchPSLX().getHeaders()) else: iterator = Blat.iterator( options.stdin ) header = "\t".join(Blat.Match().getHeaders()) if not options.with_match: header = "qName" options.stdout.write( "\t".join( [header,] + [ "\t".join(x.getHeaders()) for x in counters] + [ "\t".join(x.getHeaders()) for x in counters_plain] ) + "\n" ) ninput, noutput, nskipped = 0, 0, 0 # ## setup totals # totals = {} # for section in options.sections:
def main(): parser = E.OptionParser( version = "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) parser.add_option("--random-proportion", dest="random_proportion", type="float", help="mask randomly columns in multiple alignments [default=%default]" ) parser.add_option("--random", dest="random", action="store_true", help="shuffle quality scores before masking [default=%default]" ) parser.set_defaults( quality_threshold = 40, quality_file = "quality", filename_map = None, frame = 3, ) (options, args) = E.Start( parser ) ################################################## ################################################## ################################################## ## read map ################################################## infile = open(options.filename_map) map_genes2genome = {} for match in Blat.iterator( infile ): assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId map_genes2genome[match.mQueryId] = match infile.close() ################################################## ################################################## ################################################## ## get quality scores ################################################## quality = IndexedFasta.IndexedFasta( options.quality_file ) quality.setTranslator( IndexedFasta.TranslatorBytes() ) ################################################## ################################################## ################################################## ## main loop ################################################## ninput, noutput, nmissed = 0, 0, 0 options.stdout.write( "cluster_id\tstart\tend\n" ) for line in options.stdin: if line.startswith("cluster_id"): continue ninput += 1 cluster_id, gene_id, alignment = line[:-1].split("\t") if gene_id not in map_genes2genome: nmissed += 1 E.warn( "gene_id %s not found in map." % gene_id ) continue match = map_genes2genome[gene_id] map_gene2genome = match.getMapQuery2Target() is_negative = match.strand == "-" # if strand is negative, the coordinates are # on the negative strand of the gene/query # in order to work in the right coordinate system # revert the sequence if is_negative: alignment = alignment[::-1] # get map of gene to alignment map_gene2mali = alignlib_lite.py_makeAlignmentVector() fillAlignment( map_gene2mali, alignment ) # get quality scores quality_scores = quality.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome)) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali)) # print quality_scores map_mali2genome = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_combineAlignment( map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR ) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_mali2genome)) # shuffle quality scores, but only those that are aligned if options.random: positions = [] for fp,c in enumerate(alignment): if c == "-": continue y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom if y < 0: continue positions.append( y ) scores = [ quality_scores[ x ] for x in positions ] random.shuffle(scores) for p,q in zip( positions,scores): quality_scores[p] = q # negative strand to_mask = [] ## reverse position rp = len(alignment) for fp,c in enumerate(alignment): rp -= 1 if c == "-": continue y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom if y < 0: continue if quality_scores[y] < options.quality_threshold: if is_negative: p = rp else: p = fp E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % \ (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol( fp ), quality_scores[y] ) ) if options.frame > 1: start = (p // options.frame) * options.frame to_mask.extend( list( range(start, start + options.frame) ) ) else: to_mask.append( p ) regions = Iterators.group_by_distance( sorted(to_mask) ) for start,end in regions: options.stdout.write( "%s\t%i\t%i\n" % (cluster_id, start, end ) ) noutput += 1 E.info( "ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed) ) E.Stop()
forward_query = False, ) (options, args) = E.Start( parser ) if options.filename_query: query = IndexedFasta.IndexedFasta( options.filename_query ) if options.filename_target: target = IndexedFasta.IndexedFasta( options.filename_target ) if options.method == "full": getAlignment = getAlignmentFull id = 0 for match in Blat.iterator( options.stdin ): if options.loglevel >= 2: options.stdout.write("# %s\n" % str(match)) m = match.getMapQuery2Target() m.moveAlignment( -min(match.mQueryBlockStarts), -min(match.mSbjctBlockStarts) ) q = query.getSequence( match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo ) t = target.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo ) query_ali, sbjct_ali = getAlignment( m, q, t, options ) if match.strand == "-" and options.forward_query: query_ali = Genomics.complement( query_ali ) sbjct_ali = Genomics.complement( sbjct_ali ) options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % \ (options.query_prefix,
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $", usage=globals()["__doc__"]) parser.add_option("--mask-lowercase", dest="mask_lowercase", action="store_true", help="mask lowercase characters before computing properties [default=%default]") parser.add_option("--with-match", dest="with_match", action="store_true", help="echo the match in output [default=%default]") parser.add_option("--without-match", dest="with_match", action="store_false", help="do not echo the match in output [default=%default]") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=( "counts", "baseml", "match", "query-counts", "sbjct-counts"), help="methods to compute properties between sequence pairs.") WrapperCodeML.BaseML().AddOptions(parser) parser.set_defaults( methods=[], mask_lowercase=False, is_pslx=True, with_match=True, ) (options, args) = E.Start(parser) counters_plain = [] counters = [] for method in options.methods: if method == "counts": counters.append( SequencePairProperties.SequencePairPropertiesCountsNa()) elif method == "query-counts": counters.append(QueriesCounter()) elif method == "sbjct-counts": counters.append(SbjctsCounter()) elif method == "baseml": counters.append( SequencePairProperties.SequencePairPropertiesBaseML(options)) elif method == "match": counters_plain.append(CounterMatch(options)) if counters: iterator = Blat.iterator_pslx(options.stdin) header = "\t".join(Blat.MatchPSLX().getHeaders()) else: iterator = Blat.iterator(options.stdin) header = "\t".join(Blat.Match().getHeaders()) if not options.with_match: header = "qName" options.stdout.write("\t".join( [header, ] + ["\t".join(x.getHeaders()) for x in counters] + ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n") ninput, noutput, nskipped = 0, 0, 0 for match in iterator: ninput += 1 if options.with_match: options.stdout.write(str(match)) else: options.stdout.write(match.mQueryId) if counters: qseq = match.mQuerySequence sseq = match.mSbjctSequence # mask non printable characters - sometimes # appear after using pslToPslX qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq] sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq] if options.mask_lowercase: qseq = [re.sub("[a-z]", "N", x) for x in qseq] sseq = [re.sub("[a-z]", "N", x) for x in sseq] match.mQuerySequence = qseq match.mSbjctSequence = sseq qseq = "".join(match.mQuerySequence).upper() sseq = "".join(match.mSbjctSequence).upper() if len(qseq) != len(sseq): if options.loglevel >= 1: options.stdlog.write( "# WARNING: two sequences of unequal length in match\n# %s\n" % str(match)) nskipped += 1 continue for counter in counters: counter(qseq, sseq) options.stdout.write("\t" + "\t".join( [str(counter) for counter in counters])) if counters_plain: for counter in counters_plain: counter(match) options.stdout.write("\t" + "\t".join( [str(counter) for counter in counters_plain])) options.stdout.write("\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: psl2stats.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.set_defaults() (options, args) = E.Start(parser) query_bitsets, target_bitsets = {}, {} def addRange(bitset, id, size, iterator): if id not in bitset: bitset[id] = bx.bitset.BinnedBitSet(size) b = bitset[id] for start, end in iterator: b.set_range(start, end - start) for psl in Blat.iterator(options.stdin): addRange(query_bitsets, psl.mQueryId, psl.mQueryLength, psl.iterator_query_exons()) addRange(target_bitsets, psl.mSbjctId, psl.mSbjctLength, psl.iterator_sbjct_exons()) def printBitset(outfile, bitsets): outfile.write("contig\tcovered\tsize\tpcovered\n") total, total_len = 0, 0 for chrom in sorted(bitsets): l = bitsets[chrom].size s = bitsets[chrom].count_range(0, l) if l > 0: outfile.write("%s\t%i\t%i\t%6.4f\n" % (chrom, s, l, 100.0 * s / l)) total += s total_len += l if total_len > 0: outfile.write("total\t%i\t%i\t%6.4f\n" % (total, total_len, 100.0 * total / total_len)) options.stdout.write("# query\n") printBitset(options.stdout, query_bitsets) options.stdout.write("# target\n") printBitset(options.stdout, target_bitsets) E.Stop()
def main(argv=None): parser = E.OptionParser( version= "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--random-proportion", dest="random_proportion", type="float", help="mask randomly columns in multiple alignments [default=%default]") parser.add_option( "--random", dest="random", action="store_true", help="shuffle quality scores before masking [default=%default]") parser.set_defaults( quality_threshold=40, quality_file="quality", filename_map=None, frame=3, ) (options, args) = E.Start(parser) ################################################## ################################################## ################################################## # read map ################################################## infile = open(options.filename_map) map_genes2genome = {} for match in Blat.iterator(infile): assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId map_genes2genome[match.mQueryId] = match infile.close() ################################################## ################################################## ################################################## # get quality scores ################################################## quality = IndexedFasta.IndexedFasta(options.quality_file) quality.setTranslator(IndexedFasta.TranslatorBytes()) ################################################## ################################################## ################################################## # main loop ################################################## ninput, noutput, nmissed = 0, 0, 0 options.stdout.write("cluster_id\tstart\tend\n") for line in options.stdin: if line.startswith("cluster_id"): continue ninput += 1 cluster_id, gene_id, alignment = line[:-1].split("\t") if gene_id not in map_genes2genome: nmissed += 1 E.warn("gene_id %s not found in map." % gene_id) continue match = map_genes2genome[gene_id] map_gene2genome = match.getMapQuery2Target() is_negative = match.strand == "-" # if strand is negative, the coordinates are # on the negative strand of the gene/query # in order to work in the right coordinate system # revert the sequence if is_negative: alignment = alignment[::-1] # get map of gene to alignment map_gene2mali = alignlib_lite.py_makeAlignmentVector() fillAlignment(map_gene2mali, alignment) # get quality scores quality_scores = quality.getSequence(match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome)) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali)) # print quality_scores map_mali2genome = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_combineAlignment(map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR) # print str(alignlib_lite.py_AlignmentFormatEmissions( # map_mali2genome)) # shuffle quality scores, but only those that are aligned if options.random: positions = [] for fp, c in enumerate(alignment): if c == "-": continue y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom if y < 0: continue positions.append(y) scores = [quality_scores[x] for x in positions] random.shuffle(scores) for p, q in zip(positions, scores): quality_scores[p] = q # negative strand to_mask = [] # reverse position rp = len(alignment) for fp, c in enumerate(alignment): rp -= 1 if c == "-": continue y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom if y < 0: continue if quality_scores[y] < options.quality_threshold: if is_negative: p = rp else: p = fp E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol(fp), quality_scores[y])) if options.frame > 1: start = (p // options.frame) * options.frame to_mask.extend(list(range(start, start + options.frame))) else: to_mask.append(p) regions = Iterators.group_by_distance(sorted(to_mask)) for start, end in regions: options.stdout.write("%s\t%i\t%i\n" % (cluster_id, start, end)) noutput += 1 E.info("ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--query-psl-file", dest="filename_query", type="string", help="fasta filename with queries.") parser.add_option("--target-psl-file", dest="filename_target", type="string", help="fasta filename with target.") parser.add_option( "-m", "--method", dest="method", type="choice", choices=("full", "pileup-query", "pileup-target", "gapless"), help="method to use for constructing the alignment [%default].") parser.add_option( "--forward-query", dest="forward_query", action="store_true", help= "reverse-complement sequences such that query is always on forward strand [%default]" ) parser.add_option("--target-prefix", dest="target_prefix", type="string", help="prefix to use for target [%default].") parser.add_option("--query-prefix", dest="query_prefix", type="string", help="prefix to use for query [%default].") parser.add_option("--id", dest="id", type="choice", choices=("numeric", "query"), help="choose type of identifier to use [%default]") parser.set_defaults( filename_query=None, filename_target=None, method="full", output_format_id="%06i", target_prefix="", query_prefix="", forward_query=False, ) (options, args) = E.Start(parser) if options.filename_query: query = IndexedFasta.IndexedFasta(options.filename_query) if options.filename_target: target = IndexedFasta.IndexedFasta(options.filename_target) if options.method == "full": getAlignment = getAlignmentFull id = 0 for match in Blat.iterator(options.stdin): if options.loglevel >= 2: options.stdout.write("# %s\n" % str(match)) m = match.getMapQuery2Target() m.moveAlignment(-min(match.mQueryBlockStarts), -min(match.mSbjctBlockStarts)) q = query.getSequence(match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo) t = target.getSequence(match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) query_ali, sbjct_ali = getAlignment(m, q, t, options) if match.strand == "-" and options.forward_query: query_ali = Genomics.complement(query_ali) sbjct_ali = Genomics.complement(sbjct_ali) options.stdout.write( ">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % (options.query_prefix, options.output_format_id % id, match.mQueryId, match.mQueryFrom, match.mQueryTo, query_ali, options.target_prefix, options.output_format_id % id, match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo, sbjct_ali)) id += 1 E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: psl2stats.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) parser.set_defaults( ) (options, args) = E.Start( parser ) query_bitsets, target_bitsets = {}, {} def addRange( bitset, id, size, iterator ): if id not in bitset: bitset[id] = bx.bitset.BinnedBitSet( size ) b = bitset[id] for start, end in iterator: b.set_range( start, end-start ) for psl in Blat.iterator( options.stdin ): addRange( query_bitsets, psl.mQueryId, psl.mQueryLength, psl.iterator_query_exons() ) addRange( target_bitsets, psl.mSbjctId, psl.mSbjctLength, psl.iterator_sbjct_exons() ) def printBitset( outfile, bitsets ): outfile.write( "contig\tcovered\tsize\tpcovered\n" ) total, total_len = 0, 0 for chrom in sorted(bitsets): l = bitsets[chrom].size s = bitsets[chrom].count_range( 0, l ) if l > 0: outfile.write( "%s\t%i\t%i\t%6.4f\n" % (chrom, s,l,100.0 * s / l) ) total += s total_len += l if total_len > 0: outfile.write("total\t%i\t%i\t%6.4f\n" % (total,total_len, 100.0 * total / total_len)) options.stdout.write("# query\n" ) printBitset( options.stdout, query_bitsets ) options.stdout.write("# target\n" ) printBitset( options.stdout, target_bitsets ) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--query-psl-file", dest="filename_query", type="string", help="fasta filename with queries.") parser.add_option("--target-psl-file", dest="filename_target", type="string", help="fasta filename with target.") parser.add_option("-m", "--method", dest="method", type="choice", choices=( "full", "pileup-query", "pileup-target", "gapless"), help="method to use for constructing the alignment [%default].") parser.add_option("--forward-query", dest="forward_query", action="store_true", help="reverse-complement sequences such that query is always on forward strand [%default]") parser.add_option("--target-prefix", dest="target_prefix", type="string", help="prefix to use for target [%default].") parser.add_option("--query-prefix", dest="query_prefix", type="string", help="prefix to use for query [%default].") parser.add_option("--id", dest="id", type="choice", choices=("numeric", "query"), help="choose type of identifier to use [%default]") parser.set_defaults( filename_query=None, filename_target=None, method="full", output_format_id="%06i", target_prefix="", query_prefix="", forward_query=False, ) (options, args) = E.Start(parser) if options.filename_query: query = IndexedFasta.IndexedFasta(options.filename_query) if options.filename_target: target = IndexedFasta.IndexedFasta(options.filename_target) if options.method == "full": getAlignment = getAlignmentFull id = 0 for match in Blat.iterator(options.stdin): if options.loglevel >= 2: options.stdout.write("# %s\n" % str(match)) m = match.getMapQuery2Target() m.moveAlignment(-min(match.mQueryBlockStarts), - min(match.mSbjctBlockStarts)) q = query.getSequence( match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo) t = target.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) query_ali, sbjct_ali = getAlignment(m, q, t, options) if match.strand == "-" and options.forward_query: query_ali = Genomics.complement(query_ali) sbjct_ali = Genomics.complement(sbjct_ali) options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % (options.query_prefix, options.output_format_id % id, match.mQueryId, match.mQueryFrom, match.mQueryTo, query_ali, options.target_prefix, options.output_format_id % id, match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo, sbjct_ali)) id += 1 E.Stop()