def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: maf2psl.py 2879 2010-04-06 14:44:34Z andreas $", usage=globals()["__doc__"]) parser.add_option("-q", "--query", dest="query", type="string", help="sequence to use for query [default=%default].") parser.add_option("-t", "--target", dest="target", type="string", help="sequence to use for target [default=%default].") parser.set_defaults( query=None, target=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.query is None or options.target is None: if len(args) != 2: raise ValueError( "please supply two sequence identifiers for query and target") options.query, options.target = args # do sth ninput, nskipped, noutput = 0, 0, 0 reader = maf.Reader(options.stdin) psl = Blat.Match() for cc in threaditer(reader, (options.query, options.target)): ninput += 1 query, target = cc # treat identfiers like Hsap.GL000223.1 try: data = query.src.split(".") qs, qcontig = data[0], ".".join(data[1:]) except ValueError, msg: raise ValueError( "error: could not parse query %s: msg=%s" % (query.src, msg)) try: data = target.src.split(".") ts, tcontig = data[0], ".".join(data[1:]) except ValueError, msg: raise ValueError( "error: could not parse target %s: msg=%s" % (target.src, msg))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--mask-lowercase", dest="mask_lowercase", action="store_true", help= "mask lowercase characters before computing properties [default=%default]" ) parser.add_option("--with-match", dest="with_match", action="store_true", help="echo the match in output [default=%default]") parser.add_option( "--without-match", dest="with_match", action="store_false", help="do not echo the match in output [default=%default]") parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"), help="methods to compute properties between sequence pairs.") WrapperCodeML.BaseML().AddOptions(parser) parser.set_defaults( methods=[], mask_lowercase=False, is_pslx=True, with_match=True, ) (options, args) = E.Start(parser) counters_plain = [] counters = [] for method in options.methods: if method == "counts": counters.append( SequencePairProperties.SequencePairPropertiesCountsNa()) elif method == "query-counts": counters.append(QueriesCounter()) elif method == "sbjct-counts": counters.append(SbjctsCounter()) elif method == "baseml": counters.append( SequencePairProperties.SequencePairPropertiesBaseML(options)) elif method == "match": counters_plain.append(CounterMatch(options)) if counters: iterator = Blat.iterator_pslx(options.stdin) header = "\t".join(Blat.MatchPSLX().getHeaders()) else: iterator = Blat.iterator(options.stdin) header = "\t".join(Blat.Match().getHeaders()) if not options.with_match: header = "qName" options.stdout.write( "\t".join([ header, ] + ["\t".join(x.getHeaders()) for x in counters] + ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n") ninput, noutput, nskipped = 0, 0, 0 for match in iterator: ninput += 1 if options.with_match: options.stdout.write(str(match)) else: options.stdout.write(match.mQueryId) if counters: qseq = match.mQuerySequence sseq = match.mSbjctSequence # mask non printable characters - sometimes # appear after using pslToPslX qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq] sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq] if options.mask_lowercase: qseq = [re.sub("[a-z]", "N", x) for x in qseq] sseq = [re.sub("[a-z]", "N", x) for x in sseq] match.mQuerySequence = qseq match.mSbjctSequence = sseq qseq = "".join(match.mQuerySequence).upper() sseq = "".join(match.mSbjctSequence).upper() if len(qseq) != len(sseq): if options.loglevel >= 1: options.stdlog.write( "# WARNING: two sequences of unequal length in match\n# %s\n" % str(match)) nskipped += 1 continue for counter in counters: counter(qseq, sseq) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters])) if counters_plain: for counter in counters_plain: counter(match) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters_plain])) options.stdout.write("\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.add_option("--no-header", dest="with_header", action="store_false", help="do not output BLAT header [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string", help="fasta filename with queries [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default].""" ) parser.set_defaults(is_gtf=False, genome_file=None, with_header=True, allow_duplicates=False, test=None) (options, args) = E.Start(parser, add_pipe_options=True) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) else: genome_fasta = None if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None ninput, noutput, nskipped = 0, 0, 0 if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin), feature="exon"), strict=not options.allow_duplicates) else: iterator = GTF.joined_iterator(GTF.iterator(sys.stdin)) if options.with_header: options.stdout.write(Blat.Match().getHeader() + "\n") for gffs in iterator: if options.test and ninput >= options.test: break ninput += 1 result = alignlib_lite.py_makeAlignmentBlocks() xstart = 0 intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs]) for start, end in intervals: xend = xstart + end - start result.addDiagonal(xstart, xend, start - xstart) xstart = xend entry = Blat.Match() entry.mQueryId = gff.transcript_id entry.mSbjctId = gff.contig entry.strand = gff.strand if genome_fasta: if entry.mSbjctId in genome_fasta: entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId) else: entry.mSbjctLength = result.getColTo() if queries_fasta: if entry.mQueryId in queries_fasta: entry.mQueryLength = queries_fasta.getLength(entry.mQueryId) else: entry.mQueryLength = result.getRowTo() entry.fromMap(result) options.stdout.write(str(entry) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-t", "--tablename", dest="tablename", type="string", help= "tablename to get variants from (in samtools pileup format) [default=%default]." ) parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option( "-f", "--exons-file", dest="filename_exons", type="string", help= "filename with transcript model information (gtf formatted file) [default=%default]." ) parser.add_option( "-r", "--filename-reference", dest="filename_reference", type="string", help= "filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default]." ) parser.add_option( "--vcf-file", dest="filename_vcf", type="string", help= "filename with variants in VCF format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--pileup-file", dest="filename_pileup", type="string", help= "filename with variants in samtools pileup format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--vcf-sample", dest="vcf_sample", type="string", help= "sample id for species of interest in vcf formatted file [default=%default]." ) parser.add_option( "-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help= "filename of a list of transcript ids that are selenoproteins [default=%default]." ) parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option( "-k", "--with-knockouts", dest="with_knockouts", action="store_true", help= "add alleles that are knocked out to fasta and gtf files [default=%default]." ) parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite(options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF(options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join(("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# collected variants:", variants) # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# merged variants:", variants) # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences(indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print("exon", key) Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print("intron", key) Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles( transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip( allele.exon_starts[1:], allele.cds_starts[1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write(">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ("\t".join( (gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: bed2psl.py 2899 2010-04-13 14:37:37Z andreas $", usage=globals()["__doc__"]) parser.add_option("-q", "--query", dest="query", type="string", help="sequence to use for query [default=%default].") parser.add_option("-t", "--target", dest="target", type="string", help="sequence to use for target [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.set_defaults( genome_file=None, query=None, target=None, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) ## do sth ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None psl = Blat.Match() for bed in Bed.iterator(options.stdin): ninput += 1 start, end = bed.start, bed.end if "blockSizes" in bed: psl.mQueryId = bed["name"] blocksizes = [int(x) for x in bed["blockSizes"].split(",")[:-1]] sbjctblockstarts = [ int(x) + start for x in bed["blockStarts"].split(",")[:-1] ] strand = bed["strand"] else: psl.mQueryId = "%i" % ninput blocksizes = [end - start] sbjctblockstarts = [ start, ] strand = "+" psl.mSbjctId = bed.contig psl.mSbjctFrom, psl.mSbjctTo = start, end psl.mQueryFrom, psl.mQueryTo = 0, end - start psl.mBlockSizes = blocksizes psl.mNBlocks = len(blocksizes) psl.strand = strand q, qp = [], 0 for x in blocksizes: q.append(qp) qp += x psl.mQueryBlockStarts = q psl.mSbjctBlockStarts = sbjctblockstarts psl.mQueryLength = sum(psl.mBlockSizes) if fasta: psl.mSbjctLength = fasta.getLength(bed.contig) options.stdout.write("%s\n" % str(psl)) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) ## write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: maf2psl.py 2879 2010-04-06 14:44:34Z andreas $", usage=globals()["__doc__"]) parser.add_option("-q", "--query", dest="query", type="string", help="sequence to use for query [default=%default].") parser.add_option("-t", "--target", dest="target", type="string", help="sequence to use for target [default=%default].") parser.set_defaults( query=None, target=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.query is None or options.target is None: if len(args) != 2: raise ValueError( "please supply two sequence identifiers for query and target") options.query, options.target = args # do sth ninput, nskipped, noutput = 0, 0, 0 reader = maf.Reader(options.stdin) psl = Blat.Match() for cc in threaditer(reader, (options.query, options.target)): ninput += 1 query, target = cc # treat identfiers like Hsap.GL000223.1 try: data = query.src.split(".") qs, qcontig = data[0], ".".join(data[1:]) except ValueError as msg: raise ValueError( "error: could not parse query %s: msg=%s" % (query.src, msg)) try: data = target.src.split(".") ts, tcontig = data[0], ".".join(data[1:]) except ValueError as msg: raise ValueError( "error: could not parse target %s: msg=%s" % (target.src, msg)) assert qs == options.query assert ts == options.target psl.mQueryId = qcontig psl.mSbjctId = tcontig psl.fromPair(query.start, query.src_size, query.strand, query.text.upper(), target.start, target.src_size, target.strand, target.text.upper()) E.debug("%s\t%s\t%i\t%i\t%s\t%s" % (qs, qcontig, query.start, query.src_size, query.strand, query.text)) E.debug("%s\t%s\t%i\t%i\t%s\t%s" % (ts, tcontig, target.start, target.src_size, target.strand, target.text)) options.stdout.write("%s\n" % str(psl)) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: psl2map.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string", help="fasta filename with queries - required for polyA analysis [%default].") parser.add_option("--polyA", dest="polyA", action="store_true", help="detect polyA tails [%default].") parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="OUTPUT filename with histogram information on aggregate coverages [%default].") parser.add_option("--output-filename-empty", dest="output_filename_empty", type="string", help="OUTPUT filename with queries for which all matches have been discarded [%default].") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("map", "psl"), help="output format to choose [%default].") parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true", help="input is zipped.") parser.add_option("--threshold-min-pid", dest="threshold_min_pid", type="float", help="minimum thresholds for pid [%default].") parser.add_option("--threshold-min-matches", dest="threshold_min_matches", type="int", help="minimum threshold for number of matching residues [%default].") parser.add_option("--threshold-max-error-rate", dest="threshold_max_error_rate", type="float", help="maximum threshold for error of aligned part [%default].") parser.add_option("--threshold-good-query-coverage", dest="threshold_good_query_coverage", type="float", help="minimum query coverage for segments to be counted as good [%default].") parser.add_option("--threshold-min-query-coverage", dest="threshold_min_query_coverage", type="float", help="minimum query coverage for segments to be accepted [%default].") parser.add_option("--threshold-max-query-gapchars", dest="threshold_max_query_gapchars", type="int", help="maximum number of gap characters in query[%default].") parser.add_option("--threshold-max-query-gaps", dest="threshold_max_query_gaps", type="int", help="maximum number of gaps in query[%default].") parser.add_option("--threshold-max-sbjct-gapchars", dest="threshold_max_sbjct_gapchars", type="int", help="maximum number of gap characters in sbjct[%default].") parser.add_option("--keep-unique-matches", dest="keep_unique_matches", action="store_true", help="ignore filters for unique matches [%default].") parser.add_option("--keep-all-best", dest="keep_all_best", action="store_true", help="when sorting matches, keep all matches within the collection threshold [%default].") parser.add_option("--output-best-per-subject", dest="best_per_sbjct", action="store_true", help="keep only the best entry per sbjct (for transcript mapping) [%default].") parser.add_option("--threshold-max-sbjct-gaps", dest="threshold_max_sbjct_gaps", type="int", help="maximum number of gaps in sbjct[%default].") parser.add_option("--test", dest="test", type="int", help="test - stop after # rows of parsing[%default].") parser.add_option("-m", "--matching-mode", dest="matching_mode", type="choice", choices=("best-coverage", "best-query-coverage", "best-sbjct-coverage", "best-pid", "best-covpid", "best-query-covpid", "best-sbjct-covpid", "best-min-covpid", "best-query-min-covpid", "best-sbjct-min-covpid", "unique", "all"), help="determines how to selecte the best match [%default].") parser.add_option("--subjctfilter-tsv-file", dest="filename_filter_sbjct", type="string", help="gff file for filtering sbjct matches. Matches overlapping these regions are discarded, but see --keep-forbidden [%default].") parser.add_option("--keep-forbidden", dest="keep_forbidden", action="store_true", help="if set, keep only matches that overlap the regions supplied with --subjctfilter-tsv-file [%default].") parser.add_option("--query-forward-coordinates", dest="query_forward_coordinates", action="store_true", help="use forward coordinates for query, strand will refer to sbjct [%default].") parser.add_option("--ignore-all-random", dest="ignore_all_random", action="store_true", help="if there are multiple best matches, ignore all those to chrUn and _random [%default].") parser.add_option("--collection-threshold", dest="collection_threshold", type="float", help="threshold for collecting matches, percent of best score [%default].") parser.add_option("--collection-distance", dest="collection_distance", type="float", help="threshold for collecting matches, difference to best score [%default].") parser.set_defaults(input_filename_domains=None, input_filename_queries=None, threshold_good_query_coverage=90.0, threshold_min_pid=30.0, threshold_min_matches=0, threshold_max_error_rate=None, output_filename_pattern="%s", keep_unique_matches=False, output_format="map", print_matched=["full", "partial", "good"], from_zipped=False, combine_overlaps=True, min_length_domain=30, threshold_min_query_coverage=50, min_length_singletons=30, new_family_id=10000000, add_singletons=False, matching_mode="best-coverage", best_per_sbjct=False, threshold_max_query_gapchars=None, threshold_max_query_gaps=None, threshold_max_sbjct_gapchars=None, threshold_max_sbjct_gaps=None, filename_filter_sbjct=None, keep_forbidden=False, keep_all_best=False, test=None, query_forward_coordinates=False, output_filename_empty=None, collection_threshold=1.0, collection_distance=0, polyA=False, # max residues missing from non polyA end polyA_max_unaligned=3, # min residues in tail polyA_min_unaligned=10, # min percent residues that are A/T in tail polyA_min_percent=70.0, # ignore duplicate matches if they are on Un or # _random ignore_all_random=False, ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) == 1: if options.from_zipped or args[0][-3:] == ".gz": import gzip infile = gzip.open(args[0], "r") else: infile = IOTools.openFile(args[0], "r") else: infile = sys.stdin if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None if options.filename_filter_sbjct: try: import bx.intervals.intersection except ImportError: raise ValueError("filtering for intervals requires the bx tools") intervals = GTF.readGFFFromFileAsIntervals( IOTools.openFile(options.filename_filter_sbjct, "r")) intersectors = {} for contig, values in list(intervals.items()): intersector = bx.intervals.intersection.Intersecter() for start, end in values: intersector.add_interval(bx.intervals.Interval(start, end)) intersectors[contig] = intersector if options.loglevel >= 1: options.stdlog.write("# read %i intervals for %i contigs.\n" % (sum([len(x) for x in list(intervals.values())]), len(intersectors))) else: intersectors = None ################################################ ################################################ ################################################ # processing of a chunk (matches of same query) ################################################ ninput, noutput, nskipped = 0, 0, 0 # number of sequences with full/partial/good matches nfull_matches, npartial_matches, ngood_matches = 0, 0, 0 # number of sequences which are fully/good/partially matched # i.e., after combining all aligned regions nfully_matched, npartially_matched, nwell_matched = 0, 0, 0 nremoved_pid, nremoved_query_coverage, nempty = 0, 0, 0 nremoved_gaps, nremoved_nmatches = 0, 0 nremoved_regions = 0 nqueries_removed_region = 0 aggregate_coverages = [] mapped_coverages = [] fully_matched = [] well_matched = [] partially_matched = [] new_family_id = options.new_family_id if options.output_filename_empty: outfile_empty = IOTools.openFile(options.output_filename_empty, "w") outfile_empty.write("read_id\tcomment\n") else: outfile_empty = None if options.polyA: options.outfile_polyA = IOTools.openFile( options.output_filename_pattern % "polyA", "w") options.outfile_polyA.write("query_id\tstart\tend\tpA+N\tpT+N\ttail\n") def processChunk(query_id, matches): """process a set of matches from query_id""" global ninput, noutput, nskipped global nfull_matches, npartial_matches, ngood_matches global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches global nremoved_regions, nqueries_removed_region global outfile_empty ninput += 1 full_matches = [] good_matches = [] partial_matches = [] x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0 nmatches = len(matches) new_matches = [] # absolute filters applicable to non-fragmentory matches for match in matches: if match.mPid < options.threshold_min_pid: nremoved_pid += 1 continue if match.mNMatches < options.threshold_min_matches: nremoved_nmatches += 1 continue if options.threshold_max_error_rate: r = 100.0 * \ math.power( options.threshold_max_error_rate, match.mNMatches + match.mNMismatches) if match.mPid < r: nremoved_pid += 1 x_nremoved_pid += 1 continue new_matches.append(match) matches = new_matches # filter matches if len(matches) == 0: if outfile_empty: outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" % (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches)) nskipped += 1 return if options.keep_unique_matches and len(matches) == 1: pass else: new_matches = [] for match in matches: if match.mQueryCoverage < options.threshold_min_query_coverage: nremoved_query_coverage += 1 x_nquery_coverage += 1 continue if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue new_matches.append(match) matches = new_matches if len(matches) == 0: if outfile_empty: outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" % (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches)) nskipped += 1 return # Remove queries matching to a forbidden region. This section # will remove the full query if any of its matches matches in a # forbidden region. keep = True for match in matches: if intersectors and match.mSbjctId in intersectors: found = intersectors[match.mSbjctId].find( match.mSbjctFrom, match.mSbjctTo) if found and not options.keep_forbidden or (found and not options.keep_forbidden): nremoved_regions += 1 keep = False continue if not keep: nqueries_removed_region += 1 if outfile_empty: outfile_empty.write( "%s\toverlap with forbidden region\n" % query_id) return # check for full length matches for match in matches: if match.mQueryCoverage >= 99.9: full_matches.append(match) if match.mQueryCoverage > options.threshold_good_query_coverage: good_matches.append(match) else: partial_matches.append(match) if full_matches: nfull_matches += 1 elif good_matches: ngood_matches += 1 elif partial_matches: npartial_matches += 1 # compute coverage of sequence with matches intervals = [] for match in full_matches + good_matches + partial_matches: intervals.append((match.mQueryFrom, match.mQueryTo)) rest = Intervals.complement(intervals, 0, match.mQueryLength) query_coverage = 100.0 * \ (match.mQueryLength - sum([x[1] - x[0] for x in rest])) / match.mQueryLength if query_coverage >= 99.9: fully_matched.append(query_id) elif query_coverage > options.threshold_good_query_coverage: well_matched.append(query_id) else: partially_matched.append(query_id) aggregate_coverages.append(query_coverage) # select matches to output matches, msg = selectMatches(query_id, matches, options, queries_fasta) if len(matches) > 0: for match in matches: if options.query_forward_coordinates: match.convertCoordinates() if options.output_format == "map": options.stdout.write("%s\n" % "\t".join(map(str, ( match.mQueryId, match.mSbjctId, match.strand, "%5.2f" % match.mQueryCoverage, "%5.2f" % match.mSbjctCoverage, "%5.2f" % match.mPid, match.mQueryLength, match.mSbjctLength, match.mQueryFrom, match.mQueryTo, match.mSbjctFrom, match.mSbjctTo, ",".join( map(str, match.mBlockSizes)), ",".join( map(str, match.mQueryBlockStarts)), ",".join( map(str, match.mSbjctBlockStarts)), )))) elif options.output_format == "psl": options.stdout.write(str(match) + "\n") noutput += 1 else: if outfile_empty: outfile_empty.write( "%s\tno matches selected: %s\n" % (query_id, msg)) nempty += 1 if options.output_format == "map": options.stdout.write("\t".join(("query_id", "sbjct_id", "sstrand", "qcoverage", "scoverage", "pid", "qlen", "slen", "qfrom", "qto", "sfrom", "sto", "blocks", "qstarts", "sstarts")) + "\n") elif options.output_format == "psl": options.stdout.write(Blat.Match().getHeader() + "\n") ################################################ ################################################ ################################################ # main loop ################################################ nfully_covered = None matches = [] last_query_id = None is_complete = True ninput_lines = 0 skip = 0 iterator = Blat.BlatIterator(infile) while 1: try: match = next(iterator) except Blat.ParsingError: iterator = Blat.BlatIterator(infile) continue if match is None: break ninput_lines += 1 if options.test and ninput_lines > options.test: break if match.mQueryId != last_query_id: if last_query_id: processChunk(last_query_id, matches) matches = [] last_query_id = match.mQueryId matches.append(match) processChunk(last_query_id, matches) printHistogram(aggregate_coverages, "aggregate", options) printHistogram(mapped_coverages, "mapped", options) if "full" in options.print_matched: printMatched(fully_matched, "full", options) if "good" in options.print_matched: printMatched(well_matched, "good", options) if "partial" in options.print_matched: printMatched(partially_matched, "partial", options) if options.loglevel >= 1: options.stdlog.write( "# alignments: ninput=%i, is_complete=%s\n" % (ninput_lines, str(is_complete))) options.stdlog.write( "# queries: ninput=%i, noutput=%i\n" % (ninput, noutput)) options.stdlog.write("# individual coverage: full=%i, good=%i, partial=%i\n" % ( nfull_matches, ngood_matches, npartial_matches)) options.stdlog.write("# aggregate coverage: full=%i, good=%i, partial=%i\n" % ( len(fully_matched), len(well_matched), len(partially_matched))) options.stdlog.write("# omitted queries: total=%i, thresholds=%i, regions=%i, selection=%i\n" % (nskipped + nqueries_removed_region + nempty, nskipped, nqueries_removed_region, nempty)) options.stdlog.write("# omitted matches: pid=%i, query_coverage=%i, gaps=%i, regions=%i, nmatches=%i\n" % ( nremoved_pid, nremoved_query_coverage, nremoved_gaps, nremoved_regions, nremoved_nmatches)) E.Stop()
def main( argv = None ): if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: align_pairs.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"] ) parser.add_option("--skip-statistics", dest="skip_stats", action="store_true", help="do not compute alignment statistics [%default]." ) parser.add_option("--method", dest="methods", type="choice", action="append", choices=("dialign", "clustal", "blastz", "nw", "sw", "dba", "dialignlgs" ), help="alignment method [%default]." ) parser.add_option("--anchor-alignment", dest="anchor_alignment", type="int", help="anchor alignmet with xxx residues [%default]." ) parser.add_option("--output-format", dest="output_formats", type="choice", action="append", choices=("fasta", "stats", "psl" ), help="anchor alignment with xxx residues [%default]." ) parser.add_option("--input-format", dest="input_format", type="choice", choices=("fasta", "list" ), help="input format of stdin [%default]." ) parser.add_option("--output-filename-pattern", dest="output_filename_pattern", type="string", help="output pattern for multiple files [%default]." ) parser.add_option("--filename-sequences1", dest="filename_sequences1", type="string", help="first indexed input filename with sequences [%default]." ) parser.add_option("--filename-sequences2", dest="filename_sequences2", type="string", help="second indexed input filename with sequences [%default]." ) parser.add_option("--options-blastz", dest="options_blastz", type="string", help="command line options for blastz [%default]." ) parser.set_defaults( skip_stats = False, methods = [], output_formats = [], input_format = "fasta", output_filename_pattern = None, filename_sequences1 = None, filename_sequences2 = None, anchor_alignment = 0, options_blastz = "C=2 B=1 T=0 W=6 K=2200" ) (options, args) = E.Start( parser, add_pipe_options = True ) if len(options.methods) == 0: print USAGE print "please specify an alignment method." sys.exit(1) if len(options.output_formats) == 0: print USAGE print "please specify at least one output format." sys.exit(1) if len(args) == 2: iterator = iterate_double_fasta( args[0], args[1] ) elif options.filename_sequences1 and options.filename_sequences2: if len(args) == 0 or (len(args) == 1 and args[0] == "-"): infile = options.stdin elif len(args) == 1: infile = open( args[0], "r") iterator = iterate_list( infile, options.filename_sequences1, options.filename_sequences2 ) else: iterator = iterate_single_fasta( options.stdin ) npairs, ntoken_pairs = 0, 0 ninput, nskipped, nerrors = 0, 0, 0 outfile_table = None outfile_fasta = None outfile_psl = None if "table" in options.output_formats: outfile_table = getFile( "table ", options ) outfile_table.write( """# CATEGORY: category [intron|exon] # METHOD: alignment method # TOKEN: name # ID: segment id # TOTAL: number of segments # LEN: length of segment # NALIGNED: number of aligned positions # PALIGNED: percentage of aligned positions # IDENT: number of identical positions # TRANSIT: number of transitions # TRANSVERS: number of transversion # MATCHES: number of matching positions # PIDENT: percentage of identical positions # PTRANSIT: precentage of transitions # PTRANSVERS: precentage of transversion # BLOCKSIZES: alignment, length of blocks # GAPS: gap sizes in sequence 1/2 CATEGORY\tMETHOD\tTOKEN1\tID1\tTOTAL1\tLEN1\tTOKEN2\tID2\tTOTAL2\tLEN2\tNALIGNED\tPALIGNED\tIDENT\tTRANSIT\tTRANSVER\tMATCHES\tPIDENT\tPTRANSVIT\tPTRANVER\tBLOCKSIZES\tGAPSIZES\tGAPSIZES\tTYPE1\tTYPE2\n""") if "fasta" in options.output_formats: outfile_fasta = getFile( "fasta", options ) if "psl" in options.output_formats: outfile_psl = getFile( "psl", options ) ## setup alignment objects for unaligned_pair in iterator: ninput += 1 for method in options.methods: pair = AlignedPairs.AlignedPair( unaligned_pair ) pair.mOptionsBlastZ = options.options_blastz try: pair.Align( method, anchor = options.anchor_alignment ) except AlignedPairs.AlignmentError, msg: if options.loglevel >= 1: options.stdlog.write( "# %s - %s: %s\n" % (msg, unaligned_pair.mToken1, unaligned_pair.mToken2)) if options.loglevel >= 2: options.stdlog.write( "# input=%s\n" % (str(unaligned_pair))) nskipped += 1 continue if outfile_table: outfile_table.write( str(pair) + "\n" ) if outfile_fasta: outfile_fasta.write( ">%s\n%s\n>%s\n%s\n" % (pair.mToken1, pair.mAlignedSequence1, pair.mToken2, pair.mAlignedSequence2 ) ) if outfile_psl: entry = Blat.Match() entry.mQueryId, entry.mSbjctId = pair.mToken1, pair.mToken2 entry.strand = pair.strand entry.fromMap( pair.mAlignment ) outfile_psl.write( str(entry) + "\n" ) npairs += 1
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $", usage=globals()["__doc__"]) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) ## do sth ninput, nskipped, noutput = 0, 0, 0 psl = None def chain_iterator(infile): lines = [] for line in options.stdin: if line.startswith("#"): continue if line.strip() == "": continue if line.startswith("chain"): if lines: yield lines lines = [] lines.append(line) yield lines for lines in chain_iterator(options.stdin): ninput += 1 psl = Blat.Match() (_, _, psl.mSbjctId, target_length, target_strand, target_start, target_end, psl.mQueryId, query_length, query_strand, query_start, query_end, alignment_id) = lines[0][:-1].split() ( psl.mQueryStart, psl.mQueryEnd, psl.mQueryLength, psl.mSbjctStart, psl.mSbjctEnd, psl.mSbjctLength ) = \ [ int(x) for x in (query_start, query_end, query_length, target_start, target_end, target_length) ] map_query2target = alignlib_lite.py_makeAlignmentBlocks() qstart, tstart = psl.mQueryStart, psl.mSbjctStart for line in lines[1:-1]: size, dt, dq = [int(x) for x in line[:-1].split()] map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart) qstart += size + dq tstart += size + dt size = int(lines[-1][:-1]) map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart) psl.fromMap(map_query2target) # sort out strand # target_strand is always positive assert (target_strand == "+") # if query strand is negative if query_strand == "-": # invert both query and target psl.switchTargetStrand() # manually invert the query coordinates psl.mQueryFrom, psl.mQueryTo = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom options.stdout.write("%s\n" % psl) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) ## write footer and output benchmark information. E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: maq2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome." ) parser.add_option("-c", "--filename-coordinates", dest="filename_coordinates", type="string", help="filename with coordinates." ) parser.add_option( "-p", "--output-filename-pattern", dest="output_filename_pattern", type="string" , help="OUTPUT filename pattern for additional data [%default].") parser.set_defaults( genome_file = "genome", filename_coordinates = None, segment_length = 32, ) (options, args) = E.Start( parser ) if options.genome_file: genome = IndexedFasta.IndexedFasta( options.genome_file ) else: genome = None ninput, noutput = 0, 0 if options.filename_coordinates: segment_length = options.segment_length a = matchby_sequence( iterator_segments( open( options.filename_coordinates, "r"), options.segment_length ), Maq.iterator( options.stdin ), lambda x: (x.mSegment), lambda x: (x.contig) ) for segments, maqs in a: pairs = match_smaller( segments, maqs, lambda x: x.start, lambda x: x.start ) for segment, maq in pairs: ninput += 1 assert maq.start >= segment.start, "maq start < segment start: %i < %i" % (maq.start, segment.start) assert maq.start + maq.mLength <= segment.start + 2 * segment_length, "maq end > segment end: %i < %i" % (maq.start + maq.mLength, segment.start + 2 * segment_length) psl = Blat.Match() psl.fromMaq( maq ) match_start = maq.start segment_start = segment.start contig, left_start, right_start = segment.contig, segment.mLeftStart, segment.mRightStart if options.loglevel >= 2: options.stdlog.write("# mapping: name=%s, match_start=%i, segment=%s\n" % (maq.contig, match_start, str(segment))) # build positions of the two blocks left_size = segment_length - (match_start - segment_start) right_size = segment_length - left_size mapped1_start = left_start + match_start - segment_start mapped1_end = left_start + segment_length mapped2_start = right_start mapped2_end = right_start + right_size if options.loglevel >= 3: options.stdlog.write("# mapped: match_start=%i, segment_start=%i, left_size=%i, right_size=%i, mapped1=(%i-%i), mapped2=(%i-%i)\n" %\ (match_start, segment_start, left_size, right_size, mapped1_start, mapped1_end, mapped2_start, mapped2_end) ) psl.mSbjctId = contig if genome: psl.mSbjctLength = genome.getLength( contig ) psl.mSbjctFrom = mapped1_start psl.mSbjctTo = mapped2_end psl.mNBlocks = 2 psl.mBlockSizes= [left_size, right_size] psl.mQueryBlockStarts = [0, left_size] psl.mSbjctBlockStarts = [mapped1_start, mapped2_start] psl.mSbjctNGapsCounts = 1 psl.mSbjctNGapsBases = mapped2_start - mapped1_end options.stdout.write( str(psl) + "\n" ) noutput += 1 else: for maq in Maq.iterator( options.stdin ): ninput += 1 psl = Blat.Match() psl.fromMaq( maq ) options.stdout.write( str(psl) + "\n" ) noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i\n" % (ninput, noutput) ) E.Stop()