def iterate_list(infile, idx1, idx2=None): fasta1 = IndexedFasta.IndexedFasta(idx1) if idx2 is None: fasta2 = fasta1 else: fasta2 = IndexedFasta.IndexedFasta(idx2) first = True for line in infile: if line[0] == "#": continue id1, id2 = line[:-1].split("\t")[:2] try: yield AlignedPairs.UnalignedPair( token1=id1, sequence1=fasta1.getSequence(id1), token2=id2, sequence2=fasta2.getSequence(id2)) except KeyError, msg: if first: first = False continue raise KeyError(msg)
class CounterOfErrors(ChainCounter): """class for reporting invalid contig sizes in chains""" header = "Contig size validation report" def __init__(self, options): self.tdb = IndexedFasta(options.dbpath + options.targetgenome) self.qdb = IndexedFasta(options.dbpath + options.querygenome) self.tcontigs = self.tdb.getContigSizes() self.qcontigs = self.qdb.getContigSizes() self.badchains = [] def add(self, c): db_tsize = self.tcontigs[c.tname] db_qsize = self.qcontigs[c.qname] if c.tsize != db_tsize: self.badchains.append("\t".join([str(x) for x in c.atts] + [" #bad target contigsize"])) if c.qsize != db_qsize: self.badchains.append("\t".join([str(x) for x in c.atts] + [" #bad query contigsize"])) def report(self, options): report = self._wrap_header() if len(self.badchains) == 0: report.append("All chains passed validation") else: report = report + self.badchains self._write_report(options, report) def tabbed_report(self, options, E): if len(self.badchains) > 0: lines = self.badchains else: lines = ["#no bad chains found"] self._write_tabbed("bad_contig_sizes", lines, E)
def setUp(self): self.mExons = [] self.mSplitCodonsNext = {} self.mSplitCodonsPrev = {} self.mSpliceSize = 4 self.mExonSize = 100 self.mIntronSize = 900 self.strand = "+" self.mNExons = 9 self.mOffset = 1000 length = 0 self.frame = 0 self.mIncrement = self.mIntronSize + self.mExonSize seq = list("123" * int((self.mNExons * self.mExonSize) / 3)) exon_id = 0 start = self.mOffset for x in range(self.mNExons): e = GTF.Entry() e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1" e.start, e.end = start, start + self.mExonSize e.frame = (3 - (length % 3)) % 3 length += e.end - e.start self.mExons.append(e) if e.frame != 0: for y in range(0, e.frame): self.mSplitCodonsPrev[start + y] = start - self.mIntronSize for y in range(0, 3 - e.frame): self.mSplitCodonsNext[ start - self.mIntronSize - y - 1] = start exon_id += 1 if exon_id < self.mNExons: p = exon_id * self.mExonSize + self.mIntronSize * (exon_id - 1) seq[p:p] = list("AG") seq[p:p] = list("T" * (self.mIntronSize - 4)) seq[p:p] = list("GT") start += self.mIncrement # print str(e) # print self.mSplitCodonsNext # print self.mSplitCodonsPrev seq[0:0] = "C" * self.mOffset seq.append("G" * self.mOffset) tmpfile = tempfile.NamedTemporaryFile() tmpfile.close() seq = "".join(seq) self.mSequence = seq self.contigSize = len(seq) IndexedFasta.createDatabase(tmpfile.name, iter([("chr1", seq), ])) self.mFasta = IndexedFasta.IndexedFasta(tmpfile.name)
def setUp(self): self.tmpdir = tempfile.mkdtemp() self.outfile_genome = os.path.join(self.tmpdir, "genome_in") self.outfile_gtf = os.path.join(self.tmpdir, "exons.gtf") self.outfile_output = os.path.join(self.tmpdir, "output") self.length = 1000 genome = iter((("chr1", "A" * self.length), )) IndexedFasta.createDatabase(self.outfile_genome, genome) self.reference = ["g"] * self.length
def buildContigSizes(infile, outfile): ''' Get contig sizes from indexed genome :term:`fasta` files and outputs to a text file. Parameters ---------- infile : str infile is constructed from the `PARAMS` variable to retrieve the `genome` :term:`fasta` file Returns ------- outfile : str outfile is a text format file that contains two columns, matched contig name and contig size (in nucleotides). The output file name is defined in `PARAMS: interface_contigs`. ''' prefix = P.snip(infile, ".fasta") fasta = IndexedFasta.IndexedFasta(prefix) contigs = [] for contig, size in fasta.getContigSizes(with_synonyms=False).items(): contigs.append([contig, size]) df_contig = pd.DataFrame(contigs, columns=['contigs', 'size']) df_contig.sort_values('contigs', inplace=True) df_contig.to_csv(outfile, sep="\t", header=False, index=False)
def extractSequence( infile, outfile ): '''extract genomic sequence to be aligned against.''' fasta = IndexedFasta.IndexedFasta( infile[:-len(".fasta")] ) outs = open( outfile,"w") outs.write( ">%s\n%s\n" % (CONTIG, fasta.getSequence( CONTIG, "+", START, END) )) outs.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/patch_translations.py 1841 2008-05-08 12:07:13Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.set_defaults(genome_file=None, ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) entry = PredictionParser.PredictionParserEntry() fasta = IndexedFasta.IndexedFasta(options.genome_file) ninput, noutput = 0, 0 for line in sys.stdin: if line[0] == "#": print line[:-1] continue entry.Read(line) ninput += 1 # get genomic sequence genomic_sequence = fasta.getSequence(entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment( entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence) options.stdout.write(str(entry) + "\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i\n" % (ninput, noutput)) E.Stop()
def main(argv=None): parser = E.OptionParser( version="%prog version: $Id: fasta2gff.py 2861 2010-02-23 17:36:32Z andreas $") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf.") parser.add_option("-f", "--fragment-size", dest="fragment_size", type="int", help="fixed size of fragments [default=%default].") parser.add_option("-s", "--sample-size", dest="sample_size", type="int", help="fixed size of fragments.") parser.set_defaults( as_gtf=False, genome_file=None, fragment_size=1000, sample_size=10000, pattern_id="%08i", ) (options, args) = E.Start(parser) fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.as_gtf: entry = GTF.Entry() else: entry = GTF.Entry() n = 0 entry.feature = "exon" entry.source = "random" for x in range(options.sample_size): entry.contig, entry.strand, entry.start, entry.end = fasta.getRandomCoordinates( options.fragment_size) if entry.strand == "-": l = contigs[entry.contig] entry.start, entry.end = l - entry.end, l - entry.start if options.as_gtf: entry.gene_id = options.pattern_id % n entry.transcript_id = entry.gene_id options.stdout.write(str(entry) + "\n") n += 1 E.Stop()
def writeContigSizes(genome, outfile): '''write contig sizes to outfile for UCSC tools. ''' outf = IOTools.openFile(outfile, "w") fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], genome)) for contig, size in fasta.getContigSizes(with_synonyms=False).items(): outf.write("%s\t%i\n" % (contig, size)) outf.close()
def main(argv=sys.argv): parser = E.OptionParser( version= "%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("merge", "filter-genome", "bins", "block", "sanitize-genome", "shift", "extend"), help="method to apply [default=%default]") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.set_defaults(methods=[], merge_distance=0, binning_method="equal-bases", genome_file=None, bam_file=None, num_bins=5, merge_min_intervals=1, bin_edges=None, offset=10000, test=None, extend_distance=1000) (options, args) = E.Start(parser, add_pipe_options=True) contigs = None if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = genome_fasta.getContigSizes() for method in options.methods: if method == "filter-genome": if not contigs: raise ValueError("please supply contig sizes") processor = filterGenome(processor, contigs) elif method == "sanitize-genome": if not contigs: raise ValueError("please supply contig sizes") processor = sanitizeGenome(options.stdin, options.stdout, contigs) E.Stop()
class CounterPercentIdentify(ChainCounter): header = "Report on Percent Indentities" def __init__(self, tname, qname): self.tfasta = IndexedFasta(tname) self.qfasta = IndexedFasta(qname) self.pids = [] self.stats = 0 def _get_pid(self, x, y): z = zip(x, y) pid = (float(len([x for x, y in z if x == y])) / float(len(z)) * 100) return(pid) def add(self, c): nreg = len(c.tugr) for i in range(0, nreg): tseq = self.tfasta.getSequence( c.tname, "+", c.tugr[i][0], (sum(c.tugr[i]))) qseq = self.qfasta.getSequence( c.qname, c.qstrand, c.qugr[i][0], (sum(c.qugr[i]))) pid = self._get_pid(tseq.lower(), qseq.lower()) self.pids.append(pid) def _get_stats(self): if self.stats == 0: self.stats = self._get_basic_stats(self.pids, string="{:.2f}") def report(self, options): self._get_stats() report = self._wrap_header() report.append(self._wrap_basic_stats(self.stats)) self._write_report(options, report) def tabbed_report(self, options, E): self._get_stats() lines = ["mean\tmedian\tmax\tmin"] lines.append(self._wrap_basic_stats(self.stats, tabbed=True)) self._write_tabbed("pids", lines, E)
class CounterPercentIdentify(ChainCounter): header = "Report on Percent Indentities" def __init__(self, tname, qname): self.tfasta = IndexedFasta(tname) self.qfasta = IndexedFasta(qname) self.pids = [] self.stats = 0 def _get_pid(self, x, y): z = zip(x, y) pid = (float(len([x for x, y in z if x == y])) / float(len(z)) * 100) return (pid) def add(self, c): nreg = len(c.tugr) for i in range(0, nreg): tseq = self.tfasta.getSequence(c.tname, "+", c.tugr[i][0], (sum(c.tugr[i]))) qseq = self.qfasta.getSequence(c.qname, c.qstrand, c.qugr[i][0], (sum(c.qugr[i]))) pid = self._get_pid(tseq.lower(), qseq.lower()) self.pids.append(pid) def _get_stats(self): if self.stats == 0: self.stats = self._get_basic_stats(self.pids, string="{:.2f}") def report(self, options): self._get_stats() report = self._wrap_header() report.append(self._wrap_basic_stats(self.stats)) self._write_report(options, report) def tabbed_report(self, options, E): self._get_stats() lines = ["mean\tmedian\tmax\tmin"] lines.append(self._wrap_basic_stats(self.stats, tabbed=True)) self._write_tabbed("pids", lines, E)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: gtf2fasta.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-i", "--ignore-missing", dest="ignore_missing", action="store_true", help="Ignore transcripts on contigs that are not in the genome-file [default=%default].") parser.add_option("--min-intron-length", dest="min_intron_length", type="int", help="minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown' [default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("full", ), help="method to apply [default=%default].") parser.set_defaults( genome_file=None, flank=1000, max_frameshift_length=4, min_intron_length=30, ignore_missing=False, restrict_source=None, method="full", report_step=1000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if not options.genome_file: raise ValueError("an indexed genome is required.") fasta = IndexedFasta.IndexedFasta(options.genome_file) iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) annotateGenome(iterator, fasta, options) # write footer and output benchmark information. E.Stop()
class CounterOfErrors(ChainCounter): '''class for reporting invalid contig sizes in chains''' header = "Contig size validation report" def __init__(self, options): self.tdb = IndexedFasta(options.dbpath + options.targetgenome) self.qdb = IndexedFasta(options.dbpath + options.querygenome) self.tcontigs = self.tdb.getContigSizes() self.qcontigs = self.qdb.getContigSizes() self.badchains = [] def add(self, c): db_tsize = self.tcontigs[c.tname] db_qsize = self.qcontigs[c.qname] if c.tsize != db_tsize: self.badchains.append( '\t'.join([str(x) for x in c.atts] + [" #bad target contigsize"])) if c.qsize != db_qsize: self.badchains.append( '\t'.join([str(x) for x in c.atts] + [" #bad query contigsize"])) def report(self, options): report = self._wrap_header() if len(self.badchains) == 0: report.append("All chains passed validation") else: report = report + self.badchains self._write_report(options, report) def tabbed_report(self, options, E): if len(self.badchains) > 0: lines = self.badchains else: lines = ["#no bad chains found"] self._write_tabbed("bad_contig_sizes", lines, E)
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' to_cluster = True # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) tmpdir = P.getTempDir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.asList(PARAMS['motifs_masker']), halfwidth=int(PARAMS["meme_halfwidth"]), maxsize=int(PARAMS["meme_max_size"]), proportion=PARAMS["meme_proportion"], min_sequences=PARAMS["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def getContigSizes(infile, outfile): from CGAT import IndexedFasta try: prefix = P.snip(infile, ".fasta") except ValueError: prefix = P.snip(infile, ".fa") fasta = IndexedFasta.IndexedFasta(prefix) outs = IOTools.openFile(outfile, "w") for contig, size in fasta.getContigSizes(with_synonyms=False).items(): outs.write("%s\t%i\n" % (contig, size)) outs.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: index2gff.py 2880 2010-04-07 08:44:13Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.set_defaults(genome_file=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # do sth ninput, nskipped, noutput = 0, 0, 0 fasta = IndexedFasta.IndexedFasta(options.genome_file) entry = GTF.Entry() entry.start = 0 entry.feature = "contig" entry.source = "genome" for contig, size in fasta.getContigSizes(with_synonyms=False).iteritems(): ninput += 1 entry.contig = contig entry.end = int(size) options.stdout.write("%s\n" % str(entry)) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"): '''export sequences for intervals in :term:`bed`-formatted *infile* to :term:`fasta` formatted *outfile* ''' track = P.snip(infile, ".bed.gz") fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) outs = IOTools.open_file(outfile, "w") ids, seqs = [], [] for bed in Bed.setName(Bed.iterator(IOTools.open_file(infile))): lcontig = fasta.getLength(bed.contig) if mode == "intervals": seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end)) ids.append("%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end)) elif mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) masked = maskSequences(seqs, masker) outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)])) outs.close()
def buildContigBed(infile, outfile): ''' Gets the contig sizes and co-ordinates from an indexed genome :term:`fasta` file and outputs them to :term:`BED` format Parameters ---------- infile : str infile is constructed from `PARAMS` variable to retrieve the `genome` :term:`fasta` file Returns ------- outfile : str :term:`BED` format file containing contig name, value (0) and contig size in nucleotides. The output file name is defined in `PARAMS: interface_contigs_bed` ''' prefix = P.snip(infile, ".fasta") fasta = IndexedFasta.IndexedFasta(prefix) outs = IOTools.open_file(outfile, "w") for contig, size in fasta.getContigSizes(with_synonyms=False).items(): outs.write("%s\t%i\t%i\n" % (contig, 0, size)) outs.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome (indexed).") parser.add_option("-w", "--windows-bed-file", dest="filename_windows", type="string", help="gff file with windows to use.") parser.add_option("-d", "--filename-data", dest="filename_data", type="string", help="gff file with data to use.") parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="filename-data is gtf file [default=%default.") parser.add_option("-f", "--features", dest="features", type="choice", action="append", choices=("GC", ), help="features to compute.") parser.add_option("-c", "--decorator", dest="decorator", type="choice", choices=("counts", "gc", "gc3", "mean-length", "median-length", "percent-coverage", "median-score", "mean-score", "stddev-score", "min-score", "max-score"), help="decorators to use.") parser.add_option("-e", "--skip-empty", dest="skip_empty", action="store_true", help="skip empty windows.") parser.add_option( "-t", "--transform=", dest="transform", type="choice", choices=("none", "overlap", "complement", "third_codon"), help="transform to use when mapping overlapping regions onto window.") parser.set_defaults( genome_file=None, filename_windows=None, filename_data=None, features=[], skip_empty=False, decorator="counts", transform="none", is_gtf=False, ) (options, args) = E.Start(parser) # test_transform_third_codon() if not options.filename_windows: raise ValueError("please supply a gff file with window information.") if options.loglevel >= 1: options.stdlog.write("# reading windows...") options.stdlog.flush() windows = GTF.readAsIntervals( GTF.iterator(IOTools.openFile(options.filename_windows, "r"))) if options.loglevel >= 1: options.stdlog.write("done\n") options.stdlog.flush() if options.filename_data: if options.loglevel >= 1: options.stdlog.write("# reading data...") options.stdlog.flush() if options.is_gtf: gff_data = GTF.readFromFile( IOTools.openFile(options.filename_data, "r")) else: gff_data = GTF.readFromFile( IOTOols.openFile(options.filename_data, "r")) if options.loglevel >= 1: options.stdlog.write("done\n") options.stdlog.flush() data_ranges = GTF.SortPerContig(gff_data) else: # use windows to compute properties # by supplying no data and asking for the complement = original window gff_data = None data_ranges = None options.transform = "complement" map_contig2size = {} if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) map_contig2size = fasta.getContigSizes() else: for contig, values in list(windows.items()): map_contig2size[contig] = max(lambda x: x[1], values) fasta = None contigs = list(map_contig2size.keys()) contigs.sort() # proceed contig wise noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0 options.stdout.write("\t".join( map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1", "l1", "n2", "l2", "score", "extra_info"))) + "\n") for contig in contigs: skip = False if contig not in windows: ncontigs_skipped_windows += 1 skip = True if data_ranges and contig not in data_ranges: ncontigs_skipped_data += 1 skip = True if skip: continue noutput_contigs += 1 if data_ranges: annotateWindows( contig, windows[contig], gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta, options) else: annotateWindows(contig, windows[contig], [], fasta, options) E.info( "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i" % (len(windows), noutput_contigs, len(contigs), ncontigs_skipped_windows, ncontigs_skipped_data)) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-a", "--aggregate-by", dest="aggregate", type="choice", choices=("name", "contig", "track", "none"), help="aggregate counts by feature [default=%default].") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentages [default=%default].") parser.set_defaults( genome_file=None, aggregate="none", add_percent=False, ) (options, args) = E.Start(parser, argv) # get files if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: if options.add_percent: raise ValueError("--add-percent option requires --genome-file") fasta = None if options.add_percent and not options.aggregate == "contig": raise NotImplementedError( "--add-percent option requires --aggregate=contig") counts = collections.defaultdict(Counter) total = Counter() output_totals = True if options.aggregate == "track": keyf = lambda x: x.track elif options.aggregate == "name": keyf = lambda x: x.name elif options.aggregate == "contig": keyf = lambda x: x.contig else: keyf = lambda x: "all" output_totals = False for bed in Bed.iterator(options.stdin): counts[keyf(bed)].add(bed) total.add(bed) outf = options.stdout key = "track" if options.add_percent: outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers_percent))) else: outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers))) total_bases = 0 for key, count in sorted(counts.items()): if options.add_percent: total_bases += fasta.getLength(key) count.setSize(fasta.getLength(key)) outf.write("%s\t%s\n" % (key, str(count))) if output_totals: if options.add_percent: count.setSize(total_bases) outf.write("%s\t%s\n" % ("total", str(total))) E.Stop()
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None, stranded=False): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' cc = dbhandle.cursor() orderby = "" if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" elif order != "random": raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.tablequote(track) statement = '''SELECT contig, start, end, interval_id, score, strand, peakcenter FROM %(tablename)s ''' % locals() + orderby cc.execute(statement) data = cc.fetchall() cc.close() E.debug("Got %s intervals for track %s" % (len(data), track)) if len(data) == 0: P.touch(filename) return data = truncateList(data, track, proportion, min_sequences, num_sequences, order == "random") beds = bedsFromList(data) L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) # At the moment the pipeline retrieves from the DB the bed regions and they will # always be in the positive strand but if this were to change. The regions retrieved from # the negative strand will be counted from the end of the chromosome and not the beginning without this. # This should be tested. fasta.setConverter(IndexedFasta.getConverter("zero-single-open")) # modify the ranges if shift == "leftright": beds = shitfBeds(beds) if halfwidth and not full: beds = centreAndCrop(beds, halfwidth) sequences = getFASTAFromBed(beds, fasta, stranded, offset, maxsize) if shuffled: sequences = shuffleFasta(sequences) c = E.Counter() outs = IOTools.open_file(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): ids, sequences = zip(*[(x.title, x.sequence) for x in sequences]) sequences = maskSequences(sequences, masker) sequences = (FastaRecord(id, seq) for id, seq in zip(ids, sequences)) with IOTools.open_file(filename, "w") as outs: for sequence in sequences: c.input += 1 if len(sequence.sequence) == 0: c.empty += 1 continue if len(sequence.sequence) < 0: c.too_short += 1 continue outs.write(">%s\n%s\n" % (sequence.title, sequence.sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: " "$Id: gff2coverage.py 2781 2009-09-10 11:33:14Z " "andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default]") parser.add_option("-f", "--features", dest="features", type="string", action="append", help="features to collect " "[default=%default]") parser.add_option("-w", "--window-size", dest="window_size", type="int", help="window size in bp for histogram computation. " "Determines the bin size. " "[default=%default]") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="number of bins for histogram computation " "if window size is not given. " "[default=%default]") parser.add_option("-m", "--method", dest="method", type="choice", choices=("genomic", "histogram", ), help="methods to apply. " "[default=%default]") parser.set_defaults( genome_file=None, window_size=None, num_bins=1000, value_format="%6.4f", features=[], method="genomic", ) (options, args) = E.Start(parser, add_output_options=True) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.method == "histogram": gff = GTF.readFromFile(options.stdin) gff.sort(lambda x, y: cmp((x.contig, x.start), (y.contig, y.start))) chunk = [] last_contig = None for entry in gff: if last_contig != entry.contig: processChunk(last_contig, chunk, options, fasta) last_contig = entry.contig chunk = [] chunk.append(entry) processChunk(last_contig, chunk, options, fasta) elif options.method == "genomic": intervals = collections.defaultdict(int) bases = collections.defaultdict(int) total = 0 for entry in GTF.iterator(options.stdin): intervals[(entry.contig, entry.source, entry.feature)] += 1 bases[(entry.contig, entry.source, entry.feature) ] += entry.end - entry.start total += entry.end - entry.start options.stdout.write("contig\tsource\tfeature\tintervals\tbases") if fasta: options.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n") else: options.stdout.write("\n") total_genome_size = sum( fasta.getContigSizes(with_synonyms=False).values()) for key in sorted(intervals.keys()): nbases = bases[key] nintervals = intervals[key] contig, source, feature = key options.stdout.write("\t".join(("\t".join(key), str(nintervals), str(nbases)))) if fasta: options.stdout.write( "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig))) options.stdout.write( "\t%f\n" % (100.0 * float(nbases) / total_genome_size)) else: options.stdout.write("\n") E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gtf2exons.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option("--coordinate-format", dest="coordinate_format", type="string", help="input type of coordinates." ) parser.add_option("--forward-coordinates", dest="forward_coordinates", action="store_true", help="output forward coordinates." ) parser.add_option("-e", "--extract-id", dest="extract_id", type="string", help="""regular expression to extract id from id column, e.g. 'transcript_id "(\S+)"'.""" ) parser.set_defaults( coordinate_format = "zero-forward", forward_coordinates = False, genome_file = None, extract_id = None ) (options, args) = E.Start( parser ) if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() else: contig_sizes = {} if options.extract_id: extract_id = re.compile( options.extract_id ) else: extract_id = None converter = IndexedFasta.getConverter( options.coordinate_format ) exons = Exons.ReadExonBoundaries( sys.stdin, contig_sizes = contig_sizes, converter = converter, do_invert = True, format = "gtf", gtf_extract_id = extract_id ) ntranscripts, nexons, nerrors = 0, 0, 0 for id, ee in exons.items(): ntranscripts += 1 has_error = False for e in ee: if options.forward_coordinates and e.mSbjctToken in contig_sizes and \ e.mSbjctStrand == "-": l = contig_sizes[e.mSbjctToken] e.mGenomeFrom, e.mGenomeTo = l - e.mGenomeTo, l - e.mGenomeFrom if e.mGenomeFrom < 0: has_error = True if options.loglevel >= 1: options.stderr.write( "# Error: %s\n" % str(e) ) break options.stdout.write( str(e) + "\n" ) nexons += 1 if has_error: nerrors += 1 continue if options.loglevel >= 1: options.stdlog.write("# ntranscripts=%i, nexons=%i, nerrors=%i\n" % (ntranscripts, nexons, nerrors)) E.Stop()
def __init__(self, tname, qname): self.tfasta = IndexedFasta(tname) self.qfasta = IndexedFasta(qname) self.pids = [] self.stats = 0
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic sequence to retrieve " "sequences from.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker to mask output sequences " "[%default].") parser.add_option("--output-mode", dest="output_mode", type="choice", choices=("intervals", "leftright", "segments"), help="what to output. " "'intervals' generates a single sequence for " "each bed interval. 'leftright' generates two " "sequences, one in each direction, for each bed " "interval. 'segments' can be used to output " "sequence from bed12 files so that sequence only covers " "the segements [%default]") parser.add_option("--min-sequence-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-sequence-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at 3', 5' or both or no ends. If 3only or 5only " "are set, only the added sequence is returned [default=%default]") parser.add_option( "--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement " "on intervals located on the negative strand. " "[default=%default]") parser.set_defaults( genome_file=None, masker=None, output_mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.output_mode == "segments" and bed.columns == 12: ids.append("%s %s:%i..%i (%s) %s %s" % (bed.name, bed.contig, bed.start, bed.end, strand, bed["blockSizes"], bed["blockStarts"])) seg_seqs = [fasta.getSequence(bed.contig, strand, start, end) for start, end in bed.toIntervals()] seqs.append("".join(seg_seqs)) elif (options.output_mode == "intervals" or options.output_mode == "segments"): ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.output_mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: extractseq.py 2861 2010-02-23 17:36:32Z andreas $") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="pattern to look for sequence filename.") parser.add_option("-d", "--identifier", dest="identifier", type="string", help="identifier(s).") parser.add_option("-o", "--output-coordinate-format", dest="output_coordinate_format", type="choice", choices=("full", "long"), help="""output format of coordinates. Output format is contig:strand:from:to in zero based /forward/reverse strand coordinates in open/closed notation. 'long' includes the contig length as fifth field""" ) parser.add_option("--input-format", dest="input_format", type="choice", choices=("list", "id-compressed"), help="input format.") parser.add_option("-i", "--input-coordinate-format", dest="input_coordinate_format", type="choice", choices=("zero-both", "zero-forward"), help="coordinate format.") parser.add_option("-e", "--extend-region", dest="extend_region", type="int", help="regions are extended by this margin at either end.") parser.add_option("-r", "--shorten-region", dest="shorten_region", type="int", help="regions are shortened by this margin at either end.") parser.set_defaults( genome_file=None, identifier=None, input_coordinate_format="zero-both", output_coordinate_format="full", input_format="list", extend_region=0, shorten_region=0, ) (options, args) = E.Start(parser) fasta = IndexedFasta.IndexedFasta(options.genome_file) lines = [] if options.identifier: lines += map(lambda x: x.split(":"), options.identifier.split(",")) if args: lines += map(lambda x: x.split(":"), args) if len(lines) == 0: lines = map(lambda x: x[ :-1].split("\t"), (filter(lambda x: x[0] != "#", options.stdin.readlines()))) ninput, nskipped, noutput = 0, 0, 0 for data in lines: if options.input_format == "list": if len(data) < 4: sbjct_token = data[0] sbjct_from, sbjct_to = "0", "0" sbjct_strand = "+" else: sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[:4] id = None elif options.input_format == "id-compressed": id = data[0] sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[ 1].split(":") ninput += 1 try: sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) except ValueError: E.warn("skipping line %s" % data) nskipped += 1 continue sbjct_from -= (options.extend_region - options.shorten_region) sbjct_from = max(0, sbjct_from) lcontig = fasta.getLength(sbjct_token) if sbjct_to != 0: sbjct_to += (options.extend_region - options.shorten_region) sbjct_to = min(sbjct_to, lcontig) else: sbjct_to = lcontig if sbjct_to - sbjct_from <= 0: nskipped += 1 continue sequence = fasta.getSequence(sbjct_token, sbjct_strand, sbjct_from, sbjct_to, converter=IndexedFasta.getConverter(options.input_coordinate_format)) if options.output_coordinate_format == "full": coordinates = "%s:%s:%i:%i" % (sbjct_token, sbjct_strand, sbjct_from, sbjct_to) elif options.output_coordinate_format == "long": coordinates = "%s:%s:%i:%i:%i" % (sbjct_token, sbjct_strand, sbjct_from, sbjct_to, lcontig) if id: options.stdout.write(">%s %s\n%s\n" % (id, coordinates, sequence)) else: options.stdout.write(">%s\n%s\n" % (coordinates, sequence)) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def __init__(self, options): self.tdb = IndexedFasta(options.dbpath + options.targetgenome) self.qdb = IndexedFasta(options.dbpath + options.querygenome) self.tcontigs = self.tdb.getContigSizes() self.qcontigs = self.qdb.getContigSizes() self.badchains = []
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-e", "--extract", dest="extract", type="string", help="extract region for testing purposes. Format is " "contig:strand:from:to. " "The default coordinates are 0-based " "open/closed coordinates on both strands. " "For example, chr1:+:10:12 will return " "bases 11 to 12 on chr1.") compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug") parser.add_option("-c", "--compression", dest="compression", type="choice", choices=compression_choices, help="compress database, using specied compression. " "Valid choices are %s. " "[default=%%default]." % ", ".join(compression_choices)) parser.add_option("--random-access-points", dest="random_access_points", type="int", help="save random access points every # number " "of nucleotides [default=%default].") input_format_choices = ("one-forward-open", "zero-both-open") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=input_format_choices, help="coordinate format of input. Valid choices are " "%s [default=%%default]." % ", ".join(input_format_choices)) parser.add_option("-s", "--synonyms", dest="synonyms", type="string", help="list of synonyms, comma separated with =, " "for example, chr1=chr1b [default=%default]") parser.add_option("-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access " "[default=%default].") parser.add_option("--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark " "[default=%default].") parser.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") parser.add_option("--verify-iterations", dest="verify_num_iterations", type="int", help="number of iterations for verification " "[default=%default].") file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz") parser.add_option("--file-format", dest="file_format", type="choice", choices=file_format_choices, help="file format of input. Supply if data comes " "from stdin " "Valid choices are %s [default=%%default]." % ", ".join(file_format_choices)) parser.add_option("-a", "--clean-sequence", dest="clean_sequence", action="store_true", help="remove X/x from DNA sequences - they cause " "errors in exonerate [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="allow duplicate identifiers. Further occurances " "of an identifier are suffixed by an '_%i' " "[default=%default].") parser.add_option("--regex-identifier", dest="regex_identifier", type="string", help="regular expression for extracting the " "identifier from fasta description line " "[default=%default].") parser.add_option("--compress-index", dest="compress_index", action="store_true", help="compress index [default=%default].") parser.add_option("--force", dest="force", action="store_true", help="force overwriting of existing files " "[default=%default].") translator_choices = ("solexa", "phred", "bytes", "range200") parser.add_option("-t", "--translator", dest="translator", type="choice", choices=translator_choices, help="translate numerical quality scores. " "Valid choices are %s [default=%%default]." % ", ".join(translator_choices)) parser.set_defaults( extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.Start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = IndexedFasta.TranslatorPhred() elif options.translator == "solexa": options.translator = IndexedFasta.TranslatorSolexa() elif options.translator == "bytes": options.translator = IndexedFasta.TranslatorBytes() elif options.translator == "range200": options.translator = IndexedFasta.TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write(">%s\n%s\n" % (options.extract, sequence)) elif options.benchmark: import timeit timer = timeit.Timer( stmt="IndexedFasta.benchmarkRandomFragment( fasta = fasta, size = %i)" % ( options.benchmark_fragment_size), setup="""from __main__ import IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )""" % (args[0] ) ) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % ( options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = IndexedFasta.verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" % (" \n# ".join(args[1:]))) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in synonyms.items(): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print globals()["__doc__"] sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "--remove-regex", dest="remove_regex", type="string", help="regular expression of contigs to remove [default=None].") parser.add_option( "-e", "--gff-file", dest="gff_file", type="string", help="gff file to use for getting contig sizes.") parser.add_option( "-f", "--fixed-width-windows", dest="fixed_width_windows", type="string", help="fixed width windows. Supply the window size as a " "parameter. Optionally supply an offset.") parser.set_defaults( genome_file=None, remove_regex=None, fixed_windows=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.remove_regex: remove_regex = re.compile(options.remove_regex) else: remove_regex = None if options.fixed_width_windows: v = map(int, options.fixed_width_windows.split(",")) if len(v) == 2: window_size, window_increment = v elif len(v) == 1: window_size, window_increment = v[0], v[0] else: raise ValueError( "could not parse window size '%s': should be size[,increment]" % options.fixed_width_windows) if options.gff_file: infile = open(options.gff_file, "r") gff = GTF.readFromFile(infile) infile.close() for g in gff: try: map_contig2size[g.mName] = max(map_contig2size[g.mName], g.end) except ValueError: map_contig2size[g.mName] = g.end else: gff = None if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) map_contig2size = fasta.getContigSizes(with_synonyms=False) else: fasta = None if map_contig2size is None: raise ValueError("no source of contig sizes supplied") # do sth counter = E.Counter() for contig, size in map_contig2size.items(): size = int(size) counter.input += 1 if remove_regex and remove_regex.search(contig): counter.skipped += 1 continue if options.fixed_width_windows: for x in range(0, size, window_increment): if x + window_size > size: continue options.stdout.write( "%s\t%i\t%i\n" % (contig, x, min(size, x + window_size))) counter.windows += 1 else: options.stdout.write("%s\t%i\t%i\n" % (contig, 0, size)) counter.windows += 1 counter.output += 1 E.info(str(counter)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.add_option("--no-header", dest="with_header", action="store_false", help="do not output BLAT header [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string", help="fasta filename with queries [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default].""" ) parser.set_defaults(is_gtf=False, genome_file=None, with_header=True, allow_duplicates=False, test=None) (options, args) = E.Start(parser, add_pipe_options=True) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) else: genome_fasta = None if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None ninput, noutput, nskipped = 0, 0, 0 if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin), feature="exon"), strict=not options.allow_duplicates) else: iterator = GTF.joined_iterator(GTF.iterator(sys.stdin)) if options.with_header: options.stdout.write(Blat.Match().getHeader() + "\n") for gffs in iterator: if options.test and ninput >= options.test: break ninput += 1 result = alignlib_lite.py_makeAlignmentBlocks() xstart = 0 intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs]) for start, end in intervals: xend = xstart + end - start result.addDiagonal(xstart, xend, start - xstart) xstart = xend entry = Blat.Match() entry.mQueryId = gff.transcript_id entry.mSbjctId = gff.contig entry.strand = gff.strand if genome_fasta: if entry.mSbjctId in genome_fasta: entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId) else: entry.mSbjctLength = result.getColTo() if queries_fasta: if entry.mQueryId in queries_fasta: entry.mQueryLength = queries_fasta.getLength(entry.mQueryId) else: entry.mQueryLength = result.getRowTo() entry.fromMap(result) options.stdout.write(str(entry) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults(is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with IOTools.openFile(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.Stop()
def __init__(self, tpath, qpath): self.tfasta = IndexedFasta.IndexedFasta(tpath) self.qfasta = IndexedFasta.IndexedFasta(qpath) self.pids = [] self.stats = 0
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $", usage=globals()["__doc__"], ) parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true") parser.add_option( "-f", "--format", dest="format", help="input format.", type="choice", choices=("exons", "psl", "gff") ) parser.add_option( "-o", "--output-format", dest="output_format", help="output format", type="choice", choices=("exontable", "exons", "predictions", "cds", "fasta"), ) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "--predictions-file", dest="predictions_file", type="string", help="filename with predictions. Use gene structures from this file if available.", ) parser.add_option( "-i", "--gff-field-id", dest="gff_field_id", type="string", help="field for the feature id in the gff info section.", ) parser.add_option( "-p", "--filename-peptides", dest="filename_peptides", type="string", help="Filename with peptide sequences. If given, it is used to check the predicted translated sequences.", ) parser.add_option( "--no-realignment", dest="do_realignment", action="store_false", help="do not re-align entries that do not parse correctly.", ) parser.add_option( "--remove-unaligned", dest="remove_unaligned", action="store_true", help="remove entries that have not been aligned correctly.", ) parser.add_option( "--input-coordinates", dest="input_coordinates", type="string", help="specify input format for input coordinates [forward|both-zero|one-closed|open].", ) parser.set_defaults( trans=False, output_format="predictions", format="psl", gff_field_id="id", input_coordinates="both-zero-open", filename_peptides=None, genome_file=None, do_realignment=True, predictions_file=None, remove_unaligned=False, ) (options, args) = E.Start(parser) if not options.genome_file: raise "please specify a genome file." fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0, 0, 0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences(IOTools.openFile(options.filename_peptides, "r")) predictor = Predictor.PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter(options.input_coordinates) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions(IOTools.openFile(options.predictions_file, "r")) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue if not re.match("^[0-9]", line): continue try: entries = parser.Parse((line,)) except PredictionParser.AlignmentError, e: print "# %s" % str(e) print "#", line[:-1] sys.exit(1) for entry in entries: entry.mPredictionId = nmatches nmatches += 1 print str(entries) elif options.format == "exons": parser = PredictionParser.PredictionParserExons(contig_sizes=contig_sizes) else: raise "unknown format %s for output option %s" % (options.format, options.output_format) if options.loglevel >= 2: options.stdlog.write("# parsing.\n") options.stdlog.flush() results = parser.Parse(sys.stdin.readlines()) if options.loglevel >= 2: options.stdlog.write("# parsing finished.\n") options.stdlog.flush() if options.loglevel >= 1: options.stdlog.write( "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors()) ) for error, msg in parser.mErrors: options.stdlog.write("# %s : %s\n" % (str(error), msg)) options.stdlog.flush() # if genomes are given: build translation if options.genome_file: results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken)) new_results = PredictionParser.Predictions() for entry in results: ninput += 1 if options.loglevel >= 2: options.stdlog.write( "# processing entry %s:%s on %s:%s %i/%i.\n" % ( entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, ninput, len(results), ) ) options.stdlog.flush() try: lgenome = fasta.getLength(entry.mSbjctToken) # added 3 residues - was a problem at split codons just before the stop. # See for example the chicken sequence ENSGALP00000002741 genomic_sequence = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, min(entry.mSbjctGenomeTo + 3, lgenome), ) except KeyError: if options.loglevel >= 1: options.stdlog.write( "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken) ) nskipped += 1 continue if predictions and entry.mPredictionId in predictions: if options.loglevel >= 2: options.stdlog.write( "# substituting entry %s on %s:%s.\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand) ) options.stdlog.flush() entry = predictions[entry.mPredictionId] exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom) entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment( Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence ) entry.score = entry.mMapPeptide2Translation.getColTo() - entry.mMapPeptide2Translation.getColFrom() + 1 ( entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions, ) = Genomics.CountGeneFeatures(0, entry.mMapPeptide2Genome, genomic_sequence) if peptide_sequences: if str(entry.mPredictionId) in peptide_sequences: reference = peptide_sequences[str(entry.mPredictionId)].upper() translation = entry.mTranslation nfound += 1 is_identical, nmismatches = checkIdentity(reference, translation, options) if is_identical: nidentical += 1 else: nmismatch += 1 if options.do_realignment: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches..realigning in region %i:%i\n" % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) ) options.stdlog.flush() result = predictor( entry.mPredictionId, reference, entry.mSbjctToken, genomic_sequence, "--subopt FALSE --score '%s'" % str(80), ) # "--exhaustive --subopt FALSE --score '%s'" % str(80) ) if result: translation = result[0].mTranslation is_identical, nmismatches = checkIdentity(reference, translation, options) else: if options.loglevel >= 2: options.stdlog.write( "# %s: realignment returned empty result\n" % (entry.mPredictionId) ) options.stdlog.flush() is_identical = False if is_identical: naligned += 1 prediction_id = entry.mPredictionId sbjct_genome_from = entry.mSbjctGenomeFrom entry = result[0] entry.mPredictionId = prediction_id entry.mSbjctGenomeFrom += sbjct_genome_from else: nunaligned += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n" % ( entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, reference, entry.mTranslation, translation, ) ) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches on %s ... no realignment\n" % (entry.mPredictionId, entry.mSbjctToken) ) if options.loglevel >= 3: options.stdlog.write( "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n" % (entry.mPredictionId, reference, translation) ) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: nnotfound += 1 new_results.append(entry) noutput += 1 results = new_results if results: options.stdout.write(str(results) + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-t", "--tablename", dest="tablename", type="string", help= "tablename to get variants from (in samtools pileup format) [default=%default]." ) parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option( "-f", "--exons-file", dest="filename_exons", type="string", help= "filename with transcript model information (gtf formatted file) [default=%default]." ) parser.add_option( "-r", "--filename-reference", dest="filename_reference", type="string", help= "filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default]." ) parser.add_option( "--vcf-file", dest="filename_vcf", type="string", help= "filename with variants in VCF format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--pileup-file", dest="filename_pileup", type="string", help= "filename with variants in samtools pileup format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--vcf-sample", dest="vcf_sample", type="string", help= "sample id for species of interest in vcf formatted file [default=%default]." ) parser.add_option( "-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help= "filename of a list of transcript ids that are selenoproteins [default=%default]." ) parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option( "-k", "--with-knockouts", dest="with_knockouts", action="store_true", help= "add alleles that are knocked out to fasta and gtf files [default=%default]." ) parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite(options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF(options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join(("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# collected variants:", variants) # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# merged variants:", variants) # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences(indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print("exon", key) Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print("intron", key) Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles( transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip( allele.exon_starts[1:], allele.cds_starts[1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write(">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ("\t".join( (gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=sys.argv): parser = E.OptionParser( version="%prog version: $Id: psl2wiggle.py 2834 2009-11-24 16:11:23Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-b", "--output-filename-pattern", dest="output_filename", type="string", help="filename for output [default=%default]") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("bedgraph", "wiggle", "bigbed", "bigwig"), help="output format [default=%default]") parser.set_defaults(genome_file=None, typecode=numpy.int16, output_filename=None, output_format="wiggle", test=None) (options, args) = E.Start(parser, add_pipe_options=True) typecode = options.typecode if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) counts = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * typecode().itemsize)) for contig, size in contig_sizes.items(): E.debug("allocating %s: %i bases" % (contig, size)) counts[contig] = numpy.zeros(size, typecode) E.info("allocated memory for %i contigs" % len(fasta)) else: fasta = None contig_sizes = {} if options.output_format in ("bigwig", "bigbed"): if not options.genome_file: raise ValueError( "please supply genome file for bigwig/bigbed computation.") if not options.output_filename: raise ValueError( "please output file for bigwig/bigbed computation.") if options.output_format == "bigwig": executable_name = "wigToBigWig" elif options.output_format == "bigbed": executable_name = "bedToBigBed" else: raise ValueError("unknown output format `%s`" % options.output_format) executable = IOTools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # write contig sizes outfile_size = open(tmpfile_sizes, "w") for contig, size in contig_sizes.items(): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() outfile = open(tmpfile_wig, "w") else: outfile = options.stdout iterator = Blat.BlatIterator(sys.stdin) ninput, ncontigs, nskipped = 0, 0, 0 E.info("started counting") while 1: if options.test and ninput >= options.test: break match = iterator.next() if match is None: break ninput += 1 contig = match.mSbjctId for start, length in zip(match.mSbjctBlockStarts, match.mBlockSizes): counts[contig][start:start + length] += 1 E.info("finished counting") if options.output_format in ("wig", "bigwig"): E.info("starting wig output") for contig, vals in counts.items(): E.debug("output for %s" % contig) for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]): l = list(iter) start, end = l[0][0], l[-1][0] val = vals[start] if val > 0: outfile.write("variableStep chrom=%s span=%i\n" % (contig, end - start + 1)) outfile.write("%i\t%i\n" % (start, val)) ncontigs += 1 elif options.output_format in ("bedgraph", "bigbed"): E.info("starting bedgraph output") for contig, vals in counts.items(): E.debug("output for %s" % contig) for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]): l = list(iter) start, end = l[0][0], l[-1][0] val = vals[start] if val > 0: outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end + 1, val)) ncontigs += 1 E.info("finished output") if options.output_format in ("bigwig", "bigbed"): outfile.close() E.info("starting bigwig conversion") try: retcode = subprocess.call(" ".join((executable, tmpfile_wig, tmpfile_sizes, os.path.abspath(options.output_filename)), ), shell=True) if retcode < 0: warn("wigToBigWig terminated with signal: %i" % -retcode) return -retcode except OSError, msg: warn("Error while executing bigwig: %s" % e) return 1 shutil.rmtree(tmpdir) E.info("finished bigwig conversion")
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("-o", "--mode", dest="mode", type="choice", choices=("intervals", "leftright"), help="what to output [%default]") parser.add_option("--min-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no, 3', 5' or both ends. If 3only or 5only are set, only the added sequence is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement [default=%default]") parser.set_defaults( genome_file=None, masker=None, mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.mode == "intervals": ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: bed2annotator2tsv.py 2885 2010-04-07 08:46:50Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-f", "--features", dest="features", type="string", help="feature to collect [default=None].") parser.add_option("-i", "--files", dest="files", action="append", help="use multiple annotations [default=None].") parser.add_option( "-a", "--annotations", dest="annotations", type="string", help= "aggregate name for annotations if only single file is provided from STDIN [default=None]." ) parser.add_option( "--input-filename-map", dest="input_filename_map", type="string", help="filename with a map of gene_ids to categories [default=None].") parser.add_option("-l", "--max-length", dest="max_length", type="string", help="maximum segment length [default=None].") parser.add_option( "-m", "--merge", dest="merge", action="store_true", help="merge overlapping bed segments [default=%default].") parser.add_option("-s", "--section", dest="section", type="choice", choices=("segments", "annotations", "workspace"), help="annotator section [default=None].") parser.add_option( "--subset", dest="subsets", type="string", action="append", help= "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]." ) parser.set_defaults( genome_file=None, feature=None, remove_random=True, section="segments", annotations="annotations", max_length=100000, files=[], subsets=[], input_filename_map=None, merge=False, ) (options, args) = E.Start(parser) options.files += args if len(options.files) == 0: options.files.append("-") options.files = list( itertools.chain(*[re.split("[,; ]+", x) for x in options.files])) if options.subsets: subsets = collections.defaultdict(list) for s in options.subsets: filename_gff, label, filename_ids = s.split(",") subsets[filename_gff].append((label, filename_ids)) options.subsets = subsets if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.section == "segments": prefix = "##Segs" elif options.section == "annotations": prefix = "##Id" elif options.section == "workspace": prefix = "##Work" else: raise ValueError("unknown section %s" % options.section) if options.max_length: max_length = options.max_length else: max_length = 0 ninput, ntracks, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0, 0 if options.section in ("annotations"): contigs = set() it = itertools.groupby(Bed.iterator(options.stdin), key=lambda x: x.track["name"]) map_track2segments = {} for track, beds in it: ntracks += 1 map_track2segments[track] = [] first_segment = nsegments beds = list(beds) if options.merge: beds = Bed.merge(beds) for bed in beds: contig, start, end = bed.contig, bed.start, bed.end if options.remove_random and "random" in contig: continue if max_length > 0 and end - start > max_length: ndiscarded += 1 continue contigs.add(contig) map_track2segments[track].append(nsegments) options.stdout.write("%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end)) nsegments += 1 options.stdout.write("##Ann\t%s\t%s\n" % (track, "\t".join( ["%i" % x for x in range(first_segment, nsegments)]))) E.info("track %s: annotated with %i segments" % (track, nsegments - first_segment)) ncontigs = len(contigs) E.info( "ninput=%i, ntracks=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ntracks, ncontigs, nsegments, ndiscarded)) E.Stop()
alphabet = "fastq", encoding = "phred", default_value = None, ) (options, args) = E.Start( parser ) ninput, noutput = 0, 0 if options.format == "fasta": iterator = FromFastaIterator( sys.stdin, alphabet = options.alphabet, default = options.default_value ) if options.output_format == "fasta": if options.build_index: IndexedFasta.createDatabase( options.build_index, iterator ) else: while 1: try: r = iterator.next() except StopIteration: break t,s = r options.stdout.write( ">%s\n%s\n" % (t,s)) elif options.output_format == "fastq": if not options.filename_sequences: raise "please supply a filename with sequences." iterator_sequence = FastaIterator.FastaIterator( open( options.filename_sequences, "r" ) )
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: quality2fasta.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option("-f", "--format", dest="format", type="choice", choices=("fasta", ), help="input format [%default]." ) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("fasta", "fastq" ), help="output format - if fastq is chosen, also supply a sequence file [%default]." ) parser.add_option("-a", "--alphabet", dest="alphabet", type="choice", choices=("fastq", "solexa", "printable" ), help="characters to use for quality scores [%default]." ) parser.add_option("-e", "--encoding", dest="encoding", type="choice", choices=("phred", "solexa" ), help="encoding of quality scores [%default]." ) parser.add_option("-i", "--build-index", dest="build_index", type="string", help="build an index. Supply the database name [%default]." ) parser.add_option("-s", "--filename-sequences", dest="filename_sequences", type="string", help="input filename with file of sequences in fasta format - sorted in the same way as the quality file [%default]." ) parser.add_option( "-d", "--set-to-default", dest="default_value", type="int", help="set all quality codes to the default value. Supply the fasta sequence instead of the quality codes [%default]." ) parser.set_defaults( format = "fasta", output_format = "fasta", build_index = None, filename_sequences = None, alphabet = "fastq", encoding = "phred", default_value = None, ) (options, args) = E.Start( parser ) ninput, noutput = 0, 0 if options.format == "fasta": iterator = FromFastaIterator( sys.stdin, alphabet = options.alphabet, default = options.default_value ) if options.output_format == "fasta": if options.build_index: IndexedFasta.createDatabase( options.build_index, iterator ) else: while 1: try: r = iterator.next() except StopIteration: break t,s = r options.stdout.write( ">%s\n%s\n" % (t,s)) elif options.output_format == "fastq": if not options.filename_sequences: raise "please supply a filename with sequences." iterator_sequence = FastaIterator.FastaIterator( open( options.filename_sequences, "r" ) ) while 1: qual, seq = None, None try: qual = iterator.next() seq = iterator_sequence.next() except StopIteration: if qual and not seq: options.stdlog.write( "# sequence file incomplete\n" ) elif seq and not qual: options.stdlog.write( "# quality file incomplete\n" ) qt, qs = qual st, ss = seq.title, seq.sequence assert qt == st, "sequence and quality identifiers incongruent: %s != %s" % (qt, st) options.stdout.write( "@%s\n%s\n+\n%s\n" % (qt, ss, qs)) if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, noverflow=%i, nunderflow=%i\n" % \ (iterator.mNInput, iterator.mNOutput, iterator.mNOverFlow, iterator.mNUnderFlow )) E.Stop()
extract_id = None ) (options, args) = E.Start( parser ) if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() else: contig_sizes = {} if options.extract_id: extract_id = re.compile( options.extract_id ) else: extract_id = None converter = IndexedFasta.getConverter( options.coordinate_format ) exons = Exons.ReadExonBoundaries( sys.stdin, contig_sizes = contig_sizes, converter = converter, do_invert = True, format = "gtf", gtf_extract_id = extract_id ) ntranscripts, nexons, nerrors = 0, 0, 0 for id, ee in exons.items(): ntranscripts += 1 has_error = False for e in ee: if options.forward_coordinates and e.mSbjctToken in contig_sizes and \ e.mSbjctStrand == "-":
def main(argv=None): if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: IndexedFasta.py 2801 2009-10-22 13:40:39Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-e", "--extract", dest="extract", type="string", help= "extract region for testing purposes. Format is contig:strand:from:to. " "The default coordinates are 0-based open/closed coordinates on both strands. " "For example, chr1:+:10:12 will return bases 11 to 12 on chr1.") parser.add_option("-c", "--compression", dest="compression", type="choice", choices=("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug"), help="compress database [default=%default].") parser.add_option( "--random-access-points", dest="random_access_points", type="int", help= "save random access points every # number of nucleotides [default=%default]." ) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("one-forward-open", "zero-both-open"), help="coordinate format of input [default=%default].") parser.add_option( "-s", "--synonyms", dest="synonyms", type="string", help= "list of synonyms, comma separated with =, for example, chr1=chr1b [default=%default]" ) parser.add_option( "-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access [default=%default].") parser.add_option( "--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark [default=%default].") parser.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") parser.add_option( "--file-format", dest="file_format", type="choice", choices=("fasta", "auto", "fasta.gz", "tar", "tar.gz"), help= "file format of input. Supply if data comes from stdin [default=%default]." ) parser.add_option( "-a", "--clean-sequence", dest="clean_sequence", action="store_true", help= "remove X/x from DNA sequences - they cause errors in exonerate [default=%default]." ) parser.add_option( "--allow-duplicates", dest="allow_duplicates", action="store_true", help= "allow duplicate identifiers. Further occurances of an identifier are suffixed by an '_%i' [default=%default]." ) parser.add_option( "--regex-identifier", dest="regex_identifier", type="string", help= "regular expression for extracting the identifier from fasta description line [default=%default]." ) parser.add_option("--compress-index", dest="compress_index", action="store_true", help="compress index [default=%default].") parser.add_option( "--force", dest="force", action="store_true", help="force overwriting of existing files [default=%default].") parser.add_option( "-t", "--translator", dest="translator", type="choice", choices=("solexa", "phred", "bytes", "range200"), help="translate numerical quality scores [default=%default].") parser.set_defaults(extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.Start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = TranslatorPhred() elif options.translator == "solexa": options.translator = TranslatorSolexa() elif options.translator == "bytes": options.translator = TranslatorBytes() elif options.translator == "range200": options.translator = TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write( ">%s\n%s\n" % \ ( options.extract, sequence ) ) elif options.benchmark: import timeit timer = timeit.Timer( stmt="benchmarkRandomFragment( fasta = fasta, size = %i)" % (options.benchmark_fragment_size), setup= """from __main__ import benchmarkRandomFragment,IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )""" % (args[0])) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % (options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" %\ (" \n# ".join( args[1:] ) )) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in synonyms.items(): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print globals()["__doc__"] sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.Stop()
fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0,0,0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0,0,0,0,0,0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile( options.filename_peptides, "r")) predictor = PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter( options.input_coordinates ) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions( IOTools.openFile( options.predictions_file, "r") ) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA()