def readAndGroupTable( infile, options ): """read table from infile and group. """ fields, table = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True ) options.columns = getColumns( fields, options.columns ) assert options.group_column not in options.columns converter = float new_fields = [ fields[options.group_column] ] + [ fields[x] for x in options.columns ] if options.group_function == "min": f = min elif options.group_function == "max": f = max elif options.group_function == "sum": f = lambda z: reduce( lambda x,y: x+y, z) elif options.group_function == "mean": f = scipy.mean elif options.group_function == "cat": f = lambda x: ";".join( [ y for y in x if y != "" ] ) converter = str elif options.group_function == "uniq": f = lambda x: ";".join( [ y for y in set(x) if y != "" ] ) converter = str elif options.group_function == "stats": f = lambda x: str(Stats.DistributionalParameters(x)) # update headers new_fields = [ fields[options.group_column] ] for c in options.columns: new_fields += list( map(lambda x: "%s_%s" % (fields[c], x), Stats.DistributionalParameters().getHeaders() ) ) ## convert values to floats (except for group_column) ## Delete rows with unconvertable values and not in options.columns new_table = [] for row in table: skip = False new_row = [ row[options.group_column] ] for c in options.columns: if row[c] == options.missing_value: new_row.append(row[c]) else: try: new_row.append( converter(row[c]) ) except ValueError: skip = True break if not skip: new_table.append(new_row) table = new_table new_rows = CSV.GroupTable( table, group_column = 0, group_function = f ) options.stdout.write("\t".join(new_fields) + "\n") for row in new_rows: options.stdout.write( "\t".join( map(str,row) ) + "\n")
def printHeightsPerTree(values, section, options, prefix_header, prefix_row): if not values: return outfile, is_new = TreeReconciliation.getFile(options, section) if is_new: outfile.write("%s%s\theights\n" % (prefix_header, "\t".join( Stats.DistributionalParameters().getHeaders()))) s = Stats.DistributionalParameters(values) s.setFormat(options.format_branch_length) outfile.write("%s%s\t%s\n" % (prefix_row, str(s), ",".join( map(lambda x: options.format_branch_length % x, values))))
def process(self, contig, start, end, reads, qualities): aligned = filter(lambda x: x > 0, reads) self.mOutFile.write( "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" % (self.mOutputId, contig, start, end, end - start, len(reads), len(aligned), str(Stats.DistributionalParameters(aligned))))
def process( self, contig, start, end, reads, qualities ): self.mOutFile.write( "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" % (self.mOutputId, contig, start, end, end - start, len(reads), len(qualities), str(Stats.DistributionalParameters( qualities ) )))
def process(self, contig, start, end, reads, qualities): aligned = [x for x in reads if x > 0] self.mOutFile.write("%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" % (self.mOutputId, contig, start, end, end - start, len(reads), len(aligned), str(Stats.DistributionalParameters(aligned))))
def printHeightsPerSpecies(values, section, options, prefix_header, prefix_row): if not values: return ## distributions of distance to node outfile, is_new = TreeReconciliation.getFile(options, section) if is_new: outfile.write("%sspecies\t%s\theights\n" % (prefix_header, "\t".join( Stats.DistributionalParameters().getHeaders()))) for species in sorted(values.keys()): s = Stats.DistributionalParameters(values[species]) s.setFormat(options.format_branch_length) outfile.write("%s%s\t%s\t%s\n" % (prefix_row, species, str(s), ",".join( map(lambda x: options.format_branch_length % x, values[species]))))
def decorator_median_length(intervals, start, end, contig, fasta): """compute length distribution.""" d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals]) return d['median'], str(d)
def decorator_median_score(values, start, end, contig): """compute median of values.""" d = Stats.DistributionalParameters(values) return d['median'], str(d)
def decorator_max_score(values, start, end, contig): """compute minumum of values.""" d = Stats.DistributionalParameters(values) return d['max'], str(d)
def decorator_percent_coverage(intervals, start, end, contig, fasta): """compute length of intervals.""" d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals]) return 100.0 * float(d['sum']) / (end - start), str(d)
def decorator_median_length(intervals, start, end, contig, fasta): """compute length distribution.""" d = Stats.DistributionalParameters(map(lambda x: x[1] - x[0], intervals)) return d['median'], str(d)
def decorator_stddev_score(values, start, end, contig): """compute stddev of values.""" d = Stats.DistributionalParameters(values) return d['stddev'], str(d)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: psl2wiggle_stats.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--wiggle-files", dest="wiggle_files", type="string", help="glob expression for wiggle files [%default].") parser.add_option("--prefix", dest="prefix", type="string", help="prefix to add to contig names before lookup [%default].") parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true", help="input is zipped.") parser.add_option("--test", dest="test", type="int", help="test - stop after # rows of parsing [%default].") parser.add_option("--with-values", dest="with_values", action="store_true", help="output values in last column [%default].") parser.set_defaults(wiggle_files="*.data.bz2", from_zipped=False, prefix="", with_values=False, test=None) (options, args) = E.Start(parser, add_pipe_options=True) # open indexed access to wiggles wiggle_files = glob.glob(options.wiggle_files) if not wiggle_files: raise IOError("could not find wiggle files with '%s'" % options.wiggle_files) index = Wiggle.WiggleMultiIndexedAccess(wiggle_files, keep_open=True, use_cache=False) iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 options.stdout.write( "query\tnali\t%s" % ("\t".join(Stats.DistributionalParameters().getHeaders()))) if options.with_values: options.stdout.write("\tvalues") options.stdout.write("\n") while 1: if options.test and ninput >= options.test: break match = iterator.next() if match is None: break ninput += 1 if options.loglevel >= 2: options.stdlog.write(str(match) + "\n") # psl always matches on the forward strand map_genome2query = alignlib_lite.py_makeAlignmentBlocks() f = alignlib_lite.py_AlignmentFormatBlat("%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % ( match.mSbjctFrom, match.mSbjctTo, match.mQueryFrom, match.mQueryTo, match.mSbjctBlockStarts, match.mQueryBlockStarts, match.mBlockSizes)) f.copy(map_genome2query) data = index.get(options.prefix + match.mSbjctId, match.mSbjctFrom, match.mSbjctTo) values = [] for x, vv in data: for v in vv: if map_genome2query.mapRowToCol(x) >= 0: values.append(v) x += 1 if len(values) == 0: nskipped += 1 continue noutput += 1 if options.loglevel >= 2: options.stdlog.write( "# %s\n" % ",".join(["%5.3f" % v for v in values])) s = Stats.DistributionalParameters(values) options.stdout.write("%s\t%i\t%s" % (match.mQueryId, match.mNMismatches + match.mNMatches, str(s))) if options.with_values: options.stdout.write( "\t%s" % ",".join(["%5.3f" % v for v in values])) options.stdout.write("\n") if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree2stats.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("branchlengths", ), help="methods to apply.") parser.set_defaults( methods=[], filtered_branch_length=-999, ) (options, args) = E.Start(parser, add_pipe_options=True) nexus = TreeTools.Newick2Nexus(sys.stdin) if options.loglevel >= 1: options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees)) ninput = len(nexus.trees) nskipped = 0 for method in options.methods: outfile = options.stdout if method == "branchlengths": outfile.write( "tree\t%s\n" % "\t".join(Stats.DistributionalParameters().getHeaders())) for tree in nexus.trees: branchlengths = [] for node in tree.chain.values(): # ignore branch length of root if it is zero if not node.prev and node.data.branchlength == 0: continue if node.data.branchlength == options.filtered_branch_length: continue branchlengths.append(node.data.branchlength) s = Stats.DistributionalParameters(branchlengths) outfile.write("%s\t%s\n" % (tree.name, str(s))) if options.loglevel >= 1: options.stdlog.write("# ninput=%i, nskipped=%i\n" % (ninput, nskipped)) E.Stop()
def getHeader(self): return "id\tcontig\tstart\tend\tsize\tnmatches\tncovered\t%s" % ("\t".join(Stats.DistributionalParameters().getHeaders()))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_ribosomes.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--schemas", dest="schemas", type="string", help="schemas in the set.") parser.add_option("-e", "--field-extract", dest="field_extract", type="string", help="pattern for the field to extract.") parser.add_option("-c", "--field-compare", dest="field_compare", type="string", help="pattern for the field to compare.") parser.add_option("-i", "--filename-identifiers", dest="filename_identifiers", type="string", help="identifiers in the positive set.") parser.add_option("-u", "--filename-subset", dest="filename_subset", type="string", help="subset in the positive set.") parser.add_option("--filter-min-ratio", dest="filter_min_ratio", type="float", help="minimum boundary for filter.") parser.add_option("--filter-max-ratio", dest="filter_max_ratio", type="float", help="maximum boundary for filter.") parser.add_option( "-o", "--output-fields", dest="output_fields", type="string", help= "output fields, choices are: zscore, val, nvals, sum, min, max, stddev, mean, median." ) parser.add_option( "--output-pattern", dest="output_pattern", type="string", help= "pattern for table headers, should contain %s for schema and %s for field anme." ) parser.add_option( "-f", "--output-format", dest="output_format", type="choice", choices=("table", "list", "values"), help="output format. Tabular form (one row per ortholog) or list form." ) parser.add_option("--format", dest="format", type="string", help="output format for numbers.") parser.add_option("--remove-na", dest="remove_na", action="store_true", help="remove entries with any na values.") parser.set_defaults( field_extract="%s_length", field_compare="%s_length", filename_identifiers=None, filename_subset=None, filter_min_ratio=0.00, filter_max_ratio=0.00, schemas="", output_fields="", output_pattern="%s_%s", output_format="table", format="%6.4f", remove_na=False, ) (options, args) = E.Start(parser, add_csv_options=True) options.schemas = options.schemas.split(",") if not options.schemas: raise "please supply schemas." if options.output_fields: options.output_fields = options.output_fields.split(",") else: options.output_fields = () fields, table = CSV.ReadTable(sys.stdin) map_fields2column = {} for x in fields: map_fields2column[x] = len(map_fields2column) if options.loglevel >= 1: options.stdlog.write("# read a %i x %i table.\n" % (len(table), len(fields))) if options.filename_subset: subset, nerrors = IOTools.ReadList(open(options.filename_subset, "r")) subset = set(subset) table = filter(lambda x: x[0] in subset, table) if options.loglevel >= 1: options.stdlog.write( "# subset of %i entries reduced table to a %i x %i table.\n" % (len(subset), len(table), len(fields))) if options.filename_identifiers: identifiers, nerrors = IOTools.ReadList( open(options.filename_identifiers, "r")) else: identifiers = [] identifiers = set(identifiers) # extract rows with positive identifiers positive_rows = filter(lambda x: x[0] in identifiers, table) if options.loglevel >= 1: options.stdlog.write( "# subset of %i identifiers gives %i positive entries.\n" % (len(identifiers), len(positive_rows))) if options.output_format == "table": options.stdout.write("id") for schema in options.schemas: if options.output_fields: for field in options.output_fields: options.stdout.write("\t" + options.output_pattern % (schema, field)) else: options.stdout.write("\t%s" % (schema)) options.stdout.write("\n") else: options.stdout.write("schema\tvalue\n") if identifiers: for row in positive_rows: if options.output_format == "table": options.stdout.write(row[0]) for schema in options.schemas: # set fields for extraction f_extract = map_fields2column[options.field_extract % schema] f_compare = map_fields2column[options.field_compare % schema] # get region for extraction if row[f_compare] != "na": r = float(row[f_compare]) if options.filter_min_ratio or options.filter_max_ratio: mi = r * options.filter_min_ratio ma = r * options.filter_max_ratio f = lambda x: x[f_compare] != "na" and float( x[f_compare] ) >= mi and float(x[f_compare]) <= ma and x[ 0] not in identifiers and x[f_extract] != "na" else: f = lambda x: x[0] not in identifiers and x[f_extract ] != "na" # extract values: filter by minimum and maximum range and remove # positive identifiers. v = float(row[f_extract]) values = map(lambda x: float(x[f_extract]), filter(f, table)) stats = Stats.DistributionalParameters(values) else: v = None for field in options.output_fields: if v is not None: if field == "zscore": f = options.format % stats.getZScore(v) elif field == "diff": f = options.format % (v - stats["mean"]) elif field == "reldiff": f = options.format % ( (v - stats["mean"]) / stats["mean"]) elif field == "val": f = options.format % v else: f = options.format % stats[field] else: f = "na" if options.output_format == "table": options.stdout.write("\t%s" % f) elif options.output_format == "list": options.stdout.write("%s\t%s\n" % (schema, f)) elif options.output_format == "values": options.stdout.write( "%s\t%s\t%5.2f\t%s\n" % (row[0], schema, v, ",".join( map(lambda x: options.format % x, values)))) if options.output_format == "table": options.stdout.write("\n") else: extract_columns = [] for schema in options.schemas: extract_columns.append(map_fields2column[options.field_extract % schema]) # simply dump a subset of values for row in table: skip = False if options.filter_min_ratio or options.filter_max_ratio: master = options.schemas[0] v = row[map_fields2column[options.field_compare % master]] if v == "na": continue v = float(v) mi = v * options.filter_min_ratio ma = v * options.filter_max_ratio for schema in options.schemas[1:]: r = row[map_fields2column[options.field_compare % schema]] if r == "na": if options.remove_na: skip = True continue r = float(r) if r < mi or r > ma: skip = True if options.loglevel >= 3: if options.format == "table": options.stdout.write("* ") options.stdout.write("%s\t" % row[0]) options.stdout.write("\t".join( [row[y] for y in extract_columns])) options.stdout.write("\n") break if skip: continue if options.output_format == "table": options.stdout.write("%s\t" % row[0]) options.stdout.write("\t".join( [row[y] for y in extract_columns])) options.stdout.write("\n") elif options.output_format == "list": has_na = False for x in range(len(options.schemas)): v = row[extract_columns[x]] if v == "na": has_na = True if has_na and options.remove_na: continue for x in range(len(options.schemas)): options.stdout.write( "%s\t%s\n" % (options.schemas[x], row[extract_columns[x]])) E.Stop()
index = Wiggle.WiggleMultiIndexedAccess(wiggle_files, keep_open=True, use_cache=False) if options.as_gtf: iterator = GTF.flat_gene_iterator(GTF.iterator(sys.stdin)) id = "gene_id" else: iterator = GTF.chunk_iterator(GTF.iterator(sys.stdin)) id = "query" ninput, noutput, nskipped = 0, 0, 0 options.stdout.write( "%s\tnali\t%s" % (id, "\t".join(Stats.DistributionalParameters().getHeaders()))) if options.with_values: options.stdout.write("\tvalues") options.stdout.write("\n") for gffs in iterator: if options.test and ninput >= options.test: break ninput += 1 if options.loglevel >= 2: for gff in gffs: options.stdlog.write(str(gff) + "\n")
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2transcripts.py 1841 2008-05-08 12:07:13Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome." ) parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string", help="filename with summary information." ) parser.add_option( "--skip-header", dest="skip_header", action="store_true", help="skip header." ) parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int", help="maximum extension for start codon (make divisible by 3)." ) parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int", help="maximum extension for stop codon (make divisible by 3)." ) parser.add_option( "--left-extension-mode", dest="left_extension_mode", type="choice", choices=("first-start", "first-stop-backtrack"), help="extension mode for 5' end.") parser.add_option( "--fill-introns", dest="fill_introns", type="int", help="fill intron if divisible by three and no stop codon up to a maximum length of #." ) parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int", help="maximum number of stop codons to tolerate within an intron." ) parser.add_option( "--output-format", dest="output_format", type="choice", choices=("predictions", "extensions", "filled-introns"), help="output format." ) parser.set_defaults( genome_file = "genome", start_codons = ("ATG"), stop_codons = ("TAG", "TAA", "TGA"), start_codon_boundary = 9999, stop_codon_boundary = 9999, fill_introns = 0, introns_max_stops = 0, left_splice_signals = ("GT",), right_splice_signals = ("AG",), output_format="extensions", left_extension_mode = "first-start", skip_header = False, output_filename_summary = None, ) (options, args) = E.Start( parser, add_pipe_options = True ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) options.start_codon_boundary = int(options.start_codon_boundary / 3) options.stop_codon_boundary = int(options.stop_codon_boundary / 3) fasta = IndexedFasta.IndexedFasta( options.genome_file ) p = PredictionParser.PredictionParserEntry() ninput, noutput = 0, 0 nfilled = 0 nseqs_filled = 0 nseqs_extended = 0 left_extensions = [] right_extensions = [] filled_introns = [] if not options.skip_header: if options.output_format == "predictions": options.stdout.write( Prediction.Prediction().getHeader() + "\n" ) elif options.output_format == "filled-introns": options.stdout.write("\t".join( ("prediction_id", "intron", "peptide_sequence", "genomic_sequence") ) + "\n" ) if options.output_filename_summary: outfile_summary = open (options.output_filename_summary, "w" ) outfile_summary.write( "id\ttype\tnumber\tlength\tfrom\tto\tsequence\n" ) else: outfile_summary = None for line in options.stdin: if line[0] == "#": continue ninput += 1 p.Read(line) lsequence = fasta.getLength( p.mSbjctToken ) genome_from = max( 0, p.mSbjctGenomeFrom - options.start_codon_boundary) genome_to = min( lsequence, p.mSbjctGenomeTo + options.stop_codon_boundary) genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, genome_from, genome_to ).upper() ######################################################################## ######################################################################## ######################################################################## ## Do extensions if options.start_codon_boundary or options.stop_codon_boundary: extension_start = p.mSbjctGenomeFrom - genome_from extension_stop = genome_to - p.mSbjctGenomeTo fragment_to = extension_start + p.mSbjctGenomeTo - p.mSbjctGenomeFrom lfragment = len(genomic_sequence) ######################################################################## ######################################################################## ######################################################################## ## find start codon start = extension_start found_start = False if options.left_extension_mode == "first-start": found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons, options.stop_codons ) elif options.left_extension_mode == "first-stop-backtrack": if genomic_sequence[start:start+3] in options.start_codons: found_start = True else: found_start, start = findCodonReverse( genomic_sequence, start, options.stop_codons ) if found_start: E.info("prediction %s: stop found at %i (%i) backtracking ..." % ( p.mPredictionId, start, extension_start - start) ) ## bracktrack to first start codon found_start = False while start < extension_start: start += 3 if genomic_sequence[start:start+3] in options.start_codons: found_start = True break else: start = extension_start if found_start: E.info("start codon found at %i (%i)." % ( start, extension_start - start) ) else: E.info("no start codon found." ) else: E.info("prediction %s: no stop found ... backtracking to start codon." % ( p.mPredictionId ) ) found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons ) E.info("prediction %s: no start codon found." % ( p.mPredictionId ) ) if found_start: start += genome_from else: start = p.mSbjctGenomeFrom dstart = p.mSbjctGenomeFrom - start ######################################################################## ######################################################################## ######################################################################## ## find stop codon ## stop points to the beginning of the codon, thus the stop codon will ## not be part of the sequence. stop = fragment_to found_stop = 0 while stop < lfragment and \ genomic_sequence[stop:stop+3] not in ("NNN", "XXX"): if genomic_sequence[stop:stop+3] in options.stop_codons: found_stop = 1 break stop += 3 if found_stop: stop += genome_from else: stop = p.mSbjctGenomeTo dstop = stop - p.mSbjctGenomeTo ######################################################################## ######################################################################## ######################################################################## ## build new prediction map_peptide2genome = [] if dstart: map_peptide2genome.append( ("G", 0, dstart) ) map_peptide2genome += p.mMapPeptide2Genome if dstop: map_peptide2genome.append( ("G", 0, dstop) ) E.info("prediction %s: extension: found_start=%i, found_stop=%i, left=%i, right=%i" % ( p.mPredictionId, found_start, found_stop, dstart, dstop ) ) ## save results p.mMapPeptide2Genome = map_peptide2genome p.mAlignmentString = Genomics.Alignment2String( map_peptide2genome ) p.mSbjctGenomeFrom -= dstart p.mSbjctGenomeTo += dstop p.mSbjctFrom += dstart / 3 p.mSbjctTo += dstart / 3 + dstop / 3 if dstart or dstop: if dstart: left_extensions.append( dstart ) if dstop: right_extensions.append( dstop ) nseqs_extended += 1 ## update genomic sequence because borders might have changed. genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo ).upper() if options.fill_introns: has_filled = False exons = Exons.Alignment2Exons( p.mMapPeptide2Genome, query_from = 0, sbjct_from = 0 ) new_exons = [] last_e = exons[0] nintron = 0 for e in exons[1:]: nintron += 1 lintron = e.mGenomeFrom - last_e.mGenomeTo if lintron > options.fill_introns or (lintron) % 3 != 0: E.debug( "prediction %s: intron %i of size %i discarded." % \ (p.mPredictionId, nintron, lintron ) ) new_exons.append(last_e) last_e = e continue ## get sequence, include also residues from split codons ## when checking for stop codons. if e.mAlignment[0][0] == "S": offset_left = last_e.mAlignment[-1][2] offset_right = e.mAlignment[0][2] else: offset_left, offset_right = 0, 0 sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom+offset_right] ## check for splice sites for signal in options.left_splice_signals: if sequence[offset_left:offset_left+len(signal)] == signal: left_signal = True break else: left_signal = False for signal in options.right_splice_signals: if sequence[-(len(signal)+offset_right):-offset_right] == signal: right_signal = True break else: right_signal = False nstops, ngaps = 0, 0 for codon in [ sequence[x:x+3] for x in range(0,len(sequence),3) ]: if codon in options.stop_codons: nstops += 1 if "N" in codon.upper(): ngaps += 1 E.debug( "prediction %s: intron %i of size %i (%i-%i) (%s:%s:%i:%i): stops=%i, gaps=%i, signals=%s,%s." % \ (p.mPredictionId, nintron, lintron, offset_left, offset_right, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom + last_e.mGenomeTo, p.mSbjctGenomeFrom + e.mGenomeFrom, nstops, ngaps, left_signal, right_signal ) ) if nstops + ngaps > options.introns_max_stops: new_exons.append(last_e) last_e = e continue E.info( "prediction %s: filling intron %i of size %i: stops=%i, gaps=%i, signals=%s,%s" % \ (p.mPredictionId, nintron, lintron, nstops, ngaps, left_signal, right_signal)) e.Merge( last_e ) has_filled = True nfilled += 1 last_e = e if options.output_format == "filled-introns": options.stdout.write( "\t".join( map(str, ( p.mPredictionId, nintron, Genomics.TranslateDNA2Protein( sequence ), sequence ) ) ) + "\n" ) filled_introns.append(lintron) p.mNIntrons -= 1 new_exons.append(last_e) if has_filled: nseqs_filled += 1 Exons.UpdatePeptideCoordinates( new_exons ) p.mMapPeptide2Genome = Exons.Exons2Alignment( new_exons ) p.mAlignmentString = Genomics.Alignment2String( p.mMapPeptide2Genome ) ## build translated sequence p.mMapPeptide2Translation, p.mTranslation = Genomics.Alignment2PeptideAlignment( \ p.mMapPeptide2Genome, p.mQueryFrom, 0, genomic_sequence ) ## output info if options.output_format == "predictions": options.stdout.write( str(p) + "\n" ) elif options.output_format == "extensions": if found_start: found_start = 1 if found_stop: found_stop = 1 options.stdout.write( "\t".join( map(str, ( p.mPredictionId, found_start, found_stop, dstart, dstop, p.mTranslation, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, p.mAlignmentString ))) + "\n" ) noutput += 1 options.stdout.flush() E.info("stats : %s" % "\t".join(Stats.DistributionalParameters().getHeaders() )) E.info("left : %s" % str(Stats.DistributionalParameters(left_extensions)) ) E.info("right : %s" % str(Stats.DistributionalParameters(right_extensions)) ) E.info("introns: %s" % str(Stats.DistributionalParameters(filled_introns)) ) E.info("ninput=%i, noutput=%i, nextended=%i, nfilled=%i, nexons_filled=%i" % (\ ninput, noutput, nseqs_extended, nseqs_filled, nfilled)) E.Stop()
options.verify_num_iterations, options.verify_fragment_size, quiet = True ) options.stdout.write("%s\t%i\t%i\t%i\n" % (compression, t, nerrors1, nerrors2 )) options.stdout.flush() dbfiles.append( dbfile ) ############################################################################## ############################################################################## ############################################################################## ## random sampling of data points ############################################################################## options.stdout.write("//\n") options.stdout.write( "method\tsize\t%s\tvalues\n" % ("\t".join(Stats.DistributionalParameters().getHeaders()))) options.stdout.flush() for fragment_size in options.fragment_sizes: times = [ [] for x in range(len(options.methods)+1)] for iteration in range(options.num_iterations): for x in range(len(options.methods)): if options.stdlog >= 1: options.stdlog.write("# fragment_size=%i, iteration=%i/%i, method=%s.\n" % (fragment_size, iteration, options.num_iterations,options.methods[x]) ) options.stdlog.flush() timer = timeit.Timer( stmt="benchmarkRandomFragment( fasta = fasta, size = %i)" % (fragment_size),