def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/regions2graph.py 2754 2009-09-04 16:50:22Z andreas $", usage=globals()["__doc__"]) parser.add_option("-b", "--benchmark", dest="filename_benchmark", type="string", help="") parser.add_option("-y", "--benchmark-synonyms", dest="benchmark_synonyms", type="string", help="") parser.add_option("-p", "--peptides", dest="filename_peptides", type="string", help="") parser.add_option("-c", "--min-coverage-query", dest="min_coverage_query", type="float", help="") parser.add_option("-s", "--min-score", dest="min_total_score", type="float", help="") parser.add_option("-i", "--min-percent-identity", dest="min_percent_identity", type="float", help="") parser.add_option("-o", "--max-percent-overlap", dest="max_percent_overlap", type="float", help="") parser.add_option("--overlap-min-score", dest="overlap_min_score", type="float", help="") parser.add_option("--overlap-min-coverage", dest="overlap_min_coverage", type="float", help="") parser.add_option("--overlap-min-identity", dest="overlap_min_identity", type="float", help="") parser.add_option("--overlap-max-coverage", dest="overlap_max_coverage", type="float", help="") parser.add_option("-m", "--max-matches", dest="max_matches", type="int", help="") parser.add_option("-j", "--join-regions", dest="join_regions", type="int", help="") parser.add_option("--join-regions-max-regions", dest="join_regions_max_regions", type="int", help="") parser.add_option("--join-regions-max-coverage", dest="join_regions_max_coverage", type="float", help="") parser.add_option("--min-length", dest="min_length", type="int", help="") parser.add_option("--test", dest="test", type="int", help="") parser.add_option("--filter-queries", dest="filename_filter_queries", type="string", help="") parser.add_option("--filter-regions", dest="filter_regions", type="string", help="") parser.add_option("--conserve-memory", dest="conserve_memory", action="store_true", help="") parser.add_option("--filter-suboptimal", dest="filter_suboptimal", action="store_true", help="") parser.set_defaults( # overlap allowed for matches on genomic region max_percent_overlap=20, gop=-10.0, gep=-2.0, # thresholds for joining regions overlap_min_score=80, overlap_min_coverage=80, overlap_max_coverage=90, overlap_min_identity=50, # threshold for filtering bad predictions: # minimum score min_total_score=80, # joining regions join_regions=0, # maximum coverage of query for predictions to be joined # (This is to ensure not to join duplications. A range check # would be better, but runs into trouble with repeats). join_regions_max_coverage=90, # minimum coverage of query min_coverage_query=10, # conserve memory conserve_memory=0, # minimum percent identity min_percent_identity=0, # minimum length min_length=0, max_matches=0, filename_peptides=None, filename_filter_queries=None, # turn on/off various filters filter_suboptimal=False, filter_regions=False, # parameters for filter of suboptimal predictions min_relative_coverage=0.5, min_relative_score=0.5, min_relative_percent_identity=0.5, # minimum difference between non-correlated conflicts to keep them # both. conflicts_min_difference=0.1, # benchmarking data benchmarks=None, benchmark_synonyms=None, filename_benchmark=None, filename_benchmark_synonyms=None, test=None, max_intron=50000) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) ########################################################################## # read filtering filter_queries = {} if options.filename_filter_queries: for line in open(options.filename_filter_queries, "r"): if line[0] == "#": continue query_token = line[:-1].split("\t")[0] filter_queries[query_token] = True if options.loglevel >= 1: options.stdlog.write("# filtering for %i queries.\n" % len(filter_queries)) ########################################################################## # read benchmarking regions if options.filename_benchmark: options.benchmarks = ReadBenchmarkingRegions( open(options.filename_benchmark, "r")) if options.loglevel >= 1: options.stdlog.write( "# read benchmarking regions for %i tokens\n" % len(options.benchmarks)) sys.stdout.flush() if options.filename_benchmark_synonyms: infile = open(options.filename_benchmark_synonyms, "r") options.benchmark_synonyms = {} for line in infile: if line[0] == "#": continue value, key = line[:-1].split("\t") options.benchmark_synonyms[key] = value else: options.benchmark_synonyms = {} else: options.benchmarks = {} options.benchmark_synonyms = {} ########################################################################## # read peptide sequences if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) else: peptide_sequences = {} if options.conserve_memory: old_predictions, filename_old_predictions = tempfile.mkstemp() os.close(old_predictions) old_predictions = PredictionFile.PredictionFile() old_predictions.open(filename_old_predictions, "w") else: # array with final predictions old_predictions = [] if options.loglevel >= 1: options.stdlog.write("# reading predictions.\n") sys.stdout.flush() nread = 0 ninput = 0 for line in sys.stdin: if line[0] == "#": continue entry = PredictionParser.PredictionParserEntry(expand=0) entry.Read(line) nread += 1 # set prediction id if not entry.mPredictionId: entry.mPredictionId = nread # filter bad predictions right here in order to save memory: if entry.score < options.min_total_score: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: score below minimum: removing: %s\n" % str(entry)) continue elif entry.mQueryCoverage < options.min_coverage_query: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: coverage below minimum: removing: %s\n" % str(entry)) continue elif entry.mPercentIdentity < options.min_percent_identity: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: percent identity below minimum: removing: %s\n" % str(entry)) continue elif entry.mSbjctTo - entry.mSbjctFrom < options.min_length: if options.loglevel >= 3: options.stdlog.write( "# PRUNING: reason: length of transcript below minimum: removing: %s\n" % str(entry)) continue ninput += 1 if options.test and ninput > options.test: break old_predictions.append(entry) if options.loglevel >= 1: options.stdlog.write("# predictions after input: %i\n" % ninput) sys.stdout.flush() if options.loglevel >= 10: options.stdlog.write( "############## start: predictions after input ###################################\n" ) for x in old_predictions: options.stdlog.write("# %s\n" % str(x)) options.stdlog.write( "############## end: predictions after input #####################################\n" ) sys.stdout.flush() if ninput == 0: options.stdlog.write("# ERROR: no predictions\n") sys.exit(1) ########################################################################## # set up stacks of regions if options.conserve_memory: old_predictions.close() old_predictions.open(mode="r") removed_predictions, filename_removed_predictions = tempfile.mkstemp() os.close(removed_predictions) removed_predictions = PredictionFile.PredictionFile() removed_predictions.open(filename_removed_predictions, "w") new_predictions, filename_new_predictions = tempfile.mkstemp() os.close(new_predictions) new_predictions = PredictionFile.PredictionFile() new_predictions.open(filename_new_predictions, "w") else: removed_predictions = [] new_predictions = [] if options.benchmarks: EvaluateBenchmark(old_predictions) ########################################################################## # join regions if options.join_regions and options.join_regions_max_coverage: if options.loglevel >= 1: options.stdlog.write( "# joining regions: maximum distance between segments = %i and maximum query coverage = %i\n" % (options.join_regions, options.join_regions_max_coverage)) sys.stdout.flush() njoined = JoinRegions(old_predictions, new_predictions) if options.conserve_memory: ExchangeStreams(old_predictions, new_predictions) else: old_predictions = new_predictions new_predictions = [] if options.loglevel >= 1: options.stdlog.write("# predictions after joining: %i\n" % njoined) sys.stdout.flush() if options.loglevel >= 10: options.stdlog.write( "############## start: predictions after joining ###################################\n" ) for x in old_predictions: options.stdlog.write("# %s" % str(x)) options.stdlog.write( "############## end: predictions after joining #####################################\n" ) sys.stdout.flush() else: if options.loglevel >= 1: options.stdlog.write("# joining regions: skipped\n") sys.stdout.flush() njoined = ninput ########################################################################## # build map of best predictions if options.filter_suboptimal: if options.loglevel >= 1: options.stdlog.write("# calculating best predictions\n") sys.stdout.flush() best_predictions = GetBestPredictions(old_predictions) else: best_predictions = {} if options.loglevel >= 1: options.stdlog.write("# calculated best predictions: %i\n" % len(best_predictions)) sys.stdout.flush() ########################################################################## # get regions to eliminate filter_regions = {} if options.filter_regions: entry = PredictionParser.PredictionParserEntry(expand=0) filenames = options.filter_regions.split(",") for filename in filenames: if options.loglevel >= 1: options.stdlog.write("# reading regions to filter from %s.\n" % (filename)) sys.stdout.flush() if filename.endswith(".gz"): infile = gzip.open(filename, "r") else: infile = open(filename, "r") for line in infile: if line[0] == "#": continue entry.Read(line) exons = Exons.Alignment2Exons( Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, entry.mSbjctGenomeFrom) key = "%s-%s" % (entry.mSbjctToken, entry.mSbjctStrand) if key not in filter_regions: filter_regions[key] = [] for exon in exons: filter_regions[key].append( (exon.mGenomeFrom, exon.mGenomeTo)) infile.close() for k in filter_regions.keys(): filter_regions[k].sort() ########################################################################## # bipartite graph construction ########################################################################## # sort predictions by genomic region if options.conserve_memory: old_predictions.sort(('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo')) else: old_predictions.sort(lambda x, y: cmp( (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x. mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y. mSbjctGenomeFrom, y.mSbjctGenomeTo))) ########################################################################## # filter predictions and resolve conflicts based on genomic overlap # deleted segments are put in a temporary storage space. min_from, max_from = None, None min_to, max_to = None, None region_id = 0 noverlaps = 0 last_prediction = None predictions = [] region = Region() nclusters = 0 neliminated_suboptimal = 0 neliminated_overlap = 0 noutput, nfiltered = 0, 0 for this_prediction in old_predictions: # Filter 1: skip suboptimal predictions if this_prediction.mQueryToken in best_predictions: best_prediction = best_predictions[this_prediction.mQueryToken] neliminated_suboptimal += 1 if float( this_prediction.mQueryCoverage ) / best_prediction.mQueryCoverage < options.min_relative_coverage: if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: coverage below best: removing %s\n" % str(this_prediction)) continue if float(this_prediction.score ) / best_prediction.score < options.min_relative_score: if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: score below best: removing %s\n" % str(this_prediction)) continue if float( this_prediction.mPercentIdentity ) / best_prediction.mPercentIdentity < options.min_relative_percent_identity: if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: percent identity below best: removing %s\n" % str(this_prediction)) continue neliminated_suboptimal -= 1 # Filter 2: remove predictions overlapping with certain segments key = "%s-%s" % (this_prediction.mSbjctToken, this_prediction.mSbjctStrand) if key in filter_regions: exons = Exons.Alignment2Exons( Genomics.String2Alignment(this_prediction.mAlignmentString), this_prediction.mQueryFrom, this_prediction.mSbjctGenomeFrom) if CheckOverlap(map(lambda x: (x.mGenomeFrom, x.mGenomeTo), exons), filter_regions[key]): if options.loglevel >= 2: options.stdlog.write( "# PRUNING: reason: overlapping with taboo region: removing %s\n" % str(this_prediction)) neliminated_overlap += 1 continue try: this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \ re.split("\s+", this_prediction.mQueryToken) except ValueError: this_query_gene = None # process first entry if min_from is None: min_from = this_prediction.mSbjctGenomeFrom max_from = this_prediction.mSbjctGenomeFrom max_to = this_prediction.mSbjctGenomeTo min_to = this_prediction.mSbjctGenomeTo predictions.append(this_prediction) last_prediction = this_prediction continue overlap = min_to > this_prediction.mSbjctGenomeFrom and \ last_prediction.mSbjctToken == this_prediction.mSbjctToken and \ last_prediction.mSbjctStrand == this_prediction.mSbjctStrand if options.loglevel >= 4: options.stdlog.write("# from=%i, to=%i, working on: %s\n" % (min_from, max_to, str(this_prediction))) options.stdlog.flush() # resolve overlap between different genes if overlap: noverlaps += 1 else: region.mSbjctToken = last_prediction.mSbjctToken region.mSbjctStrand = last_prediction.mSbjctStrand region.mSbjctGenomeFrom = min_from region.mSbjctGenomeTo = max_to region_id, nxoutput, nxfiltered = ProcessRegion( predictions, region_id, region, peptide_sequences, filter_queries) noutput += nxoutput nfiltered += nxfiltered nclusters += 1 predictions = [] min_from = this_prediction.mSbjctGenomeFrom max_from = this_prediction.mSbjctGenomeFrom min_to = this_prediction.mSbjctGenomeTo max_to = this_prediction.mSbjctGenomeTo predictions.append(this_prediction) min_from = min(min_from, this_prediction.mSbjctGenomeFrom) max_from = max(max_from, this_prediction.mSbjctGenomeFrom) min_to = min(min_to, this_prediction.mSbjctGenomeTo) max_to = max(max_to, this_prediction.mSbjctGenomeTo) last_prediction = this_prediction if last_prediction: region.mSbjctToken = last_prediction.mSbjctToken region.mSbjctStrand = last_prediction.mSbjctStrand region.mSbjctGenomeFrom = min_from region.mSbjctGenomeTo = max_to region_id, nxoutput, nxfiltered = ProcessRegion( predictions, region_id, region, peptide_sequences, filter_queries) noutput += nxoutput nfiltered += nxfiltered nclusters += 1 if options.conserve_memory: os.remove(filename_old_predictions) os.remove(filename_new_predictions) os.remove(filename_removed_predictions) if options.loglevel >= 1: options.stdlog.write( "# pairs: nread=%i, input=%i, joined=%i, clusters=%i, regions=%i, eliminated_subopt=%i, eliminated_overlap=%i, noutput=%i, nfiltered=%i\n" % (nread, ninput, njoined, nclusters, region_id, neliminated_suboptimal, neliminated_overlap, noutput, nfiltered)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true") parser.add_option("-f", "--format", dest="format", help="input format.", type="choice", choices=("exons", "psl", "gff")) parser.add_option("-o", "--output-format", dest="output_format", help="output format", type="choice", choices=('exontable', 'exons', 'predictions', 'cds', 'fasta')) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed).") parser.add_option( "--predictions-file", dest="predictions_file", type="string", help= "filename with predictions. Use gene structures from this file if available." ) parser.add_option("-i", "--gff-field-id", dest="gff_field_id", type="string", help="field for the feature id in the gff info section.") parser.add_option( "-p", "--filename-peptides", dest="filename_peptides", type="string", help= "Filename with peptide sequences. If given, it is used to check the predicted translated sequences." ) parser.add_option( "--no-realignment", dest="do_realignment", action="store_false", help="do not re-align entries that do not parse correctly.") parser.add_option( "--remove-unaligned", dest="remove_unaligned", action="store_true", help="remove entries that have not been aligned correctly.") parser.add_option( "--input-coordinates", dest="input_coordinates", type="string", help= "specify input format for input coordinates [forward|both-zero|one-closed|open]." ) parser.set_defaults(trans=False, output_format="predictions", format="psl", gff_field_id='id', input_coordinates="both-zero-open", filename_peptides=None, genome_file=None, do_realignment=True, predictions_file=None, remove_unaligned=False) (options, args) = E.Start(parser) if not options.genome_file: raise "please specify a genome file." fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0, 0, 0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_peptides, "r")) predictor = Predictor.PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter(options.input_coordinates) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions( IOTools.openFile(options.predictions_file, "r")) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue if not re.match("^[0-9]", line): continue try: entries = parser.Parse((line, )) except PredictionParser.AlignmentError, e: print "# %s" % str(e) print "#", line[:-1] sys.exit(1) for entry in entries: entry.mPredictionId = nmatches nmatches += 1 print str(entries) elif options.format == "exons": parser = PredictionParser.PredictionParserExons( contig_sizes=contig_sizes) else: raise "unknown format %s for output option %s" % ( options.format, options.output_format) if options.loglevel >= 2: options.stdlog.write("# parsing.\n") options.stdlog.flush() results = parser.Parse(sys.stdin.readlines()) if options.loglevel >= 2: options.stdlog.write("# parsing finished.\n") options.stdlog.flush() if options.loglevel >= 1: options.stdlog.write( "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors())) for error, msg in parser.mErrors: options.stdlog.write("# %s : %s\n" % (str(error), msg)) options.stdlog.flush() # if genomes are given: build translation if options.genome_file: results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken)) new_results = PredictionParser.Predictions() for entry in results: ninput += 1 if options.loglevel >= 2: options.stdlog.write( "# processing entry %s:%s on %s:%s %i/%i.\n" % (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, ninput, len(results))) options.stdlog.flush() try: lgenome = fasta.getLength(entry.mSbjctToken) # added 3 residues - was a problem at split codons just before the stop. # See for example the chicken sequence ENSGALP00000002741 genomic_sequence = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, min(entry.mSbjctGenomeTo + 3, lgenome)) except KeyError: if options.loglevel >= 1: options.stdlog.write( "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken)) nskipped += 1 continue if predictions and entry.mPredictionId in predictions: if options.loglevel >= 2: options.stdlog.write( "# substituting entry %s on %s:%s.\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand)) options.stdlog.flush() entry = predictions[entry.mPredictionId] exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom) entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment( Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence) entry.score = entry.mMapPeptide2Translation.getColTo( ) - entry.mMapPeptide2Translation.getColFrom() + 1 (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \ Genomics.CountGeneFeatures(0, entry.mMapPeptide2Genome, genomic_sequence) if peptide_sequences: if str(entry.mPredictionId) in peptide_sequences: reference = peptide_sequences[str( entry.mPredictionId)].upper() translation = entry.mTranslation nfound += 1 is_identical, nmismatches = checkIdentity( reference, translation, options) if is_identical: nidentical += 1 else: nmismatch += 1 if options.do_realignment: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches..realigning in region %i:%i\n" % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) options.stdlog.flush() result = predictor( entry.mPredictionId, reference, entry.mSbjctToken, genomic_sequence, "--subopt FALSE --score '%s'" % str(80)) # "--exhaustive --subopt FALSE --score '%s'" % str(80) ) if result: translation = result[0].mTranslation is_identical, nmismatches = checkIdentity( reference, translation, options) else: if options.loglevel >= 2: options.stdlog.write( "# %s: realignment returned empty result\n" % (entry.mPredictionId)) options.stdlog.flush() is_identical = False if is_identical: naligned += 1 prediction_id = entry.mPredictionId sbjct_genome_from = entry.mSbjctGenomeFrom entry = result[0] entry.mPredictionId = prediction_id entry.mSbjctGenomeFrom += sbjct_genome_from else: nunaligned += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, reference, entry.mTranslation, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches on %s ... no realignment\n" % ( entry.mPredictionId, entry.mSbjctToken, )) if options.loglevel >= 3: options.stdlog.write( "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n" % (entry.mPredictionId, reference, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: nnotfound += 1 new_results.append(entry) noutput += 1 results = new_results if results: options.stdout.write(str(results) + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/predictions2introns.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string", help="filename with summary information.") parser.add_option("--skip-header", dest="skip_header", action="store_true", help="skip header.") parser.add_option( "--fill-introns", dest="fill_introns", type="int", help= "fill intron if divisible by three and no stop codon up to a maximum length of #." ) parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int", help="maximum number of stop codons to tolerate within an intron.") parser.add_option("--output-format", dest="output_format", type="choice", choices=("predictions", "extensions", "filled-introns"), help="output format.") parser.set_defaults( genome_file="genome", start_codons=("ATG"), stop_codons=("TAG", "TAA", "TGA"), skip_header=False, ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) fasta = IndexedFasta.IndexedFasta(options.genome_file) p = PredictionParser.PredictionParserEntry() ninput, noutput = 0, 0 nfilled = 0 nseqs_filled = 0 nseqs_extended = 0 left_extensions = [] right_extensions = [] filled_introns = [] if not options.skip_header: options.stdout.write("\t".join(( "prediction_id", "intron", "contig", "strand", "start", "end", "length", "nstops", "type", "prime5", "prime3", )) + "\n") for line in sys.stdin: if line[0] == "#": continue ninput += 1 p.Read(line) lsequence = fasta.getLength(p.mSbjctToken) genomic_sequence = fasta.getSequence(p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo).upper() exons = Exons.Alignment2Exons(p.mMapPeptide2Genome, query_from=0, sbjct_from=0) new_exons = [] last_e = exons[0] nintron = 0 for e in exons[1:]: nintron += 1 lintron = e.mGenomeFrom - last_e.mGenomeTo intron_is_l3 = lintron % 3 != 0 if intron_is_l3: ## get sequence, include also residues from split codons ## when checking for stop codons. ## note that e.mAlignment can sometimes be empty. This might ## be an exonerate bug. In the alignment string there are two ## consecutive exons. if e.mAlignment and last_e.mAlignment and e.mAlignment[0][ 0] == "S": offset_left = last_e.mAlignment[-1][2] offset_right = e.mAlignment[0][2] else: offset_left, offset_right = 0, 0 sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom + offset_right] intron_nstops = 0 for codon in [ sequence[x:x + 3] for x in range(0, len(sequence), 3) ]: if codon in options.stop_codons: intron_nstops += 1 else: intron_nstops = 0 ## check for splice signals sequence = genomic_sequence[last_e.mGenomeTo:e.mGenomeFrom] intron_type, prime5, prime3 = Genomics.GetIntronType(sequence) if options.loglevel >= 2: options.stdlog.write( "\t".join(map(str, (p.mPredictionId, nintron, lintron, intron_nstops, intron_type, genomic_sequence[last_e.mGenomeTo-6:last_e.mGenomeTo].lower() + "|" + sequence[:5] + "..." +\ sequence[-5:] + "|" + genomic_sequence[e.mGenomeFrom:e.mGenomeFrom+6].lower()) ) ) + "\n" ) options.stdout.write("\t".join( map(str, (p.mPredictionId, nintron, p.mSbjctToken, p.mSbjctStrand, last_e.mGenomeTo + p.mSbjctGenomeFrom, e.mGenomeFrom + p.mSbjctGenomeFrom, lintron, intron_nstops, intron_type, prime5, prime3))) + "\n") last_e = e noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i.\n" % (\ ninput, noutput)) E.Stop()
e.mGenomeFrom, e.mGenomeTo))) elif options.output_format == "exons": if options.format == "exons": parser = PredictionParser.PredictionParserExons( contig_sizes=contig_sizes) else: raise "unknown format %s." % options.format results = parser.Parse(sys.stdin.readlines()) id = 0 for entry in results: exons = Exons.Alignment2Exons( entry.mMapPeptide2Genome, entry.mQueryFrom, entry.mSbjctGenomeFrom, ) for e in exons: id += 1 print "\t".join( map(str, (entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, e.frame, e.mRank, e.mPeptideFrom, e.mPeptideTo, e.mGenomeFrom, e.mGenomeTo))) elif options.output_format == "cds": if options.format == "exons":
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/predictions2cds.py 1858 2008-05-13 15:07:05Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-o", "--forward-coordinates", dest="forward_coordinates", action="store_true", help="input uses forward coordinates.") parser.add_option("-f", "--format", dest="format", type="choice", choices=("default", "cds", "cdnas", "map", "gff", "intron-fasta", "exons"), help="output format.") parser.add_option("-r", "--reset-to-start", dest="reset_to_start", action="store_true", help="move genomic coordinates to begin from 0.") parser.add_option("--reset-query", dest="reset_query", action="store_true", help="move peptide coordinates to begin from 0.") parser.set_defaults(genome_file=None, forward_coordinates=False, format="default", reset_to_start=False, reset_query=False) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) cds_id = 1 entry = PredictionParser.PredictionParserEntry() fasta = IndexedFasta.IndexedFasta(options.genome_file) ninput, noutput, nskipped, nerrors = 0, 0, 0, 0 for line in sys.stdin: if line[0] == "#": continue if line.startswith("id"): continue ninput += 1 try: entry.Read(line) except ValueError, msg: options.stdlog.write("# parsing failed with msg %s in line %s" % (msg, line)) nerrors += 1 continue cds = Exons.Alignment2Exons(entry.mMapPeptide2Genome, query_from=entry.mQueryFrom, sbjct_from=entry.mSbjctGenomeFrom, add_stop_codon=0) for cd in cds: cd.mSbjctToken = entry.mSbjctToken cd.mSbjctStrand = entry.mSbjctStrand if cds[-1].mGenomeTo != entry.mSbjctGenomeTo: options.stdlog.write( "# WARNING: discrepancy in exon calculation!!!\n") for cd in cds: options.stdlog.write("# %s\n" % str(cd)) options.stdlog.write("# %s\n" % entry) lsequence = fasta.getLength(entry.mSbjctToken) genomic_sequence = fasta.getSequence(entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) # deal with forward coordinates: convert them to negative strand # coordinates if options.forward_coordinates and \ entry.mSbjctStrand == "-": entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = lsequence - \ entry.mSbjctGenomeTo, lsequence - entry.mSbjctGenomeFrom for cd in cds: cd.InvertGenomicCoordinates(lsequence) # attach sequence to cds for cd in cds: start = cd.mGenomeFrom - entry.mSbjctGenomeFrom end = cd.mGenomeTo - entry.mSbjctGenomeFrom cd.mSequence = genomic_sequence[start:end] # reset coordinates for query if options.reset_to_start: offset = entry.mPeptideFrom for cd in cds: cd.mPeptideFrom -= offset cd.mPeptideTo -= offset # play with coordinates if options.reset_to_start: offset = entry.mSbjctGenomeFrom for cd in cds: cd.mGenomeFrom -= offset cd.mGenomeTo -= offset else: offset = 0 if options.format == "cds": rank = 0 for cd in cds: rank += 1 cd.mQueryToken = entry.mQueryToken cd.mSbjctToken = entry.mSbjctToken cd.mSbjctStrand = entry.mSbjctStrand cd.mRank = rank print str(cd) if options.format == "exons": rank = 0 for cd in cds: rank += 1 options.stdout.write("\t".join( map(str, (entry.mPredictionId, cd.mSbjctToken, cd.mSbjctStrand, rank, cd.frame, cd.mPeptideFrom, cd.mPeptideTo, cd.mGenomeFrom, cd.mGenomeTo))) + "\n") elif options.format == "cdnas": print string.join( map(str, (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom - offset, entry.mSbjctGenomeTo - offset, genomic_sequence)), "\t") elif options.format == "map": map_prediction2genome = alignlib_lite.makeAlignmentSet() for cd in cds: alignlib_lite.addDiagonal2Alignment( map_prediction2genome, cd.mPeptideFrom + 1, cd.mPeptideTo, (cd.mGenomeFrom - offset) - cd.mPeptideFrom) print string.join( map(str, (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, alignlib_lite.AlignmentFormatEmissions( map_prediction2genome))), "\t") elif options.format == "intron-fasta": rank = 0 if len(cds) == 1: nskipped += 1 continue last = cds[0].mGenomeTo for cd in cds[1:]: rank += 1 key = "%s %i %s:%s:%i:%i" % ( entry.mPredictionId, rank, entry.mSbjctToken, entry.mSbjctStrand, last, entry.mSbjctGenomeFrom) sequence = genomic_sequence[last - entry.mSbjctGenomeFrom:cd. mGenomeFrom - entry.mSbjctGenomeFrom] options.stdout.write(">%s\n%s\n" % (key, sequence)) last = cd.mGenomeTo elif options.format == "gff-match": print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Introns %i; Frameshifts %i; Stops %i" % \ (entry.mSbjctToken, "gpipe", "similarity", entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, entry.mPercentIdentity, entry.mSbjctStrand, ".", entry.mQueryToken, entry.mQueryFrom, entry.mQueryTo, entry.score, entry.mNIntrons, entry.mNFrameShifts, entry.mNStopCodons) elif options.format == "gff-exon": rank = 0 for cd in cds: rank += 1 print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Rank %i/%i; Prediction %i" % \ (entry.mSbjctToken, "gpipe", "similarity", cd.mGenomeFrom, cd.mGenomeTo, entry.mPercentIdentity, entry.mSbjctStrand, ".", entry.mQueryToken, cd.mPeptideFrom / 3 + 1, cd.mPeptideTo / 3 + 1, entry.score, rank, len(cds), entry.mPredictionId) else: exon_from = 0 for cd in cds: cd.mPeptideFrom = exon_from exon_from += cd.mGenomeTo - cd.mGenomeFrom cd.mPeptideTo = exon_from print string.join( map(str, (cds_id, entry.mPredictionId, cd.mPeptideFrom, cd.mPeptideTo, cd.frame, cd.mGenomeFrom, cd.mGenomeTo, cd.mSequence)), "\t") cds_id += 1 noutput += 1
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2transcripts.py 1841 2008-05-08 12:07:13Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome." ) parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string", help="filename with summary information." ) parser.add_option( "--skip-header", dest="skip_header", action="store_true", help="skip header." ) parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int", help="maximum extension for start codon (make divisible by 3)." ) parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int", help="maximum extension for stop codon (make divisible by 3)." ) parser.add_option( "--left-extension-mode", dest="left_extension_mode", type="choice", choices=("first-start", "first-stop-backtrack"), help="extension mode for 5' end.") parser.add_option( "--fill-introns", dest="fill_introns", type="int", help="fill intron if divisible by three and no stop codon up to a maximum length of #." ) parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int", help="maximum number of stop codons to tolerate within an intron." ) parser.add_option( "--output-format", dest="output_format", type="choice", choices=("predictions", "extensions", "filled-introns"), help="output format." ) parser.set_defaults( genome_file = "genome", start_codons = ("ATG"), stop_codons = ("TAG", "TAA", "TGA"), start_codon_boundary = 9999, stop_codon_boundary = 9999, fill_introns = 0, introns_max_stops = 0, left_splice_signals = ("GT",), right_splice_signals = ("AG",), output_format="extensions", left_extension_mode = "first-start", skip_header = False, output_filename_summary = None, ) (options, args) = E.Start( parser, add_pipe_options = True ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) options.start_codon_boundary = int(options.start_codon_boundary / 3) options.stop_codon_boundary = int(options.stop_codon_boundary / 3) fasta = IndexedFasta.IndexedFasta( options.genome_file ) p = PredictionParser.PredictionParserEntry() ninput, noutput = 0, 0 nfilled = 0 nseqs_filled = 0 nseqs_extended = 0 left_extensions = [] right_extensions = [] filled_introns = [] if not options.skip_header: if options.output_format == "predictions": options.stdout.write( Prediction.Prediction().getHeader() + "\n" ) elif options.output_format == "filled-introns": options.stdout.write("\t".join( ("prediction_id", "intron", "peptide_sequence", "genomic_sequence") ) + "\n" ) if options.output_filename_summary: outfile_summary = open (options.output_filename_summary, "w" ) outfile_summary.write( "id\ttype\tnumber\tlength\tfrom\tto\tsequence\n" ) else: outfile_summary = None for line in options.stdin: if line[0] == "#": continue ninput += 1 p.Read(line) lsequence = fasta.getLength( p.mSbjctToken ) genome_from = max( 0, p.mSbjctGenomeFrom - options.start_codon_boundary) genome_to = min( lsequence, p.mSbjctGenomeTo + options.stop_codon_boundary) genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, genome_from, genome_to ).upper() ######################################################################## ######################################################################## ######################################################################## ## Do extensions if options.start_codon_boundary or options.stop_codon_boundary: extension_start = p.mSbjctGenomeFrom - genome_from extension_stop = genome_to - p.mSbjctGenomeTo fragment_to = extension_start + p.mSbjctGenomeTo - p.mSbjctGenomeFrom lfragment = len(genomic_sequence) ######################################################################## ######################################################################## ######################################################################## ## find start codon start = extension_start found_start = False if options.left_extension_mode == "first-start": found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons, options.stop_codons ) elif options.left_extension_mode == "first-stop-backtrack": if genomic_sequence[start:start+3] in options.start_codons: found_start = True else: found_start, start = findCodonReverse( genomic_sequence, start, options.stop_codons ) if found_start: E.info("prediction %s: stop found at %i (%i) backtracking ..." % ( p.mPredictionId, start, extension_start - start) ) ## bracktrack to first start codon found_start = False while start < extension_start: start += 3 if genomic_sequence[start:start+3] in options.start_codons: found_start = True break else: start = extension_start if found_start: E.info("start codon found at %i (%i)." % ( start, extension_start - start) ) else: E.info("no start codon found." ) else: E.info("prediction %s: no stop found ... backtracking to start codon." % ( p.mPredictionId ) ) found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons ) E.info("prediction %s: no start codon found." % ( p.mPredictionId ) ) if found_start: start += genome_from else: start = p.mSbjctGenomeFrom dstart = p.mSbjctGenomeFrom - start ######################################################################## ######################################################################## ######################################################################## ## find stop codon ## stop points to the beginning of the codon, thus the stop codon will ## not be part of the sequence. stop = fragment_to found_stop = 0 while stop < lfragment and \ genomic_sequence[stop:stop+3] not in ("NNN", "XXX"): if genomic_sequence[stop:stop+3] in options.stop_codons: found_stop = 1 break stop += 3 if found_stop: stop += genome_from else: stop = p.mSbjctGenomeTo dstop = stop - p.mSbjctGenomeTo ######################################################################## ######################################################################## ######################################################################## ## build new prediction map_peptide2genome = [] if dstart: map_peptide2genome.append( ("G", 0, dstart) ) map_peptide2genome += p.mMapPeptide2Genome if dstop: map_peptide2genome.append( ("G", 0, dstop) ) E.info("prediction %s: extension: found_start=%i, found_stop=%i, left=%i, right=%i" % ( p.mPredictionId, found_start, found_stop, dstart, dstop ) ) ## save results p.mMapPeptide2Genome = map_peptide2genome p.mAlignmentString = Genomics.Alignment2String( map_peptide2genome ) p.mSbjctGenomeFrom -= dstart p.mSbjctGenomeTo += dstop p.mSbjctFrom += dstart / 3 p.mSbjctTo += dstart / 3 + dstop / 3 if dstart or dstop: if dstart: left_extensions.append( dstart ) if dstop: right_extensions.append( dstop ) nseqs_extended += 1 ## update genomic sequence because borders might have changed. genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo ).upper() if options.fill_introns: has_filled = False exons = Exons.Alignment2Exons( p.mMapPeptide2Genome, query_from = 0, sbjct_from = 0 ) new_exons = [] last_e = exons[0] nintron = 0 for e in exons[1:]: nintron += 1 lintron = e.mGenomeFrom - last_e.mGenomeTo if lintron > options.fill_introns or (lintron) % 3 != 0: E.debug( "prediction %s: intron %i of size %i discarded." % \ (p.mPredictionId, nintron, lintron ) ) new_exons.append(last_e) last_e = e continue ## get sequence, include also residues from split codons ## when checking for stop codons. if e.mAlignment[0][0] == "S": offset_left = last_e.mAlignment[-1][2] offset_right = e.mAlignment[0][2] else: offset_left, offset_right = 0, 0 sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom+offset_right] ## check for splice sites for signal in options.left_splice_signals: if sequence[offset_left:offset_left+len(signal)] == signal: left_signal = True break else: left_signal = False for signal in options.right_splice_signals: if sequence[-(len(signal)+offset_right):-offset_right] == signal: right_signal = True break else: right_signal = False nstops, ngaps = 0, 0 for codon in [ sequence[x:x+3] for x in range(0,len(sequence),3) ]: if codon in options.stop_codons: nstops += 1 if "N" in codon.upper(): ngaps += 1 E.debug( "prediction %s: intron %i of size %i (%i-%i) (%s:%s:%i:%i): stops=%i, gaps=%i, signals=%s,%s." % \ (p.mPredictionId, nintron, lintron, offset_left, offset_right, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom + last_e.mGenomeTo, p.mSbjctGenomeFrom + e.mGenomeFrom, nstops, ngaps, left_signal, right_signal ) ) if nstops + ngaps > options.introns_max_stops: new_exons.append(last_e) last_e = e continue E.info( "prediction %s: filling intron %i of size %i: stops=%i, gaps=%i, signals=%s,%s" % \ (p.mPredictionId, nintron, lintron, nstops, ngaps, left_signal, right_signal)) e.Merge( last_e ) has_filled = True nfilled += 1 last_e = e if options.output_format == "filled-introns": options.stdout.write( "\t".join( map(str, ( p.mPredictionId, nintron, Genomics.TranslateDNA2Protein( sequence ), sequence ) ) ) + "\n" ) filled_introns.append(lintron) p.mNIntrons -= 1 new_exons.append(last_e) if has_filled: nseqs_filled += 1 Exons.UpdatePeptideCoordinates( new_exons ) p.mMapPeptide2Genome = Exons.Exons2Alignment( new_exons ) p.mAlignmentString = Genomics.Alignment2String( p.mMapPeptide2Genome ) ## build translated sequence p.mMapPeptide2Translation, p.mTranslation = Genomics.Alignment2PeptideAlignment( \ p.mMapPeptide2Genome, p.mQueryFrom, 0, genomic_sequence ) ## output info if options.output_format == "predictions": options.stdout.write( str(p) + "\n" ) elif options.output_format == "extensions": if found_start: found_start = 1 if found_stop: found_stop = 1 options.stdout.write( "\t".join( map(str, ( p.mPredictionId, found_start, found_stop, dstart, dstop, p.mTranslation, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, p.mAlignmentString ))) + "\n" ) noutput += 1 options.stdout.flush() E.info("stats : %s" % "\t".join(Stats.DistributionalParameters().getHeaders() )) E.info("left : %s" % str(Stats.DistributionalParameters(left_extensions)) ) E.info("right : %s" % str(Stats.DistributionalParameters(right_extensions)) ) E.info("introns: %s" % str(Stats.DistributionalParameters(filled_introns)) ) E.info("ninput=%i, noutput=%i, nextended=%i, nfilled=%i, nexons_filled=%i" % (\ ninput, noutput, nseqs_extended, nseqs_filled, nfilled)) E.Stop()
def ResolveExonOverlaps(gene_id, predictions): """resolve overlaps between predictions based on exonic overlap.""" all_exons = [] n = 1 if len(predictions) == 0: return gene_id for p in predictions: exons = Exons.Alignment2Exons(Genomics.String2Alignment( p.mAlignmentString), query_from=0, sbjct_from=p.mSbjctGenomeFrom) for exon in exons: all_exons.append((exon.mGenomeFrom, exon.mGenomeTo, n)) else: all_exons.append((p.mSbjctGenomeFrom, p.mSbjctGenomeTo, n)) n += 1 map_prediction2gene = range(0, len(predictions) + 1) map_gene2predictions = [None] for x in range(1, len(predictions) + 1): map_gene2predictions.append([x]) all_exons.sort() # print all_exons # cluster exons by overlap last_exon_from, last_exon_to, last_p = all_exons[0] for exon_from, exon_to, p in all_exons[1:]: # if overlap overlap = min(exon_to, last_exon_to) - max(exon_from, last_exon_from) if overlap and param_exon_identity: overlap = (exon_from == last_exon_from) and (exon_to == last_exon_to) if overlap > 0: # print "# overlap between %i and %i" % (p, last_p) # rewire pointers to point to gene of previous prediction # if they belong to different genes new_g = map_prediction2gene[last_p] old_g = map_prediction2gene[p] if new_g != old_g: for x in map_gene2predictions[old_g]: map_gene2predictions[new_g].append(x) map_prediction2gene[x] = new_g map_gene2predictions[old_g] = [] # if no overlap: create new gene, if predictions has no gene # associated with it yet. else: # print "# no overlap between %i and %i" % (p, last_p) if not map_prediction2gene[p]: map_prediction2gene[p] = len(map_gene2predictions) map_gene2predictions.append([p]) if param_exon_identity: last_exon_to = exon_to last_exon_from = exon_from else: last_exon_to = max(last_exon_to, exon_to) last_p = p for x in range(1, len(map_gene2predictions)): if map_gene2predictions[x]: for p in map_gene2predictions[x]: print "%i\t%i" % (gene_id, predictions[p - 1].mPredictionId) gene_id += 1 return gene_id
alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep) map_reference2target = alignlib_lite.makeAlignmentVector() assignment_id = 0 for line in cr.fetchall(): reference = PredictionParser.PredictionParserEntry() reference.FillFromTable(line) ct = dbhandle.cursor() ct.execute(statement % (param_tablename_predictions_target, reference.mSbjctToken, reference.mSbjctStrand, reference.mSbjctGenomeFrom, reference.mSbjctGenomeTo)) reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome, 0, reference.mSbjctFrom) for line2 in ct.fetchall(): target = PredictionParser.PredictionParserEntry() target.FillFromTable(line2) target_exons = Exons.Alignment2Exons(target.mMapPeptide2Genome, 0, target.mSbjctFrom) ## check for exon overlap rr, tt = 0, 0 overlap = 0 while rr < len(reference_exons) and tt < len(target_exons): r = reference_exons[rr] t = target_exons[tt]