raise IOError("no exons in exon list.") Exons.SetRankToPositionFlag(exons) if param_use_genome_length: lengths = Exons.GetGenomeLengths(exons) else: lengths = Exons.GetPeptideLengths(exons) if param_min_overlap > 0: map_cluster2transcripts, map_transcript2cluster = ClusterByExonOverlap( exons, lengths, peptide_sequences, loglevel=param_loglevel) else: map_cluster2transcripts, map_transcript2cluster = \ Exons.ClusterByExonIdentity( exons, max_terminal_num_exons = 3, min_terminal_exon_coverage = param_min_terminal_exon_coverage, loglevel = param_loglevel ) map_transcript2strand = {} for k, ee in exons.items(): map_transcript2strand[k] = (ee[0].mSbjctStrand == "+") nnegatives, npositives = 0, 0 ## take longest transcript cluster_id = 1 if peptide_sequences: print "rep\tmem\tlength\tquery_from\tquery_to\tquery_ali\tsbjct_from\tsbjct_to\tsbjct_ali" else: print "rep\tmem\tlength"
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/select_transcripts.py 2263 2008-11-17 16:36:29Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-o", "--overlap", dest="overlap_residues", type="int", help="overlap residues.") parser.add_option( "-t", "--filter-tokens", dest="filename_filter_tokens", type="string", help="filename to filter tokens." ) parser.add_option( "-i", "--exon-identity", dest="exon_identity", action="store_true", help="exon identity." ) parser.add_option( "--exons", dest="filename_exons", type="string", help="filename with exon information." ) parser.add_option( "-m", "--output-members", dest="filename_members", type="string", help="output filename with members." ) parser.add_option( "--overlap-id", dest="overlap_id", action="store_true", help="overlap id." ) parser.add_option( "-s", "--remove-spanning", dest="remove_spanning_predictions", action="store_true", help="remove spanning predictions." ) parser.add_option( "-c", "--remove-complement", dest="remove_complementary_predictions", action="store_true", help="remove complementary predictions." ) parser.add_option( "--remove-exon-swoppers", dest="remove_exon_swoppers", action="store_true", help="remove exon swoppers." ) parser.add_option( "--remove-gene-spanners", dest="remove_gene_spanners", action="store_true", help="remove gene spanners." ) parser.add_option( "--remove-suboptimal", dest="remove_suboptimal", action="store_true", help="remove suboptimal predictions." ) parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide information." ) parser.add_option( "--extended-peptides", dest="filename_extended_peptides", type="string", help="filename with peptide information - after extension." ) parser.add_option( "--test", dest="test_nids", type="string", help="test nids." ) ## filter options parser.add_option( "--filter-transcripts", dest="filter_filename_transcripts", type="string", help="filename with transcripts that are used to filter." ) parser.add_option( "--filter-remove-spanning", dest="filter_remove_spanning", action="store_true", help="remove all transcripts that span the filter set." ) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "--discard-large-clusters", dest="discard_large_clusters", type="int", help="if set discard clusters bigger than this size (patch) [default=%default]." ) parser.set_defaults( filename_members = None, filename_peptides = None, filename_extended_peptides = None, filename_exons = None, quality_hierarchy = ("CG", "PG", "SG", "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK" ), ## Classes, where redundancy is removed by similarity. When exon structure ## is not conserved, I can't predict alternative splice variants, so remove ## the redundancy. quality_exclude_same = ( "UG", "UP", "UF", "BF", "UK" ), quality_genes = ("CG", "SG", "PG", "RG", "UG"), ## class that can be removed in spanning/complementary predictions quality_remove_dubious = ( "UG", "UP", "UF", "BF", "UK" ), ## class that is required for defining exon swopper event quality_remove_exon_swopper = ("CG", "PG"), ## class that will kept, in spite of being an exons swopper. quality_keep_exon_swopper = (), ## class that is required for removing gene spanners quality_remove_gene_spanners = ("CG"), ## class that will kept, in spite of being a gene spanner quality_keep_gene_spanners = (), ## class that is required for defining suboptimal matches quality_remove_suboptimal = ("CG", "PG" ), ## class that will be kept, in spite of being a suboptimal match quality_keep_suboptimal = (), ## gap penalties gop = -10.0, gep = -1.0, ## maximum number of gaps to allow in alignment max_gaps = 20, ## threshold of percent identity that allows to remove a prediction ## of a lower class. ## This allows for insertions/deletions min_identity = 98, ## threshold of percent identity that allows to remove a prediction ## of a non-gene by a gene min_identity_non_genes = 80, ## safety threshold: do not remove, if coverage of member is by x better ## than representative safety_pide = 10, safety_coverage = 10, overlap_id = False, remove_spanning_predictions = False, remove_exon_swoppers = False, remove_gene_spanners = False, remove_suboptimal = False, ## nids to use for testing test_nids = None, ## remove members with less than maximum coverage max_member_coverage = 90, ## maximum allowable exon slippage max_slippage = 9, ## minimum difference in identity for suboptimal predictions to be removed. suboptimal_min_identity_difference = 10, ## filter options filter_filename_transcripts = None, filter_remove_spanning = True, filter_remove_spanning_both_strands = True, genome_file = None, discard_large_clusters = None ) (options, args) = E.Start( parser, add_psql_options = True ) if options.test_nids: options.test_nids = options.test_nids.split(",") # list of eliminated predictions eliminated_predictions = {} if options.filename_members: outfile_members = open( options.filename_members, "w" ) else: outfile_members = sys.stdout ###################################################### ###################################################### ###################################################### # data ###################################################### data = [] class Entry: def __init__(self, gff): self.mPid = float(gff["pid"]) self.mQueryCoverage = float(gff["qcov"]) self.gene_id = gff['gene_id'] self.transcript_id = gff['transcript_id'] self.mExtendedStart = int( gff['xstart'] ) self.mExtendedEnd = int( gff['xend'] ) self.start = gff.start self.contig = gff.contig self.strand = gff.strand self.end = gff.end self.mQuality = gff['class'] for gff in GTF.iterator( sys.stdin ): data.append( Entry(gff) ) if options.loglevel >= 1: options.stdlog.write( "# read %i transcripts.\n" % len(data) ) options.stdlog.flush() ###################################################### ###################################################### ###################################################### # read peptide sequences ###################################################### if options.loglevel >= 1: options.stdlog.write( "# loading peptide databases ... " ) options.stdlog.flush() if options.filename_peptides: peptides = IndexedFasta.IndexedFasta( options.filename_peptides ) peptide_lengths = peptides.getContigSizes() else: peptide_lengths = {} peptides = {} ###################################################### ###################################################### ###################################################### # read extended peptide sequences ###################################################### if options.filename_extended_peptides: extended_peptides = IndexedFasta.IndexedFasta( options.filename_extended_peptides ) else: extended_peptides = {} if options.loglevel >= 1: options.stdlog.write( "finished\n" ) options.stdlog.flush() ###################################################### ###################################################### ###################################################### ## open genome file ###################################################### if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() else: contig_sizes = {} ###################################################### ###################################################### ###################################################### ## reading exons, clustering and formatting them. ###################################################### if options.filename_exons: if options.loglevel >= 1: options.stdlog.write( "# reading exon boundaries ... " ) options.stdlog.flush() ids = [ x.transcript_id for x in data ] exons = Exons.ReadExonBoundaries( open( options.filename_exons, "r"), contig_sizes = contig_sizes, filter = set(ids) ) if options.loglevel >= 1: options.stdlog.write( "done - read exons for %i transcripts\n" % (len(exons) )) if len(exons) == 0: raise ValueError("no exons found in table.") # flag terminal exons Exons.SetRankToPositionFlag( exons ) identity_map_cluster2transcripts, identity_map_transcript2cluster =\ Exons.ClusterByExonIdentity( exons, max_terminal_num_exons = 3, max_slippage= options.max_slippage, loglevel = options.loglevel ) overlap_map_cluster2transcripts, overlap_map_transcript2cluster =\ Exons.ClusterByExonOverlap( exons, min_overlap = 10, loglevel = options.loglevel ) else: exons = {} ###################################################### nrepresentatives, nmembers, neliminated = 0, 0, 0 eliminated_by_method = {} ###################################################### ###################################################### ###################################################### ## read filter transcripts and apply filters ###################################################### if options.filter_filename_transcripts: if options.loglevel >= 1: options.stdlog.write( "# reading exon boundaries for filter set ... " ) options.stdlog.flush() filter_exons = Exons.ReadExonBoundaries( open( options.filter_filename_transcripts, "r" ), delete_missing = True, contig_sizes = contig_sizes ) if options.loglevel >= 1: options.stdlog.write( "done - read exons for %i transcripts\n" % (len(filter_exons)) ) t = time.time() eliminated = FilterEliminateOverlappingTranscripts( exons, filter_exons, eliminated_predictions, contig_sizes, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i transcripts overlapping or spanning transcripts in %i seconds.\n" % (n, time.time()-t )) options.stdlog.flush() if options.remove_exon_swoppers and not exons: raise ValueError( "please specify exon table if using --remove-swoppers." ) if options.remove_gene_spanners and not exons: raise ValueError( "please specify exon table if using --remove-gene-spanners." ) ######################################################################################## ## remove predictions spanning other predictions but do not overlap with them on an exon level. if options.remove_gene_spanners and exons: if options.loglevel >= 1: options.stdlog.write( "# removing gene spanners\n" ) options.stdlog.flush() t = time.time() eliminated = EliminateGeneSpanners( data, eliminated_predictions, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i gene spanners in %i seconds\n" % (n, time.time()-t )) options.stdlog.flush() ######################################################################################## ## sort data by quality, length of prediction and coverage * pid if options.loglevel >= 1: options.stdlog.write( "# sorting data\n" ) options.stdlog.flush() map2pos = {} for x in range(len(options.quality_hierarchy)): map2pos[options.quality_hierarchy[x]] = x data.sort( key = lambda x: (map2pos[x.mQuality], len(extended_peptides[x.transcript_id]), x.mQueryCoverage * x.mPid ) ) # build map of prediction to quality map_prediction2data = {} for d in data: map_prediction2data[d.transcript_id] = d if options.loglevel >= 1: options.stdlog.write( "# sorting data finished\n" ) options.stdlog.flush() ######################################################################################## ## remove predictions joining two other complete non-overlapping predictions if options.remove_exon_swoppers and exons: if options.loglevel >= 1: options.stdlog.write( "# removing exon swoppers\n" ) options.stdlog.flush() eliminated = EliminateExonSwoppers( data, eliminated_predictions, identity_map_transcript2cluster, identity_map_cluster2transcripts, map_prediction2data, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i exon swoppers\n" % n ) options.stdlog.flush() ######################################################################################## ## remove suboptimal predictions if options.remove_suboptimal and exons: if options.loglevel >= 1: options.stdlog.write( "# removing suboptimal predictions\n" ) options.stdlog.flush() t = time.time() eliminated = EliminateSuboptimalPredictions( data, eliminated_predictions, overlap_map_transcript2cluster, overlap_map_cluster2transcripts, map_prediction2data, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i suboptimal predictions in %i seconds\n" % (n, time.time()-t) ) options.stdlog.flush() ######################################################################################## ## remove redundant predictions l = len(data) options.report_step = max(1, int(l / 100)) t2= time.time() last_quality = None qualities = [] options.stdout.write( "%s\t%s\n" % ("rep", "comment") ) for x in range(len(data)): if options.loglevel >= 1: if x % options.report_step == 0: options.stdlog.write( "# process: %i/%i = %i %%, %i/%i = %i %% in %i seconds\n" % \ (x+1, l, int(100 * (x+1) / l), len(eliminated_predictions), l, 100 * len(eliminated_predictions) / l, time.time() - t2 ) ) options.stdlog.flush() rep = data[x] rep_id, rep_quality = rep.transcript_id, rep.mQuality if rep_id in eliminated_predictions: continue if rep_quality != last_quality: if last_quality: qualities.append( last_quality ) last_quality = rep_quality if options.loglevel >= 2: options.stdlog.write( "# processing prediction %s|%s\n" % (rep_id, rep_quality) ) options.stdlog.flush() eliminated = [] if options.overlap_id: eliminated += EliminateRedundantEntriesByOverlap( rep, data[x+1:], eliminated_predictions, options, peptides, extended_peptides, filter_quality = qualities, this_quality = rep_quality ) else: eliminated += EliminateRedundantEntriesByRange( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality = qualities, this_quality = rep_quality ) options.stdout.write( "%s\t%i\n" % (rep_id, len(eliminated)) ) if outfile_members: outfile_members.write( "%s\t%s\tm\n" % (str(rep_id), str(rep_id))) nrepresentatives += 1 nmembers += PrintMembers( rep_id, outfile_members, eliminated, eliminated_by_method ) if outfile_members != sys.stdout: outfile_members.close() options.stdlog.write( "# representatives=%i, members=%i, eliminated=%i, total=%i\n" %\ (nrepresentatives, nmembers, neliminated, nrepresentatives+nmembers+neliminated ) ) options.stdlog.write( "# elimination by method:\n" ) for v,c in eliminated_by_method.items(): options.stdlog.write( "# method=%s, count=%i\n" % (v, c) ) E.Stop()