Beispiel #1
0
        sys.stdout.flush()

    exons = Exons.ReadExonBoundaries(
        sys.stdin,
        contig_sizes=contig_sizes,
        delete_missing=delete_missing,
    )

    if param_loglevel >= 1:
        print "# read exon information for %i transcripts" % len(exons)
        sys.stdout.flush()

    if len(exons) == 0:
        raise IOError("no exons in exon list.")

    Exons.SetRankToPositionFlag(exons)

    if param_use_genome_length:
        lengths = Exons.GetGenomeLengths(exons)
    else:
        lengths = Exons.GetPeptideLengths(exons)

    if param_min_overlap > 0:
        map_cluster2transcripts, map_transcript2cluster = ClusterByExonOverlap(
            exons, lengths, peptide_sequences, loglevel=param_loglevel)
    else:
        map_cluster2transcripts, map_transcript2cluster = \
                                 Exons.ClusterByExonIdentity( exons,
                                                              max_terminal_num_exons = 3,
                                                              min_terminal_exon_coverage = param_min_terminal_exon_coverage,
                                                              loglevel = param_loglevel )
Beispiel #2
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/select_transcripts.py 2263 2008-11-17 16:36:29Z andreas $", usage = globals()["__doc__"] )

    parser.add_option( "-o", "--overlap", dest="overlap_residues", type="int",
                       help="overlap residues.")
    parser.add_option( "-t", "--filter-tokens", dest="filename_filter_tokens", type="string",
                       help="filename to filter tokens." )
    parser.add_option( "-i", "--exon-identity", dest="exon_identity", action="store_true",
                       help="exon identity." )
    parser.add_option( "--exons", dest="filename_exons", type="string",
                       help="filename with exon information." )
    parser.add_option( "-m", "--output-members", dest="filename_members", type="string",
                       help="output filename with members." )
    parser.add_option( "--overlap-id", dest="overlap_id", action="store_true",
                       help="overlap id." )
    parser.add_option( "-s", "--remove-spanning", dest="remove_spanning_predictions", action="store_true",
                       help="remove spanning predictions." )
    parser.add_option( "-c", "--remove-complement", dest="remove_complementary_predictions", action="store_true",
                       help="remove complementary predictions." )
    parser.add_option( "--remove-exon-swoppers", dest="remove_exon_swoppers", action="store_true",
                       help="remove exon swoppers." )
    parser.add_option( "--remove-gene-spanners", dest="remove_gene_spanners", action="store_true",
                       help="remove gene spanners." )
    parser.add_option( "--remove-suboptimal", dest="remove_suboptimal", action="store_true",
                       help="remove suboptimal predictions." )
    parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string",
                       help="filename with peptide information." )
    parser.add_option( "--extended-peptides", dest="filename_extended_peptides", type="string",
                       help="filename with peptide information - after extension." )
    
    parser.add_option( "--test", dest="test_nids", type="string",
                       help="test nids." )
    ## filter options
    parser.add_option( "--filter-transcripts", dest="filter_filename_transcripts", type="string",
                       help="filename with transcripts that are used to filter." )
    parser.add_option( "--filter-remove-spanning", dest="filter_remove_spanning", action="store_true",
                       help="remove all transcripts that span the filter set." )
    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genomic data (indexed)." )
    parser.add_option( "--discard-large-clusters", dest="discard_large_clusters", type="int",
                       help="if set discard clusters bigger than this size (patch) [default=%default]." )
    

    parser.set_defaults(
        filename_members = None,
        filename_peptides = None,
        filename_extended_peptides = None,
        filename_exons = None,
        quality_hierarchy = ("CG", "PG", "SG", "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK" ),
        ## Classes, where redundancy is removed by similarity. When exon structure
        ## is not conserved, I can't predict alternative splice variants, so remove
        ## the redundancy.
        quality_exclude_same = ( "UG", "UP", "UF", "BF", "UK" ),
        quality_genes = ("CG", "SG", "PG", "RG", "UG"),
        ## class that can be removed in spanning/complementary predictions
        quality_remove_dubious = ( "UG", "UP", "UF", "BF", "UK" ),
        ## class that is required for defining exon swopper event
        quality_remove_exon_swopper = ("CG", "PG"),
        ## class that will kept, in spite of being an exons swopper.
        quality_keep_exon_swopper = (),
        ## class that is required for removing gene spanners
        quality_remove_gene_spanners = ("CG"),
        ## class that will kept, in spite of being a gene spanner
        quality_keep_gene_spanners = (),
        ## class that is required for defining suboptimal matches
        quality_remove_suboptimal = ("CG", "PG" ),
        ## class that will be kept, in spite of being a suboptimal match
        quality_keep_suboptimal = (),
        ## gap penalties
        gop = -10.0,
        gep = -1.0,
        ## maximum number of gaps to allow in alignment
        max_gaps = 20,
        ## threshold of percent identity that allows to remove a prediction
        ## of a lower class.
        ## This allows for insertions/deletions
        min_identity = 98,
        ## threshold of percent identity that allows to remove a prediction
        ## of a non-gene by a gene
        min_identity_non_genes = 80,
        ## safety threshold: do not remove, if coverage of member is by x better
        ## than representative
        safety_pide = 10,
        safety_coverage = 10,
        overlap_id = False,
        remove_spanning_predictions = False,
        remove_exon_swoppers = False,
        remove_gene_spanners = False,
        remove_suboptimal = False,
        ## nids to use for testing
        test_nids = None,
        ## remove members with less than maximum coverage
        max_member_coverage = 90,
        ## maximum allowable exon slippage
        max_slippage = 9,
        ## minimum difference in identity for suboptimal predictions to be removed.
        suboptimal_min_identity_difference = 10,
        ## filter options
        filter_filename_transcripts = None,
        filter_remove_spanning = True,
        filter_remove_spanning_both_strands = True,
        genome_file = None,
        discard_large_clusters = None )
    
    (options, args) = E.Start( parser, add_psql_options = True )    

    if options.test_nids: options.test_nids = options.test_nids.split(",")

    # list of eliminated predictions
    eliminated_predictions = {}
    
    if options.filename_members:
        outfile_members = open( options.filename_members, "w" )
    else:
        outfile_members = sys.stdout

    ######################################################
    ######################################################
    ######################################################        
    # data
    ######################################################    
    data = []

    class Entry:
        def __init__(self, gff):
            self.mPid = float(gff["pid"])
            self.mQueryCoverage = float(gff["qcov"])
            self.gene_id = gff['gene_id']
            self.transcript_id = gff['transcript_id']
            self.mExtendedStart = int( gff['xstart'] )
            self.mExtendedEnd = int( gff['xend'] )
            self.start = gff.start
            self.contig = gff.contig
            self.strand = gff.strand
            self.end = gff.end
            self.mQuality = gff['class']
            
    for gff in GTF.iterator( sys.stdin ):
        data.append( Entry(gff) )

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i transcripts.\n" % len(data) )
        options.stdlog.flush()

    ######################################################
    ######################################################
    ######################################################        
    # read peptide sequences
    ######################################################    
    if options.loglevel >= 1:
        options.stdlog.write( "# loading peptide databases ... " )
        options.stdlog.flush()

    if options.filename_peptides:
        peptides = IndexedFasta.IndexedFasta( options.filename_peptides )
        peptide_lengths = peptides.getContigSizes()
    else:
        peptide_lengths = {}
        peptides = {}

    ######################################################
    ######################################################
    ######################################################        
    # read extended peptide sequences
    ######################################################    
    if options.filename_extended_peptides:
        extended_peptides = IndexedFasta.IndexedFasta( options.filename_extended_peptides )
    else:
        extended_peptides = {}

    if options.loglevel >= 1:
        options.stdlog.write( "finished\n" )
        options.stdlog.flush()

    ######################################################
    ######################################################
    ######################################################        
    ## open genome file
    ######################################################        
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    ######################################################
    ######################################################
    ######################################################        
    ## reading exons, clustering and formatting them.
    ######################################################        
    if options.filename_exons:
        if options.loglevel >= 1:
            options.stdlog.write( "# reading exon boundaries ... " )
            options.stdlog.flush()
            
        ids = [ x.transcript_id for x in data ] 

        exons = Exons.ReadExonBoundaries( open( options.filename_exons, "r"),
                                          contig_sizes = contig_sizes,
                                          filter = set(ids) )

        if options.loglevel >= 1:
            options.stdlog.write( "done - read exons for %i transcripts\n" % (len(exons) ))

        if len(exons) == 0:
            raise ValueError("no exons found in table.")
            
        # flag terminal exons
        Exons.SetRankToPositionFlag( exons )

        identity_map_cluster2transcripts, identity_map_transcript2cluster =\
                                          Exons.ClusterByExonIdentity( exons,
                                                                       max_terminal_num_exons = 3,
                                                                       max_slippage= options.max_slippage,
                                                                       loglevel = options.loglevel )

        overlap_map_cluster2transcripts, overlap_map_transcript2cluster =\
                                         Exons.ClusterByExonOverlap( exons,
                                                                     min_overlap = 10,
                                                                     loglevel = options.loglevel )
    else:
        exons = {}

    ######################################################        
    nrepresentatives, nmembers, neliminated = 0, 0, 0
    eliminated_by_method = {}

    ######################################################
    ######################################################
    ######################################################
    ## read filter transcripts and apply filters
    ######################################################        
    if options.filter_filename_transcripts:

        if options.loglevel >= 1:
            options.stdlog.write( "# reading exon boundaries for filter set ... " )
            options.stdlog.flush()
            
        filter_exons = Exons.ReadExonBoundaries( open( options.filter_filename_transcripts, "r" ),
                                                 delete_missing = True,
                                                 contig_sizes = contig_sizes )

        if options.loglevel >= 1:
            options.stdlog.write( "done - read exons for %i transcripts\n" % (len(filter_exons)) )
        
        t = time.time()
        eliminated = FilterEliminateOverlappingTranscripts( exons,
                                                            filter_exons,
                                                            eliminated_predictions,
                                                            contig_sizes,
                                                            options )

        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n
        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i transcripts overlapping or spanning transcripts in %i seconds.\n" % (n, time.time()-t ))
            options.stdlog.flush()

    if options.remove_exon_swoppers and not exons:
        raise ValueError( "please specify exon table if using --remove-swoppers." )
    if options.remove_gene_spanners and not exons:
        raise ValueError( "please specify exon table if using --remove-gene-spanners." )

    ########################################################################################
    ## remove predictions spanning other predictions but do not overlap with them on an exon level.
    if options.remove_gene_spanners and exons:
        if options.loglevel >= 1:
            options.stdlog.write( "# removing gene spanners\n" )
            options.stdlog.flush()
            
        t = time.time()
        eliminated = EliminateGeneSpanners( data,
                                            eliminated_predictions,
                                            exons,
                                            options )

        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n
        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i gene spanners in %i seconds\n" % (n, time.time()-t ))
            options.stdlog.flush()
            
    ########################################################################################
    ## sort data by quality, length of prediction and coverage * pid

    if options.loglevel >= 1:
        options.stdlog.write( "# sorting data\n" )
        options.stdlog.flush()

    map2pos = {}
    for x in range(len(options.quality_hierarchy)):
        map2pos[options.quality_hierarchy[x]] = x

    data.sort( key = lambda x: (map2pos[x.mQuality], len(extended_peptides[x.transcript_id]), x.mQueryCoverage * x.mPid ) )

    # build map of prediction to quality
    map_prediction2data = {}
    for d in data:
        map_prediction2data[d.transcript_id] = d

    if options.loglevel >= 1:
        options.stdlog.write( "# sorting data finished\n" )
        options.stdlog.flush()

    ########################################################################################
    ## remove predictions joining two other complete non-overlapping predictions
    if options.remove_exon_swoppers and exons:

        if options.loglevel >= 1:
            options.stdlog.write( "# removing exon swoppers\n" )
            options.stdlog.flush()

        eliminated = EliminateExonSwoppers( data,
                                            eliminated_predictions,
                                            identity_map_transcript2cluster,
                                            identity_map_cluster2transcripts,
                                            map_prediction2data,
                                            exons,
                                            options )
        
        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n

        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i exon swoppers\n" % n )
            options.stdlog.flush()

    ########################################################################################
    ## remove suboptimal predictions
    if options.remove_suboptimal and exons:

        if options.loglevel >= 1:
            options.stdlog.write( "# removing suboptimal predictions\n" )
            options.stdlog.flush()

        t = time.time()
        eliminated = EliminateSuboptimalPredictions( data,
                                                     eliminated_predictions,
                                                     overlap_map_transcript2cluster,
                                                     overlap_map_cluster2transcripts,
                                                     map_prediction2data,
                                                     exons,
                                                     options )
        
        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n

        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i suboptimal predictions in %i seconds\n" % (n, time.time()-t) )
            options.stdlog.flush()        

    ########################################################################################
    ## remove redundant predictions
    l = len(data)
        
    options.report_step = max(1, int(l / 100))

    t2= time.time()

    last_quality = None
    qualities = []

    options.stdout.write( "%s\t%s\n" % ("rep", "comment") )
    
    for x in range(len(data)):

        if options.loglevel >= 1:
            if x % options.report_step  == 0:
                options.stdlog.write( "# process: %i/%i = %i %%, %i/%i = %i %% in %i seconds\n" % \
                                      (x+1, l,
                                       int(100 * (x+1) / l),
                                       len(eliminated_predictions), l,
                                       100 * len(eliminated_predictions) / l,
                                       time.time() - t2 ) )
                                                                    
                options.stdlog.flush()
                
        rep = data[x]

        rep_id, rep_quality = rep.transcript_id, rep.mQuality
        
        if rep_id in eliminated_predictions: continue

        if rep_quality != last_quality:
            if last_quality:
                qualities.append( last_quality )
            last_quality = rep_quality
        
        if options.loglevel >= 2:
            options.stdlog.write( "# processing prediction %s|%s\n" % (rep_id, rep_quality) )
            options.stdlog.flush()

        eliminated = []

        if options.overlap_id:
            eliminated += EliminateRedundantEntriesByOverlap( rep,
                                                              data[x+1:],
                                                              eliminated_predictions,
                                                              options,
                                                              peptides, 
                                                              extended_peptides,
                                                              filter_quality = qualities,
                                                              this_quality = rep_quality )
                                                              
        else:
            eliminated += EliminateRedundantEntriesByRange( rep,
                                                            data,
                                                            eliminated_predictions,
                                                            options,
                                                            peptides, 
                                                            extended_peptides,
                                                            filter_quality = qualities,
                                                            this_quality = rep_quality )

        options.stdout.write( "%s\t%i\n" % (rep_id, len(eliminated)) )

        if outfile_members:
            outfile_members.write( "%s\t%s\tm\n" % (str(rep_id), str(rep_id)))
            nrepresentatives += 1
            nmembers += PrintMembers( rep_id, outfile_members, eliminated, eliminated_by_method )            

    if outfile_members != sys.stdout:
        outfile_members.close()

    options.stdlog.write( "# representatives=%i, members=%i, eliminated=%i, total=%i\n" %\
                          (nrepresentatives, nmembers, neliminated,
                           nrepresentatives+nmembers+neliminated ) )
    
    options.stdlog.write( "# elimination by method:\n" )
    
    for v,c in eliminated_by_method.items():
        options.stdlog.write( "# method=%s, count=%i\n" % (v, c) )

    E.Stop()