Python Exons.ClusterByExonIdentity Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: Exons

Method/Function: ClusterByExonIdentity

Examples at hotexamples.com: 2

Python Exons.ClusterByExonIdentity - 2 examples found. These are the top rated real world Python examples of CGAT.Exons.ClusterByExonIdentity extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ReadExonBoundaries(13)

Alignment2Exons(8)

CheckOverlap(3)

ComparisonResult(3)

SetRankToPositionFlag(2)

CheckCoverage(2)

CheckCoverageAinB(2)

ClusterByExonIdentity(2)

CompareGeneStructures(2)

Alignment2ExonBoundaries(1)

GetExonsRange(1)

MatchExons(1)

GetPeptideLengths(1)

GetGenomeLengths(1)

CountMissedBoundaries(1)

GetExonBoundariesFromTable(1)

Exons2Alignment(1)

Exon(1)

ClusterByExonOverlap(1)

CheckContainedAinB(1)

CalculateStats(1)

UpdatePeptideCoordinates(1)

Example #1

Show file

File: exons2clusters.py Project: yangjl/cgat

        raise IOError("no exons in exon list.")

    Exons.SetRankToPositionFlag(exons)

    if param_use_genome_length:
        lengths = Exons.GetGenomeLengths(exons)
    else:
        lengths = Exons.GetPeptideLengths(exons)

    if param_min_overlap > 0:
        map_cluster2transcripts, map_transcript2cluster = ClusterByExonOverlap(
            exons, lengths, peptide_sequences, loglevel=param_loglevel)
    else:
        map_cluster2transcripts, map_transcript2cluster = \
                                 Exons.ClusterByExonIdentity( exons,
                                                              max_terminal_num_exons = 3,
                                                              min_terminal_exon_coverage = param_min_terminal_exon_coverage,
                                                              loglevel = param_loglevel )

    map_transcript2strand = {}
    for k, ee in exons.items():
        map_transcript2strand[k] = (ee[0].mSbjctStrand == "+")

    nnegatives, npositives = 0, 0
    ## take longest transcript
    cluster_id = 1

    if peptide_sequences:
        print "rep\tmem\tlength\tquery_from\tquery_to\tquery_ali\tsbjct_from\tsbjct_to\tsbjct_ali"
    else:
        print "rep\tmem\tlength"

Example #2

Show file

File: select_transcripts.py Project: yangjl/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/select_transcripts.py 2263 2008-11-17 16:36:29Z andreas $", usage = globals()["__doc__"] )

    parser.add_option( "-o", "--overlap", dest="overlap_residues", type="int",
                       help="overlap residues.")
    parser.add_option( "-t", "--filter-tokens", dest="filename_filter_tokens", type="string",
                       help="filename to filter tokens." )
    parser.add_option( "-i", "--exon-identity", dest="exon_identity", action="store_true",
                       help="exon identity." )
    parser.add_option( "--exons", dest="filename_exons", type="string",
                       help="filename with exon information." )
    parser.add_option( "-m", "--output-members", dest="filename_members", type="string",
                       help="output filename with members." )
    parser.add_option( "--overlap-id", dest="overlap_id", action="store_true",
                       help="overlap id." )
    parser.add_option( "-s", "--remove-spanning", dest="remove_spanning_predictions", action="store_true",
                       help="remove spanning predictions." )
    parser.add_option( "-c", "--remove-complement", dest="remove_complementary_predictions", action="store_true",
                       help="remove complementary predictions." )
    parser.add_option( "--remove-exon-swoppers", dest="remove_exon_swoppers", action="store_true",
                       help="remove exon swoppers." )
    parser.add_option( "--remove-gene-spanners", dest="remove_gene_spanners", action="store_true",
                       help="remove gene spanners." )
    parser.add_option( "--remove-suboptimal", dest="remove_suboptimal", action="store_true",
                       help="remove suboptimal predictions." )
    parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string",
                       help="filename with peptide information." )
    parser.add_option( "--extended-peptides", dest="filename_extended_peptides", type="string",
                       help="filename with peptide information - after extension." )
    
    parser.add_option( "--test", dest="test_nids", type="string",
                       help="test nids." )
    ## filter options
    parser.add_option( "--filter-transcripts", dest="filter_filename_transcripts", type="string",
                       help="filename with transcripts that are used to filter." )
    parser.add_option( "--filter-remove-spanning", dest="filter_remove_spanning", action="store_true",
                       help="remove all transcripts that span the filter set." )
    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genomic data (indexed)." )
    parser.add_option( "--discard-large-clusters", dest="discard_large_clusters", type="int",
                       help="if set discard clusters bigger than this size (patch) [default=%default]." )
    

    parser.set_defaults(
        filename_members = None,
        filename_peptides = None,
        filename_extended_peptides = None,
        filename_exons = None,
        quality_hierarchy = ("CG", "PG", "SG", "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK" ),
        ## Classes, where redundancy is removed by similarity. When exon structure
        ## is not conserved, I can't predict alternative splice variants, so remove
        ## the redundancy.
        quality_exclude_same = ( "UG", "UP", "UF", "BF", "UK" ),
        quality_genes = ("CG", "SG", "PG", "RG", "UG"),
        ## class that can be removed in spanning/complementary predictions
        quality_remove_dubious = ( "UG", "UP", "UF", "BF", "UK" ),
        ## class that is required for defining exon swopper event
        quality_remove_exon_swopper = ("CG", "PG"),
        ## class that will kept, in spite of being an exons swopper.
        quality_keep_exon_swopper = (),
        ## class that is required for removing gene spanners
        quality_remove_gene_spanners = ("CG"),
        ## class that will kept, in spite of being a gene spanner
        quality_keep_gene_spanners = (),
        ## class that is required for defining suboptimal matches
        quality_remove_suboptimal = ("CG", "PG" ),
        ## class that will be kept, in spite of being a suboptimal match
        quality_keep_suboptimal = (),
        ## gap penalties
        gop = -10.0,
        gep = -1.0,
        ## maximum number of gaps to allow in alignment
        max_gaps = 20,
        ## threshold of percent identity that allows to remove a prediction
        ## of a lower class.
        ## This allows for insertions/deletions
        min_identity = 98,
        ## threshold of percent identity that allows to remove a prediction
        ## of a non-gene by a gene
        min_identity_non_genes = 80,
        ## safety threshold: do not remove, if coverage of member is by x better
        ## than representative
        safety_pide = 10,
        safety_coverage = 10,
        overlap_id = False,
        remove_spanning_predictions = False,
        remove_exon_swoppers = False,
        remove_gene_spanners = False,
        remove_suboptimal = False,
        ## nids to use for testing
        test_nids = None,
        ## remove members with less than maximum coverage
        max_member_coverage = 90,
        ## maximum allowable exon slippage
        max_slippage = 9,
        ## minimum difference in identity for suboptimal predictions to be removed.
        suboptimal_min_identity_difference = 10,
        ## filter options
        filter_filename_transcripts = None,
        filter_remove_spanning = True,
        filter_remove_spanning_both_strands = True,
        genome_file = None,
        discard_large_clusters = None )
    
    (options, args) = E.Start( parser, add_psql_options = True )    

    if options.test_nids: options.test_nids = options.test_nids.split(",")

    # list of eliminated predictions
    eliminated_predictions = {}
    
    if options.filename_members:
        outfile_members = open( options.filename_members, "w" )
    else:
        outfile_members = sys.stdout

    ######################################################
    ######################################################
    ######################################################        
    # data
    ######################################################    
    data = []

    class Entry:
        def __init__(self, gff):
            self.mPid = float(gff["pid"])
            self.mQueryCoverage = float(gff["qcov"])
            self.gene_id = gff['gene_id']
            self.transcript_id = gff['transcript_id']
            self.mExtendedStart = int( gff['xstart'] )
            self.mExtendedEnd = int( gff['xend'] )
            self.start = gff.start
            self.contig = gff.contig
            self.strand = gff.strand
            self.end = gff.end
            self.mQuality = gff['class']
            
    for gff in GTF.iterator( sys.stdin ):
        data.append( Entry(gff) )

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i transcripts.\n" % len(data) )
        options.stdlog.flush()

    ######################################################
    ######################################################
    ######################################################        
    # read peptide sequences
    ######################################################    
    if options.loglevel >= 1:
        options.stdlog.write( "# loading peptide databases ... " )
        options.stdlog.flush()

    if options.filename_peptides:
        peptides = IndexedFasta.IndexedFasta( options.filename_peptides )
        peptide_lengths = peptides.getContigSizes()
    else:
        peptide_lengths = {}
        peptides = {}

    ######################################################
    ######################################################
    ######################################################        
    # read extended peptide sequences
    ######################################################    
    if options.filename_extended_peptides:
        extended_peptides = IndexedFasta.IndexedFasta( options.filename_extended_peptides )
    else:
        extended_peptides = {}

    if options.loglevel >= 1:
        options.stdlog.write( "finished\n" )
        options.stdlog.flush()

    ######################################################
    ######################################################
    ######################################################        
    ## open genome file
    ######################################################        
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    ######################################################
    ######################################################
    ######################################################        
    ## reading exons, clustering and formatting them.
    ######################################################        
    if options.filename_exons:
        if options.loglevel >= 1:
            options.stdlog.write( "# reading exon boundaries ... " )
            options.stdlog.flush()
            
        ids = [ x.transcript_id for x in data ] 

        exons = Exons.ReadExonBoundaries( open( options.filename_exons, "r"),
                                          contig_sizes = contig_sizes,
                                          filter = set(ids) )

        if options.loglevel >= 1:
            options.stdlog.write( "done - read exons for %i transcripts\n" % (len(exons) ))

        if len(exons) == 0:
            raise ValueError("no exons found in table.")
            
        # flag terminal exons
        Exons.SetRankToPositionFlag( exons )

        identity_map_cluster2transcripts, identity_map_transcript2cluster =\
                                          Exons.ClusterByExonIdentity( exons,
                                                                       max_terminal_num_exons = 3,
                                                                       max_slippage= options.max_slippage,
                                                                       loglevel = options.loglevel )

        overlap_map_cluster2transcripts, overlap_map_transcript2cluster =\
                                         Exons.ClusterByExonOverlap( exons,
                                                                     min_overlap = 10,
                                                                     loglevel = options.loglevel )
    else:
        exons = {}

    ######################################################        
    nrepresentatives, nmembers, neliminated = 0, 0, 0
    eliminated_by_method = {}

    ######################################################
    ######################################################
    ######################################################
    ## read filter transcripts and apply filters
    ######################################################        
    if options.filter_filename_transcripts:

        if options.loglevel >= 1:
            options.stdlog.write( "# reading exon boundaries for filter set ... " )
            options.stdlog.flush()
            
        filter_exons = Exons.ReadExonBoundaries( open( options.filter_filename_transcripts, "r" ),
                                                 delete_missing = True,
                                                 contig_sizes = contig_sizes )

        if options.loglevel >= 1:
            options.stdlog.write( "done - read exons for %i transcripts\n" % (len(filter_exons)) )
        
        t = time.time()
        eliminated = FilterEliminateOverlappingTranscripts( exons,
                                                            filter_exons,
                                                            eliminated_predictions,
                                                            contig_sizes,
                                                            options )

        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n
        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i transcripts overlapping or spanning transcripts in %i seconds.\n" % (n, time.time()-t ))
            options.stdlog.flush()

    if options.remove_exon_swoppers and not exons:
        raise ValueError( "please specify exon table if using --remove-swoppers." )
    if options.remove_gene_spanners and not exons:
        raise ValueError( "please specify exon table if using --remove-gene-spanners." )

    ########################################################################################
    ## remove predictions spanning other predictions but do not overlap with them on an exon level.
    if options.remove_gene_spanners and exons:
        if options.loglevel >= 1:
            options.stdlog.write( "# removing gene spanners\n" )
            options.stdlog.flush()
            
        t = time.time()
        eliminated = EliminateGeneSpanners( data,
                                            eliminated_predictions,
                                            exons,
                                            options )

        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n
        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i gene spanners in %i seconds\n" % (n, time.time()-t ))
            options.stdlog.flush()
            
    ########################################################################################
    ## sort data by quality, length of prediction and coverage * pid

    if options.loglevel >= 1:
        options.stdlog.write( "# sorting data\n" )
        options.stdlog.flush()

    map2pos = {}
    for x in range(len(options.quality_hierarchy)):
        map2pos[options.quality_hierarchy[x]] = x

    data.sort( key = lambda x: (map2pos[x.mQuality], len(extended_peptides[x.transcript_id]), x.mQueryCoverage * x.mPid ) )

    # build map of prediction to quality
    map_prediction2data = {}
    for d in data:
        map_prediction2data[d.transcript_id] = d

    if options.loglevel >= 1:
        options.stdlog.write( "# sorting data finished\n" )
        options.stdlog.flush()

    ########################################################################################
    ## remove predictions joining two other complete non-overlapping predictions
    if options.remove_exon_swoppers and exons:

        if options.loglevel >= 1:
            options.stdlog.write( "# removing exon swoppers\n" )
            options.stdlog.flush()

        eliminated = EliminateExonSwoppers( data,
                                            eliminated_predictions,
                                            identity_map_transcript2cluster,
                                            identity_map_cluster2transcripts,
                                            map_prediction2data,
                                            exons,
                                            options )
        
        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n

        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i exon swoppers\n" % n )
            options.stdlog.flush()

    ########################################################################################
    ## remove suboptimal predictions
    if options.remove_suboptimal and exons:

        if options.loglevel >= 1:
            options.stdlog.write( "# removing suboptimal predictions\n" )
            options.stdlog.flush()

        t = time.time()
        eliminated = EliminateSuboptimalPredictions( data,
                                                     eliminated_predictions,
                                                     overlap_map_transcript2cluster,
                                                     overlap_map_cluster2transcripts,
                                                     map_prediction2data,
                                                     exons,
                                                     options )
        
        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n

        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i suboptimal predictions in %i seconds\n" % (n, time.time()-t) )
            options.stdlog.flush()        

    ########################################################################################
    ## remove redundant predictions
    l = len(data)
        
    options.report_step = max(1, int(l / 100))

    t2= time.time()

    last_quality = None
    qualities = []

    options.stdout.write( "%s\t%s\n" % ("rep", "comment") )
    
    for x in range(len(data)):

        if options.loglevel >= 1:
            if x % options.report_step  == 0:
                options.stdlog.write( "# process: %i/%i = %i %%, %i/%i = %i %% in %i seconds\n" % \
                                      (x+1, l,
                                       int(100 * (x+1) / l),
                                       len(eliminated_predictions), l,
                                       100 * len(eliminated_predictions) / l,
                                       time.time() - t2 ) )
                                                                    
                options.stdlog.flush()
                
        rep = data[x]

        rep_id, rep_quality = rep.transcript_id, rep.mQuality
        
        if rep_id in eliminated_predictions: continue

        if rep_quality != last_quality:
            if last_quality:
                qualities.append( last_quality )
            last_quality = rep_quality
        
        if options.loglevel >= 2:
            options.stdlog.write( "# processing prediction %s|%s\n" % (rep_id, rep_quality) )
            options.stdlog.flush()

        eliminated = []

        if options.overlap_id:
            eliminated += EliminateRedundantEntriesByOverlap( rep,
                                                              data[x+1:],
                                                              eliminated_predictions,
                                                              options,
                                                              peptides, 
                                                              extended_peptides,
                                                              filter_quality = qualities,
                                                              this_quality = rep_quality )
                                                              
        else:
            eliminated += EliminateRedundantEntriesByRange( rep,
                                                            data,
                                                            eliminated_predictions,
                                                            options,
                                                            peptides, 
                                                            extended_peptides,
                                                            filter_quality = qualities,
                                                            this_quality = rep_quality )

        options.stdout.write( "%s\t%i\n" % (rep_id, len(eliminated)) )

        if outfile_members:
            outfile_members.write( "%s\t%s\tm\n" % (str(rep_id), str(rep_id)))
            nrepresentatives += 1
            nmembers += PrintMembers( rep_id, outfile_members, eliminated, eliminated_by_method )            

    if outfile_members != sys.stdout:
        outfile_members.close()

    options.stdlog.write( "# representatives=%i, members=%i, eliminated=%i, total=%i\n" %\
                          (nrepresentatives, nmembers, neliminated,
                           nrepresentatives+nmembers+neliminated ) )
    
    options.stdlog.write( "# elimination by method:\n" )
    
    for v,c in eliminated_by_method.items():
        options.stdlog.write( "# method=%s, count=%i\n" % (v, c) )

    E.Stop()