Esempio n. 1
0
def PrintCluster( cluster,
                  cluster_id,
                  lengths,
                  peptide_sequences = None,
                  regex_preferred = None):
    """print a cluster.

    Take longest sequence as representative. If preferred is given, only take
    genes matching preferred identifier.
    """

    if regex_preferred:
        rx = re.compile(regex_preferred)
    else:
        rx = None
        
    max_al = 0
    max_pl = 0
    rep_a = None
    rep_p = None
    for c in cluster:
        l = 0
        if c in lengths: l = lengths[c]

        if l > max_al:
            max_al = l
            rep_a  = c

        if rx and rx.search(c) and l > max_pl:
            max_pl = l
            rep_p = c

    if max_pl > 0:
        max_l = max_pl
        rep = rep_p
    else:
        max_l = max_al
        rep = rep_a
        
    for mem in cluster:
        l = 0
        if mem in lengths: l = lengths[mem]
        if peptide_sequences:
            map_rep2mem = alignlib.makeAlignmentVector()            
            
            if rep == mem and rep in lengths:
                alignlib.addDiagonal2Alignment( map_rep2mem, 1, lengths[rep], 0)
            elif mem in peptide_sequences and \
                     rep in peptide_sequences:
                alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -1.0)
                alignator.align( map_rep2mem,
                                 alignlib.makeSequence( peptide_sequences[rep] ),
                                 alignlib.makeSequence( peptide_sequences[mem] ) )
                    
            f = alignlib.AlignmentFormatEmissions( map_rep2mem )
            print string.join( map(str, (rep, mem, l, f)), "\t" ) 

        else:
            print string.join( map(str, (rep, mem, l)), "\t" )
            
    sys.stdout.flush()
    
    return cluster_id
Esempio n. 2
0
        elif options.format == "cdnas":
            print string.join( map(str, (entry.mPredictionId,
                                         entry.mQueryToken,
                                         entry.mSbjctToken,
                                         entry.mSbjctStrand,
                                         entry.mSbjctGenomeFrom - offset,
                                         entry.mSbjctGenomeTo - offset,
                                         genomic_sequence )), "\t")

        elif options.format == "map":

            map_prediction2genome = alignlib.makeAlignmentSet()
            
            for cd in cds:
                alignlib.addDiagonal2Alignment( map_prediction2genome,
                                               cd.mPeptideFrom + 1,
                                               cd.mPeptideTo,
                                               (cd.mGenomeFrom - offset) - cd.mPeptideFrom )

            print string.join( map(str, (entry.mPredictionId,
                                         entry.mSbjctToken,
                                         entry.mSbjctStrand,
                                         alignlib.AlignmentFormatEmissions( map_prediction2genome ))), "\t")

        elif options.format == "intron-fasta":
            rank = 0
            if len(cds) == 1:
                nskipped += 1
                continue

            last = cds[0].mGenomeTo
            for cd in cds[1:]:
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2cds.py 1858 2008-05-13 15:07:05Z andreas $",
                                    usage = globals()["__doc__"] )

    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genome."  )

    parser.add_option( "-o", "--forward-coordinates", dest="forward_coordinates", action="store_true",
                       help = "input uses forward coordinates." )

    parser.add_option( "-f", "--format", dest="format", type="choice",
                       choices=("default", "cds", "cdnas", "map", "gff", "intron-fasta", "exons" ),
                       help = "output format." )

    parser.add_option( "-r", "--reset-to-start", dest="reset_to_start", action="store_true",
                       help = "move genomic coordinates to begin from 0." )

    parser.add_option( "--reset-query", dest="reset_query", action="store_true",
                       help = "move peptide coordinates to begin from 0." )

    parser.set_defaults(
        genome_file = None,
        forward_coordinates = False,
        format = "default",
        reset_to_start = False,
        reset_query = False )
    
    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    cds_id = 1

    entry = PredictionParser.PredictionParserEntry()

    fasta = IndexedFasta.IndexedFasta( options.genome_file )

    ninput, noutput, nskipped, nerrors = 0, 0, 0, 0
    
    for line in sys.stdin:

        if line[0] == "#": continue
        if line.startswith( "id" ): continue

        ninput += 1

        try:
            entry.Read(line)
        except ValueError, msg:
            options.stdlog.write( "# parsing failed with msg %s in line %s" % (msg, line ) )
            nerrors += 1
            continue
            
        cds = Exons.Alignment2Exons( entry.mMapPeptide2Genome,
                                     query_from = entry.mQueryFrom,
                                     sbjct_from = entry.mSbjctGenomeFrom,
                                     add_stop_codon = 0 )

        for cd in cds:
            cd.mSbjctToken = entry.mSbjctToken
            cd.mSbjctStrand = entry.mSbjctStrand

        if cds[-1].mGenomeTo != entry.mSbjctGenomeTo:
            options.stdlog.write( "# WARNING: discrepancy in exon calculation!!!\n")
            for cd in cds:
                options.stdlog.write("# %s\n" % str(cd))
            options.stdlog.write( "# %s\n" % entry) 

        lsequence = fasta.getLength( entry.mSbjctToken )
        genomic_sequence = fasta.getSequence( entry.mSbjctToken,
                                              entry.mSbjctStrand,
                                              entry.mSbjctGenomeFrom,
                                              entry.mSbjctGenomeTo )
        

        ## deal with forward coordinates: convert them to negative strand coordinates
        if options.forward_coordinates and \
               entry.mSbjctStrand == "-":
            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = lsequence - entry.mSbjctGenomeTo, lsequence - entry.mSbjctGenomeFrom
            for cd in cds:
                cd.InvertGenomicCoordinates( lsequence )

        ## attach sequence to cds
        for cd in cds:
            start = cd.mGenomeFrom-entry.mSbjctGenomeFrom
            end = cd.mGenomeTo-entry.mSbjctGenomeFrom
            cd.mSequence = genomic_sequence[start:end]

        ## reset coordinates for query
        if options.reset_to_start:
            offset = entry.mPeptideFrom
            for cd in cds:
                cd.mPeptideFrom -= offset
                cd.mPeptideTo -= offset

        ## play with coordinates
        if options.reset_to_start:
            offset = entry.mSbjctGenomeFrom
            for cd in cds:
                cd.mGenomeFrom -= offset
                cd.mGenomeTo -= offset
        else:
            offset = 0
                
        if options.format == "cds":
            rank = 0
            for cd in cds:
                rank += 1
                cd.mQueryToken = entry.mQueryToken
                cd.mSbjctToken = entry.mSbjctToken
                cd.mSbjctStrand = entry.mSbjctStrand
                cd.mRank = rank
                print str(cd)

        if options.format == "exons":
            rank = 0
            for cd in cds:
                rank += 1
                options.stdout.write( "\t".join( map(str, (entry.mPredictionId,
                                                           cd.mSbjctToken,
                                                           cd.mSbjctStrand,
                                                           rank,
                                                           cd.frame,
                                                           cd.mPeptideFrom,
                                                           cd.mPeptideTo,
                                                           cd.mGenomeFrom,
                                                           cd.mGenomeTo ) ) ) + "\n" )
                                                          

                
        elif options.format == "cdnas":
            print string.join( map(str, (entry.mPredictionId,
                                         entry.mQueryToken,
                                         entry.mSbjctToken,
                                         entry.mSbjctStrand,
                                         entry.mSbjctGenomeFrom - offset,
                                         entry.mSbjctGenomeTo - offset,
                                         genomic_sequence )), "\t")

        elif options.format == "map":

            map_prediction2genome = alignlib.makeAlignmentSet()
            
            for cd in cds:
                alignlib.addDiagonal2Alignment( map_prediction2genome,
                                               cd.mPeptideFrom + 1,
                                               cd.mPeptideTo,
                                               (cd.mGenomeFrom - offset) - cd.mPeptideFrom )

            print string.join( map(str, (entry.mPredictionId,
                                         entry.mSbjctToken,
                                         entry.mSbjctStrand,
                                         alignlib.AlignmentFormatEmissions( map_prediction2genome ))), "\t")

        elif options.format == "intron-fasta":
            rank = 0
            if len(cds) == 1:
                nskipped += 1
                continue

            last = cds[0].mGenomeTo
            for cd in cds[1:]:
                rank += 1
                key = "%s %i %s:%s:%i:%i" % (entry.mPredictionId, rank, entry.mSbjctToken, entry.mSbjctStrand, last, entry.mSbjctGenomeFrom)
                sequence = genomic_sequence[last-entry.mSbjctGenomeFrom:cd.mGenomeFrom-entry.mSbjctGenomeFrom]
                options.stdout.write( ">%s\n%s\n" % (key, sequence) )
                last = cd.mGenomeTo

            
        elif options.format == "gff-match":
            print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Introns %i; Frameshifts %i; Stops %i" % \
                  ( entry.mSbjctToken,
                    "gpipe", "similarity",
                    entry.mSbjctGenomeFrom,
                    entry.mSbjctGenomeTo,
                    entry.mPercentIdentity,
                    entry.mSbjctStrand,
                    ".",
                    entry.mQueryToken,
                    entry.mQueryFrom,
                    entry.mQueryTo,
                    entry.score,
                    entry.mNIntrons,
                    entry.mNFrameShifts,
                    entry.mNStopCodons)

        elif options.format == "gff-exon":
            rank = 0
            for cd in cds:
                rank += 1
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Rank %i/%i; Prediction %i" % \
                      ( entry.mSbjctToken,
                        "gpipe", "similarity",
                        cd.mGenomeFrom,
                        cd.mGenomeTo,
                        entry.mPercentIdentity,
                        entry.mSbjctStrand,
                        ".",
                        entry.mQueryToken,
                        cd.mPeptideFrom / 3 + 1,
                        cd.mPeptideTo / 3 + 1,
                        entry.score,
                        rank,
                        len(cds),
                        entry.mPredictionId )
        else:
            exon_from = 0            
            for cd in cds:
                cd.mPeptideFrom = exon_from
                exon_from += cd.mGenomeTo - cd.mGenomeFrom                
                cd.mPeptideTo = exon_from
                print string.join( map(str, (cds_id, entry.mPredictionId,
                                             cd.mPeptideFrom, cd.mPeptideTo,
                                             cd.frame, 
                                             cd.mGenomeFrom, cd.mGenomeTo,
                                             cd.mSequence
                                             )), "\t")
                cds_id += 1

        noutput += 1
Esempio n. 4
0
        nexons += 1
        
        if last_exon.mQueryToken != this_exon.mQueryToken:

            if last_exon.mQueryToken:
                f = alignlib.AlignmentFormatEmissions( map_prediction2genome )
                print string.join( map(str, (last_exon.mQueryToken,
                                             last_exon.mSbjctToken,
                                             last_exon.mSbjctStrand,
                                             f)), "\t" )

                npairs += 1                
            map_prediction2genome.clear()
            
        alignlib.addDiagonal2Alignment( map_prediction2genome,
                                       this_exon.mPeptideFrom + 1,
                                       this_exon.mPeptideTo + 1,
                                       this_exon.mGenomeFrom  - this_exon.mPeptideFrom)

        last_exon = this_exon
        
    f = alignlib.AlignmentFormatEmissions( map_prediction2genome )    
    print string.join( map(str, (last_exon.mQueryToken, 
                                 last_exon.mSbjctToken,
                                 last_exon.mSbjctStrand,
                                 f)), "\t" )
    npairs += 1

    print "# nexons=%i, npairs=%i" % (nexons, npairs)
    
    print E.GetFooter()