Ejemplo n.º 1
0
    def applyMethod(self, neighbours):
        """apply the method."""
        # build multiple alignment
        mali = alignlib.makeMultipleAlignment()

        query_nid = neighbours.mQueryToken

        sequence = self.mFasta.getSequence(query_nid)

        mali.add(alignlib.makeAlignatum(sequence))

        qseq = alignlib.makeSequence(sequence)
        alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_GLOBAL,
                                                 -10.0, -1.0, True, True, True,
                                                 True)

        for n in neighbours.mMatches:

            if n.mSbjctToken == query_nid: continue
            sequence = self.mFasta.getSequence(n.mSbjctToken)

            blast_query2sbjct = n.getAlignment()

            if blast_query2sbjct == None:
                raise ValueError(
                    "AddaRealignment.py needs a reference alignment.")

            realign_query2sbjct = alignlib.makeAlignmentVector()

            sseq = alignlib.makeSequence(sequence)
            qseq.useSegment(n.mQueryFrom, n.mQueryTo)
            sseq.useSegment(n.mSbjctFrom, n.mSbjctTo)
            realign_query2sbjct = alignlib.makeAlignmentVector()
            alignator.align(realign_query2sbjct, qseq, sseq)

            nidentical = alignlib.getAlignmentIdentity(realign_query2sbjct,
                                                       blast_query2sbjct,
                                                       alignlib.RR)
            nblast = blast_query2sbjct.getNumAligned()
            nrealigned = realign_query2sbjct.getNumAligned()

            self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \
                                     (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) )

            if nidentical == nblast:
                self.mNIdentical += 1
            else:
                self.mNDifferent += 1
Ejemplo n.º 2
0
    def checkLinkThreshold( self,
                   query_nid, query_from, query_to,
                   sbjct_nid, sbjct_from, sbjct_to):
        """check, whether two domains are homologous.
        
        The check is done whether the alignment store between the two
        domains is above a score threshold.
        """

        query_profile = self.getAlignandum( query_nid )
        query_profile.useSegment( query_from, query_to )

        sbjct_profile = self.getAlignandum( sbjct_nid )
        sbjct_profile.useSegment( sbjct_from, sbjct_to )        
        
        result = alignlib.makeAlignmentVector()

        alignator.align( result, query_profile, sbjct_profile )

        self.debug( "--> %i vs %i: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i" %\
                  (query_nid, sbjct_nid,
                   result.getScore(),
                   result.getLength(),
                   result.getNumGaps(),
                   result.getRowFrom(), result.getRowTo(),
                   result.getColFrom(), result.getColTo()) )

        query_profile.useFullLength()
        sbjct_profile.useFullLength()
        
        if result.getScore() > self.mMinAlignmentScore:
            return True,result, ()
        else:
            return False,result, ()
Ejemplo n.º 3
0
def CheckAlignments( peptide_sequences, query_token, other_tokens ):
    """check wether query aligns to all others.
    """

    if param_loglevel >= 3:
        print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens))
        sys.stdout.flush()
        
    if query_token not in peptide_sequences:
        return True

    result = alignlib.makeAlignmentVector()
    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL,
                                              -10.0, -1.0 )
    row_seq = alignlib.makeSequence(peptide_sequences[query_token])

    for x in other_tokens:
        if x not in peptide_sequences:
            continue
        col_seq = alignlib.makeSequence( peptide_sequences[x] )
        alignator.align( result, row_seq, col_seq )
        if param_loglevel >= 5:
            print "# %s - %s = %f" % (query_token, x, result.getScore())
        if result.getScore() > param_min_alignment_score:
            return True
        
    return False
Ejemplo n.º 4
0
def getMapPeptide2Cds( peptide_sequence, cds_sequence, options ):
    """get map between peptide sequence and cds sequence.
    
    The returned alignment is in nucleotides.

    """
    
    ## remove whitespaces form protein sequence
    p = re.sub(" ", "", peptide_sequence )

    ## remove gaps and whitespaces from cds
    c = re.sub("[ .-]", "", cds_sequence )

    w = Genomics.Protein2Wobble( p.upper() )

    if options.loglevel >= 6:
        options.stdlog.write( "# peptide original (%5i): %s\n" % (len(p), p) )
        options.stdlog.write( "# cds original     (%5i): %s\n" % (len(c), c) )
        options.stdlog.write( "# wobble sequence  (%5i): %s\n" % (len(w), w) )
        options.stdlog.flush()

    seq_wobble = alignlib.makeSequence( w )
    seq_cds = alignlib.makeSequence( string.upper(c) )
    seq_peptide = alignlib.makeSequence( p )

    map_p2c = alignlib.makeAlignmentVector()

    try:
        AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options = options )
    except ValueError, msg:
        raise ValueError( "mapping error for sequence: %s" % (msg) )
Ejemplo n.º 5
0
    def applyMethod(self, neighbours ):
        """apply the method."""
        # build multiple alignment
        mali = alignlib.makeMultipleAlignment()
        
        query_nid = neighbours.mQueryToken
        
        sequence = self.mFasta.getSequence( query_nid )

        mali.add( alignlib.makeAlignatum( sequence ) )

        qseq = alignlib.makeSequence( sequence )
        alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL, 
                                                  -10.0, -1.0, True, True, True, True)

        for n in neighbours.mMatches:

            if n.mSbjctToken == query_nid: continue
            sequence = self.mFasta.getSequence( n.mSbjctToken )

            blast_query2sbjct = n.getAlignment()

            if blast_query2sbjct == None:
                raise ValueError( "AddaRealignment.py needs a reference alignment.")
            
            realign_query2sbjct = alignlib.makeAlignmentVector()
            
            sseq = alignlib.makeSequence( sequence )
            qseq.useSegment( n.mQueryFrom, n.mQueryTo )
            sseq.useSegment( n.mSbjctFrom, n.mSbjctTo )
            realign_query2sbjct = alignlib.makeAlignmentVector()
            alignator.align( realign_query2sbjct, qseq, sseq )

            nidentical = alignlib.getAlignmentIdentity( realign_query2sbjct, blast_query2sbjct, alignlib.RR )
            nblast = blast_query2sbjct.getNumAligned()
            nrealigned = realign_query2sbjct.getNumAligned()

            self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \
                                     (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) )
            
            if nidentical == nblast:
                self.mNIdentical += 1
            else:
                self.mNDifferent += 1
Ejemplo n.º 6
0
 def GetMap( self ):
     """return map between the two segments."""
     if self.mAlignmentFrom1 and self.mAlignmentFrom2:
         map_a2b = alignlib.makeAlignmentVector()
         alignlib.AlignmentFormatEmissions( 
             self.mAlignmentFrom1, self.mAlignment1,
             self.mAlignmentFrom2, self.mAlignment2 ).copy( map_a2b )
         return map_a2b
     else:
         return None
Ejemplo n.º 7
0
    def fillFromTable( self, table_row ):

        if len(table_row) == 25:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString) = table_row
        elif len(table_row) == 26:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              self.mNAssembled) = table_row[:26]
        elif len(table_row) > 26:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              self.mNAssembled) = table_row[:26]
        else:
            raise ValueError, "unknown format: %i fields" % len(data)
            sys.exit(0)
            
        if self.mExpand:
            self.mMapPeptide2Translation = alignlib.makeAlignmentVector()

            if self.mQueryAli != "" and self.mSbjctAli != "":
                alignlib.AlignmentFormatEmissions( self.mQueryFrom, self.mQueryAli,
                                                   self.mSbjctFrom, self.mSbjctAli ).copy( self.mMapPeptide2Translation )

            self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
Ejemplo n.º 8
0
def AlignPair( pair, anchor = 0 ):
    """align a pair of introns."""

    map_intron_a2b = alignlib.makeAlignmentVector()

    if param_loglevel >= 1:
        print "# aligning %s-%i with %s-%i: lengths %i and %i" % (pair.mToken1, pair.mIntronId1,
                                                                  pair.mToken2, pair.mIntronId2,
                                                                  len(pair.mAlignedSequence1),
                                                                  len(pair.mAlignedSequence2))
        sys.stdout.flush()

    s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor
    s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor

    if param_method == "dialigned":
        dialign.Align( s1, s2, map_intron_a2b )
    elif param_method == "dialignedlgs":
        dialignlgs.Align( s1, s2, map_intron_a2b ) 
    elif param_method == "dbaligned":
        dba.Align( s1, s2, map_intron_a2b )
    elif param_method == "clusaligned":
        raise NotImplementedError("clustalw wrapper not up-to-date")
        clustal.Align( s1, s2, map_intron_a2b )

    if anchor:
        map_intron_a2b.removeRowRegion( anchor + len(pair.mAlignedSequence1) + 1, map_intron_a2b.getRowTo() )
        map_intron_a2b.removeRowRegion( 1, anchor)        
        map_intron_a2b.removeColRegion( anchor + len(pair.mAlignedSequence2) + 1, map_intron_a2b.getColTo() )        
        map_intron_a2b.removeColRegion( 1, anchor)
        map_intron_a2b.moveAlignment( -anchor, -anchor )

    if map_intron_a2b.getLength() == 0:
        if param_loglevel >= 1:
            print "# Error: empty intron alignment"
        return False


    seq1 = alignlib.makeSequence( pair.mAlignedSequence1 )
    seq2 = alignlib.makeSequence( pair.mAlignedSequence2 )
    
    data = alignlib.AlignmentFormatExplicit( map_intron_a2b, seq1, seq2 )

    pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo
    pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo
    pair.mMethod = param_method

    pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps(), map_intron_a2b.getLength()
    pair.mAligned = pair.mLength - pair.mNumGaps

    if param_loglevel >= 2:
        print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2

    return True
Ejemplo n.º 9
0
    def buildAlignment(self):
        """build alignment for a match."""
        block_sizes = map(int, self.mBlockSizes.split(",")[:-1])
        query_starts = map(int, self.mQueryBlockStarts.split(",")[:-1])
        sbjct_starts = map(int, self.mSbjctBlockStarts.split(",")[:-1])

        self.mMapSbjct2Query = alignlib.makeAlignmentVector()

        for x in range(len(block_sizes)):
            self.mMapSbjct2Query.addDiagonal(sbjct_starts[x],
                                             sbjct_starts[x] + block_sizes[x],
                                             query_starts[x] - sbjct_starts[x])
Ejemplo n.º 10
0
    def buildAlignment( self ):
        """build alignment for a match."""
        block_sizes = map(int, self.mBlockSizes.split(",")[:-1])
        query_starts = map(int, self.mQueryBlockStarts.split(",")[:-1]) 
        sbjct_starts = map(int, self.mSbjctBlockStarts.split(",")[:-1])
        
        self.mMapSbjct2Query = alignlib.makeAlignmentVector()

        for x in range( len(block_sizes) ):
            self.mMapSbjct2Query.addDiagonal( 
                sbjct_starts[x],
                sbjct_starts[x] + block_sizes[x],
                query_starts[x] - sbjct_starts[x] )
Ejemplo n.º 11
0
    def getCopy( self ):
        """return a new copy.
        """

        new_entry = Prediction()

        new_entry.mExpand = self.mExpand 
        
        new_entry.mPredictionId = self.mPredictionId 
        new_entry.mQueryToken = self.mQueryToken 
        new_entry.mQueryFrom = self.mQueryFrom 
        new_entry.mQueryTo = self.mQueryTo 
        new_entry.mSbjctToken = self.mSbjctToken 
        new_entry.mSbjctStrand = self.mSbjctStrand 
        new_entry.mSbjctFrom = self.mSbjctFrom 
        new_entry.mSbjctTo = self.mSbjctTo 
        new_entry.mRank = self.mRank 
        new_entry.score = self.score 
        new_entry.mQueryLength = self.mQueryLength 
        new_entry.mQueryCoverage = self.mQueryCoverage 
        new_entry.mNGaps = self.mNGaps 
        new_entry.mNFrameShifts = self.mNFrameShifts 
        new_entry.mNIntrons = self.mNIntrons 
        new_entry.mNSplits = self.mNSplits 
        new_entry.mNStopCodons = self.mNStopCodons 
        new_entry.mPercentIdentity = self.mPercentIdentity 
        new_entry.mPercentSimilarity = self.mPercentSimilarity 
        new_entry.mTranslation = self.mTranslation 
        new_entry.mSbjctGenomeFrom = self.mSbjctGenomeFrom 
        new_entry.mSbjctGenomeTo = self.mSbjctGenomeTo 
        new_entry.mAlignmentString = self.mAlignmentString 
        new_entry.mQueryAli = self.mQueryAli 
        new_entry.mSbjctAli = self.mSbjctAli 

        if self.mExpand:
            new_entry.mMapPeptide2Translation = alignlib.makeAlignmentVector()
            alignlib.copyAlignment( new_entry.mMapPeptide2Translation, self.mMapPeptide2Translation)
            new_entry.mMapPeptide2Genome = Genomics.String2Alignment( new_entry.mAlignmentString) 
        else:
            new_entry.mMapPeptide2Translation = self.mMapPeptide2Translation = None
            new_entry.mMapPeptide2Genome = self.mMapPeptide2Genome = None

        return new_entry
Ejemplo n.º 12
0
def getParts(src):
    '''split a wrap-around alignment'''

    result = None
    r = []
    last_s = src.getColTo()
    for p in range(src.getRowFrom(), src.getRowTo()):
        s = src.mapRowToCol(p)
        if s < 0: continue
        if last_s >= s:
            if result:
                r.append(result)
            result = alignlib.makeAlignmentVector()
        last_s = s
        result.addPair(s, p, 0)

    if result:
        r.append(result)
    return r
Ejemplo n.º 13
0
def getParts( src ):
    '''split a wrap-around alignment'''

    result = None
    r = []
    last_s = src.getColTo()
    for p in range( src.getRowFrom(), 
                    src.getRowTo() ):
        s = src.mapRowToCol(p)
        if s < 0: continue
        if last_s >= s:
            if result:
                r.append( result )
            result = alignlib.makeAlignmentVector()
        last_s = s
        result.addPair( s, p, 0 )

    if result:
        r.append( result )
    return r
Ejemplo n.º 14
0
    def __init__(self, expand = 1):

        self.mExpand = expand
        
        self.mPredictionId = 0
        self.mQueryToken = 0
        self.mQueryFrom = 0
        self.mQueryTo = 0
        self.mSbjctToken = 0
        self.mSbjctStrand = 0
        self.mSbjctFrom = 0
        self.mSbjctTo = 0
        self.mRank = 0
        self.score = 0
        self.mQueryLength = 0
        self.mQueryCoverage = 0
        self.mNGaps = 0
        self.mNFrameShifts = 0
        self.mNIntrons = 0
        self.mNSplits = 0
        self.mNStopCodons = 0
        self.mPercentIdentity = 0
        self.mPercentSimilarity = 0
        self.mTranslation = ""
        self.mSbjctGenomeFrom = 0
        self.mSbjctGenomeTo = 0
        self.mAlignmentString = ""
        self.mQueryAli = ""
        self.mSbjctAli = ""
        
        if self.mExpand:
            self.mMapPeptide2Translation = alignlib.makeAlignmentVector()
            self.mMapPeptide2Genome = []
        else:
            self.mMapPeptide2Translation = None
            self.mMapPeptide2Genome = None
        self.mNAssembled = 0
Ejemplo n.º 15
0
    def buildMali(self, query_nid, neighbours):
        """build a multiple alignment from a set of neighbours.
        """
        # build multiple alignment
        mali = alignlib.makeMultipleAlignment()

        query_sequence = self.mFasta.getSequence(query_nid)

        mali.add(alignlib.makeAlignatum(query_sequence))

        qseq = alignlib.makeSequence(query_sequence)
        alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_LOCAL, -10,
                                                 -2)

        nskipped = 0

        for n in neighbours[:self.mMaxNumNeighbours]:

            if n.mSbjctToken == query_nid: continue
            if n.mEvalue > self.mMaxEvalue:
                nskipped += 1
                continue
            sequence = self.mFasta.getSequence(n.mSbjctToken)

            E.debug("adding %s" % str(n))

            map_query2sbjct = n.getAlignment()

            if map_query2sbjct == None:
                sseq = alignlib.makeSequence(sequence)
                qseq.useSegment(n.mQueryFrom, n.mQueryTo)
                sseq.useSegment(n.mSbjctFrom, n.mSbjctTo)
                map_query2sbjct = alignlib.makeAlignmentVector()
                alignator.align(map_query2sbjct, qseq, sseq)

            if map_query2sbjct.getLength() == 0:
                self.warn("empty alignment: %s" % str(n))
                nskipped += 1
                continue

            if map_query2sbjct.getRowTo() > len(query_sequence):
                self.warn( "alignment out of bounds for query: %i>%i, line=%s" %\
                               (map_query2sbjct.getRowTo(), len(query_sequence), str(n)))
                nskipped += 1
                continue

            elif map_query2sbjct.getColTo() > len(sequence):
                self.warn( "alignment out of bounds for sbjct: %i>%i, line=%s" %\
                               (map_query2sbjct.getColTo(), len(sequence), str(n)))
                nskipped += 1
                continue

            try:
                mali.add(alignlib.makeAlignatum(sequence),
                         map_query2sbjct,
                         mali_is_in_row=True,
                         insert_gaps_mali=False,
                         insert_gaps_alignatum=True,
                         use_end_mali=True,
                         use_end_alignatum=False)
            except RuntimeError, msg:
                self.warn("problem when building alignment for %s: msg=%s" %
                          (str(n), msg))
                nskipped += 1
                continue
Ejemplo n.º 16
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-o", "--gop", dest="gop", type="float",
                      help="gap opening penalty [default=%default]."  )

    parser.add_option("-e", "--gep", dest="gep", type="float",
                      help="gap extension penalty [default=%default]."  )

    parser.add_option("-m", "--mode", dest="mode", type="choice",
                      choices = ("global", "local" ),
                      help="alignment mode, global=nw, local=sw [default=%default]."  )

    parser.set_defaults(
        gop = -12.0,
        gep = -2.0,
        format= "fasta",
        mode = "local",
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    if len(args) != 2: raise ValueError("please supply two multiple alignments in FASTA format.")

    mali1 = Mali.Mali()
    mali2 = Mali.Mali()

    E.info( "read 2 multiple alignments" )

    mali1.readFromFile( IOTools.openFile( args[0], "r" ), format=options.format )
    mali2.readFromFile( IOTools.openFile( args[1], "r" ), format=options.format )

    cmali1 = Mali.convertMali2Alignlib( mali1 )
    cmali2 = Mali.convertMali2Alignlib( mali2 )

    if options.mode == "local":
        mode = alignlib.ALIGNMENT_LOCAL
    elif options.mode == "global":
        mode = alignlib.ALIGNMENT_GLOBAL
        
    alignator = alignlib.makeAlignatorDPFull( mode,
                                              options.gop, options.gep )

    alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20) )
    alignlib.setDefaultLogOddor( alignlib.makeLogOddorDirichlet( 0.3 ) )
    alignlib.setDefaultRegularizor( alignlib.makeRegularizorDirichletPrecomputed() )

    cprofile1 = alignlib.makeProfile( cmali1 )
    cprofile2 = alignlib.makeProfile( cmali2 )

    result = alignlib.makeAlignmentVector()

    alignator.align( result, cprofile1, cprofile2 )

    E.debug( "result=\n%s" % alignlib.AlignmentFormatEmissions( result) )

    cmali1.add( cmali2, result )

    outmali = Mali.convertAlignlib2Mali( cmali1,
                                         identifiers = mali1.getIdentifiers() + mali2.getIdentifiers() )
    
    outmali.writeToFile( options.stdout, format=options.format)

    ## write footer and output benchmark information.
    E.Stop()
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser.add_option("-m", "--filename-map", dest="filename_map", type="string",
                      help="filename with mapping information.")
    parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string",
                      help="pattern for mapping new to old identifiers: extract string from old.")
    parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string",
                      help="pattern for mapping new to old identifiers: put string into new.")
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="genome_file.")
    parser.add_option("-p", "--peptides", dest="filename_peptides", type = "string",
                      help="filename with peptide sequences.")
    parser.add_option("-f", "--input-format", dest="input_format", type="choice",
                      help="format of mapping file", choices=("alignment", "offsets") )
    parser.add_option("-i", "--write-missed", dest="write_missed", type="string",
                      help="write missed identifiers to separate file.")
    parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string",
                      help="filename with gene information.")
    parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string",
                      help="filename with old peptide information.")
    parser.add_option("--no-renumber", dest="renumber", action="store_false",
                      help="do not renumber predictions.")
    parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string",
                      help="contig sizes for old data.")
    parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string",
                      help="contig sizes for new data.")
    parser.add_option("--skip-errors", dest="skip_errors", action="store_true",
                      help="skip entries with errors.")
    
    parser.set_defaults(
        filename_map = None,
        pattern_old = "(.+)",
        pattern_new = "%s",
        genome_file = None,
        filename_peptides = None,
        write_missed = None,
        filename_genes = None,
        filename_old_peptides = None,
        renumber = True,
        input_format = "alignment",
        contig_sizes_old = None,
        contig_sizes_new = None,
        skip_errors = None
        )

    (options, args) = E.Start( parser, add_pipe_options = True)

    predictor = PredictorExonerate()

    ## the different mapping criteria
    map_sbjcts = {}
    breakpoints = {}

    ################################################################################################
    map_transcript2gene = {}
    if options.filename_genes:
        infile = open(options.filename_genes, "r")
        for gene, transcript in map( lambda x: x[:-1].split("\t")[:2], filter( lambda x: x[0] != "#", infile.readlines())):
            map_transcript2gene[transcript] = gene
        infile.close()

    ################################################################################################
    peptides = {}
    if options.filename_peptides:
        peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r"))
        options.stdlog.write( "# read %i peptide sequences.\n" % len(peptides))

    ################################################################################################
    ## read old query sequences and compare against new query sequences
    ## this can be used to build a map between old and new queries
    query_map_old2new = {}        
    if options.filename_old_peptides:
        old_peptides = Genomics.ReadPeptideSequences( open(options.filename_old_peptides, "r"))
        options.stdlog.write( "# read %i old peptide sequences.\n" % len(old_peptides))
        query_map_old2new, unmappable, unmapped = Genomics.MapSequences( old_peptides, peptides)
        options.stdlog.write( "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped)))
        if options.loglevel >= 2:
            options.stdlog.write( "# unmappable: %s.\n" % ";".join(unmappable))
            options.stdlog.write( "# unmapped: %s.\n" % ";".join(unmapped))            

    ################################################################################################
    ## read old/new contig sizes for mapping positive/negative coordinates
    contig_sizes_old = {}
    contig_sizes_new = {}
    if options.contig_sizes_old:
        contig_sizes_old = Genomics.ReadContigSizes( open(options.contig_sizes_old, "r") )
    if options.contig_sizes_new:
        contig_sizes_new = Genomics.ReadContigSizes( open(options.contig_sizes_new, "r") )
        
    ################################################################################################        
    if options.filename_map:
        
        infile = open(options.filename_map)
        if options.input_format == "alignments":
            for line in infile:
                if line[0] == "#": continue

                x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[:-1].split("\t")

                map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali)

            if options.loglevel >= 1:
                options.stdlog.write( "# read %i alignments.\n" % len(map_sbjcts))

        elif options.input_format == "offsets":
            ## input is a list of segments and their offsets.

            breakpoints, endpoints, offsets = ReadOffsets( infile )
            if options.loglevel >= 1:
                options.stdlog.write( "# read breakpoints for %i chromosomes.\n" % len(breakpoints))

        infile.close()
        
    ################################################################################################
    ################################################################################################
    ################################################################################################
    ## end of input section
    ################################################################################################
    ################################################################################################
    ################################################################################################        

    rx = re.compile(options.pattern_old)
    last_sbjct_token = None
    ninput = 0
    nerrors = 0
    nerrors_map = 0
    nerrors_inconsistencies = 0
    nerrors_boundaries = 0
    nerrors_translation = 0
    nerrors_inconsequential = 0
    nerrors_realigned = 0
    nmapped = 0
    nfiltered = 0
    naligned = 0
    noutput = 0
    found_transcripts = {}
    nduplicates = 0
    output = {}
    
    for line in sys.stdin:
        if line[0] == "#": continue
        
        entry = PredictionParser.PredictionParserEntry()

        entry.Read( line )
        
        ninput += 1
        is_positive = entry.mSbjctStrand == "+"
        
        is_error = False
        
        ## check if query token is mappable: using sequence map
        if (query_map_old2new and entry.mQueryToken not in query_map_old2new):
            options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
            nfiltered += 1
            continue
        else:
            ## check if query token is mappable: using filter        
            if (peptides and entry.mQueryToken not in peptides):
                options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
                nfiltered += 1
                continue

        new_sbjct_token = options.pattern_new % rx.search(entry.mSbjctToken).groups()[0]

        ##########################################################################################################
        ## Map via alignments
        if entry.mSbjctToken in map_sbjcts:
            nmapped += 1
            if last_sbjct_token != entry.mSbjctToken:
                old_from, old_ali, new_from, new_ali = map_sbjcts[entry.mSbjctToken]
                map_a2b = alignlib.makeAlignmentVector()
                alignlib.AlignmentFormatExplicit(
                    int(old_from), old_ali,
                    int(new_from), new_ali).copy( map_a2b )
                
            last_sbjct_token = entry.mSbjctToken
            
            if options.loglevel >= 3:
                print "#", str(entry)
                print "#", map_sbjcts[entry.mSbjctToken]
                sys.stdout.flush()

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
                first_res, last_res = f + 1, t                
            else:
                f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t 
                first_res, last_res = f, t + 1 
            
            ## map first and last residues
            mfirst_res = map_a2b.mapRowToCol( first_res )
            mlast_res = map_a2b.mapRowToCol( last_res )

            if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo() ):
                
                options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      f, t))
                
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# %s\n" % str(entry))                
                options.stderr.flush()                
                nerrors_boundaries += 1
                is_error = True

                ## get extended boundaries for alignment later on
                while mfirst_res == 0 and first_res > 1:
                    first_res -= 1
                    mfirst_res = map_a2b.mapRowToCol(first_res)
                while mlast_res == 0 and last_res < map_a2b.getRowTo():
                    last_res += 1
                    mlast_res = map_a2b.mapRowToCol(last_res)

            ## convert to genomic coordinates            
            ## convert negative strand coordinates
            if is_positive:
                new_f = mfirst_res - 1
                new_t = mlast_res 
            else:
                new_f = mfirst_res
                new_t = mlast_res - 1
                
                new_f = map_a2b.getColTo() - new_f
                new_t = map_a2b.getColTo() - new_t

            ## Now map the alignment.
            try:
                MapAlignment( entry, map_a2b )
                
            except ValueError:
                options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.flush()
                nerrors_map += 1
                is_error= True
            
            if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo:
                options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,                                      
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                
                nerrors_inconsistencies += 1
                is_error = True

        ##########################################################################################################
        ## Map via offsets
        if entry.mSbjctToken in breakpoints:

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
            else:
                f, t = contig_sizes_old[entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f

            o1 = GetOffset( f,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )
            o2 = GetOffset( t,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )            

            if o1 != o2:
                options.stderr.write("# break within gene %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
                
            f += o1
            t += o2

            if not is_positive:
                f, t = contig_sizes_new[entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f

            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t

            if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo:
                options.stderr.write("# mapping error: start after end %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
        
        ##########################################################################################################
        ## do translation check, if genome is given
        if options.genome_file:
            genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo,
                                                            options.genome_file,
                                                            loglevel = 0)

            map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment( \
                entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence )

            if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation):
                options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                if map_sbjcts:
                    options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mTranslation, translation))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome)))                    
                nerrors_translation += 1
                is_error = True

                if peptides and entry.mQueryToken in peptides:
                    naligned += 1

                    options.stdlog.write( "# aligning: %s versus %s:%s: %i-%i\n" % ( \
                        entry.mQueryToken,
                        new_sbjct_token, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                    
                    # do a quick reprediction
                    if entry.mQueryToken in peptides:
                        genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                                        0, 0,
                                                                        genome_file = options.genome_pattern,                                                                        
                                                                        loglevel = 0)
                        predictor.mLogLevel = 0

                        
                        result =  predictor(entry.mQueryToken, peptides[entry.mQueryToken],
                                            entry.mSbjctToken, genomic_sequence,
                                            "--exhaustive --subopt FALSE --score '%s' " % str(80),
                                            new_f - 10, new_t + 10)
                        prediction_id = entry.mPredictionId
                        if result:
                            entry = result[0]
                            entry.mPredictionId = prediction_id
                            nerrors_realigned += 1
            else:
                if is_error:
                    nerrors_inconsequential += 1
                    
        entry.mSbjctToken = new_sbjct_token

        ## map query tokens
        if query_map_old2new:
            query_tokens = query_map_old2new[entry.mQueryToken]
        else:
            query_tokens = (entry.mQueryToken,)

        if options.skip_errors and is_error:
            continue

        for query_token in query_tokens:

            entry.mQueryToken = query_token
            
            prediction_id = entry.mPredictionId
            entry.mPredictionId = 0
            
            hid = Genomics.GetHID( str(entry) )
            if hid in output:
                nduplicates += 1
                continue
            
            noutput += 1                        
            if options.renumber: prediction_id = noutput

            entry.mPredictionId = prediction_id

            options.stdout.write( str(entry) + "\n")
            options.stdout.flush()
            found_transcripts[entry.mQueryToken] = 1

    ## write out found transcripts and genes
    nmissed_transcripts = 0
    missed_transcripts = []
    found_genes = {}
    if peptides:
        for x in peptides.keys():
            if x not in found_transcripts:
                nmissed_transcripts += 1
                missed_transcripts.append( x )
            else:
                found_genes[map_transcript2gene[x]] = 1

    missed_genes = {}
    nmissed_genes = 0
    if map_transcript2gene:

        for t in missed_transcripts:
            g = map_transcript2gene[t]
            if g not in found_genes:
                missed_genes[g] = 1
        nmissed_genes = len(missed_genes)
    
    if options.write_missed:
        outfile = open(options.write_missed, "w")
        for x in missed_transcripts:
            if x in unmapped:
                status = "unmapped"
            else:
                status = "mapped"
            outfile.write( "%s\t%s\t%s\n" % ("transcript", x, status ))
        for x in missed_genes:
            status = "unknown"
            outfile.write( "%s\t%s\t%s\n" % ("gene", x, status ))
        
        outfile.close()
        
    options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (\
         ninput, noutput, nfiltered, nduplicates, nmapped, nerrors ))
    options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (\
       nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned ))
    options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (\
        len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes) )
    
    E.Stop()
Ejemplo n.º 18
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option(
        "--method",
        dest="method",
        type="choice",
        choices=("view", "align", "pileup", "profile"),
        help="method to perform [default=%default].",
    )

    parser.add_option(
        "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default]."
    )

    parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].")

    parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write(
            "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n"
            % (
                nid1,
                nid2,
                result.getScore(),
                result.getLength(),
                result.getNumGaps(),
                result.getRowFrom(),
                result.getRowTo(),
                result.getColFrom(),
                result.getColTo(),
            )
        )

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()
Ejemplo n.º 19
0
	nsplits, 
	nstopcodons, 
	pidentity, 
	psimilarity, 
	sequence, 
	sbjct_genome_from, 
	sbjct_genome_to, 
	map_query2genome
    FROM %s AS p 
    WHERE p.sbjct_token = '%s' AND
    p.sbjct_strand = '%s' AND 
    OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 
    """

    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, param_gop, param_gep )
    map_reference2target = alignlib.makeAlignmentVector()    
    assignment_id = 0
    
    for line in cr.fetchall():

        reference = PredictionParser.PredictionParserEntry()
        reference.FillFromTable( line )

        ct = dbhandle.cursor()
        ct.execute( statement % (param_tablename_predictions_target,
                    reference.mSbjctToken, reference.mSbjctStrand,
                    reference.mSbjctGenomeFrom, reference.mSbjctGenomeTo ))

        reference_exons = Exons.Alignment2Exons( reference.mMapPeptide2Genome,
                                                 0, 
                                                 reference.mSbjctFrom)
Ejemplo n.º 20
0
def alignlibCombineVector():
    "test combination of vectors"
    vector = alignlib.makeAlignmentVector()
    alignlib.combineAlignment( vector, alignlib_vector, alignlib_vector, alignlib.RR)
Ejemplo n.º 21
0
    def read( self, line ):

        data = string.split( line[:-1], "\t")
        
        if len(data) == 26:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString, self.mNAssembled,
              ) = data
        elif len(data) == 25:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              ) = data
        elif len(data) == 24:            
            ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              ) = data
        elif len(data) == 23:
            ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              ) = data
            self.mAlignmentString = ""
        else:
            raise ValueError, "unknown format: %i fields in line %s" % (len(data), line[:-1])

        (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity) = map (\
            float, (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity))

        (self.mPredictionId, 
         self.mQueryFrom, self.mQueryTo, self.mQueryLength,
         self.mSbjctFrom, self.mSbjctTo,
         self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
         self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons,
         self.mNFrameShifts, self.mNAssembled) = map (\
            int, ( self.mPredictionId,
                   self.mQueryFrom, self.mQueryTo, self.mQueryLength,
                   self.mSbjctFrom, self.mSbjctTo,
                   self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
                   self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons,
                   self.mNFrameShifts, self.mNAssembled))

        if self.mExpand:        
            self.mMapPeptide2Translation = alignlib.makeAlignmentVector()

            if self.mQueryAli != "" and self.mSbjctAli != "":
                
                alignlib.AlignmentFormatExplicit(
                    self.mQueryFrom, self.mQueryAli,
                    self.mSbjctFrom, self.mSbjctAli).copy( self.mMapPeptide2Translation )

            self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
def EliminateRedundantEntries( rep, 
                               data,
                               eliminated_predictions,
                               options, 
                               peptides,
                               extended_peptides,
                               filter_quality = None,
                               this_quality = None ):
    """eliminate redundant entries in a set."""
    
    eliminated = []

    rep_id = rep.transcript_id
    rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid

    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, options.gop, options.gep )
    result = alignlib.makeAlignmentVector()
    
    rep_seq = peptides[rep_id]
    rep_extended_seq = extended_peptides[rep_id]

    for entry in data:

        mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id,
                                                       entry.mQueryCoverage,
                                                       entry.mPid,
                                                       entry.mQuality )

        mem_seq = peptides[mem_id]
        mem_extended_seq = extended_peptides[mem_id]

        if options.loglevel >= 4:
            options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality))
            
        if mem_id in eliminated_predictions: continue

        if mem_extended_seq == rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "i") )

        elif mem_extended_seq in rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "p") )

        else:
            if mem_quality != this_quality or \
                   mem_quality in options.quality_exclude_same:
          
                seq1 = alignlib.makeSequence( str(rep_seq) )
                seq2 = alignlib.makeSequence( str(mem_seq) )            

                alignator.align( result, seq1, seq2 )

                if options.loglevel >= 5:
                    options.stdlog.write( "# ali\n%s\n" % alignlib.AlignmentFormatExplicit( result, seq1, seq2 ) )
                
                pidentity = 100 * alignlib.calculatePercentIdentity( result, seq1, seq2 )
                
                num_gaps = result.getNumGaps()

                if options.loglevel >= 4:
                    options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\
                                              ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) )
                    
                if pidentity >= options.min_identity:

                    keep = False
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        keep = True
                        reason = "covpid"
                    elif num_gaps >= options.max_gaps and \
                         mem_coverage > rep_coverage - options.safety_coverage:
                        keep = True
                        reason = "gaps"
                    elif mem_coverage >= rep_coverage - options.safety_coverage and \
                             100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage:
                        keep = True
                        reason = "memcov"

                    if keep:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "h") )
                        
                elif pidentity >= options.min_identity_non_genes and \
                         this_quality in options.quality_genes and \
                         mem_quality not in options.quality_genes:
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "l") )

    return eliminated
Ejemplo n.º 23
0
    def checkLinkZScore( self,
                         query_nid, query_from, query_to,
                         sbjct_nid, sbjct_from, sbjct_to):
        """check, whether two domains are homologous.
        
        The check is done using a zscore calculation.
        """

        result = alignlib.makeAlignmentVector()
        
        query_profile = self.getAlignandum( query_nid )
        sbjct_profile = self.getAlignandum( sbjct_nid )

        if not query_profile or not sbjct_profile:
            self.warn( "could not compute link %s_%i_%i - %s_%i_%i\n" % \
                       (query_nid, query_from, query_to,
                        sbjct_nid, sbjct_from, sbjct_to) )
            self.mNNotFound += 1
            return False, result, ("na",)
        
        query_profile.useSegment( query_from, query_to )
        sbjct_profile.useSegment( sbjct_from, sbjct_to )        
        
        self.mAlignator.align( result, query_profile, sbjct_profile )
        
        self.debug( "# --> %s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i" %\
                    (query_nid, sbjct_nid,
                     result.getScore(),
                     result.getLength(),
                     result.getNumGaps(),
                     result.getRowFrom(), result.getRowTo(),
                     result.getColFrom(), result.getColTo()))
        
        if result.getLength() == 0:
            query_profile.useSegment()
            sbjct_profile.useSegment()
            return False, result, ("na",)
        
        elif result.getScore() < self.mMinAlignmentScore:
            query_profile.useSegment()
            sbjct_profile.useSegment()
            return False, result, ("na",)

        elif result.getScore() > self.mSafetyThreshold * self.mMinAlignmentScore:
            query_profile.useSegment()
            sbjct_profile.useSegment()
            return True,result, ("na",)
        
        z_params = alignlib.makeNormalDistributionParameters()
        alignlib.calculateZScoreParameters( z_params,
                                            query_profile,
                                            sbjct_profile,
                                            self.mAlignator,
                                            self.mNumIterationsZScore)
        
        mean   = z_params.getMean()
        stddev = z_params.getStandardDeviation()
        if stddev == 0: stddev = 1
        
        zscore = (result.getScore() - mean) / stddev
        
        self.debug( "--> mean=%f, stdev=%f, zscore=%f" % (mean, stddev, zscore) )
        
        query_profile.useSegment()
        sbjct_profile.useSegment()
        
        if zscore > self.mMinZScore:
            return True, result, ( "%5.2f" % zscore,)
        else:
            return False, result, ( "%5.2f" % zscore,)
Ejemplo n.º 24
0
 def Expand( self ):
     self.mMapOld2New = alignlib.makeAlignmentVector()
     alignlib.AlignmentFormatEmissions( 
         self.mOldFrom, self.mOldAli,
         self.mNewFrom, self.mNewAli).copy( self.mMapOld2New )
Ejemplo n.º 25
0
def main():

    parser = E.OptionParser( version = "%prog version: $Id: quality2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])

    parser.add_option("--quality-threshold", dest="quality_threshold", type="int",
                      help="quality threshold for masking positions [default=%default]" )

    parser.add_option("--random", dest="random", action="store_true",
                      help="shuffle quality scores before masking [default=%default]" )

    parser.add_option("--filename-map", dest="filename_map", type="string",
                      help="filename in psl format mapping entries in multiple alignment to the genome [default=%default]" )

    parser.add_option("-q", "--quality-file", dest="quality_file", type="string",
                      help="filename with genomic base quality information [default=%default]."  )


    parser.set_defaults(
        quality_threshold = 40,
        quality_file = "quality",
        filename_map = None,
        frame = 3,
        )

    (options, args) = E.Start( parser )

    ##################################################
    ##################################################
    ##################################################
    ## read map
    ##################################################
    infile = open(options.filename_map) 
    map_genes2genome = {}
    for match in Blat.iterator( infile ):
        assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId
        map_genes2genome[match.mQueryId] = match
    infile.close()

    ##################################################
    ##################################################
    ##################################################
    ## get quality scores
    ##################################################
    quality = IndexedFasta.IndexedFasta( options.quality_file )
    quality.setTranslator( IndexedFasta.TranslatorBytes() )

    ##################################################
    ##################################################
    ##################################################
    ## main loop
    ##################################################
    ninput, noutput, nmissed = 0, 0, 0

    options.stdout.write( "cluster_id\tstart\tend\n" )

    for line in options.stdin:
        if line.startswith("cluster_id"): continue
        ninput += 1
        cluster_id, gene_id, alignment = line[:-1].split("\t")

        if gene_id not in map_genes2genome:
            nmissed += 1
            E.warn( "gene_id %s not found in map." % gene_id )
            continue
        
        match = map_genes2genome[gene_id]
        map_gene2genome = match.getMapQuery2Target()
        is_negative = match.strand == "-"

        # if strand is negative, the coordinates are 
        # on the negative strand of the gene/query
        # in order to work in the right coordinate system
        # revert the sequence
        if is_negative: 
            alignment = alignment[::-1]

        # get map of gene to alignment
        map_gene2mali = alignlib.makeAlignmentVector()
        fillAlignment( map_gene2mali, alignment )

        # get quality scores
        try:
            quality_scores = quality.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)
        except ValueError, msg:
            nmissed += 1
            E.warn( "could not retrieve quality scores for %s:%i-%i: %s" % (match.mSbjctId, match.mSbjctFrom, match.mSbjctTo, msg) )
            continue

        # print str(alignlib.AlignmentFormatEmissions( map_gene2genome))
        # print str(alignlib.AlignmentFormatEmissions( map_gene2mali))
        # print quality_scores

        map_mali2genome = alignlib.makeAlignmentVector()
        alignlib.combineAlignment( map_mali2genome, map_gene2mali, map_gene2genome, alignlib.RR )
        # print str(alignlib.AlignmentFormatEmissions( map_mali2genome))

        # shuffle quality scores, but only those that are aligned
        if options.random:
            positions = []
            for fp,c in enumerate(alignment):
                if c == "-": continue
                y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom 
                if y < 0: continue
                positions.append( y )
            scores = [ quality_scores[ x ] for x in positions ]
            random.shuffle(scores)
            for p,q in zip( positions,scores): quality_scores[p] = q

        # negative strand
        to_mask = []
        ## reverse position
        rp = len(alignment)
        for fp,c in enumerate(alignment):
            rp -= 1
            if c == "-": continue
            y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom
            if y < 0: continue
            if quality_scores[y] < options.quality_threshold:
                if is_negative: p = rp
                else: p = fp
                E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % \
                             (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol( fp ), quality_scores[y] ) )
                if options.frame > 1:
                    start = (p // options.frame) * options.frame
                    to_mask.extend( list( range(start, start + options.frame) ) )
                else:
                    to_mask.append( p ) 

        regions = Iterators.group_by_distance( sorted(to_mask) )
            
        for start,end in regions:
            options.stdout.write( "%s\t%i\t%i\n" % (cluster_id, start, end ) )

        noutput += 1
Ejemplo n.º 26
0
    ninput, noutput, nskipped = 0, 0, 0
    
    options.stdout.write( "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n" )

    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        ninput += 1
        
        sequence = re.sub( " ", "", cur_record.sequence)
        l = len(sequence)

        map_sequence2mali = alignlib.makeAlignmentVector()        

        alignlib.AlignmentFormatExplicit( 0, sequence,
                                          0, "X" * l ).copy( map_sequence2mali )

        options.stdout.write( "\t".join( (
                cur_record.title,
                "ref",
                str( alignlib.AlignmentFormatBlocks( map_sequence2mali ) ) ) ) + "\n" )
        
        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped))
        
    E.Stop()
Ejemplo n.º 27
0
def _alignToProfile(infile, outfile, min_score=0):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile(open("../data/mouse.fasta"))
    src_mali = Mali.convertMali2Alignlib(mali)

    E.debug("read mali: %i sequences x %i columns" %
            (mali.getNumSequences(), mali.getNumColumns()))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns()
    for x in "ACGT":
        for y in range(0, 2):
            profile_mali.addSequence("%s%i" % (x, y), 0, n, x * n)

    profile_mali = Mali.convertMali2Alignlib(profile_mali)
    alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.DNA4))
    alignlib.setDefaultLogOddor(alignlib.makeLogOddorUniform())

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile(profile_mali)

    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull(alignment_mode, -5.0, -0.5)

    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal(0, n, 0)
    build_mali.add(src_mali, m)

    outf = open(outfile, "w")
    outf_log = open(outfile + ".info", "w")
    outf_log.write(
        "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n"
    )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append(re.sub("-", "", mali[pid]))
        ids.append(pid)

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator(open(infile)):

        E.debug("adding %s" % s.title)
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence(s.sequence)
        rseq = alignlib.makeSequence(rsequence)

        alignator.align(map_seq2profile, seq, profile)
        alignator.align(map_rseq2profile, rseq, profile)

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score:
            c.skipped += 1
            continue

        r = getParts(m)

        covered = 0
        for mm in r:
            build_mali.add(mm)
            sequences.append(sequence)
            ids.append(s.title)
            covered += mm.getLength() - mm.getNumGaps()

        mali_covered = m.getColTo() - m.getColFrom()

        outf_log.write("\t".join(
            map(str, (s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(),
                      len(r), covered, "%5.2f" %
                      (100.0 * covered / len(s.sequence)), m.getScore(),
                      m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" %
                      ((100.0 * mali_covered) / mali.getNumColumns())))) +
                       "\n")

        c.output += 1

    #build_mali.expand( aa )
    result = str(
        alignlib.MultAlignmentFormatPlain(build_mali, sequences,
                                          alignlib.UnalignedStacked))

    for pid, data in zip(ids, result.split("\n")):
        start, sequence, end = data.split("\t")
        outf.write(">%s/%i-%i\n%s\n" %
                   (pid, int(start) + 1, int(end), sequence))

    outf.close()
    outf_log.close()

    E.info("%s\n" % str(c))
Ejemplo n.º 28
0
def AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options,
                     diag_width = 2, max_advance = 2 ):
    """advance in codons in seq_wobble and match to nucleotides in seq_cds.

    Due to alinglib this is all in one-based coordinates.
    Takes care of frameshifts.
    """
    
    map_p2c.clear()

    gop, gep = -1.0, -1.0
    matrix = alignlib.makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib.getDefaultEncoder() )

    pep_seq = seq_peptide.asString()
    cds_seq = seq_cds.asString()
    wobble_seq = seq_wobble.asString()
    
    lcds = seq_cds.getLength()
    lwobble = seq_wobble.getLength()
    y = 0
    x = 0

    last_start = None

    while x < lwobble and y < lcds:

        xr = seq_wobble.asResidue( x )
        # skip over masked chars in wobble - these are gaps
        if seq_wobble.asChar(x) == "X": 
            x += 1
            continue

        # skip over masked chars in wobble - these are from
        # masked chars in the peptide sequence
        # Note to self: do not see all implications of this change
        # check later.
        if seq_wobble.asChar(x) == "N": 
            x += 1
            continue

        # skip over gaps in wobble 
        if seq_wobble.asChar(x) == "-": 
            x += 1
            continue

        s = matrix.getValue( xr, seq_cds.asResidue(y) )

        if options.loglevel >= 6:
            if (x % 3 == 0):
                c = seq_cds.asChar(y) + seq_cds.asChar(y+1) + seq_cds.asChar(y+2)
                options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y,
                                                                                 Genomics.MapCodon2AA( c ),
                                                                                 pep_seq[int(x/3)]) )
                                      
            options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % \
                                      (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s) ))
            
        # deal with mismatches
        if s <= 0:

            tmp_map_p2c = alignlib.makeAlignmentVector()

            ## backtrack to previous three codons and align
            ## three codons for double frameshifts that span two codons and
            ## produce two X's and six WWWWWW.

            ## number of nucleotides to extend (should be multiple of 3)
            ## less than 12 caused failure for some peptides.
            d = 15
            
            # extend by amound dx
            dx = (x % 3) + d
            
            x_start = max(0, x - dx )
            # map to ensure that no ambiguous residue mappings
            # exist after re-alignment
            y_start = max(0, map_p2c.mapRowToCol( x_start, alignlib.RIGHT ))

            if (x_start, y_start) == last_start:
                raise ValueError( "infinite loop detected" )

            last_start = (x_start, y_start)

            x_end = min(x_start + 2 * d, len(wobble_seq) )
            y_end = min(y_start + 2 * d, len(cds_seq) )

            wobble_fragment = alignlib.makeSequence(wobble_seq[x_start:x_end])
            cds_fragment = alignlib.makeSequence(cds_seq[y_start:y_end])
            
            AlignExhaustive( wobble_fragment, cds_fragment, "", tmp_map_p2c, options )

            if options.loglevel >= 10:
                 options.stdlog.write("# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end,
                                                                                           y_start, y_end,
                                                                                           str(alignlib.AlignmentFormatExplicit( tmp_map_p2c,
                                                                                                                                 wobble_fragment, 
                                                                                                                                 cds_fragment ))))
                 
                 options.stdlog.flush()

            ## clear alignment
            map_p2c.removeRowRegion( x_start, x_end )
            ngap = 0
            last_x, last_y = None, None
            for xxx in range( tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo() ):
                yyy = tmp_map_p2c.mapRowToCol(xxx)

                if yyy >= 0:
                    x = xxx + x_start
                    y = yyy + y_start
                    xr = seq_wobble.asResidue(x)
                    s = matrix.getValue( seq_wobble.asResidue(x), seq_cds.asResidue(y) )
                    if s < 0:
                        raise ValueError("mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y)))
                    
                    map_p2c.addPair( x, y, s)
                    last_x, last_y = x, y
                    if options.loglevel >= 6:
                        options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \
                                              (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s ))
                        options.stdlog.flush()
                    ngap = 0
                else:
                    ngap += 1

                # treat special case of double frameshifts. They might cause a petide/wobble residue
                # to be eliminated and thus the translated sequences will differ.
                # simply delete the last residue between x and y and move to next codon.
                if ngap == 3:
                    map_p2c.removeRowRegion( last_x, last_x + 1 )

                    last_x += 1
                    map_p2c.addPair( last_x, last_y )
                    if options.loglevel >= 6:
                        options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \
                                              (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s ))
                        options.stdlog.flush()                    
                    ngap = 0
                    
            ## exit condition if alignment is shorter than problematic residue
            ## need to catch this to avoid infinite loop.
            if tmp_map_p2c.getRowTo() < d:
                if lwobble - x <= 4:
                    ## only last codon is missing, so ok
                    break
                else:
                    raise ValueError("failure to align in designated window.")
                    
            s = 0
            
        s = matrix.getValue( xr, seq_cds.asResidue(y) )

        if s < 0:
            raise ValueError("mis-matching residues.")
        
        map_p2c.addPair( x, y, float(s) )
        
        # advance to next residues
        x += 1
        y += 1

    # sanity checks
    assert( map_p2c.getRowTo() <= seq_wobble.getLength() )
    assert( map_p2c.getColTo() <= seq_cds.getLength() )
Ejemplo n.º 29
0
    def Add( self, const_other,
             combine_contig = False,
             allow_overlap = False,
             contig_size = 0,
             combine_queries = False,
             as_intron = False ):
        """add one entry to another.

        This procedure allows to add
        
        - predictions on different contigs if combine_contig = True
        - overlapping predictions on the same query if allow_overlap = True
        - results from different queries if combine_queries = True

        - if as_intron is set to true, the new fragment is added as an intron.
        
        """

        ## create working copies of each prediction
        other = const_other.getCopy()
        this  = self.getCopy()

        other.Expand()
        this.Expand()

        if as_intron:
            code = "I"
        else:
            code = "P"

        ## check for query overlaps
        if this.mQueryToken == other.mQueryToken:

            query_overlap = max( 0, min(this.mQueryTo, other.mQueryTo) -\
                                 max(this.mQueryFrom, other.mQueryFrom) + 1)

            if query_overlap > 0:

                if allow_overlap:
                    overlap = query_overlap
                    ## if queries overlap, truncate this before adding the other
                    this.mMapPeptide2Translation.removeRowRegion( this.mQueryTo - overlap + 1, this.mQueryTo )
                    other.mMapPeptide2Translation.moveAlignment( 0, -overlap )
                    this.mQueryTo -= overlap
                    this.mTranslation = this.mTranslation[:-overlap]

                    ## remove aligned residues from the back
                    for x in range(len(this.mMapPeptide2Genome) - 1, 0, -1):
                        if this.mMapPeptide2Genome[x][1] <= overlap:
                            overlap -= this.mMapPeptide2Genome[x][1]
                            del this.mMapPeptide2Genome[x]
                        else:
                            break
                    this.mMapPeptide2Genome[-1] = (this.mMapPeptide2Genome[-1][0],
                                                   this.mMapPeptide2Genome[-1][1] - overlap,
                                                   this.mMapPeptide2Genome[-1][2] - overlap * 3)
                else:
                    raise ValueError, "refusing to add overlapping entries: overlap = %i, queries:\n%s\n%s\n, set allow_overlap = True " % (query_overlap, str(this), str(other))


        else:
            if not combine_queries:
                raise ValueError, "refusing to add different queries - set combine_queries = True."

        if this.mSbjctToken != other.mSbjctToken or \
               this.mSbjctStrand != other.mSbjctStrand :
            if combine_contig:
                this.mSbjctToken += "-" + other.mSbjctToken
                this.mSbjctStrand += other.mSbjctStrand
            else:
                raise ValueError, "can not add different sbjct."                

        sbjct_overlap = max(0, min(this.mSbjctGenomeTo, other.mSbjctGenomeTo) -\
                            max(this.mSbjctGenomeFrom, other.mSbjctGenomeFrom), 0)

        if sbjct_overlap > 0:
            if not combine_contig:
                raise ValueError, "refusing to add overlapping entries: overlap = %i, sbjct:\n%s\n%s\n" % (sbjct_overlap, str(this), str(other))

        if this.mSbjctToken == other.mSbjctToken:

            ## set precedence
            if this.mSbjctGenomeFrom < other.mSbjctGenomeFrom:
                first = this
                second = other
            else:
                first = other
                second = this

            ## get length of gap
            d_na = second.mSbjctGenomeFrom - first.mSbjctGenomeTo

            if this.mQueryToken != other.mQueryToken:
                d_aa = first.mQueryLength - first.mQueryTo                 
                # create a new virtual query by concatenating
                # the two queries
                this.mQueryToken += "-" + other.mQueryToken

                # sort out the alignment
                second.mMapPeptide2Translation.moveAlignment( first.mQueryLength, 0 )

                this.mQueryLength = first.mQueryLength + second.mQueryLength

            else:
                d_aa = second.mQueryFrom - first.mQueryTo - 1
            
            this.mSbjctGenomeFrom = min(this.mSbjctGenomeFrom, other.mSbjctGenomeFrom )
            this.mSbjctGenomeTo = max(this.mSbjctGenomeTo, other.mSbjctGenomeTo )        

            this.mMapPeptide2Genome = first.mMapPeptide2Genome + [(code, d_aa, d_na)] + second.mMapPeptide2Genome
            this.mTranslation = first.mTranslation + second.mTranslation

            second.mMapPeptide2Translation.moveAlignment( 0, first.mSbjctTo - 1 )
            
        else:
            ## join on different contigs
            d_na = contig_size - this.mSbjctGenomeTo + other.mSbjctGenomeFrom + query_overlap * 3
            d_aa = other.mQueryFrom - this.mQueryTo - 1
            this.mMapPeptide2Genome += [(code, d_aa, d_na),] + other.mMapPeptide2Genome
            this.mTranslation += other.mTranslation 
            other.mMapPeptide2Translation.moveAlignment( 0, this.mSbjctTo - 1 )

            this.mSbjctGenomeFrom = this.mSbjctGenomeFrom
            this.mSbjctGenomeTo = contig_size + other.mSbjctGenomeTo

        ## now fill self from first and this
        self.mQueryToken = first.mQueryToken
        self.mQueryLength = this.mQueryLength
        
        nthis  = this.mMapPeptide2Translation.getLength() - this.mMapPeptide2Translation.getNumGaps()
        nother = other.mMapPeptide2Translation.getLength() - other.mMapPeptide2Translation.getNumGaps()

        self.mMapPeptide2Genome = first.mMapPeptide2Genome
        self.mSbjctGenomeFrom = this.mSbjctGenomeFrom
        self.mSbjctGenomeTo= this.mSbjctGenomeTo
        
        ## there might be some reference counting issues, thus
        ## do it the explicit way.
        alignlib.addAlignment2Alignment( this.mMapPeptide2Translation, other.mMapPeptide2Translation)
        self.mMapPeptide2Translation = alignlib.makeAlignmentVector()
        alignlib.addAlignment2Alignment( self.mMapPeptide2Translation, this.mMapPeptide2Translation )
        
        self.mTranslation = this.mTranslation
        
        self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom()
        self.mQueryTo = self.mMapPeptide2Translation.getRowTo()
        self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom()
        self.mSbjctTo = self.mMapPeptide2Translation.getColTo()
        
        self.mQueryCoverage = 100.0 * (self.mQueryTo - self.mQueryFrom + 1) / float(self.mQueryLength)

        self.mAlignmentString = string.join( map( \
                                      lambda x: string.join(map(str, x), " "),
                                      self.mMapPeptide2Genome), " ")

        f = alignlib.AlignmentFormatEmssions( self.mMapPeptide2Translation )
        self.mQueryAli, self.mSbjctAli = f.mRowAlignment, f.mColAlignment

        ## summary parameters
        self.mRank = max( this.mRank, other.mRank)
        self.score += other.score
        self.mNGaps += other.mNGaps
        self.mNFrameShifts += other.mNFrameShifts
        self.mNIntrons += other.mNIntrons + 1
        self.mNStopCodons += other.mNStopCodons
        
        nnew = self.mMapPeptide2Translation.getLength() - self.mMapPeptide2Translation.getNumGaps()
        
        self.mPercentIdentity = min( 100.0, (self.mPercentIdentity * nthis + other.mPercentIdentity * nother) / nnew )
        self.mPercentSimilarity = min( 100.0, (self.mPercentSimilarity * nthis + other.mPercentSimilarity * nother) / nnew )

        self.mNAssembled += 1 + other.mNAssembled
Ejemplo n.º 30
0
def PrintCluster( cluster,
                  cluster_id,
                  lengths,
                  peptide_sequences = None,
                  regex_preferred = None):
    """print a cluster.

    Take longest sequence as representative. If preferred is given, only take
    genes matching preferred identifier.
    """

    if regex_preferred:
        rx = re.compile(regex_preferred)
    else:
        rx = None
        
    max_al = 0
    max_pl = 0
    rep_a = None
    rep_p = None
    for c in cluster:
        l = 0
        if c in lengths: l = lengths[c]

        if l > max_al:
            max_al = l
            rep_a  = c

        if rx and rx.search(c) and l > max_pl:
            max_pl = l
            rep_p = c

    if max_pl > 0:
        max_l = max_pl
        rep = rep_p
    else:
        max_l = max_al
        rep = rep_a
        
    for mem in cluster:
        l = 0
        if mem in lengths: l = lengths[mem]
        if peptide_sequences:
            map_rep2mem = alignlib.makeAlignmentVector()            
            
            if rep == mem and rep in lengths:
                alignlib.addDiagonal2Alignment( map_rep2mem, 1, lengths[rep], 0)
            elif mem in peptide_sequences and \
                     rep in peptide_sequences:
                alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -1.0)
                alignator.align( map_rep2mem,
                                 alignlib.makeSequence( peptide_sequences[rep] ),
                                 alignlib.makeSequence( peptide_sequences[mem] ) )
                    
            f = alignlib.AlignmentFormatEmissions( map_rep2mem )
            print string.join( map(str, (rep, mem, l, f)), "\t" ) 

        else:
            print string.join( map(str, (rep, mem, l)), "\t" )
            
    sys.stdout.flush()
    
    return cluster_id
Ejemplo n.º 31
0
            def __init__(self):
                self.mAlignator1 = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -2.0 )
                self.mAlignator = alignlib.makeAlignatorIterative( self.mAlignator1, options.iterative_min_score )

            def align(self, query, sbjct, map_query2sbjct):
                xrow = alignlib.makeSequence(query.asString())
                xcol = alignlib.makeSequence(sbjct.asString())
                self.mAlignator.align( xrow, xcol, map_query2sbjct)
                
        alignator = AlignatorSequence()
    elif options.alignment_mode == "compass":
        alignator = AlignatorCompass()
    else:
        raise "unknown alignment mode %s" % options.alignment_mode

    map_query2sbjct = alignlib.makeAlignmentVector()

    def __align( query_profile, sbjct_profile ):
        """align two profiles and output the result."""
        
        alignator.align( query_profile, sbjct_profile, map_query2sbjct )
        
        blocks = alignlib.AlignedBlocks( map_query2sbjct )
        
        if options.loglevel >= 3:
            options.stdlog.write( str(map_query2sbjct) )

        if map_query2sbjct.getLength() > 0:
            options.stdout.write("%s\t%s\t%i\t%s\n" % (
                    query, sbjct, map_query2sbjct.getScore(), str(blocks) ) )
            return 1
Ejemplo n.º 32
0
def IsParalogLink( link, cds1, cds2 ):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    """

    map_a2b = alignlib.makeAlignmentVector()
    alignlib.AlignmentFormatEmissions(
        link.mQueryFrom, link.mQueryAli,
        link.mSbjctFrom, link.mSbjctAli ).copy( map_a2b )

    if link.mQueryLength < (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) or \
       link.mSbjctLength < (map_a2b.getColTo() - map_a2b.getColFrom() + 1):
        print "ERRONEOUS LINK: %s" % str(link)
        raise "length discrepancy"

    coverage_a = 100.0 * (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / link.mQueryLength
    coverage_b = 100.0 * (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / link.mSbjctLength

    ## check exon boundaries, look at starts, skip first exon
    def MyMap( a, x):
        if x < a.getRowFrom(): return 0
        while x <= a.getRowTo():
            c = a.mapRowToCol( x ) 
            if c: return c
            x += 1
        else:
            return 0
    
    mapped_boundaries    = UniquifyList(map( lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1) , cds1[1:]))
    reference_boundaries = UniquifyList(map( lambda x: x.mPeptideFrom / 3 + 1, cds2[1:]))

    nmissed = 0
    nfound = 0
    nmin = min(len(mapped_boundaries), len(reference_boundaries))
    nmax = max(len(mapped_boundaries), len(reference_boundaries))
    both_single_exon = len(cds1) == 1 and len(cds2) == 1
    one_single_exon = len(cds1) == 1 or len(cds2) == 1
    if len(mapped_boundaries) < len(reference_boundaries):
        mless = mapped_boundaries
        mmore = reference_boundaries
    else:
        mmore = mapped_boundaries
        mless = reference_boundaries
    
    ## check if exon boundaries are ok
    for x in mless:
        is_ok = 0
        for c in mmore:
            if abs(x-c) < param_boundaries_max_slippage:
                is_ok = 1
                break
        if is_ok:
            nfound += 1
        else:
            nmissed += 1

    ## set is_ok for dependent on exon boundaries
    ## in single exon cases, require a check of coverage
    is_ok = False
    check_coverage = False
    if both_single_exon or one_single_exon:
        is_ok = True
        check_coverage = True
    else:
        if nmin == 1:
            is_ok = nmissed == 0
        elif nmin == 2:
            is_ok = nmissed <= 1
        elif nmin > 2:
            is_ok = nfound >= 2
            
    cc = min(coverage_a, coverage_b)

    if param_loglevel >= 3:
        print "# nquery=", len(cds1), "nsbjct=", len(cds2), "nmin=", nmin, "nmissed=", nmissed, "nfound=", nfound, \
              "is_ok=", is_ok, "check_cov=", check_coverage, \
              "min_cov=", cc, coverage_a, coverage_b, \
              "mapped=", mapped_boundaries, "reference=",reference_boundaries

    if not is_ok:
        return True, "different exon boundaries"

    if check_coverage and cc < param_min_coverage:
        return True, "low coverage"

    return False, None
Ejemplo n.º 33
0
import timeit
import alignlib

NUM_SAMPLES=1000
ALISIZE=2000


alignlib_vector = alignlib.makeAlignmentVector()
alignlib_vector.addDiagonal( 0, ALISIZE, 0)

python_vector = []
for x in xrange(ALISIZE): 
    python_vector.append(x) 

def pythonBuildVector():
    """build vector alignment in python."""
    vector = []
    for x in xrange(ALISIZE): 
        vector.append(x) 
     
def alignlibBuildVector():
    "Stupid test function"
    vector = alignlib.makeAlignmentVector()
    vector.addDiagonal( 0, ALISIZE, 0)

def pythonMapVector():
    "test speed of mapRowToCol"    
    for x in xrange(ALISIZE): 
        a = python_vector[x]
     
def alignlibMapVector():
Ejemplo n.º 34
0
        raise ValueError( "mapping error for sequence: %s" % (msg) )

    ## if there are more than five frameshifts - do exhaustive alignment
    max_gaps = 5
    num_peptide_gaps = len( re.sub("[^-]", "", p ) )
    ngaps = map_p2c.getNumGaps() - (num_peptide_gaps *  3) - abs(len(w)-len(c))
    
    if options.loglevel >= 6:
        options.stdlog.write("# alignment between wobble and cds: ngaps=%i, npeptide_gaps=%i\n" % (ngaps, num_peptide_gaps) )
        PrintPrettyAlignment( seq_wobble, seq_cds, p, map_p2c, options )

    if ngaps > max_gaps:
        if options.loglevel >= 2:
            options.stdlog.write("# too many gaps (%i>%i), realigning exhaustively.\n" % (ngaps, max_gaps ) )
            options.stdlog.flush()
        full_map_p2c = alignlib.makeAlignmentVector()
        
        AlignExhaustive( seq_wobble, seq_cds, seq_peptide, full_map_p2c, options )
        if options.loglevel >= 6:
            options.stdlog.write("# full alignment between wobble and cds:\n" )
            options.stdlog.flush()
            PrintPrettyAlignment( seq_wobble, seq_cds, p, full_map_p2c, options )

        map_p2c = full_map_p2c
        
    ## remove incomplete codons
    x = 0
    while x < len(p) * 3:
        if (map_p2c.mapRowToCol( x ) < 0 or \
            map_p2c.mapRowToCol( x+1 ) < 0 or \
            map_p2c.mapRowToCol( x+2 ) < 0 ):
Ejemplo n.º 35
0
def alignlibBuildVector():
    "Stupid test function"
    vector = alignlib.makeAlignmentVector()
    vector.addDiagonal( 0, ALISIZE, 0)
Ejemplo n.º 36
0
    def Align( self, method, anchor = 0, loglevel = 1 ):
        """align a pair of sequences.
        get rid of this and use a method class instead in the future
        """
        
        map_a2b = alignlib.makeAlignmentVector()
        s1 = "A" * anchor + self.mSequence1 + "A" * anchor
        s2 = "A" * anchor + self.mSequence2 + "A" * anchor    

        self.strand = "+"

        if method == "dialign":
            dialign = WrapperDialign.Dialign( self.mOptionsDialign )
            dialign.Align( s1, s2, map_a2b )
        elif method == "blastz":
            blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ )
            blastz.Align( s1, s2, map_a2b )
            if blastz.isReverseComplement():
                self.strand = "-"
                self.mSequence2 = Genomics.complement( self.mSequence2 )

        elif method == "dialignlgs":
            dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS )
            dialignlgs.Align( s1, s2, map_a2b ) 
        elif method == "dba":
            dba = WrapperDBA.DBA()
            dba.Align( s1, s2, map_a2b )
        elif method == "clustal":
            raise NotImplementedError( "clustal wrapper needs to be updated")
            clustal = WrapperClustal.Clustal()
            clustal.Align( s1, s2, map_a2b )
        elif method == "nw":
            seq1 = alignlib.makeSequence( s1 )
            seq2 = alignlib.makeSequence( s2 )
            alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL,
                                                      gop=-12.0,
                                                      gep=-2.0 )
            alignator.align( map_a2b, seq1, seq2 )
        elif method == "sw":                        
            seq1 = alignlib.makeSequence( s1 )
            seq2 = alignlib.makeSequence( s2 )
            alignlib.performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw )
        else:
            ## use callback function
            method(s1, s2, map_a2b)

        if map_a2b.getLength() == 0:
            raise AlignmentError("empty alignment")

        if anchor:
            map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() )
            map_a2b.removeRowRegion( 1, anchor)        
            map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() )        
            map_a2b.removeColRegion( 1, anchor)
            map_a2b.moveAlignment( -anchor, -anchor )

        f = alignlib.AlignmentFormatExplicit( map_a2b, 
                                              alignlib.makeSequence( self.mSequence1),
                                              alignlib.makeSequence( self.mSequence2) )

        self.mMethod = method
        self.mAlignment = map_a2b
        self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment
        f = alignlib.AlignmentFormatEmissions( map_a2b )
        self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment
        self.mAlignmentFrom1 = map_a2b.getRowFrom()
        self.mAlignmentTo1 = map_a2b.getRowTo()        
        self.mAlignmentFrom2 = map_a2b.getColFrom()
        self.mAlignmentTo2 = map_a2b.getColTo()        
        self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength()
        self.mAligned = self.mLength - self.mNumGaps

        self.SetPercentIdentity()
        self.SetBlockSizes()
Ejemplo n.º 37
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("view", "align", "pileup", "profile"),
                      help="method to perform [default=%default].")

    parser.add_option("--mode",
                      dest="mode",
                      type="choice",
                      choices=("global", "local"),
                      help="alignment mode [default=%default].")

    parser.add_option("--gop",
                      dest="gop",
                      type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("--gep",
                      dest="gep",
                      type="float",
                      help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph,
                                    options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(
        alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(
        alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(
        alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop,
                                                 options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\
                                  (nid1, nid2,
                                   result.getScore(),
                                   result.getLength(),
                                   result.getNumGaps(),
                                   result.getRowFrom(), result.getRowTo(),
                                   result.getColFrom(), result.getColTo()))

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()