Example #1
0
def PrintAlignedSequences( sequence1, sequence2, chain = None, format="modeller" ):

    ## align sequences by identity

    seq_row = alignlib.makeSequence( sequence1 )
    seq_col = alignlib.makeSequence( sequence2 )
    alignator = alignlib.makeAlignatorFullDP( -0.0, -0.0 )
    map_row2col = alignlib.makeAlignataVector()
    alignator.Align( seq_row, seq_col, map_row2col )

    lines = string.split(alignlib.writePairAlignment( seq_row, seq_col, map_row2col ), "\n")

    if format == "modeller":
        
        first_res, sequence, last_res = string.split( lines[0], "\t" )
        
        print ">P1;structure"  
        print "structureX: %s : %s : %s : %s : %s : : : : " % ("structure", first_res, "" , last_res, "" )
        print "%s*" % sequence

        first_res, sequence, last_res = string.split( lines[1], "\t" )
        
        print ">P1;sequence"
        print "sequence:%s : %s : %s : %s : %s : : : : " % ("sequence" , first_res, "", last_res, "")
        print "%s*" % sequence
    else:
        print lines
Example #2
0
def GetAlignmentBetweenCorrespondingAtoms( coordinates1, coordinates2, cutoff ):
    """returns a list of atom positions, which are close to each other.

    This is done via a dynamic programming step. First all versus all comparison
    between atom positions is done. Only those positions are kept below cutoff.
    """

    dots = alignlib.makeAlignataMatrixRow()
    for i in range(len(coordinates1)):
        x1,y1,z1 = coordinates1[i]
        for j in range(len(coordinates2)):
            x2,y2,z2 = coordinates2[j]
            d = math.sqrt( (x1-x2)*(x1-x2) + (y1-y2)*(y1-y2) + (z1-z2)*(z1-z2))
            if d <= cutoff:
                dots.addPairExplicit(i+1, j+1, 1)
                
    seq1 = alignlib.makeSequence ("A" * len(coordinates1))
    seq2 = alignlib.makeSequence ("A" * len(coordinates2))    

    if dots.getLength() <= 3:
        return None
    
    dottor = alignlib.makeAlignatorDummy( dots )
    alignator = alignlib.makeAlignatorDotsSquared( 0, 0, dottor)
    map_a2b = alignlib.makeAlignataVector()
    
    alignator.Align( seq1, seq2, map_a2b)

    return map_a2b
Example #3
0
def CheckAlignments( peptide_sequences, query_token, other_tokens ):
    """check wether query aligns to all others.
    """

    if param_loglevel >= 3:
        print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens))
        sys.stdout.flush()
        
    if query_token not in peptide_sequences:
        return True

    result = alignlib.makeAlignmentVector()
    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL,
                                              -10.0, -1.0 )
    row_seq = alignlib.makeSequence(peptide_sequences[query_token])

    for x in other_tokens:
        if x not in peptide_sequences:
            continue
        col_seq = alignlib.makeSequence( peptide_sequences[x] )
        alignator.align( result, row_seq, col_seq )
        if param_loglevel >= 5:
            print "# %s - %s = %f" % (query_token, x, result.getScore())
        if result.getScore() > param_min_alignment_score:
            return True
        
    return False
Example #4
0
def PrintAlignedSequences(sequence1, sequence2, chain=None, format="modeller"):

    ## align sequences by identity

    seq_row = alignlib.makeSequence(sequence1)
    seq_col = alignlib.makeSequence(sequence2)
    alignator = alignlib.makeAlignatorFullDP(-0.0, -0.0)
    map_row2col = alignlib.makeAlignataVector()
    alignator.Align(seq_row, seq_col, map_row2col)

    lines = string.split(
        alignlib.writePairAlignment(seq_row, seq_col, map_row2col), "\n")

    if format == "modeller":

        first_res, sequence, last_res = string.split(lines[0], "\t")

        print ">P1;structure"
        print "structureX: %s : %s : %s : %s : %s : : : : " % (
            "structure", first_res, "", last_res, "")
        print "%s*" % sequence

        first_res, sequence, last_res = string.split(lines[1], "\t")

        print ">P1;sequence"
        print "sequence:%s : %s : %s : %s : %s : : : : " % (
            "sequence", first_res, "", last_res, "")
        print "%s*" % sequence
    else:
        print lines
Example #5
0
def getMapPeptide2Cds( peptide_sequence, cds_sequence, options ):
    """get map between peptide sequence and cds sequence.
    
    The returned alignment is in nucleotides.

    """
    
    ## remove whitespaces form protein sequence
    p = re.sub(" ", "", peptide_sequence )

    ## remove gaps and whitespaces from cds
    c = re.sub("[ .-]", "", cds_sequence )

    w = Genomics.Protein2Wobble( p.upper() )

    if options.loglevel >= 6:
        options.stdlog.write( "# peptide original (%5i): %s\n" % (len(p), p) )
        options.stdlog.write( "# cds original     (%5i): %s\n" % (len(c), c) )
        options.stdlog.write( "# wobble sequence  (%5i): %s\n" % (len(w), w) )
        options.stdlog.flush()

    seq_wobble = alignlib.makeSequence( w )
    seq_cds = alignlib.makeSequence( string.upper(c) )
    seq_peptide = alignlib.makeSequence( p )

    map_p2c = alignlib.makeAlignmentVector()

    try:
        AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options = options )
    except ValueError, msg:
        raise ValueError( "mapping error for sequence: %s" % (msg) )
Example #6
0
def AlignPair( pair, anchor = 0 ):
    """align a pair of introns."""

    map_intron_a2b = alignlib.makeAlignmentVector()

    if param_loglevel >= 1:
        print "# aligning %s-%i with %s-%i: lengths %i and %i" % (pair.mToken1, pair.mIntronId1,
                                                                  pair.mToken2, pair.mIntronId2,
                                                                  len(pair.mAlignedSequence1),
                                                                  len(pair.mAlignedSequence2))
        sys.stdout.flush()

    s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor
    s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor

    if param_method == "dialigned":
        dialign.Align( s1, s2, map_intron_a2b )
    elif param_method == "dialignedlgs":
        dialignlgs.Align( s1, s2, map_intron_a2b ) 
    elif param_method == "dbaligned":
        dba.Align( s1, s2, map_intron_a2b )
    elif param_method == "clusaligned":
        raise NotImplementedError("clustalw wrapper not up-to-date")
        clustal.Align( s1, s2, map_intron_a2b )

    if anchor:
        map_intron_a2b.removeRowRegion( anchor + len(pair.mAlignedSequence1) + 1, map_intron_a2b.getRowTo() )
        map_intron_a2b.removeRowRegion( 1, anchor)        
        map_intron_a2b.removeColRegion( anchor + len(pair.mAlignedSequence2) + 1, map_intron_a2b.getColTo() )        
        map_intron_a2b.removeColRegion( 1, anchor)
        map_intron_a2b.moveAlignment( -anchor, -anchor )

    if map_intron_a2b.getLength() == 0:
        if param_loglevel >= 1:
            print "# Error: empty intron alignment"
        return False


    seq1 = alignlib.makeSequence( pair.mAlignedSequence1 )
    seq2 = alignlib.makeSequence( pair.mAlignedSequence2 )
    
    data = alignlib.AlignmentFormatExplicit( map_intron_a2b, seq1, seq2 )

    pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo
    pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo
    pair.mMethod = param_method

    pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps(), map_intron_a2b.getLength()
    pair.mAligned = pair.mLength - pair.mNumGaps

    if param_loglevel >= 2:
        print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2

    return True
Example #7
0
    def applyMethod(self, neighbours):
        """apply the method."""
        # build multiple alignment
        mali = alignlib.makeMultipleAlignment()

        query_nid = neighbours.mQueryToken

        sequence = self.mFasta.getSequence(query_nid)

        mali.add(alignlib.makeAlignatum(sequence))

        qseq = alignlib.makeSequence(sequence)
        alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_GLOBAL,
                                                 -10.0, -1.0, True, True, True,
                                                 True)

        for n in neighbours.mMatches:

            if n.mSbjctToken == query_nid: continue
            sequence = self.mFasta.getSequence(n.mSbjctToken)

            blast_query2sbjct = n.getAlignment()

            if blast_query2sbjct == None:
                raise ValueError(
                    "AddaRealignment.py needs a reference alignment.")

            realign_query2sbjct = alignlib.makeAlignmentVector()

            sseq = alignlib.makeSequence(sequence)
            qseq.useSegment(n.mQueryFrom, n.mQueryTo)
            sseq.useSegment(n.mSbjctFrom, n.mSbjctTo)
            realign_query2sbjct = alignlib.makeAlignmentVector()
            alignator.align(realign_query2sbjct, qseq, sseq)

            nidentical = alignlib.getAlignmentIdentity(realign_query2sbjct,
                                                       blast_query2sbjct,
                                                       alignlib.RR)
            nblast = blast_query2sbjct.getNumAligned()
            nrealigned = realign_query2sbjct.getNumAligned()

            self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \
                                     (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) )

            if nidentical == nblast:
                self.mNIdentical += 1
            else:
                self.mNDifferent += 1
Example #8
0
 def _buildProfile(nid, start, end):
     neighbours = index.getNeighbours(nid)
     mali = align.buildMali(nid, neighbours)
     prof = alignlib.makeProfile(mali)
     E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
     prof.useSegment(start, end)
     prof.prepare()
     seq = fasta.getSequence(nid)
     return alignlib.makeSequence(seq), prof
Example #9
0
 def _buildProfile(nid, start, end):
     neighbours = index.getNeighbours(nid)
     mali = align.buildMali(nid, neighbours)
     prof = alignlib.makeProfile(mali)
     E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
     prof.useSegment(start, end)
     prof.prepare()
     seq = fasta.getSequence(nid)
     return alignlib.makeSequence(seq), prof
Example #10
0
    def applyMethod(self, neighbours ):
        """apply the method."""
        # build multiple alignment
        mali = alignlib.makeMultipleAlignment()
        
        query_nid = neighbours.mQueryToken
        
        sequence = self.mFasta.getSequence( query_nid )

        mali.add( alignlib.makeAlignatum( sequence ) )

        qseq = alignlib.makeSequence( sequence )
        alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL, 
                                                  -10.0, -1.0, True, True, True, True)

        for n in neighbours.mMatches:

            if n.mSbjctToken == query_nid: continue
            sequence = self.mFasta.getSequence( n.mSbjctToken )

            blast_query2sbjct = n.getAlignment()

            if blast_query2sbjct == None:
                raise ValueError( "AddaRealignment.py needs a reference alignment.")
            
            realign_query2sbjct = alignlib.makeAlignmentVector()
            
            sseq = alignlib.makeSequence( sequence )
            qseq.useSegment( n.mQueryFrom, n.mQueryTo )
            sseq.useSegment( n.mSbjctFrom, n.mSbjctTo )
            realign_query2sbjct = alignlib.makeAlignmentVector()
            alignator.align( realign_query2sbjct, qseq, sseq )

            nidentical = alignlib.getAlignmentIdentity( realign_query2sbjct, blast_query2sbjct, alignlib.RR )
            nblast = blast_query2sbjct.getNumAligned()
            nrealigned = realign_query2sbjct.getNumAligned()

            self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \
                                     (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) )
            
            if nidentical == nblast:
                self.mNIdentical += 1
            else:
                self.mNDifferent += 1
Example #11
0
    def CreateAlignandumObjects( self, sources ):

        tbl_nrdb = Table_nrdb( self.dbhandle )

        alignanda = []
        
        for id, nid, nid_from, nid_to in sources:
            
            if self.mLogLevel >= 2:
                print id,
                sys.stdout.flush()
                
            sequence = tbl_nrdb.Get_Sequence_From_NID( nid )
            alignandum = alignlib.makeSequence( sequence[nid_from-1:nid_to] )
            alignanda.append( (id, alignandum) )

        if self.mLogLevel >= 2:
            print
            
        return alignanda
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/prediction2pairs.py 2031 2008-07-15 09:19:05Z andreas $", usage = globals()["__doc__"])

    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genomic data (indexed)." )

    parser.add_option( "-c", "--cds", dest="filename_cds", type="string",
                       help="filename with cds seguences." )

    parser.add_option( "-f", "--format", dest="format", type="choice",
                       choices=("paired_fasta", ),
                       help="output format, valid options are: paired_fasta: concatenated pairwise alignments in FASTA format" )

    parser.set_defaults( 
        genome_file = "genome",
        filename_cds = "cds.fasta",
        format = "paired_fasta",
        filename_suffix = ".fasta",
        filename_prefix = "",
        )

    (options, args) = E.Start( parser, add_psql_options = True )    

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(1)

    fasta = IndexedFasta.IndexedFasta( options.genome_file )

    ## reading CDS sequences
    if options.filename_cds:
        cds_sequences = Genomics.ReadPeptideSequences( open(options.filename_cds, "r") )
    else:
        cds_sequences = {}
    
    if options.loglevel >= 1:
        options.stdlog.write( "# read %i CDS sequences\n" % len(cds_sequences) )

    last_filename_genome = None

    p = PredictionParser.PredictionParserEntry()    
    
    ninput, noutput, nsanity, n3, nlength = 0, 0, 0, 0, 0

    for line in options.stdin:
        
        if line[0] == "#": continue
        if line[0] == '"': continue
        
        p.Read(line)

        ninput += 1

        genomic_fragment = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand,
                                              p.mSbjctGenomeFrom, p.mSbjctGenomeTo )

        if len(genomic_fragment) == 0:
            raise "ERROR: empty fragment %s:%s for line" % (p.mSbjctGenomeFrom, p.mSbjctGenomeTo), line
        
        try:
            cds_fragment = cds_sequences[p.mQueryToken]
        except KeyError:
            options.stdlog.write( "# ERROR: cds not found: query %s.\n" % p.mQueryToken )
            continue

        map_query2sbjct, genomic_fragment = Genomics.Alignment2CDNA( p.mMapPeptide2Genome,
                                                                     query_from = p.mQueryFrom,
                                                                     sbjct_from = 0,
                                                                     genome = genomic_fragment )

        ## check for errors:
        if map_query2sbjct.getRowTo() != p.mQueryTo * 3:
            options.stdlog.write( "# ERROR: boundary shift in query at line %s\n# %i %i\n" % (line, map_query2sbjct.getRowTo(), p.mQueryTo * 3 ) )

        if map_query2sbjct.getColTo() > len(genomic_fragment):
            options.stdlog.write(  "# ERROR: length mismatch in line %s\n# genomic fragment (%i) shorter than last aligned residue (%i)\n" %\
            (line, len(genomic_fragment), map_query2sbjct.getColTo()) )
            options.stdlog.write(  "# cds     %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment ))
            nlength += 1
            continue
        
        if map_query2sbjct.getRowTo() > len(cds_fragment):
            options.stdlog.write(  "# ERROR: length mismatch in line %s\n# cds fragment (%i) shorter than last aligned residue (%i)\n" %\
            (line, len(cds_fragment), map_query2sbjct.getRowTo()) )
            options.stdlog.write(  "# cds     %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment ))
            nlength += 1
            continue

        cds_seq = alignlib.makeSequence( cds_fragment )
        genomic_seq = alignlib.makeSequence( genomic_fragment )
        
        f = alignlib.AlignmentFormatExplicit( map_query2sbjct, cds_seq, genomic_seq )
        row_ali = f.mRowAlignment
        col_ali = f.mColAlignment
        
        row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(row_ali, col_ali)
        
        row_ali = Genomics.MaskStopCodons( row_ali )
        col_ali = Genomics.MaskStopCodons( col_ali )        

        if len(row_ali) != len(col_ali):
            options.stdlog.write(  "# ERROR: wrong alignment lengths.\n" )
            sys.exit(1)
            
        if len(row_ali) % 3 or len(col_ali) % 3:
            options.stdlog.write( "# ERROR: sequences are not a multiple of 3 in line: %s\n" % line )
            options.stdlog.write( "# %6i %s\n# %6i %s\n" % (len(row_ali), str(row_ali), len(col_ali), str(col_ali) ) )
            n3 += 1

        input = re.sub( "[-X]", "", p.mTranslation )
        ref = re.sub( "[-X]", "", Genomics.TranslateDNA2Protein( col_ali ) )
        if input != ref:
            if options.loglevel >= 1:
                options.stdlog.write("# sanity check failed for %s - %s\n# %6i %s\n# %6i %s\n" % (p.mPredictionId, p.mQueryToken, 
                                                                                                  len(input), input, 
                                                                                                  len(ref), ref ) )
            nsanity += 1
            continue
        
        options.stdout.write(  ">%s\n%s\n" % (p.mPredictionId, row_ali) )
        options.stdout.write(  ">%s_vs_%s_%s_%i_%i\n%s\n" % \
              (p.mQueryToken, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, col_ali) ) 
        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nsanity=%i, nlength=%i, n3=%i\n" % (ninput, noutput, nsanity, nlength, n3) )
                                  
    E.Stop()
Example #13
0
    def Align( self, method, anchor = 0, loglevel = 1 ):
        """align a pair of sequences.
        get rid of this and use a method class instead in the future
        """
        
        map_a2b = alignlib.makeAlignmentVector()
        s1 = "A" * anchor + self.mSequence1 + "A" * anchor
        s2 = "A" * anchor + self.mSequence2 + "A" * anchor    

        self.strand = "+"

        if method == "dialign":
            dialign = WrapperDialign.Dialign( self.mOptionsDialign )
            dialign.Align( s1, s2, map_a2b )
        elif method == "blastz":
            blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ )
            blastz.Align( s1, s2, map_a2b )
            if blastz.isReverseComplement():
                self.strand = "-"
                self.mSequence2 = Genomics.complement( self.mSequence2 )

        elif method == "dialignlgs":
            dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS )
            dialignlgs.Align( s1, s2, map_a2b ) 
        elif method == "dba":
            dba = WrapperDBA.DBA()
            dba.Align( s1, s2, map_a2b )
        elif method == "clustal":
            raise NotImplementedError( "clustal wrapper needs to be updated")
            clustal = WrapperClustal.Clustal()
            clustal.Align( s1, s2, map_a2b )
        elif method == "nw":
            seq1 = alignlib.makeSequence( s1 )
            seq2 = alignlib.makeSequence( s2 )
            alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL,
                                                      gop=-12.0,
                                                      gep=-2.0 )
            alignator.align( map_a2b, seq1, seq2 )
        elif method == "sw":                        
            seq1 = alignlib.makeSequence( s1 )
            seq2 = alignlib.makeSequence( s2 )
            alignlib.performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw )
        else:
            ## use callback function
            method(s1, s2, map_a2b)

        if map_a2b.getLength() == 0:
            raise AlignmentError("empty alignment")

        if anchor:
            map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() )
            map_a2b.removeRowRegion( 1, anchor)        
            map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() )        
            map_a2b.removeColRegion( 1, anchor)
            map_a2b.moveAlignment( -anchor, -anchor )

        f = alignlib.AlignmentFormatExplicit( map_a2b, 
                                              alignlib.makeSequence( self.mSequence1),
                                              alignlib.makeSequence( self.mSequence2) )

        self.mMethod = method
        self.mAlignment = map_a2b
        self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment
        f = alignlib.AlignmentFormatEmissions( map_a2b )
        self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment
        self.mAlignmentFrom1 = map_a2b.getRowFrom()
        self.mAlignmentTo1 = map_a2b.getRowTo()        
        self.mAlignmentFrom2 = map_a2b.getColFrom()
        self.mAlignmentTo2 = map_a2b.getColTo()        
        self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength()
        self.mAligned = self.mLength - self.mNumGaps

        self.SetPercentIdentity()
        self.SetBlockSizes()
Example #14
0
                t = target_exons[tt]
                if r.mGenomeTo < t.mGenomeFrom:
                    rr += 1
                    continue
                elif t.mGenomeTo < r.mGenomeFrom:
                    tt += 1
                    continue
                overlap += ( min(r.mGenomeTo, t.mGenomeTo) - max(r.mGenomeFrom, t.mGenomeFrom))
                rr += 1
                tt += 1
                
            if overlap == 0:
                continue
            
            map_reference2target.clear()
            row = alignlib.makeSequence(reference.mTranslation)
            col = alignlib.makeSequence(target.mTranslation)
            alignator.align( map_reference2target, row, col )

            f = alignlib.AlignmentFormatEmissions( map_reference2target )
            row_ali, col_ali = f.mRowAlignment, f.mColAlignment
            pidentity = 100.0 * alignlib.calculatePercentIdentity( map_reference2target, row, col )
            psimilarity = 100.0 * alignlib.calculatePercentSimilarity( map_reference2target )        

            union = max( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \
                    min( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom )
            inter = min( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \
                    max( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom )
            
            assignment_id += 1
            
Example #15
0
def AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options,
                     diag_width = 2, max_advance = 2 ):
    """advance in codons in seq_wobble and match to nucleotides in seq_cds.

    Due to alinglib this is all in one-based coordinates.
    Takes care of frameshifts.
    """
    
    map_p2c.clear()

    gop, gep = -1.0, -1.0
    matrix = alignlib.makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib.getDefaultEncoder() )

    pep_seq = seq_peptide.asString()
    cds_seq = seq_cds.asString()
    wobble_seq = seq_wobble.asString()
    
    lcds = seq_cds.getLength()
    lwobble = seq_wobble.getLength()
    y = 0
    x = 0

    last_start = None

    while x < lwobble and y < lcds:

        xr = seq_wobble.asResidue( x )
        # skip over masked chars in wobble - these are gaps
        if seq_wobble.asChar(x) == "X": 
            x += 1
            continue

        # skip over masked chars in wobble - these are from
        # masked chars in the peptide sequence
        # Note to self: do not see all implications of this change
        # check later.
        if seq_wobble.asChar(x) == "N": 
            x += 1
            continue

        # skip over gaps in wobble 
        if seq_wobble.asChar(x) == "-": 
            x += 1
            continue

        s = matrix.getValue( xr, seq_cds.asResidue(y) )

        if options.loglevel >= 6:
            if (x % 3 == 0):
                c = seq_cds.asChar(y) + seq_cds.asChar(y+1) + seq_cds.asChar(y+2)
                options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y,
                                                                                 Genomics.MapCodon2AA( c ),
                                                                                 pep_seq[int(x/3)]) )
                                      
            options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % \
                                      (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s) ))
            
        # deal with mismatches
        if s <= 0:

            tmp_map_p2c = alignlib.makeAlignmentVector()

            ## backtrack to previous three codons and align
            ## three codons for double frameshifts that span two codons and
            ## produce two X's and six WWWWWW.

            ## number of nucleotides to extend (should be multiple of 3)
            ## less than 12 caused failure for some peptides.
            d = 15
            
            # extend by amound dx
            dx = (x % 3) + d
            
            x_start = max(0, x - dx )
            # map to ensure that no ambiguous residue mappings
            # exist after re-alignment
            y_start = max(0, map_p2c.mapRowToCol( x_start, alignlib.RIGHT ))

            if (x_start, y_start) == last_start:
                raise ValueError( "infinite loop detected" )

            last_start = (x_start, y_start)

            x_end = min(x_start + 2 * d, len(wobble_seq) )
            y_end = min(y_start + 2 * d, len(cds_seq) )

            wobble_fragment = alignlib.makeSequence(wobble_seq[x_start:x_end])
            cds_fragment = alignlib.makeSequence(cds_seq[y_start:y_end])
            
            AlignExhaustive( wobble_fragment, cds_fragment, "", tmp_map_p2c, options )

            if options.loglevel >= 10:
                 options.stdlog.write("# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end,
                                                                                           y_start, y_end,
                                                                                           str(alignlib.AlignmentFormatExplicit( tmp_map_p2c,
                                                                                                                                 wobble_fragment, 
                                                                                                                                 cds_fragment ))))
                 
                 options.stdlog.flush()

            ## clear alignment
            map_p2c.removeRowRegion( x_start, x_end )
            ngap = 0
            last_x, last_y = None, None
            for xxx in range( tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo() ):
                yyy = tmp_map_p2c.mapRowToCol(xxx)

                if yyy >= 0:
                    x = xxx + x_start
                    y = yyy + y_start
                    xr = seq_wobble.asResidue(x)
                    s = matrix.getValue( seq_wobble.asResidue(x), seq_cds.asResidue(y) )
                    if s < 0:
                        raise ValueError("mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y)))
                    
                    map_p2c.addPair( x, y, s)
                    last_x, last_y = x, y
                    if options.loglevel >= 6:
                        options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \
                                              (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s ))
                        options.stdlog.flush()
                    ngap = 0
                else:
                    ngap += 1

                # treat special case of double frameshifts. They might cause a petide/wobble residue
                # to be eliminated and thus the translated sequences will differ.
                # simply delete the last residue between x and y and move to next codon.
                if ngap == 3:
                    map_p2c.removeRowRegion( last_x, last_x + 1 )

                    last_x += 1
                    map_p2c.addPair( last_x, last_y )
                    if options.loglevel >= 6:
                        options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \
                                              (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s ))
                        options.stdlog.flush()                    
                    ngap = 0
                    
            ## exit condition if alignment is shorter than problematic residue
            ## need to catch this to avoid infinite loop.
            if tmp_map_p2c.getRowTo() < d:
                if lwobble - x <= 4:
                    ## only last codon is missing, so ok
                    break
                else:
                    raise ValueError("failure to align in designated window.")
                    
            s = 0
            
        s = matrix.getValue( xr, seq_cds.asResidue(y) )

        if s < 0:
            raise ValueError("mis-matching residues.")
        
        map_p2c.addPair( x, y, float(s) )
        
        # advance to next residues
        x += 1
        y += 1

    # sanity checks
    assert( map_p2c.getRowTo() <= seq_wobble.getLength() )
    assert( map_p2c.getColTo() <= seq_cds.getLength() )
Example #16
0
def FilterConflicts( old_predictions, new_predictions, removed_predictions,
                     min_overlap, peptide_sequences):
    """remove conflicts.

    Remove overlapping entries between different queries.

    Only remove those sequences, which are alignable.

    If they are alignable, take the sequence with the highest score and highest coverage.
    (Take both, if score and coverage are not correlated.)
    """
    ##################################################################################################
    ## sort predictions by genomic region
    if isinstance( old_predictions, PredictionFile.PredictionFile):
        old_predictions.sort( ('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo' ) )        
    else:
        old_predictions.sort( lambda x, y: cmp( (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.mSbjctGenomeTo),
                                                (y.mSbjctToken, y.mSbjctStrand, y.mSbjctGenomeFrom, y.mSbjctGenomeTo) ))

    ##################################################################################################
    ## filter predictions and resolve conflicts based on genomic overlap
    ## deleted segments are put in a temporary storage space. 
    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, param_gop, param_gep )
    result = alignlib.makeAlignmentVector()
    alignments = {}
    noverlaps = 0
    nredundants = 0

    nnew = 0
    last_prediction = None

    for this_prediction in old_predictions:
        try:
            this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \
                                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        if not last_prediction:
            last_prediction = this_prediction
            last_query_gene = this_query_gene
            continue
        
        overlap = min(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \
                  max(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom)
        union   = max(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \
                  min(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom)

        # resolve overlap between different genes
        if overlap > 0 and \
               (last_query_gene != this_query_gene or last_query_gene == None):

            noverlaps += 1
            relative_overlap = 100 * overlap / union

            # Start conflict resolution, if overlap is above threshold.
            # Keep higher scoring segment.
            #
            # Check if queries are homologous.
            if relative_overlap >= param_max_percent_overlap:
                
                if peptide_sequences:
                    if last_prediction.mQueryToken < this_prediction.mQueryToken:
                        key = "%s-%s" % (last_prediction.mQueryToken, this_prediction.mQueryToken)
                    else:
                        key = "%s-%s" % (this_prediction.mQueryToken, last_prediction.mQueryToken)

                    if not alignments.has_key( key ):
                        result.clear()
                        alignator.align( result,
                                         alignlib.makeSequence( peptide_sequences[this_prediction.mQueryToken]),
                                         alignlib.makeSequence( peptide_sequences[last_prediction.mQueryToken]) )
                        alignments[key] = result.getScore()
                        if result.getScore() >= param_min_score_overlap:
                            nredundants += 1
                            
                    if alignments[key] >= param_min_score_overlap:
                        is_overlap = 1
                    else:
                        is_overlap = 0
                else:
                    is_overlap = 1
            else:
                is_overlap = 0
        else:
            is_overlap = 0
            
        if is_overlap:
            # take best prediction. If difference is very small, set
            # difference to 0 (difference does not matter). In this case,
            # the first prediction is taken.
            d1 = last_prediction.mQueryCoverage - this_prediction.mQueryCoverage
            if float(abs(d1)) / float(last_prediction.mQueryCoverage) < param_conflicts_min_difference: d1 = 0
            d2 = last_prediction.score - this_prediction.score
            if float(abs(d2)) / float(this_prediction.score) < param_conflicts_min_difference: d2 = 0
            if d1 >= 0 and d2 >= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (last_prediction.mPredictionId,
                                                                                          last_prediction.mQueryToken,
                                                                                          last_prediction.mSbjctGenomeFrom,
                                                                                          overlap, relative_overlap,
                                                                                          str(this_prediction))
                if param_benchmarks:
                    if CheckBenchmark( this_prediction, last_prediction ):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap,
                                                                                                  str(last_prediction))

                removed_predictions.append( this_prediction )
                continue
            elif d1 <= 0 and d2 <= 0:
                if param_loglevel >= 2:                    
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (this_prediction.mPredictionId,
                                                                                          this_prediction.mQueryToken,
                                                                                          this_prediction.mSbjctGenomeFrom,
                                                                                          overlap, relative_overlap,
                                                                                          str(last_prediction))
                if param_benchmarks:
                    if CheckBenchmark( last_prediction, this_prediction ):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap,
                                                                                str(this_prediction))
                removed_predictions.append( last_prediction )                        
                last_prediction = this_prediction
                last_query_gene = this_query_gene
                continue
            else:
                if param_loglevel >= 2:
                    print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \
                          (this_prediction.mPredictionId,
                           this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom,
                           this_prediction.score, this_prediction.mQueryCoverage,
                           this_prediction.mPercentIdentity,
                           last_prediction.mPredictionId,
                           last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom,                           
                           last_prediction.score, last_prediction.mQueryCoverage,
                           last_prediction.mPercentIdentity)

        new_predictions.append(last_prediction)
        nnew += 1
        last_query_gene = this_query_gene
        last_prediction = this_prediction

    new_predictions.append(last_prediction)
    nnew += 1

    if param_loglevel >= 1:
        print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \
              (len(alignments), noverlaps, nredundants)
    
    return nnew
Example #17
0
def getAlignmentFull( m, q, t, options ):
    """print alignment with gaps in both query and target."""
    a = alignlib.AlignmentFormatExplicit( m, alignlib.makeSequence(q), alignlib.makeSequence(t) )
    return a.mRowAlignment, a.mColAlignment
Example #18
0
    def buildMali(self, query_nid, neighbours ):
        """build a multiple alignment from a set of neighbours.
        """
        # build multiple alignment
        mali = alignlib.makeMultipleAlignment()
        
        query_sequence = self.mFasta.getSequence( query_nid )

        mali.add( alignlib.makeAlignatum( query_sequence ) )

        qseq = alignlib.makeSequence( query_sequence )
        alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, 
                                                  -10, -2)

        nskipped = 0

        for n in neighbours[:self.mMaxNumNeighbours]:

            if n.mSbjctToken == query_nid: continue
            if n.mEvalue > self.mMaxEvalue: 
                nskipped += 1
                continue
            sequence = self.mFasta.getSequence( n.mSbjctToken )

            E.debug( "adding %s" % str(n) )

            map_query2sbjct = n.getAlignment()

            if map_query2sbjct == None:
                sseq = alignlib.makeSequence( sequence )
                qseq.useSegment( n.mQueryFrom, n.mQueryTo )
                sseq.useSegment( n.mSbjctFrom, n.mSbjctTo )
                map_query2sbjct = alignlib.makeAlignmentVector()
                alignator.align( map_query2sbjct, qseq, sseq )

            if map_query2sbjct.getLength() == 0:
                self.warn( "empty alignment: %s" % str( n ) )
                nskipped += 1
                continue

            if map_query2sbjct.getRowTo() > len(query_sequence):
                self.warn( "alignment out of bounds for query: %i>%i, line=%s" %\
                               (map_query2sbjct.getRowTo(), len(query_sequence), str(n)))
                nskipped += 1
                continue

            elif map_query2sbjct.getColTo() > len(sequence):
                self.warn( "alignment out of bounds for sbjct: %i>%i, line=%s" %\
                               (map_query2sbjct.getColTo(), len(sequence), str(n)))
                nskipped += 1
                continue

            try:
                mali.add( alignlib.makeAlignatum( sequence ),
                          map_query2sbjct,
                          mali_is_in_row = True, 
                          insert_gaps_mali = False,
                          insert_gaps_alignatum = True,
                          use_end_mali = True,
                          use_end_alignatum = False )
            except RuntimeError, msg:
                self.warn( "problem when building alignment for %s: msg=%s" % (str(n), msg))
                nskipped += 1
                continue
Example #19
0
def buildMapPdb2Sequence( sequence, filename_pdb, options, pdb_chain = ""):
    """build a map for residue numbers in pdb file to residue numbers on
    a sequence.

    returns the following maps:

    map_structure2seq: mapping of residue numbers between structure and
        sequence. These are mappings that will work if you "renumber" the
        structure.
        
    map_pdb2seq, map_seq2pdb: mapping according to residue numbers in pdb file.
    """

    if not os.path.exists( filename_pdb ):
        return None, None
    
    structure = Scientific.IO.PDB.Structure( filename_pdb )
    
    map_pdb2seq = {}
    map_seq2pdb = {}
    
    for chain in structure.peptide_chains:

        if chain.chain_id == pdb_chain:
            
            ## align pdb sequence to sequence
            map_structure2seq = alignlib.makeAlignataVector()
            alignator = alignlib.makeFullDP( -10.0, -2.0 )

            ## build sequence of pdb file
            structure = ""
            
            for residue in chain.sequence():
                structure += AMINOACIDS[residue]

            ## align reference sequence to sequence of pdb file
            row = alignlib.makeSequence( structure )
            col = alignlib.makeSequence( sequence )
            alignator.Align(row, col, map_structure2seq)

            if options.loglevel >= 3:
                options.stdlog.write( "structure: %s\n" % structure )                
                options.stdlog.write( "sequence : %s\n" % sequence )
                options.stdlog.write( "alignment of structure to sequence:\n" )
                options.stdlog.write( alignlib.writePairAlignment( row, col, map_structure2seq ) + "\n" )
                
            # print alignlib.writeAlignataTable(map_structure2seq)

            residue_number = 0
            
            for residue in chain.residues:

                residue_number += 1
                
                mapped_residue = map_structure2seq.mapRowToCol(residue_number)
                
                if not mapped_residue:
                    if options.loglevel >= 3:
                        options.stdlog.write( "# skipped residue %s=%s %i\n" % (str(residue.number), residue.name, residue_number))
                    continue

                r = str(residue.number)
                map_pdb2seq[r] = mapped_residue
                map_seq2pdb[mapped_residue] = r
                
            return map_structure2seq, map_pdb2seq, map_seq2pdb, residue_number-1, str(chain.residues[0].number), str(chain.residues[-1].number), structure
Example #20
0
        tmali.apply( translate )
        
        tmap_mali = Mali.Mali()
        tmap_mali.readFromFile( open(options.filename_map_mali, "r") )

        if tmap_mali.getAlphabet() == "na":
            tmap_mali.apply( translate )
        
        map_old2new = alignlib.makeAlignmentVector()

        mali1 = alignlib.makeProfileFromMali( convertMali2Mali( tmali ) )

        if tmap_mali.getLength() == 1:
            
            s = tmap_mali.values()[0].mString
            mali2 = alignlib.makeSequence( s )
            ## see if you can find an identical subsequence and then align to thisD
            for x in tmali.values():
                if s in re.sub( "[- .]+", "", x.mString):
                    mali1 = alignlib.makeSequence( x.mString )
                    break
        else:
            mali2 = alignlib.makeProfileFromMali( convertMali2Mali( tmap_mali ) )        

        alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -2.0 )
        alignator.align( map_old2new, mali1, mali2 )

        consensus = tmap_mali.getConsensus()
        
        if options.loglevel >= 4:
            options.stdlog.write( "# alphabet: %s\n" % tmap_mali.getAlphabet() )
Example #21
0
        filename_sequences = None,
        format = "fasta",
        )

    (options, args) = E.Start( parser )

    if not options.filename_sequences:
        raise "please supply filename with sequences."

    sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") )

    if options.loglevel >= 1:
        print "# read %i sequences" % len(sequences)
        
    for k in sequences.keys():
        sequences[k] = alignlib.makeSequence( sequences[k] )

    if options.loglevel >= 2:
        print "# converted %i sequences" % len(sequences)
    
    ninput, noutput, nskipped, nfailed = 0, 0, 0, 0
    link = BlastAlignments.Link()

    ali = alignlib.makeAlignataVector()
    
    for line in sys.stdin:
        
        if line[0] == "#": continue

        link.Read( line )
        ninput += 1
Example #22
0
 def align(self, query, sbjct, map_query2sbjct):
     xrow = alignlib.makeSequence(query.asString())
     xcol = alignlib.makeSequence(sbjct.asString())
     self.mAlignator.align( xrow, xcol, map_query2sbjct)
Example #23
0
            print "version="
            sys.exit(0)
        elif o in ( "-h", "--help" ):
            print globals()["__doc__"]
            sys.exit(0)

    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, param_gop, param_gep )
    map_query2token = alignlib.makeAlignmentVector()
    
    for line in sys.stdin:
        if line[0] == "#": continue

        query_token, sbjct_token, query_sequence, sbjct_sequence = string.split(line[:-1], "\t")

        map_query2token.clear()
        row = alignlib.makeSequence(query_sequence)
        col = alignlib.makeSequence(sbjct_sequence)
        alignator.align( map_query2token, row, col )

        pidentity = 100.0 * alignlib.calculatePercentIdentity( map_query2token, row, col )
        psimilarity = 100.0 * alignlib.calculatePercentSimilarity( map_query2token )        
        print string.join( map(str, (
            query_token, sbjct_token,
            map_query2token.getScore(),
            alignlib.AlignmentFormatEmissions( map_query2token ),
            pidentity,
            psimilarity,
            map_query2token.getNumGaps()) ), "\t" )
            
            
            
def ProcessRegion( predictions, region_id, region,
                   peptide_sequences = None,
                   filter_queries = {} ):
    """process a set of matches to a region.

    resolve region according to homology.
    """

    if options.loglevel >= 3:
        options.stdlog.write( "###################################################################\n" )
        options.stdlog.write( "# resolving %i predictions in region %s\n" % ( len(predictions), str(region)) )
        sys.stdout.flush()

    predictions.sort( lambda x,y: cmp(x.score, y.score))
    predictions.reverse()
        
    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, options.gop, options.gep )
    result = alignlib.makeAlignmentVector()

    cluster = []

    map_sequence2cluster = range(0,len(predictions))
    edges = []
    
    noutput, nskipped = 0, 0
    
    if peptide_sequences:
        for x in range(len(predictions)):
            if options.loglevel >= 5:
                options.stdlog.write( "# filtering from %i with prediction %i: %s\n" % (x, predictions[x].mPredictionId, predictions[x].mQueryToken) )
                sys.stdout.flush()
                
            if map_sequence2cluster[x] != x: continue
            
            region_id += 1
            edges = []
            
            if predictions[x].mQueryToken not in filter_queries:
                edges.append( predictions[x] )
            else:
                nskipped += 1
                
            for y in range(x+1,len(predictions)):
                
                if map_sequence2cluster[y] != y: continue

                if predictions[x].mQueryToken < predictions[y].mQueryToken:
                    key = "%s-%s" % (predictions[x].mQueryToken, predictions[y].mQueryToken)
                else:
                    key = "%s-%s" % (predictions[y].mQueryToken, predictions[x].mQueryToken)

                # check if predictions are overlapping on the genomic sequence
                if min(predictions[x].mSbjctGenomeTo,   predictions[y].mSbjctGenomeTo) - \
                   max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0:
                    if options.loglevel >= 4:
                        options.stdlog.write( "# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n" %\
                                                  (predictions[x].mPredictionId,
                                                   predictions[y].mPredictionId ) )
                        sys.stdout.flush()
                    continue

                if not global_alignments.has_key( key ):

                    seq1 = peptide_sequences[predictions[x].mQueryToken]
                    seq2 = peptide_sequences[predictions[y].mQueryToken]
                    result.clear()
                    s1 = alignlib.makeSequence( seq1 )
                    s2 = alignlib.makeSequence( seq2 )
                    alignator.align( result, s1, s2 )

                    c1 = 100 * (result.getRowTo() - result.getRowFrom()) / len(seq1)
                    c2 = 100 * (result.getColTo() - result.getColFrom()) / len(seq2)
                    min_cov = min(c1,c2)
                    max_cov = max(c1,c2)

                    identity = alignlib.calculatePercentIdentity( result, s1, s2 ) * 100
                    
                    # check if predictions overlap and they are homologous
                    if result.getScore() >= options.overlap_min_score and \
                       max_cov >= options.overlap_max_coverage and \
                       min_cov >= options.overlap_min_coverage and \
                       identity >= options.overlap_min_identity :
                        global_alignments[key] = True
                    else:
                        global_alignments[key] = False

                    if options.loglevel >= 4:
                        options.stdlog.write( "# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n" %\
                                                  (key,
                                                   result.getScore(),
                                                   identity,
                                                   c1,c2, min_cov, max_cov,
                                                   global_alignments[key]) )
                        sys.stdout.flush()
                        
                if global_alignments[key]:
                    map_sequence2cluster[y] = x
                    if predictions[y].mQueryToken not in filter_queries:                    
                        edges.append( predictions[y] )
                    else:
                        nskipped += 1
                        
            noutput += PrintEdges( region_id, region, edges )
    
    return region_id, noutput, nskipped
Example #25
0
def _alignToProfile(infile, outfile, min_score=0):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile(open("../data/mouse.fasta"))
    src_mali = Mali.convertMali2Alignlib(mali)

    E.debug("read mali: %i sequences x %i columns" %
            (mali.getNumSequences(), mali.getNumColumns()))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns()
    for x in "ACGT":
        for y in range(0, 2):
            profile_mali.addSequence("%s%i" % (x, y), 0, n, x * n)

    profile_mali = Mali.convertMali2Alignlib(profile_mali)
    alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.DNA4))
    alignlib.setDefaultLogOddor(alignlib.makeLogOddorUniform())

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile(profile_mali)

    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull(alignment_mode, -5.0, -0.5)

    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal(0, n, 0)
    build_mali.add(src_mali, m)

    outf = open(outfile, "w")
    outf_log = open(outfile + ".info", "w")
    outf_log.write(
        "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n"
    )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append(re.sub("-", "", mali[pid]))
        ids.append(pid)

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator(open(infile)):

        E.debug("adding %s" % s.title)
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence(s.sequence)
        rseq = alignlib.makeSequence(rsequence)

        alignator.align(map_seq2profile, seq, profile)
        alignator.align(map_rseq2profile, rseq, profile)

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score:
            c.skipped += 1
            continue

        r = getParts(m)

        covered = 0
        for mm in r:
            build_mali.add(mm)
            sequences.append(sequence)
            ids.append(s.title)
            covered += mm.getLength() - mm.getNumGaps()

        mali_covered = m.getColTo() - m.getColFrom()

        outf_log.write("\t".join(
            map(str, (s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(),
                      len(r), covered, "%5.2f" %
                      (100.0 * covered / len(s.sequence)), m.getScore(),
                      m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" %
                      ((100.0 * mali_covered) / mali.getNumColumns())))) +
                       "\n")

        c.output += 1

    #build_mali.expand( aa )
    result = str(
        alignlib.MultAlignmentFormatPlain(build_mali, sequences,
                                          alignlib.UnalignedStacked))

    for pid, data in zip(ids, result.split("\n")):
        start, sequence, end = data.split("\t")
        outf.write(">%s/%i-%i\n%s\n" %
                   (pid, int(start) + 1, int(end), sequence))

    outf.close()
    outf_log.close()

    E.info("%s\n" % str(c))
def _alignToProfile( infile, outfile, 
                     min_score = 0 ):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile( open("../data/mouse.fasta") )
    src_mali = Mali.convertMali2Alignlib( mali )
    
    E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() ))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns() 
    for x in "ACGT": 
        for y in range(0,2):
            profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n )


    profile_mali = Mali.convertMali2Alignlib( profile_mali )
    alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) )
    alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() )

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile( profile_mali )
    
    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull( alignment_mode,
                                              -5.0,
                                              -0.5 )
    
    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal( 0, n, 0 )
    build_mali.add( src_mali, m )

    outf = open( outfile, "w" )
    outf_log = open( outfile + ".info", "w" )
    outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append( re.sub( "-", "", mali[pid] ) )
        ids.append( pid )

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator( open(infile)):

        E.debug("adding %s" % s.title )
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence( s.sequence )
        rseq = alignlib.makeSequence( rsequence )

        alignator.align( map_seq2profile, seq, profile )
        alignator.align( map_rseq2profile, rseq, profile )

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score: 
            c.skipped += 1
            continue

        r = getParts( m )

        covered = 0
        for mm in r:
            build_mali.add( mm )
            sequences.append( sequence )
            ids.append( s.title )
            covered += mm.getLength() - mm.getNumGaps()

        mali_covered = m.getColTo() - m.getColFrom()

        outf_log.write( "\t".join( map(str, (
                        s.title,
                        len(s.sequence),
                        m.getRowFrom(),
                        m.getRowTo(),
                        len(r),
                        covered,
                        "%5.2f" % (100.0 * covered / len(s.sequence) ),
                        m.getScore(),
                        m.getColFrom(),
                        m.getColTo(),
                        mali_covered,
                        "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())
                        ) ) ) + "\n" )

        c.output += 1

    #build_mali.expand( aa )
    result = str(alignlib.MultAlignmentFormatPlain( build_mali, 
                                                    sequences, 
                                                    alignlib.UnalignedStacked ))

    for pid, data in zip(ids, result.split("\n") ):
        start, sequence, end = data.split("\t")
        outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) )


    outf.close()
    outf_log.close()

    E.info( "%s\n" % str(c) )
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $",
                                    usage = globals()["__doc__"] )

    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genome."  )

    parser.add_option( "-b", "--boundaries", dest="filename_boundaries", type="string",
                       help="filename with exon boundaries."  )

    parser.add_option( "-e", "--exons", dest="filename_exons", type="string",
                       help="filename with exons (output)."  )

    parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string",
                       help="filename with peptide sequences."  )

    parser.add_option( "-w", "--write-notfound", dest="write_notfound", action="store_true",
                       help="print exons for predictions not found in reference."  )

    parser.add_option( "-q", "--quality-pide", dest="quality_threshold_pide", type="int",
                       help="quality threshold (pide) for exons."  )

    parser.set_defaults( 
        genome_file = "genome",
        filename_boundaries = None,
        filename_exons = None,
        filename_peptides = None,
        quality_threshold_pide = 0,
        write_notfound = False,
        ## allowed number of nucleotides for exon boundaries to
        ## be considered equivalent.
        slipping_exon_boundary = 9,
        ## stop codons to search for        
        stop_codons = ("TAG", "TAA", "TGA"), )


    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    reference_exon_boundaries = {}
    if options.filename_boundaries:
        reference_exon_boundaries = Exons.ReadExonBoundaries( open( options.filename_boundaries, "r"),
                                                              do_invert = 1,
                                                              remove_utr = 1)
        E.info( "read exon boundaries for %i queries" % len(reference_exon_boundaries) )
                
    if options.filename_exons:
        outfile_exons = open( options.filename_exons, "w")
        outfile_exons.write( "%s\n" % "\t".join( (
                    "prediction_id",
                    "exon_id",
                    "exon_from",
                    "exon_to",
                    "exon_frame",
                    "reference_id",
                    "reference_from",
                    "reference_to",
                    "reference_phase",
                    "pidentity",
                    "psimilarity",
                    "nframeshifts",
                    "ngaps",
                    "nstopcodons",
                    "is_ok",
                    "genome_exon_from",
                    "genome_exon_to") ) )

    else:
        outfile_exons = None

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r") )
        E.info("read peptide sequences for %i queries" % len(peptide_sequences) )
    else:
        peptide_sequences = {}

    entry = PredictionParser.PredictionParserEntry()
    last_filename_genome = None
    
    nfound, nmissed_exons, nmissed_length = 0, 0, 0
    nempty_alignments = 0

    fasta = IndexedFasta.IndexedFasta( options.genome_file )

    options.stdout.write( "%s\n" % "\t".join( (
                "prediction_id", 
                "number",
                "dubious_exons",
                "boundaries_sum",
                "boundaries_max",
                "identical_exons",
                "inserted_exons",
                "deleted_exons",
                "inserted_introns",
                "deleted_introns",
                "truncated_Nterminus",
                "truncated_Cterminus",
                "deleted_Nexons",
                "deleted_Cexons",
                "inserted_Nexons",
                "inserted_Cexons" ) ) )

    for line in sys.stdin:

        if line[0] == "#": continue
        
        try:
            entry.Read(line)
        except ValueError, msg:
            print "# parsing failed with msg %s in line %s" % (msg, line[:-1])
            sys.exit(1)

        exons = Genomics.Alignment2ExonBoundaries( entry.mMapPeptide2Genome,
                                                   query_from = entry.mQueryFrom,
                                                   sbjct_from = entry.mSbjctGenomeFrom,
                                                   add_stop_codon = 0 )

        if exons[-1][4] != entry.mSbjctGenomeTo:
            print "# WARNING: discrepancy in exon calculation!!!"
            for e in exons:
                print "#", str(e)
            print "#", str(entry)

        if options.loglevel >= 5:
            for e in exons:
                print "#", str(e)
        
        genomic_fragment = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand,
                                              entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo )
        
        skip = False
        if peptide_sequences.has_key( entry.mQueryToken ):
            
            query_sequence = alignlib.makeSequence(peptide_sequences[entry.mQueryToken])
            sbjct_sequence = alignlib.makeSequence(entry.mTranslation)
            
            percent_similarity, percent_identity = 0, 0
            if query_sequence.getLength() < entry.mMapPeptide2Translation.getRowTo():
                print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken,
                                                                             query_sequence.getLength(),
                                                                             entry.mMapPeptide2Translation.getRowTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True
                
            elif sbjct_sequence.getLength() < entry.mMapPeptide2Translation.getColTo():
                print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken,
                                                                       sbjct_sequence.getLength(),
                                                                       entry.mMapPeptide2Translation.getColTo())
                sys.stdout.flush()                
                nmissed_length += 1
                skip = True
            else:
                alignlib.rescoreAlignment( entry.mMapPeptide2Translation, 
                                           query_sequence, 
                                           sbjct_sequence,
                                           alignlib.makeScorer( query_sequence, sbjct_sequence ) )
                percent_identity = alignlib.calculatePercentIdentity( entry.mMapPeptide2Translation,
                                                                      query_sequence,
                                                                      sbjct_sequence ) * 100
                percent_similarity = alignlib.calculatePercentSimilarity( entry.mMapPeptide2Translation ) * 100
                
            E.debug( "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f" % (
                    str(entry.mPredictionId), 
                    entry.mPercentSimilarity,
                    entry.mPercentIdentity,
                    percent_similarity,
                    percent_identity ) )
                
        else:
            query_sequence = None
            sbjct_sequence = None

        # default values
        exons_num_exons = "na"
        exons_boundaries_sum = "na"
        exons_boundaries_max = "na"
        dubious_exons = "na"

        ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0,0,0,0,0
        truncated_Nterminal_exon, truncated_Cterminal_exon = 0,0
        ndeleted_Nexons, ndeleted_Cexons = 0, 0
        ninserted_Nexons, ninserted_Cexons = 0, 0
        
        exons_offset = exons[0][3]

        if not reference_exon_boundaries.has_key( entry.mQueryToken ):
            print "# WARNING: sequence %s has no exon boundaries" % ( entry.mQueryToken )
            sys.stdout.flush()
            nmissed_exons += 1
            skip = True
        
        if not skip:

            nfound += 1
            
            ref_exons = reference_exon_boundaries[entry.mQueryToken]

            ref_exons_offset = ref_exons[0].mGenomeFrom
            
            exons_num_exons = len(ref_exons) - len(exons)
            exons_boundaries_sum = 0
            exons_phase = 0
            exons_boundaries_max = 0
            dubious_exons = 0
            
            inserted_exons = 0
            temp_inserted_exons = 0
            
            if options.loglevel >= 3:
                for e in exons:
                    options.stdlog.write( "# %s\n" % str(e) )
                for e in ref_exons:
                    options.stdlog.write( "# %s\n" % str(e) )

            min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100

            in_sync = 0
            e,r = 0,0

            while e < len(exons) and r < len(ref_exons):

                this_e, this_r = e+1, r+1
                percent_identity = 0
                percent_similarity = 0
                is_good_exon = 0

                if options.loglevel >= 4:
                    options.stdlog.write( "# current exons: %i and %i\n" % (e, r) )
                    sys.stdout.flush()
                    
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[e][0:6]
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (ref_exons[r].mPeptideFrom,
                                                                               ref_exons[r].mPeptideTo,
                                                                               ref_exons[r].frame,
                                                                               ref_exons[r].mGenomeFrom,
                                                                               ref_exons[r].mGenomeTo)

                ref_genome_from -= ref_exons_offset
                ref_genome_to   -= ref_exons_offset

                ## get percent identity for exon
                exon_percent_identity = 0
                exon_percent_similarity = 0
                
                if query_sequence and sbjct_sequence:
                    
                    tmp_ali = alignlib.makeAlignmentVector()

                    xquery_from = exon_from / 3
                    xquery_to = exon_to / 3

                    alignlib.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to )

                    if tmp_ali.getLength() == 0:
                        options.stdlog.write( "# WARNING: empty alignment %s\n" % str((ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to)))
                        nempty_alignments += 1
                    else:
                        if options.loglevel >= 5:
                            options.stdlog.write( "# %s\n" % str( alignlib.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence ) ) )

                        exon_percent_identity = alignlib.calculatePercentIdentity( tmp_ali,
                                                                                   query_sequence,
                                                                                   sbjct_sequence ) * 100
                        exon_percent_similarity = alignlib.calculatePercentSimilarity( tmp_ali ) * 100

                if exon_percent_identity >= min_pide:
                    is_good_exon = 1
                else:
                    is_good_exon = 0
                    
                if e < len(exons) -1 :
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to, next_exon_ali) = exons[e+1][0:6]
                else:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to, next_exon_ali) = 0, 0, 0, 0, 0, []
                    
                if r < len(ref_exons) - 1:
                    next_ref_from, next_ref_to, next_ref_phase = (ref_exons[r+1].mPeptideFrom,
                                                                  ref_exons[r+1].mPeptideTo,
                                                                  ref_exons[r+1].frame)
                else:
                    next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0
                    
                if options.loglevel >= 2:
                    options.stdlog.write( "# %s\n" % "\t".join( map(str, (entry.mQueryToken,
                                                                          exon_from, exon_to, exon_phase,
                                                                          exon_genome_from, exon_genome_to,
                                                                          ref_from, ref_to, ref_phase ))))
                    sys.stdout.flush()                    

                # beware of small exons.
                # if less than options.slipping_exon_boundary: boundary is 0
                # check if end is more than options.splipping_exon_boundary apart as well.
                if exon_to - exon_from <= options.slipping_exon_boundary or \
                        ref_to - ref_from <= options.slipping_exon_boundary:
                    boundary = 0
                else:
                    boundary = options.slipping_exon_boundary
                    
                if ref_to <= exon_from + boundary and \
                   ref_to <= exon_to - options.slipping_exon_boundary:
                    ## no overlap 
                    is_good_exon = 0
                    if e == 0:
                        ndeleted_Nexons += 1
                    else:
                        ndeleted_exons += 1
                    r += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0
                    overlap = 0
                elif exon_to <= ref_from + boundary and \
                         exon_to <= ref_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if r == 0:
                        ninserted_Nexons += 1
                    else:
                        ninserted_exons += 1
                    e += 1
                    ref_from, ref_to, ref_phase = 0, 0, 0
                    overlap = 0
                else:
                    ## overlap
                    overlap = 1
                    dfrom = int(math.fabs(exon_from - ref_from))
                    dto = int(math.fabs(exon_to - ref_to))

                    ## get percent identity for overlapping fragment 
                    if query_sequence and sbjct_sequence:
                        ## this the problem
                        tmp_ali = alignlib.makeAlignmentVector()
                        
                        xquery_from = max( ref_from / 3, exon_from / 3)
                        xquery_to = min(ref_to / 3, exon_to / 3)

                        alignlib.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to )

                        if tmp_ali.getLength() == 0:
                            options.stdlog.write( "# warning: empty alignment %s\n" % str((ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to )))
                            percent_identity = 0
                            percent_similarity = 0
                        else:
                            if options.loglevel >= 5:
                                print str( alignlib.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence ) )

                            percent_identity = alignlib.calculatePercentIdentity( tmp_ali,
                                                                                  query_sequence,
                                                                                  sbjct_sequence ) * 100
                            percent_similarity = alignlib.calculatePercentSimilarity( tmp_ali ) * 100
                            
                    if percent_identity >= min_pide:
                        is_good_exon = 1
                    else:
                        is_good_exon = 0
                        dubious_exons += 1

                    ## adjust regions for terminal exons
                    if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom - 1) * 3 and dfrom > 0:
                        if is_good_exon:                        
                            truncated_Nterminal_exon = dfrom
                        dfrom = 0
                            
                    ## truncated terminal exons
                    if e == len(exons)-1 and r == len(ref_exons)-1 and dto <= (entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0:
                        if is_good_exon:                        
                            truncated_Cterminal_exon = dto
                        dto = 0

                    ## do not count deviations for terminal query exons
                    if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0:
                        dfrom = 0
                            
                    if e == len(exons)-1 and dto <= (entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0:
                        dto = 0

                    ## permit difference of one codon (assumed to be stop)
                    if e == len(exons)-1 and r == len(ref_exons)-1 and dto == 3:
                        dto = 0

                    ## deal with different boundary conditions:
                    if dfrom == 0 and dto == 0:
                        if is_good_exon: nidentical_exons += 1
                        e += 1
                        r += 1
                    ## next exon within this ref_exon
                    elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary:
                        if is_good_exon: ninserted_introns += 1
                        e += 1
                        in_sync = 1
                        dto = 0
                    ## next ref_exon within this exon
                    elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary:
                        if is_good_exon: ndeleted_introns += 1
                        r += 1
                        in_sync = 1
                        dto = 0
                    else:
                        e += 1
                        r += 1
                        if in_sync:
                            dfrom = 0

                    if is_good_exon:
                        exons_boundaries_sum += dfrom + dto
                        exons_boundaries_max = max( dfrom, exons_boundaries_max )
                        exons_boundaries_max = max( dto, exons_boundaries_max )
                    
                        
                    ###########################################################
                    ## count inserted/deleted introns and misplaced boundaries
                    ##
                    ## if exon and next_exon in ref_exon: inserted intron
                    ## if ref_exon and next_ref_exon in exon: deleted intron
                    
                if outfile_exons:

                    if genomic_fragment and exon_genome_to:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom,
                                                                                                                       exon_ali,
                                                                                                                       genomic_fragment,
                                                                                                                       border_stop_codon = 0
                                                                                                                       )
                    else:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0

                    if exon_to == 0: this_e = 0
                    if ref_to == 0: this_r = 0
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId,
                                                                this_e, exon_from, exon_to, exon_phase,
                                                                this_r, ref_from, ref_to, ref_phase,
                                                                percent_identity, percent_similarity,
                                                                nframeshifts, ngaps, nstopcodons,
                                                                is_good_exon,
                                                                exon_genome_from, exon_genome_to,
                                                                )), "\t") + "\n")
                    
            while e < len(exons):
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[e][0:5]
                e += 1
                ninserted_Cexons += 1

                if outfile_exons:
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId, 
                                                                e, exon_from, exon_to, exon_phase,
                                                                0, 0, 0, 0,
                                                                0, 0,
                                                                0, 0, 0,
                                                                1,
                                                                exon_genome_from, exon_genome_to,
                                                                )), "\t") + "\n")
                    
            while r < len(ref_exons):
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (ref_exons[r].mPeptideFrom,
                                                                               ref_exons[r].mPeptideTo,
                                                                               ref_exons[r].frame,
                                                                               ref_exons[r].mGenomeFrom,
                                                                               ref_exons[r].mGenomeTo)
                ndeleted_Cexons += 1
                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset
                r += 1
                if outfile_exons:
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId,
                                                                0, 0, 0, 0,
                                                                r, ref_from, ref_to, ref_phase, 
                                                                0, 0,
                                                                0, 0, 0,
                                                                0,
                                                                0, 0,
                                                                )), "\t") + "\n")
        else:
            if options.write_notfound:
                this_e = 0
                ## use prediction's identity/similarity for exons.
                ## This will still then flag stop-codons in later analysis
                percent_identity = entry.mPercentIdentity
                percent_similarity = entry.mPercentSimilarity
            
                for exon in exons:
                    this_e += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[0:6]
                    if genomic_fragment:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom,
                                                                                                                       exon_ali,
                                                                                                                       genomic_fragment )
                    
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId,
                                                                this_e, exon_from, exon_to, exon_phase,
                                                                0, 0, 0, 0,
                                                                percent_identity, percent_similarity,
                                                                nframeshifts, ngaps, nstopcodons,
                                                                1,
                                                                exon_genome_from, exon_genome_to,
                                                                )), "\t") + "\n")
            
        options.stdout.write( "\t".join(map(str,
                              (entry.mPredictionId,
                               exons_num_exons,
                               dubious_exons,
                               exons_boundaries_sum,
                               exons_boundaries_max,
                               nidentical_exons,
                               ninserted_exons, ndeleted_exons,
                               ninserted_introns, ndeleted_introns,
                               truncated_Nterminal_exon, truncated_Cterminal_exon,
                               ndeleted_Nexons, ndeleted_Cexons,
                               ninserted_Nexons, ninserted_Cexons))) + "\n" )
Example #28
0
    def buildMali(self, query_nid, neighbours):
        """build a multiple alignment from a set of neighbours.
        """
        # build multiple alignment
        mali = alignlib.makeMultipleAlignment()

        query_sequence = self.mFasta.getSequence(query_nid)

        mali.add(alignlib.makeAlignatum(query_sequence))

        qseq = alignlib.makeSequence(query_sequence)
        alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_LOCAL, -10,
                                                 -2)

        nskipped = 0

        for n in neighbours[:self.mMaxNumNeighbours]:

            if n.mSbjctToken == query_nid: continue
            if n.mEvalue > self.mMaxEvalue:
                nskipped += 1
                continue
            sequence = self.mFasta.getSequence(n.mSbjctToken)

            E.debug("adding %s" % str(n))

            map_query2sbjct = n.getAlignment()

            if map_query2sbjct == None:
                sseq = alignlib.makeSequence(sequence)
                qseq.useSegment(n.mQueryFrom, n.mQueryTo)
                sseq.useSegment(n.mSbjctFrom, n.mSbjctTo)
                map_query2sbjct = alignlib.makeAlignmentVector()
                alignator.align(map_query2sbjct, qseq, sseq)

            if map_query2sbjct.getLength() == 0:
                self.warn("empty alignment: %s" % str(n))
                nskipped += 1
                continue

            if map_query2sbjct.getRowTo() > len(query_sequence):
                self.warn( "alignment out of bounds for query: %i>%i, line=%s" %\
                               (map_query2sbjct.getRowTo(), len(query_sequence), str(n)))
                nskipped += 1
                continue

            elif map_query2sbjct.getColTo() > len(sequence):
                self.warn( "alignment out of bounds for sbjct: %i>%i, line=%s" %\
                               (map_query2sbjct.getColTo(), len(sequence), str(n)))
                nskipped += 1
                continue

            try:
                mali.add(alignlib.makeAlignatum(sequence),
                         map_query2sbjct,
                         mali_is_in_row=True,
                         insert_gaps_mali=False,
                         insert_gaps_alignatum=True,
                         use_end_mali=True,
                         use_end_alignatum=False)
            except RuntimeError, msg:
                self.warn("problem when building alignment for %s: msg=%s" %
                          (str(n), msg))
                nskipped += 1
                continue
Example #29
0
    (options, args) = E.Start( parser, add_pipe_options = True )

    if options.filename_sequences:
        infile = open(options.filename_sequences, "r")
    else:
        infile = sys.stdin

    parser = FastaIterator.FastaIterator( infile )

    sequences = []
    while 1:
        cur_record = iterator.next()
        
        if cur_record is None: break
        sequences.append( (cur_record.title, alignlib.makeSequence(re.sub( " ", "", cur_record.sequence)) ) )
    
    if options.filename_sequences:
        infile.close()

    alignator = alignlib.makeAlignatorFullDP( options.gop, options.gep )
    map_a2b = alignlib.makeAlignataVector()
    nsequences = len(sequences)
    
    for x in range(0,nsequences-1):
        for y in range(x+1, nsequences):
            alignator.Align( sequences[x][1], sequences[y][1], map_a2b)

            row_ali, col_ali = alignlib.writeAlignataCompressed( map_a2b )
            
            options.stdout.write( "%s\t%s\t%i\t%i\t%i\t%s\t%i\t%i\t%s\t%i\t%i\t%i\t%i\n" % (\
Example #30
0
def PrintCluster( cluster,
                  cluster_id,
                  lengths,
                  peptide_sequences = None,
                  regex_preferred = None):
    """print a cluster.

    Take longest sequence as representative. If preferred is given, only take
    genes matching preferred identifier.
    """

    if regex_preferred:
        rx = re.compile(regex_preferred)
    else:
        rx = None
        
    max_al = 0
    max_pl = 0
    rep_a = None
    rep_p = None
    for c in cluster:
        l = 0
        if c in lengths: l = lengths[c]

        if l > max_al:
            max_al = l
            rep_a  = c

        if rx and rx.search(c) and l > max_pl:
            max_pl = l
            rep_p = c

    if max_pl > 0:
        max_l = max_pl
        rep = rep_p
    else:
        max_l = max_al
        rep = rep_a
        
    for mem in cluster:
        l = 0
        if mem in lengths: l = lengths[mem]
        if peptide_sequences:
            map_rep2mem = alignlib.makeAlignmentVector()            
            
            if rep == mem and rep in lengths:
                alignlib.addDiagonal2Alignment( map_rep2mem, 1, lengths[rep], 0)
            elif mem in peptide_sequences and \
                     rep in peptide_sequences:
                alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -1.0)
                alignator.align( map_rep2mem,
                                 alignlib.makeSequence( peptide_sequences[rep] ),
                                 alignlib.makeSequence( peptide_sequences[mem] ) )
                    
            f = alignlib.AlignmentFormatEmissions( map_rep2mem )
            print string.join( map(str, (rep, mem, l, f)), "\t" ) 

        else:
            print string.join( map(str, (rep, mem, l)), "\t" )
            
    sys.stdout.flush()
    
    return cluster_id
            print "#", cds_fragment
            print "# genomic"
            print "#",genomic_fragment
            continue
        
        if map_query2sbjct.getRowTo() > len(cds_fragment):
            print "# ERROR: length mismatch: cds fragment (%i) shorter than last aligned residue (%i)" %\
            (len(cds_fragment), map_query2sbjct.getRowTo())
            print "#", line
            print "# cds"
            print "#", cds_fragment
            print "# genomic"
            print "#",genomic_fragment
            continue

        cds_seq = alignlib.makeSequence( cds_fragment )
        genomic_seq = alignlib.makeSequence( genomic_fragment )
        
        data = map( lambda x: string.split(x, "\t"),
                    string.split( alignlib.writePairAlignment( cds_seq,
                                                               genomic_seq,
                                                               map_query2sbjct ), "\n" ))


        row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(data[0][1], data[1][1])

        row_ali = Genomics.MaskStopCodons( row_ali )
        col_ali = Genomics.MaskStopCodons( col_ali )        

        if len(row_ali) != len(col_ali):
            print "# ERROR: wrong alignment lengths."
def EliminateRedundantEntries( rep, 
                               data,
                               eliminated_predictions,
                               options, 
                               peptides,
                               extended_peptides,
                               filter_quality = None,
                               this_quality = None ):
    """eliminate redundant entries in a set."""
    
    eliminated = []

    rep_id = rep.transcript_id
    rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid

    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, options.gop, options.gep )
    result = alignlib.makeAlignmentVector()
    
    rep_seq = peptides[rep_id]
    rep_extended_seq = extended_peptides[rep_id]

    for entry in data:

        mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id,
                                                       entry.mQueryCoverage,
                                                       entry.mPid,
                                                       entry.mQuality )

        mem_seq = peptides[mem_id]
        mem_extended_seq = extended_peptides[mem_id]

        if options.loglevel >= 4:
            options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality))
            
        if mem_id in eliminated_predictions: continue

        if mem_extended_seq == rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "i") )

        elif mem_extended_seq in rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "p") )

        else:
            if mem_quality != this_quality or \
                   mem_quality in options.quality_exclude_same:
          
                seq1 = alignlib.makeSequence( str(rep_seq) )
                seq2 = alignlib.makeSequence( str(mem_seq) )            

                alignator.align( result, seq1, seq2 )

                if options.loglevel >= 5:
                    options.stdlog.write( "# ali\n%s\n" % alignlib.AlignmentFormatExplicit( result, seq1, seq2 ) )
                
                pidentity = 100 * alignlib.calculatePercentIdentity( result, seq1, seq2 )
                
                num_gaps = result.getNumGaps()

                if options.loglevel >= 4:
                    options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\
                                              ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) )
                    
                if pidentity >= options.min_identity:

                    keep = False
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        keep = True
                        reason = "covpid"
                    elif num_gaps >= options.max_gaps and \
                         mem_coverage > rep_coverage - options.safety_coverage:
                        keep = True
                        reason = "gaps"
                    elif mem_coverage >= rep_coverage - options.safety_coverage and \
                             100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage:
                        keep = True
                        reason = "memcov"

                    if keep:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "h") )
                        
                elif pidentity >= options.min_identity_non_genes and \
                         this_quality in options.quality_genes and \
                         mem_quality not in options.quality_genes:
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "l") )

    return eliminated
Example #33
0
            if param_is_compressed:
                if unaligned_pair and \
                       unaligned_pair.mToken1 == pair.mToken1 and \
                       unaligned_pair.mToken2 == pair.mToken2 and \
                       unaligned_pair.mIntronId1 == pair.mIntronId1:

                    map_a2b = alignlib.makeAlignmentVector()
                    f = AlignmentFormatEmissions( 
                        pair.mFrom1, 
                        pair.mAlignedSequence1,
                        pair.mFrom2, 
                        pair.mAlignedSequence2).copy( map_a2b )
                    map_a2b.moveAlignment( -unaligned_pair.mFrom1 + 1, -unaligned_pair.mFrom2 + 1 )            

                    data = alignlib.AlignmentFormatExplicit( map_a2b,
                                                             alignlib.makeSequence( unaligned_pair.mAlignedSequence1),
                                                             alignlib.makeSequence( unaligned_pair.mAlignedSequence2) )

                    from1, ali1, to1 = data.mRowFrom, data.mRowAlignment, data.mRowTo
                    from2, ali2, to2 = data.mColFrom, data.mColAlignment, data.mColTo

                    pair.mAlignedSequence1 = ali1
                    pair.mAlignedSequence2 = ali2

                else:
                    raise "sequence not found for pair %s" % str(pair)

                    
            if param_do_gblocks:
                if param_loglevel >= 4:
                    print "# length before: %i %i" % (len(pair.mAlignedSequence1), pair.mAligned)
Example #34
0
            print "# WARNING: discrepancy in exon calculation!!!"
            for e in exons:
                print "#", str(e)
            print "#", str(entry)

        if options.loglevel >= 5:
            for e in exons:
                print "#", str(e)
        
        genomic_fragment = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand,
                                              entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo )
        
        skip = False
        if peptide_sequences.has_key( entry.mQueryToken ):
            
            query_sequence = alignlib.makeSequence(peptide_sequences[entry.mQueryToken])
            sbjct_sequence = alignlib.makeSequence(entry.mTranslation)
            
            percent_similarity, percent_identity = 0, 0
            if query_sequence.getLength() < entry.mMapPeptide2Translation.getRowTo():
                print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken,
                                                                             query_sequence.getLength(),
                                                                             entry.mMapPeptide2Translation.getRowTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True
                
            elif sbjct_sequence.getLength() < entry.mMapPeptide2Translation.getColTo():
                print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken,
                                                                       sbjct_sequence.getLength(),
                                                                       entry.mMapPeptide2Translation.getColTo())
Example #35
0
        old_length = mali.getLength()
        
        new_mali = convertMali2Mali( mali )

        if options.alignment_method == "sw":
            alignator = alignlib.makeAlignatorFullDP( options.gop, options.gep )
        else:
            alignator = alignlib.makeAlignatorFullDPGlobal( options.gop, options.gep )            
        
        while 1:
            cur_record = iterator.next()
            if cur_record is None: break

            map_mali2seq = alignlib.makeAlignataVector()

            sequence = alignlib.makeSequence( cur_record.sequence )
            profile = alignlib.makeProfileFromMali( new_mali )

            if options.loglevel >= 4:
                options.stdlog.write(profile.Write())

            alignator.Align( profile, sequence, map_mali2seq )

            if options.loglevel >= 3:
                options.stdlog.write( map_mali2seq.Write() )

            ## add sequence to mali
            a = alignlib.makeAlignatumFromString( cur_record.sequence )
            a.thisown = 0
                
            new_mali.addAlignatum( a, map_mali2seq, 1, 1, 1, 1, 1 )