Ejemplo n.º 1
0
def alignIndels( all_alleles, colcounts, extend_by = 0 ):
    '''align all indel-regions.'''

    aa = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, 0, 0 )     
    alignator = alignlib.makeMultipleAlignatorSimple( aa)

    ids = all_alleles.keys()

    for x,c in enumerate(colcounts):
        if c <= 1: continue
        sequences = alignlib.StringVector()
        for sid in ids:
            for allele in all_alleles[sid]:
                sequences.append( allele[x] )

        mali = alignlib.makeMultAlignment()
        alignator.align( mali, sequences )
        realigned = []
        for line in str(alignlib.MultAlignmentFormatPlain( mali, sequences )).split("\n")[:-1]:
            data = line[:-1].split("\t")
            realigned.append( data[1] )
        assert len(realigned) == len(sequences)

        l = max( [len(r) for r in realigned] )
        i = 0
        for sid in ids:
            for allele in all_alleles[sid]:
                if realigned[i]: allele[x] = realigned[i] 
                else: allele[x] = "-" * l 
                i += 1
                
        colcounts[x] = l
Ejemplo n.º 2
0
def alignIndels(all_alleles, colcounts, extend_by=0):
    '''align all indel-regions.'''

    aa = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_LOCAL, 0, 0)
    alignator = alignlib.makeMultipleAlignatorSimple(aa)

    ids = all_alleles.keys()

    for x, c in enumerate(colcounts):
        if c <= 1: continue
        sequences = alignlib.StringVector()
        for sid in ids:
            for allele in all_alleles[sid]:
                sequences.append(allele[x])

        mali = alignlib.makeMultAlignment()
        alignator.align(mali, sequences)
        realigned = []
        for line in str(alignlib.MultAlignmentFormatPlain(
                mali, sequences)).split("\n")[:-1]:
            data = line[:-1].split("\t")
            realigned.append(data[1])
        assert len(realigned) == len(sequences)

        l = max([len(r) for r in realigned])
        i = 0
        for sid in ids:
            for allele in all_alleles[sid]:
                if realigned[i]: allele[x] = realigned[i]
                else: allele[x] = "-" * l
                i += 1

        colcounts[x] = l
Ejemplo n.º 3
0
def CheckAlignments( peptide_sequences, query_token, other_tokens ):
    """check wether query aligns to all others.
    """

    if param_loglevel >= 3:
        print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens))
        sys.stdout.flush()
        
    if query_token not in peptide_sequences:
        return True

    result = alignlib.makeAlignmentVector()
    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL,
                                              -10.0, -1.0 )
    row_seq = alignlib.makeSequence(peptide_sequences[query_token])

    for x in other_tokens:
        if x not in peptide_sequences:
            continue
        col_seq = alignlib.makeSequence( peptide_sequences[x] )
        alignator.align( result, row_seq, col_seq )
        if param_loglevel >= 5:
            print "# %s - %s = %f" % (query_token, x, result.getScore())
        if result.getScore() > param_min_alignment_score:
            return True
        
    return False
Ejemplo n.º 4
0
    def applyMethod(self, neighbours):
        """apply the method."""
        # build multiple alignment
        mali = alignlib.makeMultipleAlignment()

        query_nid = neighbours.mQueryToken

        sequence = self.mFasta.getSequence(query_nid)

        mali.add(alignlib.makeAlignatum(sequence))

        qseq = alignlib.makeSequence(sequence)
        alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_GLOBAL,
                                                 -10.0, -1.0, True, True, True,
                                                 True)

        for n in neighbours.mMatches:

            if n.mSbjctToken == query_nid: continue
            sequence = self.mFasta.getSequence(n.mSbjctToken)

            blast_query2sbjct = n.getAlignment()

            if blast_query2sbjct == None:
                raise ValueError(
                    "AddaRealignment.py needs a reference alignment.")

            realign_query2sbjct = alignlib.makeAlignmentVector()

            sseq = alignlib.makeSequence(sequence)
            qseq.useSegment(n.mQueryFrom, n.mQueryTo)
            sseq.useSegment(n.mSbjctFrom, n.mSbjctTo)
            realign_query2sbjct = alignlib.makeAlignmentVector()
            alignator.align(realign_query2sbjct, qseq, sseq)

            nidentical = alignlib.getAlignmentIdentity(realign_query2sbjct,
                                                       blast_query2sbjct,
                                                       alignlib.RR)
            nblast = blast_query2sbjct.getNumAligned()
            nrealigned = realign_query2sbjct.getNumAligned()

            self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \
                                     (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) )

            if nidentical == nblast:
                self.mNIdentical += 1
            else:
                self.mNDifferent += 1
Ejemplo n.º 5
0
    def applyMethod(self, neighbours ):
        """apply the method."""
        # build multiple alignment
        mali = alignlib.makeMultipleAlignment()
        
        query_nid = neighbours.mQueryToken
        
        sequence = self.mFasta.getSequence( query_nid )

        mali.add( alignlib.makeAlignatum( sequence ) )

        qseq = alignlib.makeSequence( sequence )
        alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL, 
                                                  -10.0, -1.0, True, True, True, True)

        for n in neighbours.mMatches:

            if n.mSbjctToken == query_nid: continue
            sequence = self.mFasta.getSequence( n.mSbjctToken )

            blast_query2sbjct = n.getAlignment()

            if blast_query2sbjct == None:
                raise ValueError( "AddaRealignment.py needs a reference alignment.")
            
            realign_query2sbjct = alignlib.makeAlignmentVector()
            
            sseq = alignlib.makeSequence( sequence )
            qseq.useSegment( n.mQueryFrom, n.mQueryTo )
            sseq.useSegment( n.mSbjctFrom, n.mSbjctTo )
            realign_query2sbjct = alignlib.makeAlignmentVector()
            alignator.align( realign_query2sbjct, qseq, sseq )

            nidentical = alignlib.getAlignmentIdentity( realign_query2sbjct, blast_query2sbjct, alignlib.RR )
            nblast = blast_query2sbjct.getNumAligned()
            nrealigned = realign_query2sbjct.getNumAligned()

            self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \
                                     (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) )
            
            if nidentical == nblast:
                self.mNIdentical += 1
            else:
                self.mNDifferent += 1
Ejemplo n.º 6
0
def AlignExhaustive( seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width = 2 ):
    """Align two sequences.

    Align in chunks to keep memory low. Both sequences are roughly the same,
    thus align only in diagonal.
    """

    gop, gep = -1.0, -1.0
    matrix = alignlib.makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib.getDefaultEncoder() )
    alignlib.setDefaultSubstitutionMatrix( matrix )

    if seq_wobble.getLength() < 10000:
        if options.loglevel >= 6:
            options.stdlog.write( "# using full dynamic programing matrix.\n" )
            options.stdlog.flush()
        # do not penalize gaps at the end, because sometimes the last codon might be missing
        alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL, gop, gep, 1, 1 )
    else:
        diag_width = abs(seq_wobble.getLength() - seq_cds.getLength()) + 1
        if options.loglevel >= 6:
            options.stdlog.write( "# using dot alignment with diagonal %i\n" % diag_width )
            options.stdlog.flush()

        dots = alignlib.makeAlignmentMatrixRow()
            
        for x in range(0, seq_wobble.getLength()):
            xr = seq_wobble.asResidue( x )
            for y in range( max(0, x - diag_width), min( seq_cds.getLength(), x + diag_width)):
                s = matrix.getValue( xr, seq_cds.asResidue(y) )
                if s >= 0:
                    dots.addPair( x, y, float(s) )

        if options.loglevel >= 6:
            options.stdlog.write( "# finished adding %i dots" % dots.getLength() )
            options.stdlog.flush()

        alignator_dummy = alignlib.makeAlignatorPrebuilt( dots )
        
        alignator = alignlib.makeAlignatorDots( alignator_dummy, gop, gep )

    alignator.align( map_p2c, seq_wobble, seq_cds )
Ejemplo n.º 7
0
def PrintCluster( cluster,
                  cluster_id,
                  lengths,
                  peptide_sequences = None,
                  regex_preferred = None):
    """print a cluster.

    Take longest sequence as representative. If preferred is given, only take
    genes matching preferred identifier.
    """

    if regex_preferred:
        rx = re.compile(regex_preferred)
    else:
        rx = None
        
    max_al = 0
    max_pl = 0
    rep_a = None
    rep_p = None
    for c in cluster:
        l = 0
        if c in lengths: l = lengths[c]

        if l > max_al:
            max_al = l
            rep_a  = c

        if rx and rx.search(c) and l > max_pl:
            max_pl = l
            rep_p = c

    if max_pl > 0:
        max_l = max_pl
        rep = rep_p
    else:
        max_l = max_al
        rep = rep_a
        
    for mem in cluster:
        l = 0
        if mem in lengths: l = lengths[mem]
        if peptide_sequences:
            map_rep2mem = alignlib.makeAlignmentVector()            
            
            if rep == mem and rep in lengths:
                alignlib.addDiagonal2Alignment( map_rep2mem, 1, lengths[rep], 0)
            elif mem in peptide_sequences and \
                     rep in peptide_sequences:
                alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -1.0)
                alignator.align( map_rep2mem,
                                 alignlib.makeSequence( peptide_sequences[rep] ),
                                 alignlib.makeSequence( peptide_sequences[mem] ) )
                    
            f = alignlib.AlignmentFormatEmissions( map_rep2mem )
            print string.join( map(str, (rep, mem, l, f)), "\t" ) 

        else:
            print string.join( map(str, (rep, mem, l)), "\t" )
            
    sys.stdout.flush()
    
    return cluster_id
Ejemplo n.º 8
0
def _alignToProfile( infile, outfile, 
                     min_score = 0 ):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile( open("../data/mouse.fasta") )
    src_mali = Mali.convertMali2Alignlib( mali )
    
    E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() ))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns() 
    for x in "ACGT": 
        for y in range(0,2):
            profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n )


    profile_mali = Mali.convertMali2Alignlib( profile_mali )
    alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) )
    alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() )

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile( profile_mali )
    
    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull( alignment_mode,
                                              -5.0,
                                              -0.5 )
    
    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal( 0, n, 0 )
    build_mali.add( src_mali, m )

    outf = open( outfile, "w" )
    outf_log = open( outfile + ".info", "w" )
    outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append( re.sub( "-", "", mali[pid] ) )
        ids.append( pid )

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator( open(infile)):

        E.debug("adding %s" % s.title )
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence( s.sequence )
        rseq = alignlib.makeSequence( rsequence )

        alignator.align( map_seq2profile, seq, profile )
        alignator.align( map_rseq2profile, rseq, profile )

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score: 
            c.skipped += 1
            continue

        r = getParts( m )

        covered = 0
        for mm in r:
            build_mali.add( mm )
            sequences.append( sequence )
            ids.append( s.title )
            covered += mm.getLength() - mm.getNumGaps()

        mali_covered = m.getColTo() - m.getColFrom()

        outf_log.write( "\t".join( map(str, (
                        s.title,
                        len(s.sequence),
                        m.getRowFrom(),
                        m.getRowTo(),
                        len(r),
                        covered,
                        "%5.2f" % (100.0 * covered / len(s.sequence) ),
                        m.getScore(),
                        m.getColFrom(),
                        m.getColTo(),
                        mali_covered,
                        "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())
                        ) ) ) + "\n" )

        c.output += 1

    #build_mali.expand( aa )
    result = str(alignlib.MultAlignmentFormatPlain( build_mali, 
                                                    sequences, 
                                                    alignlib.UnalignedStacked ))

    for pid, data in zip(ids, result.split("\n") ):
        start, sequence, end = data.split("\t")
        outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) )


    outf.close()
    outf_log.close()

    E.info( "%s\n" % str(c) )
Ejemplo n.º 9
0
    def startUp( self ):

        if self.isComplete(): return

        ###############################################
        # create objects for algorithm 
        alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder( alignlib.Protein20 ) )
        self.mLogOddor    = alignlib.makeLogOddorDirichlet( self.mScaleFactor )
        self.mRegularizor = alignlib.makeRegularizorDirichletPrecomputed()
        self.mWeightor    = alignlib.makeWeightor()

        alignlib.getDefaultToolkit().setRegularizor( self.mRegularizor )
        alignlib.getDefaultToolkit().setLogOddor( self.mLogOddor )
        alignlib.getDefaultToolkit().setWeightor( self.mWeightor )


        if self.mUsePrebuiltProfiles:
            self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfiles, "r" )
            self.mProfileLibrary.setWeightor( self.mWeightor )
            self.mProfileLibrary.setLogOddor( self.mLogOddor )
            self.mProfileLibrary.setRegularizor( self.mRegularizor )

        else:
            self.mProfileLibrary = None
            self.mIndexedNeighbours = cadda.IndexedNeighbours( self.mFilenameGraph, self.mFilenameIndex )

        self.mChecker = self.checkLinkZScore
        self.mHeader = ("qdomain",
                        "sdomain",
                        "weight",
                        "passed",
                        "qstart",
                        "qend",
                        "qali",
                        "sstart",
                        "send",
                        "sali",
                        "score",
                        "naligned",
                        "ngaps",
                        "zscore" )

        self.mAlignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, 
                                                        self.mGop,
                                                        self.mGep )

        # the cache to store alignandum objects
        self.mCache = {}        
        
        alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20 ) )

        ## initialize counters
        self.mNPassed, self.mNFailed, self.mNNotFound = 0, 0, 0

        self.mOutfile = self.openOutputStream( self.mFilenameAlignments )

        if self.mContinueAt == None:
            self.mOutfile.write( "\t".join( self.mHeader ) + "\n" ) 
            self.mOutfile.flush()

        self.mStartTime = time.time()
Ejemplo n.º 10
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option(
        "--method",
        dest="method",
        type="choice",
        choices=("view", "align", "pileup", "profile"),
        help="method to perform [default=%default].",
    )

    parser.add_option(
        "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default]."
    )

    parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].")

    parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write(
            "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n"
            % (
                nid1,
                nid2,
                result.getScore(),
                result.getLength(),
                result.getNumGaps(),
                result.getRowFrom(),
                result.getRowTo(),
                result.getColFrom(),
                result.getColTo(),
            )
        )

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()
Ejemplo n.º 11
0
    def buildMali(self, query_nid, neighbours):
        """build a multiple alignment from a set of neighbours.
        """
        # build multiple alignment
        mali = alignlib.makeMultipleAlignment()

        query_sequence = self.mFasta.getSequence(query_nid)

        mali.add(alignlib.makeAlignatum(query_sequence))

        qseq = alignlib.makeSequence(query_sequence)
        alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_LOCAL, -10,
                                                 -2)

        nskipped = 0

        for n in neighbours[:self.mMaxNumNeighbours]:

            if n.mSbjctToken == query_nid: continue
            if n.mEvalue > self.mMaxEvalue:
                nskipped += 1
                continue
            sequence = self.mFasta.getSequence(n.mSbjctToken)

            E.debug("adding %s" % str(n))

            map_query2sbjct = n.getAlignment()

            if map_query2sbjct == None:
                sseq = alignlib.makeSequence(sequence)
                qseq.useSegment(n.mQueryFrom, n.mQueryTo)
                sseq.useSegment(n.mSbjctFrom, n.mSbjctTo)
                map_query2sbjct = alignlib.makeAlignmentVector()
                alignator.align(map_query2sbjct, qseq, sseq)

            if map_query2sbjct.getLength() == 0:
                self.warn("empty alignment: %s" % str(n))
                nskipped += 1
                continue

            if map_query2sbjct.getRowTo() > len(query_sequence):
                self.warn( "alignment out of bounds for query: %i>%i, line=%s" %\
                               (map_query2sbjct.getRowTo(), len(query_sequence), str(n)))
                nskipped += 1
                continue

            elif map_query2sbjct.getColTo() > len(sequence):
                self.warn( "alignment out of bounds for sbjct: %i>%i, line=%s" %\
                               (map_query2sbjct.getColTo(), len(sequence), str(n)))
                nskipped += 1
                continue

            try:
                mali.add(alignlib.makeAlignatum(sequence),
                         map_query2sbjct,
                         mali_is_in_row=True,
                         insert_gaps_mali=False,
                         insert_gaps_alignatum=True,
                         use_end_mali=True,
                         use_end_alignatum=False)
            except RuntimeError, msg:
                self.warn("problem when building alignment for %s: msg=%s" %
                          (str(n), msg))
                nskipped += 1
                continue
Ejemplo n.º 12
0
    def Align( self, method, anchor = 0, loglevel = 1 ):
        """align a pair of sequences.
        get rid of this and use a method class instead in the future
        """
        
        map_a2b = alignlib.makeAlignmentVector()
        s1 = "A" * anchor + self.mSequence1 + "A" * anchor
        s2 = "A" * anchor + self.mSequence2 + "A" * anchor    

        self.strand = "+"

        if method == "dialign":
            dialign = WrapperDialign.Dialign( self.mOptionsDialign )
            dialign.Align( s1, s2, map_a2b )
        elif method == "blastz":
            blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ )
            blastz.Align( s1, s2, map_a2b )
            if blastz.isReverseComplement():
                self.strand = "-"
                self.mSequence2 = Genomics.complement( self.mSequence2 )

        elif method == "dialignlgs":
            dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS )
            dialignlgs.Align( s1, s2, map_a2b ) 
        elif method == "dba":
            dba = WrapperDBA.DBA()
            dba.Align( s1, s2, map_a2b )
        elif method == "clustal":
            raise NotImplementedError( "clustal wrapper needs to be updated")
            clustal = WrapperClustal.Clustal()
            clustal.Align( s1, s2, map_a2b )
        elif method == "nw":
            seq1 = alignlib.makeSequence( s1 )
            seq2 = alignlib.makeSequence( s2 )
            alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL,
                                                      gop=-12.0,
                                                      gep=-2.0 )
            alignator.align( map_a2b, seq1, seq2 )
        elif method == "sw":                        
            seq1 = alignlib.makeSequence( s1 )
            seq2 = alignlib.makeSequence( s2 )
            alignlib.performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw )
        else:
            ## use callback function
            method(s1, s2, map_a2b)

        if map_a2b.getLength() == 0:
            raise AlignmentError("empty alignment")

        if anchor:
            map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() )
            map_a2b.removeRowRegion( 1, anchor)        
            map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() )        
            map_a2b.removeColRegion( 1, anchor)
            map_a2b.moveAlignment( -anchor, -anchor )

        f = alignlib.AlignmentFormatExplicit( map_a2b, 
                                              alignlib.makeSequence( self.mSequence1),
                                              alignlib.makeSequence( self.mSequence2) )

        self.mMethod = method
        self.mAlignment = map_a2b
        self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment
        f = alignlib.AlignmentFormatEmissions( map_a2b )
        self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment
        self.mAlignmentFrom1 = map_a2b.getRowFrom()
        self.mAlignmentTo1 = map_a2b.getRowTo()        
        self.mAlignmentFrom2 = map_a2b.getColFrom()
        self.mAlignmentTo2 = map_a2b.getColTo()        
        self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength()
        self.mAligned = self.mLength - self.mNumGaps

        self.SetPercentIdentity()
        self.SetBlockSizes()
Ejemplo n.º 13
0
	nintrons, 
	nsplits, 
	nstopcodons, 
	pidentity, 
	psimilarity, 
	sequence, 
	sbjct_genome_from, 
	sbjct_genome_to, 
	map_query2genome
    FROM %s AS p 
    WHERE p.sbjct_token = '%s' AND
    p.sbjct_strand = '%s' AND 
    OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 
    """

    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, param_gop, param_gep )
    map_reference2target = alignlib.makeAlignmentVector()    
    assignment_id = 0
    
    for line in cr.fetchall():

        reference = PredictionParser.PredictionParserEntry()
        reference.FillFromTable( line )

        ct = dbhandle.cursor()
        ct.execute( statement % (param_tablename_predictions_target,
                    reference.mSbjctToken, reference.mSbjctStrand,
                    reference.mSbjctGenomeFrom, reference.mSbjctGenomeTo ))

        reference_exons = Exons.Alignment2Exons( reference.mMapPeptide2Genome,
                                                 0, 
def EliminateRedundantEntries( rep, 
                               data,
                               eliminated_predictions,
                               options, 
                               peptides,
                               extended_peptides,
                               filter_quality = None,
                               this_quality = None ):
    """eliminate redundant entries in a set."""
    
    eliminated = []

    rep_id = rep.transcript_id
    rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid

    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, options.gop, options.gep )
    result = alignlib.makeAlignmentVector()
    
    rep_seq = peptides[rep_id]
    rep_extended_seq = extended_peptides[rep_id]

    for entry in data:

        mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id,
                                                       entry.mQueryCoverage,
                                                       entry.mPid,
                                                       entry.mQuality )

        mem_seq = peptides[mem_id]
        mem_extended_seq = extended_peptides[mem_id]

        if options.loglevel >= 4:
            options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality))
            
        if mem_id in eliminated_predictions: continue

        if mem_extended_seq == rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "i") )

        elif mem_extended_seq in rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "p") )

        else:
            if mem_quality != this_quality or \
                   mem_quality in options.quality_exclude_same:
          
                seq1 = alignlib.makeSequence( str(rep_seq) )
                seq2 = alignlib.makeSequence( str(mem_seq) )            

                alignator.align( result, seq1, seq2 )

                if options.loglevel >= 5:
                    options.stdlog.write( "# ali\n%s\n" % alignlib.AlignmentFormatExplicit( result, seq1, seq2 ) )
                
                pidentity = 100 * alignlib.calculatePercentIdentity( result, seq1, seq2 )
                
                num_gaps = result.getNumGaps()

                if options.loglevel >= 4:
                    options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\
                                              ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) )
                    
                if pidentity >= options.min_identity:

                    keep = False
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        keep = True
                        reason = "covpid"
                    elif num_gaps >= options.max_gaps and \
                         mem_coverage > rep_coverage - options.safety_coverage:
                        keep = True
                        reason = "gaps"
                    elif mem_coverage >= rep_coverage - options.safety_coverage and \
                             100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage:
                        keep = True
                        reason = "memcov"

                    if keep:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "h") )
                        
                elif pidentity >= options.min_identity_non_genes and \
                         this_quality in options.quality_genes and \
                         mem_quality not in options.quality_genes:
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "l") )

    return eliminated
Ejemplo n.º 15
0
    def buildMali(self, query_nid, neighbours ):
        """build a multiple alignment from a set of neighbours.
        """
        # build multiple alignment
        mali = alignlib.makeMultipleAlignment()
        
        query_sequence = self.mFasta.getSequence( query_nid )

        mali.add( alignlib.makeAlignatum( query_sequence ) )

        qseq = alignlib.makeSequence( query_sequence )
        alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, 
                                                  -10, -2)

        nskipped = 0

        for n in neighbours[:self.mMaxNumNeighbours]:

            if n.mSbjctToken == query_nid: continue
            if n.mEvalue > self.mMaxEvalue: 
                nskipped += 1
                continue
            sequence = self.mFasta.getSequence( n.mSbjctToken )

            E.debug( "adding %s" % str(n) )

            map_query2sbjct = n.getAlignment()

            if map_query2sbjct == None:
                sseq = alignlib.makeSequence( sequence )
                qseq.useSegment( n.mQueryFrom, n.mQueryTo )
                sseq.useSegment( n.mSbjctFrom, n.mSbjctTo )
                map_query2sbjct = alignlib.makeAlignmentVector()
                alignator.align( map_query2sbjct, qseq, sseq )

            if map_query2sbjct.getLength() == 0:
                self.warn( "empty alignment: %s" % str( n ) )
                nskipped += 1
                continue

            if map_query2sbjct.getRowTo() > len(query_sequence):
                self.warn( "alignment out of bounds for query: %i>%i, line=%s" %\
                               (map_query2sbjct.getRowTo(), len(query_sequence), str(n)))
                nskipped += 1
                continue

            elif map_query2sbjct.getColTo() > len(sequence):
                self.warn( "alignment out of bounds for sbjct: %i>%i, line=%s" %\
                               (map_query2sbjct.getColTo(), len(sequence), str(n)))
                nskipped += 1
                continue

            try:
                mali.add( alignlib.makeAlignatum( sequence ),
                          map_query2sbjct,
                          mali_is_in_row = True, 
                          insert_gaps_mali = False,
                          insert_gaps_alignatum = True,
                          use_end_mali = True,
                          use_end_alignatum = False )
            except RuntimeError, msg:
                self.warn( "problem when building alignment for %s: msg=%s" % (str(n), msg))
                nskipped += 1
                continue
Ejemplo n.º 16
0
def ProcessRegion( predictions, region_id, region,
                   peptide_sequences = None,
                   filter_queries = {} ):
    """process a set of matches to a region.

    resolve region according to homology.
    """

    if options.loglevel >= 3:
        options.stdlog.write( "###################################################################\n" )
        options.stdlog.write( "# resolving %i predictions in region %s\n" % ( len(predictions), str(region)) )
        sys.stdout.flush()

    predictions.sort( lambda x,y: cmp(x.score, y.score))
    predictions.reverse()
        
    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, options.gop, options.gep )
    result = alignlib.makeAlignmentVector()

    cluster = []

    map_sequence2cluster = range(0,len(predictions))
    edges = []
    
    noutput, nskipped = 0, 0
    
    if peptide_sequences:
        for x in range(len(predictions)):
            if options.loglevel >= 5:
                options.stdlog.write( "# filtering from %i with prediction %i: %s\n" % (x, predictions[x].mPredictionId, predictions[x].mQueryToken) )
                sys.stdout.flush()
                
            if map_sequence2cluster[x] != x: continue
            
            region_id += 1
            edges = []
            
            if predictions[x].mQueryToken not in filter_queries:
                edges.append( predictions[x] )
            else:
                nskipped += 1
                
            for y in range(x+1,len(predictions)):
                
                if map_sequence2cluster[y] != y: continue

                if predictions[x].mQueryToken < predictions[y].mQueryToken:
                    key = "%s-%s" % (predictions[x].mQueryToken, predictions[y].mQueryToken)
                else:
                    key = "%s-%s" % (predictions[y].mQueryToken, predictions[x].mQueryToken)

                # check if predictions are overlapping on the genomic sequence
                if min(predictions[x].mSbjctGenomeTo,   predictions[y].mSbjctGenomeTo) - \
                   max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0:
                    if options.loglevel >= 4:
                        options.stdlog.write( "# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n" %\
                                                  (predictions[x].mPredictionId,
                                                   predictions[y].mPredictionId ) )
                        sys.stdout.flush()
                    continue

                if not global_alignments.has_key( key ):

                    seq1 = peptide_sequences[predictions[x].mQueryToken]
                    seq2 = peptide_sequences[predictions[y].mQueryToken]
                    result.clear()
                    s1 = alignlib.makeSequence( seq1 )
                    s2 = alignlib.makeSequence( seq2 )
                    alignator.align( result, s1, s2 )

                    c1 = 100 * (result.getRowTo() - result.getRowFrom()) / len(seq1)
                    c2 = 100 * (result.getColTo() - result.getColFrom()) / len(seq2)
                    min_cov = min(c1,c2)
                    max_cov = max(c1,c2)

                    identity = alignlib.calculatePercentIdentity( result, s1, s2 ) * 100
                    
                    # check if predictions overlap and they are homologous
                    if result.getScore() >= options.overlap_min_score and \
                       max_cov >= options.overlap_max_coverage and \
                       min_cov >= options.overlap_min_coverage and \
                       identity >= options.overlap_min_identity :
                        global_alignments[key] = True
                    else:
                        global_alignments[key] = False

                    if options.loglevel >= 4:
                        options.stdlog.write( "# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n" %\
                                                  (key,
                                                   result.getScore(),
                                                   identity,
                                                   c1,c2, min_cov, max_cov,
                                                   global_alignments[key]) )
                        sys.stdout.flush()
                        
                if global_alignments[key]:
                    map_sequence2cluster[y] = x
                    if predictions[y].mQueryToken not in filter_queries:                    
                        edges.append( predictions[y] )
                    else:
                        nskipped += 1
                        
            noutput += PrintEdges( region_id, region, edges )
    
    return region_id, noutput, nskipped
Ejemplo n.º 17
0
def _alignToProfile(infile, outfile, min_score=0):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile(open("../data/mouse.fasta"))
    src_mali = Mali.convertMali2Alignlib(mali)

    E.debug("read mali: %i sequences x %i columns" %
            (mali.getNumSequences(), mali.getNumColumns()))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns()
    for x in "ACGT":
        for y in range(0, 2):
            profile_mali.addSequence("%s%i" % (x, y), 0, n, x * n)

    profile_mali = Mali.convertMali2Alignlib(profile_mali)
    alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.DNA4))
    alignlib.setDefaultLogOddor(alignlib.makeLogOddorUniform())

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile(profile_mali)

    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull(alignment_mode, -5.0, -0.5)

    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal(0, n, 0)
    build_mali.add(src_mali, m)

    outf = open(outfile, "w")
    outf_log = open(outfile + ".info", "w")
    outf_log.write(
        "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n"
    )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append(re.sub("-", "", mali[pid]))
        ids.append(pid)

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator(open(infile)):

        E.debug("adding %s" % s.title)
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence(s.sequence)
        rseq = alignlib.makeSequence(rsequence)

        alignator.align(map_seq2profile, seq, profile)
        alignator.align(map_rseq2profile, rseq, profile)

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score:
            c.skipped += 1
            continue

        r = getParts(m)

        covered = 0
        for mm in r:
            build_mali.add(mm)
            sequences.append(sequence)
            ids.append(s.title)
            covered += mm.getLength() - mm.getNumGaps()

        mali_covered = m.getColTo() - m.getColFrom()

        outf_log.write("\t".join(
            map(str, (s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(),
                      len(r), covered, "%5.2f" %
                      (100.0 * covered / len(s.sequence)), m.getScore(),
                      m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" %
                      ((100.0 * mali_covered) / mali.getNumColumns())))) +
                       "\n")

        c.output += 1

    #build_mali.expand( aa )
    result = str(
        alignlib.MultAlignmentFormatPlain(build_mali, sequences,
                                          alignlib.UnalignedStacked))

    for pid, data in zip(ids, result.split("\n")):
        start, sequence, end = data.split("\t")
        outf.write(">%s/%i-%i\n%s\n" %
                   (pid, int(start) + 1, int(end), sequence))

    outf.close()
    outf_log.close()

    E.info("%s\n" % str(c))
Ejemplo n.º 18
0
        if options.query_start and options.sbjct_start:
            options.sbjct_start = max( options.query_start, options.sbjct_start )
    else:
        if not options.sbjct:
            print USAGE
            raise "please supply both a query and a sbjct."
    
    if options.alignment_mode == "compass":
        plib_query = ProfileLibraryCompass( options.query, "r" )
        plib_sbjct = ProfileLibraryCompass( options.sbjct, "r" )
    else:
        plib_query = ProfileLibrary( options.query, "r" )
        plib_sbjct = ProfileLibrary( options.sbjct, "r" )

    if options.alignment_mode == "iterative-profile":
        alignator1 = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -2.0 )
        alignator = alignlib.makeAlignatorIterative( alignator1, options.iterative_min_score )

    elif options.alignment_mode == "iterative-sequence":
        class AlignatorSequence:
            def __init__(self):
                self.mAlignator1 = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -2.0 )
                self.mAlignator = alignlib.makeAlignatorIterative( self.mAlignator1, options.iterative_min_score )

            def align(self, query, sbjct, map_query2sbjct):
                xrow = alignlib.makeSequence(query.asString())
                xcol = alignlib.makeSequence(sbjct.asString())
                self.mAlignator.align( xrow, xcol, map_query2sbjct)
                
        alignator = AlignatorSequence()
    elif options.alignment_mode == "compass":
Ejemplo n.º 19
0
def FilterConflicts( old_predictions, new_predictions, removed_predictions,
                     min_overlap, peptide_sequences):
    """remove conflicts.

    Remove overlapping entries between different queries.

    Only remove those sequences, which are alignable.

    If they are alignable, take the sequence with the highest score and highest coverage.
    (Take both, if score and coverage are not correlated.)
    """
    ##################################################################################################
    ## sort predictions by genomic region
    if isinstance( old_predictions, PredictionFile.PredictionFile):
        old_predictions.sort( ('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo' ) )        
    else:
        old_predictions.sort( lambda x, y: cmp( (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.mSbjctGenomeTo),
                                                (y.mSbjctToken, y.mSbjctStrand, y.mSbjctGenomeFrom, y.mSbjctGenomeTo) ))

    ##################################################################################################
    ## filter predictions and resolve conflicts based on genomic overlap
    ## deleted segments are put in a temporary storage space. 
    alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, param_gop, param_gep )
    result = alignlib.makeAlignmentVector()
    alignments = {}
    noverlaps = 0
    nredundants = 0

    nnew = 0
    last_prediction = None

    for this_prediction in old_predictions:
        try:
            this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \
                                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        if not last_prediction:
            last_prediction = this_prediction
            last_query_gene = this_query_gene
            continue
        
        overlap = min(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \
                  max(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom)
        union   = max(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \
                  min(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom)

        # resolve overlap between different genes
        if overlap > 0 and \
               (last_query_gene != this_query_gene or last_query_gene == None):

            noverlaps += 1
            relative_overlap = 100 * overlap / union

            # Start conflict resolution, if overlap is above threshold.
            # Keep higher scoring segment.
            #
            # Check if queries are homologous.
            if relative_overlap >= param_max_percent_overlap:
                
                if peptide_sequences:
                    if last_prediction.mQueryToken < this_prediction.mQueryToken:
                        key = "%s-%s" % (last_prediction.mQueryToken, this_prediction.mQueryToken)
                    else:
                        key = "%s-%s" % (this_prediction.mQueryToken, last_prediction.mQueryToken)

                    if not alignments.has_key( key ):
                        result.clear()
                        alignator.align( result,
                                         alignlib.makeSequence( peptide_sequences[this_prediction.mQueryToken]),
                                         alignlib.makeSequence( peptide_sequences[last_prediction.mQueryToken]) )
                        alignments[key] = result.getScore()
                        if result.getScore() >= param_min_score_overlap:
                            nredundants += 1
                            
                    if alignments[key] >= param_min_score_overlap:
                        is_overlap = 1
                    else:
                        is_overlap = 0
                else:
                    is_overlap = 1
            else:
                is_overlap = 0
        else:
            is_overlap = 0
            
        if is_overlap:
            # take best prediction. If difference is very small, set
            # difference to 0 (difference does not matter). In this case,
            # the first prediction is taken.
            d1 = last_prediction.mQueryCoverage - this_prediction.mQueryCoverage
            if float(abs(d1)) / float(last_prediction.mQueryCoverage) < param_conflicts_min_difference: d1 = 0
            d2 = last_prediction.score - this_prediction.score
            if float(abs(d2)) / float(this_prediction.score) < param_conflicts_min_difference: d2 = 0
            if d1 >= 0 and d2 >= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (last_prediction.mPredictionId,
                                                                                          last_prediction.mQueryToken,
                                                                                          last_prediction.mSbjctGenomeFrom,
                                                                                          overlap, relative_overlap,
                                                                                          str(this_prediction))
                if param_benchmarks:
                    if CheckBenchmark( this_prediction, last_prediction ):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap,
                                                                                                  str(last_prediction))

                removed_predictions.append( this_prediction )
                continue
            elif d1 <= 0 and d2 <= 0:
                if param_loglevel >= 2:                    
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (this_prediction.mPredictionId,
                                                                                          this_prediction.mQueryToken,
                                                                                          this_prediction.mSbjctGenomeFrom,
                                                                                          overlap, relative_overlap,
                                                                                          str(last_prediction))
                if param_benchmarks:
                    if CheckBenchmark( last_prediction, this_prediction ):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap,
                                                                                str(this_prediction))
                removed_predictions.append( last_prediction )                        
                last_prediction = this_prediction
                last_query_gene = this_query_gene
                continue
            else:
                if param_loglevel >= 2:
                    print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \
                          (this_prediction.mPredictionId,
                           this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom,
                           this_prediction.score, this_prediction.mQueryCoverage,
                           this_prediction.mPercentIdentity,
                           last_prediction.mPredictionId,
                           last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom,                           
                           last_prediction.score, last_prediction.mQueryCoverage,
                           last_prediction.mPercentIdentity)

        new_predictions.append(last_prediction)
        nnew += 1
        last_query_gene = this_query_gene
        last_prediction = this_prediction

    new_predictions.append(last_prediction)
    nnew += 1

    if param_loglevel >= 1:
        print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \
              (len(alignments), noverlaps, nredundants)
    
    return nnew
Ejemplo n.º 20
0
 def __init__(self):
     self.mAlignator1 = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -2.0 )
     self.mAlignator = alignlib.makeAlignatorIterative( self.mAlignator1, options.iterative_min_score )
Ejemplo n.º 21
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-o", "--gop", dest="gop", type="float",
                      help="gap opening penalty [default=%default]."  )

    parser.add_option("-e", "--gep", dest="gep", type="float",
                      help="gap extension penalty [default=%default]."  )

    parser.add_option("-m", "--mode", dest="mode", type="choice",
                      choices = ("global", "local" ),
                      help="alignment mode, global=nw, local=sw [default=%default]."  )

    parser.set_defaults(
        gop = -12.0,
        gep = -2.0,
        format= "fasta",
        mode = "local",
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    if len(args) != 2: raise ValueError("please supply two multiple alignments in FASTA format.")

    mali1 = Mali.Mali()
    mali2 = Mali.Mali()

    E.info( "read 2 multiple alignments" )

    mali1.readFromFile( IOTools.openFile( args[0], "r" ), format=options.format )
    mali2.readFromFile( IOTools.openFile( args[1], "r" ), format=options.format )

    cmali1 = Mali.convertMali2Alignlib( mali1 )
    cmali2 = Mali.convertMali2Alignlib( mali2 )

    if options.mode == "local":
        mode = alignlib.ALIGNMENT_LOCAL
    elif options.mode == "global":
        mode = alignlib.ALIGNMENT_GLOBAL
        
    alignator = alignlib.makeAlignatorDPFull( mode,
                                              options.gop, options.gep )

    alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20) )
    alignlib.setDefaultLogOddor( alignlib.makeLogOddorDirichlet( 0.3 ) )
    alignlib.setDefaultRegularizor( alignlib.makeRegularizorDirichletPrecomputed() )

    cprofile1 = alignlib.makeProfile( cmali1 )
    cprofile2 = alignlib.makeProfile( cmali2 )

    result = alignlib.makeAlignmentVector()

    alignator.align( result, cprofile1, cprofile2 )

    E.debug( "result=\n%s" % alignlib.AlignmentFormatEmissions( result) )

    cmali1.add( cmali2, result )

    outmali = Mali.convertAlignlib2Mali( cmali1,
                                         identifiers = mali1.getIdentifiers() + mali2.getIdentifiers() )
    
    outmali.writeToFile( options.stdout, format=options.format)

    ## write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 22
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("view", "align", "pileup", "profile"),
                      help="method to perform [default=%default].")

    parser.add_option("--mode",
                      dest="mode",
                      type="choice",
                      choices=("global", "local"),
                      help="alignment mode [default=%default].")

    parser.add_option("--gop",
                      dest="gop",
                      type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("--gep",
                      dest="gep",
                      type="float",
                      help="gap extension penalty [default=%default].")

    parser.set_defaults(
        filename_graph="adda.graph",
        filename_index="adda.graph.idx",
        method="view",
        filename_fasta="adda",
        filename_config="adda.ini",
        append=False,
        force=False,
        mode="local",
        gop=-10.0,
        gep=-1.0,
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    index = cadda.IndexedNeighbours(options.filename_graph,
                                    options.filename_index)

    alignlib.getDefaultToolkit().setEncoder(
        alignlib.getEncoder(alignlib.Protein20))
    alignlib.getDefaultToolkit().setRegularizor(
        alignlib.makeRegularizorDirichletPrecomputed())
    alignlib.getDefaultToolkit().setLogOddor(
        alignlib.makeLogOddorDirichlet(0.3))
    alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor())

    fasta = IndexedFasta.IndexedFasta(options.filename_fasta)
    align = AddaProfiles.AddaProfiles(config, fasta=fasta)

    if options.method == "view":
        for nid in args:
            nid = int(args[0])

            neighbours = index.getNeighbours(nid)

            for n in neighbours:
                print str(n)

    elif options.method == "pileup":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        options.stdout.write("%s\n" % str(mali))

    elif options.method == "profile":

        if "_" in args[0]:
            nid, start, end = AddaIO.toTuple(args[0])
        else:
            nid = int(args[0])
            start, end = None, None

        neighbours = index.getNeighbours(nid)
        mali = align.buildMali(nid, neighbours)
        prof = alignlib.makeProfile(mali)
        E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
        if start != None:
            prof.useSegment(start, end)
        prof.prepare()
        options.stdout.write("%s\n" % str(prof))

    elif options.method == "align":

        nid1, start1, end1 = AddaIO.toTuple(args[0])
        nid2, start2, end2 = AddaIO.toTuple(args[1])

        align = AddaProfiles.AddaProfiles(config, fasta=fasta)

        if options.mode == "local":
            mode = alignlib.ALIGNMENT_LOCAL
        else:
            mode = alignlib.ALIGNMENT_GLOBAL

        alignator = alignlib.makeAlignatorDPFull(mode, options.gop,
                                                 options.gep)

        def _buildProfile(nid, start, end):
            neighbours = index.getNeighbours(nid)
            mali = align.buildMali(nid, neighbours)
            prof = alignlib.makeProfile(mali)
            E.info("nid: %i, neighours=%i" % (nid, len(neighbours)))
            prof.useSegment(start, end)
            prof.prepare()
            seq = fasta.getSequence(nid)
            return alignlib.makeSequence(seq), prof

        seq1, prof1 = _buildProfile(nid1, start1, end1)
        seq2, prof2 = _buildProfile(nid2, start2, end2)

        result = alignlib.makeAlignmentVector()

        alignator.align(result, prof1, prof2)

        E.debug("%s\n" % str(result))

        options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\
                                  (nid1, nid2,
                                   result.getScore(),
                                   result.getLength(),
                                   result.getNumGaps(),
                                   result.getRowFrom(), result.getRowTo(),
                                   result.getColFrom(), result.getColTo()))

        f = alignlib.AlignmentFormatExplicit(result, seq1, seq2)
        options.stdout.write("%s\n" % str(f))

    E.Stop()