Esempio n. 1
0
    def startUp( self ):

        if self.isComplete(): return

        if self.mAppend:
            self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile + self.getSlice(),
                                                                  "a" )
            self.mContinueAt = self.mProfileLibrary.getLastInsertedKey()
            self.info("processing will continue after %s" % (str( self.mContinueAt ) ) )
        else:
            self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile + self.getSlice(),
                                                                  "w",
                                                                  force=self.mForce )


        # set default values
        self.mProfileLibrary.setLogOddor( alignlib.makeLogOddorDirichlet( self.mScaleFactor ) )
        self.mProfileLibrary.setRegularizor( alignlib.makeRegularizorDirichletPrecomputed() )
        self.mProfileLibrary.setWeightor( alignlib.makeWeightor() )
        alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20 ) )
Esempio n. 2
0
    def startUp(self):

        if self.isComplete(): return

        if self.mAppend:
            self.mProfileLibrary = ProfileLibrary.ProfileLibrary(
                self.mFilenameProfile + self.getSlice(), "a")
            self.mContinueAt = self.mProfileLibrary.getLastInsertedKey()
            self.info("processing will continue after %s" %
                      (str(self.mContinueAt)))
        else:
            self.mProfileLibrary = ProfileLibrary.ProfileLibrary(
                self.mFilenameProfile + self.getSlice(),
                "w",
                force=self.mForce)

        # set default values
        self.mProfileLibrary.setLogOddor(
            alignlib.makeLogOddorDirichlet(self.mScaleFactor))
        self.mProfileLibrary.setRegularizor(
            alignlib.makeRegularizorDirichletPrecomputed())
        self.mProfileLibrary.setWeightor(alignlib.makeWeightor())
        alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.Protein20))
Esempio n. 3
0
    def startUp( self ):

        if self.isComplete(): return

        ###############################################
        # create objects for algorithm 
        alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder( alignlib.Protein20 ) )
        self.mLogOddor    = alignlib.makeLogOddorDirichlet( self.mScaleFactor )
        self.mRegularizor = alignlib.makeRegularizorDirichletPrecomputed()
        self.mWeightor    = alignlib.makeWeightor()

        alignlib.getDefaultToolkit().setRegularizor( self.mRegularizor )
        alignlib.getDefaultToolkit().setLogOddor( self.mLogOddor )
        alignlib.getDefaultToolkit().setWeightor( self.mWeightor )


        if self.mUsePrebuiltProfiles:
            self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfiles, "r" )
            self.mProfileLibrary.setWeightor( self.mWeightor )
            self.mProfileLibrary.setLogOddor( self.mLogOddor )
            self.mProfileLibrary.setRegularizor( self.mRegularizor )

        else:
            self.mProfileLibrary = None
            self.mIndexedNeighbours = cadda.IndexedNeighbours( self.mFilenameGraph, self.mFilenameIndex )

        self.mChecker = self.checkLinkZScore
        self.mHeader = ("qdomain",
                        "sdomain",
                        "weight",
                        "passed",
                        "qstart",
                        "qend",
                        "qali",
                        "sstart",
                        "send",
                        "sali",
                        "score",
                        "naligned",
                        "ngaps",
                        "zscore" )

        self.mAlignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, 
                                                        self.mGop,
                                                        self.mGep )

        # the cache to store alignandum objects
        self.mCache = {}        
        
        alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20 ) )

        ## initialize counters
        self.mNPassed, self.mNFailed, self.mNNotFound = 0, 0, 0

        self.mOutfile = self.openOutputStream( self.mFilenameAlignments )

        if self.mContinueAt == None:
            self.mOutfile.write( "\t".join( self.mHeader ) + "\n" ) 
            self.mOutfile.flush()

        self.mStartTime = time.time()
Esempio n. 4
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-o", "--gop", dest="gop", type="float",
                      help="gap opening penalty [default=%default]."  )

    parser.add_option("-e", "--gep", dest="gep", type="float",
                      help="gap extension penalty [default=%default]."  )

    parser.add_option("-m", "--mode", dest="mode", type="choice",
                      choices = ("global", "local" ),
                      help="alignment mode, global=nw, local=sw [default=%default]."  )

    parser.set_defaults(
        gop = -12.0,
        gep = -2.0,
        format= "fasta",
        mode = "local",
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    if len(args) != 2: raise ValueError("please supply two multiple alignments in FASTA format.")

    mali1 = Mali.Mali()
    mali2 = Mali.Mali()

    E.info( "read 2 multiple alignments" )

    mali1.readFromFile( IOTools.openFile( args[0], "r" ), format=options.format )
    mali2.readFromFile( IOTools.openFile( args[1], "r" ), format=options.format )

    cmali1 = Mali.convertMali2Alignlib( mali1 )
    cmali2 = Mali.convertMali2Alignlib( mali2 )

    if options.mode == "local":
        mode = alignlib.ALIGNMENT_LOCAL
    elif options.mode == "global":
        mode = alignlib.ALIGNMENT_GLOBAL
        
    alignator = alignlib.makeAlignatorDPFull( mode,
                                              options.gop, options.gep )

    alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20) )
    alignlib.setDefaultLogOddor( alignlib.makeLogOddorDirichlet( 0.3 ) )
    alignlib.setDefaultRegularizor( alignlib.makeRegularizorDirichletPrecomputed() )

    cprofile1 = alignlib.makeProfile( cmali1 )
    cprofile2 = alignlib.makeProfile( cmali2 )

    result = alignlib.makeAlignmentVector()

    alignator.align( result, cprofile1, cprofile2 )

    E.debug( "result=\n%s" % alignlib.AlignmentFormatEmissions( result) )

    cmali1.add( cmali2, result )

    outmali = Mali.convertAlignlib2Mali( cmali1,
                                         identifiers = mali1.getIdentifiers() + mali2.getIdentifiers() )
    
    outmali.writeToFile( options.stdout, format=options.format)

    ## write footer and output benchmark information.
    E.Stop()
Esempio n. 5
0
def _alignToProfile(infile, outfile, min_score=0):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile(open("../data/mouse.fasta"))
    src_mali = Mali.convertMali2Alignlib(mali)

    E.debug("read mali: %i sequences x %i columns" %
            (mali.getNumSequences(), mali.getNumColumns()))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns()
    for x in "ACGT":
        for y in range(0, 2):
            profile_mali.addSequence("%s%i" % (x, y), 0, n, x * n)

    profile_mali = Mali.convertMali2Alignlib(profile_mali)
    alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.DNA4))
    alignlib.setDefaultLogOddor(alignlib.makeLogOddorUniform())

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile(profile_mali)

    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull(alignment_mode, -5.0, -0.5)

    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal(0, n, 0)
    build_mali.add(src_mali, m)

    outf = open(outfile, "w")
    outf_log = open(outfile + ".info", "w")
    outf_log.write(
        "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n"
    )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append(re.sub("-", "", mali[pid]))
        ids.append(pid)

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator(open(infile)):

        E.debug("adding %s" % s.title)
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence(s.sequence)
        rseq = alignlib.makeSequence(rsequence)

        alignator.align(map_seq2profile, seq, profile)
        alignator.align(map_rseq2profile, rseq, profile)

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score:
            c.skipped += 1
            continue

        r = getParts(m)

        covered = 0
        for mm in r:
            build_mali.add(mm)
            sequences.append(sequence)
            ids.append(s.title)
            covered += mm.getLength() - mm.getNumGaps()

        mali_covered = m.getColTo() - m.getColFrom()

        outf_log.write("\t".join(
            map(str, (s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(),
                      len(r), covered, "%5.2f" %
                      (100.0 * covered / len(s.sequence)), m.getScore(),
                      m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" %
                      ((100.0 * mali_covered) / mali.getNumColumns())))) +
                       "\n")

        c.output += 1

    #build_mali.expand( aa )
    result = str(
        alignlib.MultAlignmentFormatPlain(build_mali, sequences,
                                          alignlib.UnalignedStacked))

    for pid, data in zip(ids, result.split("\n")):
        start, sequence, end = data.split("\t")
        outf.write(">%s/%i-%i\n%s\n" %
                   (pid, int(start) + 1, int(end), sequence))

    outf.close()
    outf_log.close()

    E.info("%s\n" % str(c))
def _alignToProfile( infile, outfile, 
                     min_score = 0 ):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile( open("../data/mouse.fasta") )
    src_mali = Mali.convertMali2Alignlib( mali )
    
    E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() ))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns() 
    for x in "ACGT": 
        for y in range(0,2):
            profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n )


    profile_mali = Mali.convertMali2Alignlib( profile_mali )
    alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) )
    alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() )

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile( profile_mali )
    
    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull( alignment_mode,
                                              -5.0,
                                              -0.5 )
    
    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal( 0, n, 0 )
    build_mali.add( src_mali, m )

    outf = open( outfile, "w" )
    outf_log = open( outfile + ".info", "w" )
    outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append( re.sub( "-", "", mali[pid] ) )
        ids.append( pid )

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator( open(infile)):

        E.debug("adding %s" % s.title )
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence( s.sequence )
        rseq = alignlib.makeSequence( rsequence )

        alignator.align( map_seq2profile, seq, profile )
        alignator.align( map_rseq2profile, rseq, profile )

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score: 
            c.skipped += 1
            continue

        r = getParts( m )

        covered = 0
        for mm in r:
            build_mali.add( mm )
            sequences.append( sequence )
            ids.append( s.title )
            covered += mm.getLength() - mm.getNumGaps()

        mali_covered = m.getColTo() - m.getColFrom()

        outf_log.write( "\t".join( map(str, (
                        s.title,
                        len(s.sequence),
                        m.getRowFrom(),
                        m.getRowTo(),
                        len(r),
                        covered,
                        "%5.2f" % (100.0 * covered / len(s.sequence) ),
                        m.getScore(),
                        m.getColFrom(),
                        m.getColTo(),
                        mali_covered,
                        "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())
                        ) ) ) + "\n" )

        c.output += 1

    #build_mali.expand( aa )
    result = str(alignlib.MultAlignmentFormatPlain( build_mali, 
                                                    sequences, 
                                                    alignlib.UnalignedStacked ))

    for pid, data in zip(ids, result.split("\n") ):
        start, sequence, end = data.split("\t")
        outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) )


    outf.close()
    outf_log.close()

    E.info( "%s\n" % str(c) )