Exemple #1
0
    def applyMethod( self, line ):
        """output the graph."""


        # ignore header
        if line.startswith("passed"): return

        self.mInput += 1

        link = AddaIO.TestedLink._make( line[:-1].split("\t") )
        
        if self.mInput % self.mReportStep == 0:
            t = time.time() 
            self.info( "iteration=%i, passed=%i, failed=%i, skipped=%i, notfound=%i, total time=%i, time per step=%f" %\
                           (self.mInput, self.mNPassed, self.mNFailed, self.mNSkipped, self.mNNotFound,
                            t - self.mStartTime,
                            float(self.mReportStep * ( t - self.mStartTime )) / self.mInput, 
                            ) )

        if link.passed == "+":
            self.mOutfile.write( line )
            self.mNPassed += 1
            self.mNSkipped += 1
            self.mOutput += 1
            return

        query_nid, query_from, query_to = AddaIO.toTuple( link.qdomain )
        sbjct_nid, sbjct_from, sbjct_to = AddaIO.toTuple( link.sdomain )

        self.debug( "checking link between %i (%i-%i) and %i (%i-%i)" %\
                    (query_nid, query_from, query_to,
                     sbjct_nid, sbjct_from, sbjct_to) )

        passed, alignment, extra_info = self.mChecker( query_nid, query_from, query_to,
                                                       sbjct_nid, sbjct_from, sbjct_to)
        
        if passed: 
            code = "+"
            self.mNPassed += 1
        else:
            code = "-"
            self.mNFailed += 1

        self.mOutfile.write( "\t".join( ( link.qdomain,
                                          link.sdomain,
                                          link.weight,
                                          code,
                                          str(alignlib.AlignmentFormatEmissions( alignment )),
                                          str(alignment.getScore()), 
                                          str(alignment.getNumAligned()), 
                                          str(alignment.getNumGaps())) + extra_info ) + "\n" )                    
        self.mOutfile.flush()

        self.mOutput += 1
Exemple #2
0
    def startUp(self):
        if self.isComplete(): return

        self.mOutfile = self.openOutputStream( self.mFilenameDomains )
        self.mOutfileFamilies = self.openOutputStream( self.mFilenameFamilies )
        self.mMapId2Nid = AddaIO.readMapId2Nid( open(self.mFilenamesNids, "r") )
        self.mMapNid2Id = dict( ( (x[1],x[0]) for x in self.mMapId2Nid.iteritems() ) )
    def startUp(self):
        if self.isComplete(): return
        self.mOutfile = self.openOutputStream(self.mFilenameOutput)

        self.mMapId2Nid = AddaIO.readMapId2Nid(open(self.mFilenamesNids, "r"))
        self.mMapNid2Id = dict(
            ((x[1], x[0]) for x in self.mMapId2Nid.iteritems()))
Exemple #4
0
    def merge(self):
        """merge runs from parallel computations.

        returns true if merging was succecss.
        """
        if self.isComplete(): return

        infiles = glob.glob("%s*" % self.mFilenameProfile)
        # remove suffixes
        infiles = list(
            set([x[:-4] for x in infiles if x != self.mFilenameProfile]))
        infiles.sort()

        last_nid = None
        found = set()
        ninput, noutput, nfound, nunknown, nduplicate = 0, 0, 0, 0, 0
        tokens = set(self.mFasta.keys())

        self.mProfileLibrary = ProfileLibrary.ProfileLibrary(
            self.mFilenameProfile, "w")

        for filename in infiles:
            infile = ProfileLibrary.ProfileLibrary(filename, "r")

            for nid, profile in infile.iteritems_sorted():
                ninput += 1

                if nid in found:
                    nduplicates += 1
                    self.warn("duplicate nid: %i in file %s" % (nid, filename))
                if nid not in tokens:
                    nunknown += 1
                    self.warn("unknown nid: %i in file %s" % (nid, filename))
                found.add(nid)
                nfound += 1
                self.mProfileLibrary.add(nid, profile)
                noutput += 1

        missing = tokens.difference(found)
        if len(missing) > 0:
            self.warn("the following nids were missing: %s" % str(missing))

        self.info("adding %i missing nids" % len(missing))

        for nid in missing:
            self.applyMethod(AddaIO.NeighboursRecord(nid, []))

        self.info( "merging: parts=%i, ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\
                       (len(infiles), ninput, noutput, nfound, len(missing), nduplicate, nunknown ) )

        self.info("deleting %i parts" % len(infiles))
        for infile in infiles:
            fn, fi = ProfileLibrary.getFileNames(infile)
            os.remove(fn)
            os.remove(fi)

        return len(missing) == 0 and nduplicate == 0 and nunknown == 0
Exemple #5
0
    def applyMethod(self ):
        """index the graph.        
        """
        self.info( "indexing of %s started" % self.mFilenameInputGraph )

        self.info( "loading map_id2nid from %s" % self.mConfig.get( "files", "output_nids", "adda.nids" ))
        infile = open( self.mConfig.get( "files", "output_nids", "adda.nids" ) )
        map_id2nid = AddaIO.readMapId2Nid( infile, 
                                           storage = self.mConfig.get( "files", "storage_nids", "memory" ) )
        infile.close()
    
        infile = AddaIO.openStream( self.mFilenameInputGraph )

        cadda.indexGraph( cadda.PairsDBNeighboursIterator( 
                self.mGraphIterator( infile, map_id2nid, self.mLogger ), self.mLogger ),
                          len(map_id2nid), 
                          self.mFilenameOutputGraph, 
                          self.mFilenameOutputIndex, 
                          self.mLogger )

        del map_id2nid
Exemple #6
0
    def applyMethod(self):
        """index the graph.        
        """
        self.info("indexing of %s started" % self.mFilenameInputGraph)

        self.info("loading map_id2nid from %s" %
                  self.mConfig.get("files", "output_nids", "adda.nids"))
        infile = open(self.mConfig.get("files", "output_nids", "adda.nids"))
        map_id2nid = AddaIO.readMapId2Nid(infile,
                                          storage=self.mConfig.get(
                                              "files", "storage_nids",
                                              "memory"))
        infile.close()

        infile = AddaIO.openStream(self.mFilenameInputGraph)

        cadda.indexGraph(
            cadda.PairsDBNeighboursIterator(
                self.mGraphIterator(infile, map_id2nid, self.mLogger),
                self.mLogger), len(map_id2nid), self.mFilenameOutputGraph,
            self.mFilenameOutputIndex, self.mLogger)

        del map_id2nid
Exemple #7
0
    def merge( self ):
        '''merge several runs.
        
        simply concatenate all files and reindex
        '''

        f = self.mFilenameOutputGraph

        if self.mNumChunks == 1:
            raise ValueError("merge called with only one chunk" )

        if os.path.exists( f ):
            raise ValueError( "file %s already exists - no merging" % f )

        self.info( "merging file %s from %i chunks" % (f, self.mNumChunks) )

        # check if all parts have finished and are present
        ff = []

        for chunk in range( self.mNumChunks ):
            fn = f + self.getSlice( chunk )
            if not os.path.exists( fn ):
                self.info("file %s is not present - merging aborted" % fn )
                return False
            ff.append( fn )

        self.info( "all files present" )

        ff = " ".join( ff )
        self.execute( "cat %s > %s" % (ff,f) )

        self.info( "rebuilding index" )

        self.info( "loading map_id2nid from %s" % self.mConfig.get( "output", "nids", "adda.nids" ))
        infile = open( self.mConfig.get( "output", "nids", "adda.nids" ) )
        map_id2nid = AddaIO.readMapId2Nid( infile, 
                                           storage = self.mConfig.get( "adda", "storage_nids", "memory" ) )
        infile.close()

        self.info( "starting the indexing" )

        cadda.reindexGraph( 
            len(map_id2nid), 
            self.mFilenameOutputGraph, 
            self.mFilenameOutputIndex, 
            self.mLogger )

        return True
Exemple #8
0
    def merge(self):
        '''merge several runs.
        
        simply concatenate all files and reindex
        '''

        f = self.mFilenameOutputGraph

        if self.mNumChunks == 1:
            raise ValueError("merge called with only one chunk")

        if os.path.exists(f):
            raise ValueError("file %s already exists - no merging" % f)

        self.info("merging file %s from %i chunks" % (f, self.mNumChunks))

        # check if all parts have finished and are present
        ff = []

        for chunk in range(self.mNumChunks):
            fn = f + self.getSlice(chunk)
            if not os.path.exists(fn):
                self.info("file %s is not present - merging aborted" % fn)
                return False
            ff.append(fn)

        self.info("all files present")

        ff = " ".join(ff)
        self.execute("cat %s > %s" % (ff, f))

        self.info("rebuilding index")

        self.info("loading map_id2nid from %s" %
                  self.mConfig.get("files", "output_nids", "adda.nids"))
        infile = open(self.mConfig.get("files", "output_nids", "adda.nids"))
        map_id2nid = AddaIO.readMapId2Nid(infile,
                                          storage=self.mConfig.get(
                                              "files", "storage_nids",
                                              "memory"))
        infile.close()

        cadda.reindexGraph(len(map_id2nid), self.mFilenameOutputGraph,
                           self.mFilenameOutputIndex, self.mLogger)

        return True
Exemple #9
0
    def finish(self):
        """finish processing.
        
        add entries for sequences who only appear in the sbjct field.
        """
        if not self.isSubset():
            nids = self.mFasta.getContigSizes().keys()
            nadded = 0

            for nid in sorted(nids):
                if nid not in self.mProfileLibrary:
                    self.applyMethod(AddaIO.NeighboursRecord(nid, []))
                    nadded += 1

            self.mOutput += nadded
            self.info("added %i profiles for sequences without neighbours" %
                      nadded)

        self.mProfileLibrary.close()

        AddaModuleRecord.finish(self)
Exemple #10
0
    def applyMethod(self):

        self.mInput = 0
        self.mOutput = 0
        self.mRemoved = 0
        self.mDuplicates = 0

        # use existing fasta file
        iterator = FastaIterator(AddaIO.openStream(self.mFilenameInputFasta))
        fasta = IndexedFasta.IndexedFasta(self.mFilenameOutputFasta, "w")

        outfile = self.openOutputStream(self.mFilenameNids)
        outfile.write("nid\tpid\thid\tlength\tsequence\n")

        nid = 1
        hids = set()

        for seq in iterator:

            self.mInput += 1
            if len(seq.sequence) > self.mMaxSequenceLength:
                self.mRemoved += 1
                continue

            hid = self.getHID(seq.sequence)
            if hid in hids:
                self.mDuplicates += 1
                continue

            hids.add(hid)
            outfile.write("%s\t%s\t%s\t%i\t%s\n" %
                          (nid, seq.pid, hid, len(seq.sequence), seq.sequence))
            fasta.addSequence(nid, seq.sequence)
            nid += 1
            self.mOutput += 1

        fasta.close()
        outfile.close()
Exemple #11
0
    def applyMethod(self ):

        self.mInput = 0
        self.mOutput = 0
        self.mRemoved = 0
        self.mDuplicates = 0

        # use existing fasta file
        iterator = FastaIterator( AddaIO.openStream( self.mFilenameInputFasta) )
        fasta = IndexedFasta.IndexedFasta( self.mFilenameOutputFasta, "w" )

        outfile = self.openOutputStream(self.mFilenameNids)
        outfile.write( "nid\tpid\thid\tlength\tsequence\n" )

        nid = 1
        hids = set()
        
        for seq in iterator:
            
            self.mInput += 1
            if len( seq.sequence ) > self.mMaxSequenceLength:
                self.mRemoved += 1
                continue

            hid = self.getHID( seq.sequence )
            if hid in hids:
                self.mDuplicates += 1
                continue
            
            hids.add(hid)
            outfile.write( "%s\t%s\t%s\t%i\t%s\n" % (nid, seq.pid, hid, len(seq.sequence), seq.sequence) )
            fasta.addSequence( nid, seq.sequence )
            nid += 1
            self.mOutput += 1

        fasta.close()
        outfile.close()
Exemple #12
0
    def applyMethod(self ):
        """index the graph.        
        """

        if self.isComplete(): return
        
        self.info( "setting parameters" )
                
        config = AddaIO.ConfigParser()

        config.read( self.mFilenameFit )                                
        self.mExponentialF = float( config.get( "optimise", "exponential_f" ) )   
        self.mExponentialE = float( config.get( "optimise", "exponential_e" ) )           

        cadda.setFilenameGraph( self.mFilenameGraph )
        cadda.setFilenameIndex( self.mFilenameIndex )
        cadda.setFilenameTransfers( self.mFilenameTransfers )
        cadda.setFilenameDomains( self.mFilenameDomains )        
        cadda.setLogLevel( self.mLogLevel )
        cadda.setReportStep( 1000 )
        cadda.setMaxIterations( self.mMaxIterations )
        cadda.setResolution( self.mResolution )
        cadda.setExponentialF( self.mExponentialF )
        cadda.setExponentialE( self.mExponentialE )
        
        self.info( "optimisation started" )
        
        cadda.dump_parameters()
        
        retval = cadda.optimise_initialise()
        
        if retval == 0:
            self.warn( "initialisation failed" )
        else:
            self.info( "initialisation success" )        

        improvements = []
        domains = [ self.mNSequences ]
        
        for iteration in range( self.mMaxIterations ):
            
            self.info( "iteration %i: started" % iteration)

            t = time.time()

            improvement = cadda.optimise_iteration()
            if improvements:
                rel_improvement = improvement / max(improvements)
            else:
                rel_improvement = 1
                  
            ndomains = cadda.optimise_get_num_partitions()
            
            self.info( "iteration %i: finished in %i seconds: improvement=%f, relative improvement=%f, ndomains=%i" %\
                       (iteration, 
                        time.time() - t,
                        improvement, 
                        rel_improvement, 
                        ndomains) )            

            if cadda.optimise_save_partitions( self.mFilenameDomains ):
                self.info( "domains saved to %s" % self.mFilenameDomains)
            else:
                self.warn( "saving domains to %s failed" % self.mFilenameDomains)
                
            improvements.append( improvement )
            domains.append( ndomains )
                       
            self.plotProgress( improvements, 
                               self.mOutputFilenameProgressImprovement,
                               "progress: improvement" )
            self.plotProgress( domains, 
                               self.mOutputFilenameProgressDomains,
                                "progress: domains" )
            self.plotProgress( map( lambda x: float( x ) / self.mNSequences, domains), 
                               self.mOutputFilenameProgressDomainsPerSequence,
                               "progress: domains per sequence" )
            
            if improvement < self.mMinAbsImprovement:
                self.info( "optimisation stopped because absolute improvement less than %f" %\
                           (self.mMinAbsImprovement) )            
                break

            if rel_improvement < self.mMinRelImprovement:
                self.info( "optimisation stopped because relative improvement less than %f" %\
                           (self.mMinRelImprovement) )            
                break
        else:
            self.info( "optimisation stopped because maximum iteration %i reached" %\
                       (self.mMaxIterations) )            
            
        retval = cadda.optimise_destroy()
        
        if retval == 0:
            self.warn( "destruction failed" )
        else:
            self.info( "destruction success" )        
Exemple #13
0
        usage=globals()["__doc__"],
    )

    parser.add_option("--config",
                      dest="filename_config",
                      type="string",
                      help="configuration file [default=%default].")

    parser.set_defaults(filename_config="adda.ini", )

    (options, args) = E.Start(parser)

    if len(args) == 0:
        raise ValueError("please supply one or more nids to test.")

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    filename_graph = config.get("files", "output_graph", "adda.graph")
    filename_index = config.get("files", "output_index", "adda.graph.index")
    filename_fasta = config.get("files", "output_fasta", "adda")

    fasta = IndexedFasta.IndexedFasta(filename_fasta)

    index = cadda.IndexedNeighbours(filename_graph, filename_index)

    config.set("files", "output_segments", "test.segments")

    module = AddaSegment(
        config=config,
        fasta=fasta,
Exemple #14
0
                      dest="method",
                      type="choice",
                      choices=("finish", "merge"),
                      help="method to test "
                      "[default=%default].")

    parser.set_defaults(
        filename_data=None,
        filename_overhang_hist=None,
        method="finish",
        filename_config="adda.ini",
    )

    (options, args) = E.Start(parser)

    config = AddaIO.ConfigParser()
    config.read(os.path.expanduser(options.filename_config))

    filename_graph = config.get("output", "output_graph", "adda.graph")
    filename_index = config.get("output", "output_index", "adda.graph.index")
    filename_fasta = config.get("output", "output_fasta", "adda")

    config.set("output", "output_fit", "test.fit")
    config.set("output", "output_fit_data", "test.fit.data")
    config.set("output", "output_fit_details", "test.fit.details")
    config.set("output", "output_fit_overhang", "test.fit.overhang")
    config.set("output", "output_fit_transfer", "test.fit.transfer")

    fasta = None

    module = AddaFit(