コード例 #1
0
ファイル: AddaModule.py プロジェクト: AndreasHeger/adda
    def merge(self, filenames = None):
        """merge runs from parallel computations.

        return false if segmented file is not complete.
        """

        if filenames == None: filenames = self.mFilenames

        for f in filenames:

            if SegmentedFile.isComplete( f ): return True

            self.info( "merging file %s from %i chunks" % (f, self.mNumChunks) )

            # check if all parts have finished and are present
            if self.mNumChunks > 1:
                for chunk in range( self.mNumChunks ):
                    fn = SegmentedFile.mangle(f, self.getSlice( chunk ) ) 
                    if not SegmentedFile.isComplete( fn ):
                        self.info( "file %s is incomplete - merging aborted" % fn )
                        return False

            self.info( "all files complete" )

            SegmentedFile.merge( f )

        return True
コード例 #2
0
 def create(self):
     fd, self.mFilename = tempfile.mkstemp()
     outfile = SegmentedFile.openfile( self.mFilename, "w", slice="00-10" )
     for x in range(10): outfile.write( "%i\n" % x )
     outfile.close()
     outfile = SegmentedFile.openfile( self.mFilename, "w", slice="10-20" )
     for x in range(10,20): outfile.write( "%i\n" % x )
     outfile.close()
コード例 #3
0
 def checkContents(self):
     self.create()
     self.assertEqual( SegmentedFile.merge( self.mFilename ), True )
     self.checkToken( self.mFilename )
     infile = SegmentedFile.openfile( self.mFilename, "r" )
     data = [ x for x in infile ]
     self.assertEqual( data[1], "header1\n" )
     self.assertEqual( data[0], "#comment1\n" )
     self.assertEqual( data[12], "#comment2\n" )
     self.assertEqual( [int(x) for x in data[2:12] + data[13:]], range( 20 ) )
コード例 #4
0
 def checkContents(self):
     self.create()
     self.assertEqual(SegmentedFile.merge(self.mFilename), True)
     self.checkToken(self.mFilename)
     infile = SegmentedFile.openfile(self.mFilename, "r")
     data = [x for x in infile]
     self.assertEqual(data[1], "header1\n")
     self.assertEqual(data[0], "#comment1\n")
     self.assertEqual(data[12], "#comment2\n")
     self.assertEqual([int(x) for x in data[2:12] + data[13:]], range(20))
コード例 #5
0
 def create(self):
     fd, self.mFilename = tempfile.mkstemp()
     outfile = SegmentedFile.openfile(self.mFilename, "w", slice="00-10")
     for x in range(10):
         outfile.write("%i\n" % x)
     outfile.close()
     outfile = SegmentedFile.openfile(self.mFilename, "w", slice="10-20")
     for x in range(10, 20):
         outfile.write("%i\n" % x)
     outfile.close()
コード例 #6
0
ファイル: AddaFit.py プロジェクト: AndreasHeger/adda
    def isComplete( self ):
        '''check if files are complete'''

        if AddaModuleRecord.isComplete( self ):
            return True
        
        # If all the data files are complete, re-compute fit, transfer and overhang
        # only and then return as complete
        if SegmentedFile.isComplete( SegmentedFile.mangle( self.mFilenameData, self.getSlice()) ):
            return True

        if SegmentedFile.isComplete( self.mFilenameData ):
            return self.merge()
        
        return False
コード例 #7
0
ファイル: AddaModule.py プロジェクト: AndreasHeger/adda
    def openOutputStream(self, filename, register = False ):
        """opens an output stream.
        
        If the output filename exists an error is raised unless
        1. mForce is set: the existing file will be overwritten
        2. mAppend is set: data will be appended. The registerExistingOutput
            method is called to give the module the chance to advance 
            the input stream to the appropriate point for continuation.
        
        If mSlice is set, the name will be mangled to reflect the slice.
        If register is true, registerExistingOutput will be called.
        """


        if self.mAppend:
            mode = "a"
        else:
            mode = "w"

        self.debug( "%s%s opening with mode %s" % (filename, self.getSlice(), mode ))
        return SegmentedFile.openfile( filename, 
                                       mode,
                                       slice = self.getSlice(),
                                       force = self.mForce,
                                       append_callback = self.readPreviousData,
                                       )
コード例 #8
0
    def outputSummaryAlignments(self):
        """analyse the alignments."""

        infile = SegmentedFile.openfile(self.mFilenameAlignments, "r")

        ninput, naccepted = 0, 0
        nids, domains = set(), set()

        for line in infile:
            if line[0] == "#": continue
            if line.startswith("passed"): continue

            ninput += 1
            (code, query, sbjct, estimate,
             qstart, qend, qali, sstart, send, sali,
             score, naligned, ngaps, zscore) =\
             line[:-1].split("\t")

            nids.add(query.split("_")[0])
            nids.add(sbjct.split("_")[0])
            domains.add(query)
            domains.add(sbjct)

            if code == "+": naccepted += 1

        infile.close()

        self.mOutfile.write(">%s\n" % self.mFilenameAlignments)
        self.mOutfile.write("ntotal\t%i\n" % ninput)
        self.mOutfile.write("naccepted\t%i\n" % naccepted)
        self.mOutfile.write("nrejected\t%i\n" % (ninput - naccepted))

        return {'nids': len(nids), 'domains': len(domains)}
コード例 #9
0
ファイル: AddaFit.py プロジェクト: ProteinsWebTeam/Pfam
    def merge(self, filenames=None):
        """merge runs from parallel computations.
        """

        if SegmentedFile.isComplete(self.mFilenameFit):
            return True

        # remove unwanted results
        for x in (self.mFilenameTransfer, self.mFilenameOverhang,
                  self.mFilenameFit):
            for fn in glob.glob("%s.0*" % x):
                os.remove(fn)

        # merge the details file if all is complete
        if glob.glob("%s.0*" % self.mFilenameDetails):
            if not AddaModuleRecord.merge(self, (self.mFilenameDetails, )):
                return False

        if not AddaModuleRecord.merge(self, (self.mFilenameData, )):
            return False
        self.mNumChunks = 1
        self.readPreviousData(self.mFilenameData)
        self.finish()

        return True
コード例 #10
0
    def outputSummaryResult(self):
        """analyse the alignments."""

        infile = SegmentedFile.openfile(self.mFilenameResult, "r")

        ndomains = 0
        nids, families = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue

            ndomains += 1
            nid, start, end, family = line[:-1].split("\t")
            nids.add(nid)
            families.add(family)

        infile.close()

        self.mOutfile.write(">%s\n" % self.mFilenameResult)
        self.mOutfile.write("ndomains\t%i\n" % ndomains)
        self.mOutfile.write("nfamilies\t%i\n" % len(families))
        self.mOutfile.write("nnids\t%i\t%5.2f\n" %
                            (len(nids), 100.0 * len(nids) / self.mNNids))

        return {
            'nids': len(nids),
            'domains': ndomains,
            'families': len(families)
        }
コード例 #11
0
ファイル: AddaSummary.py プロジェクト: AndreasHeger/adda
    def outputSummaryGraph( self ):
        """analyse the alignments."""

        return {}

        infile = SegmentedFile.openfile( self.mFilenameGraph, "r" )

        nlinks = 0
        queries, sbjcts = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "query_nid"): continue
            
            nlinks += 1
            query, sbjct = line[:-1].split("\t")[:2]
            queries.add( query )
            sbjcts.add( sbjct )
        infile.close()

        self.mOutfile.write( ">%s\n" % self.mFilenameGraph )
        self.mOutfile.write( "nlinks\t%i\n" % nlinks )
        self.mOutfile.write( "nqueries\t%i\t%5.2f\n" % (len(queries), 100.0 * len(queries) / self.mNNids ) )
        self.mOutfile.write( "nsbjcts\t%i\t%5.2f\n" % (len(sbjcts), 100.0 * len(sbjcts) / self.mNNids ) )
        nids = queries.union( sbjcts )
        self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) )
        
        return { 'nids' : len(nids), 'links' : nlinks }
コード例 #12
0
ファイル: AddaFit.py プロジェクト: ProteinsWebTeam/Pfam
    def isComplete(self):
        '''check if files are complete'''

        if AddaModuleRecord.isComplete(self):
            return True

        # If all the data files are complete, re-compute fit, transfer and overhang
        # only and then return as complete
        if SegmentedFile.isComplete(
                SegmentedFile.mangle(self.mFilenameData, self.getSlice())):
            return True

        if SegmentedFile.isComplete(self.mFilenameData):
            return self.merge()

        return False
コード例 #13
0
 def checkContents(self):
     self.checkToken(self.mFilename)
     infile = SegmentedFile.openfile(self.mFilename,
                                     "r",
                                     has_header=self.mHasHeader)
     data = [int(x) for x in infile]
     self.assertEqual(data, range(20))
コード例 #14
0
ファイル: AddaSummary.py プロジェクト: AndreasHeger/adda
    def outputSummaryMst( self ):
        """analyse the alignments."""

        infile = SegmentedFile.openfile( self.mFilenameMst, "r" )

        nlinks = 0
        nids, domains = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            
            nlinks += 1
            query, sbjct = line[:-1].split("\t")[:2]
            nids.add( query.split("_")[0])
            nids.add( sbjct.split("_")[0])
            domains.add( query )
            domains.add( sbjct )

        infile.close()

        self.mOutfile.write( ">%s\n" % self.mFilenameMst )
        self.mOutfile.write( "nlinks\t%i\n" % nlinks )
        self.mOutfile.write( "ndomains\t%i\n" % len(domains) )
        self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) )
        
        return { 'nids' : len(nids), 'domains' : len(domains) }
コード例 #15
0
    def outputSummaryGraph(self):
        """analyse the alignments."""

        return {}

        infile = SegmentedFile.openfile(self.mFilenameGraph, "r")

        nlinks = 0
        queries, sbjcts = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("query_nid"): continue

            nlinks += 1
            query, sbjct = line[:-1].split("\t")[:2]
            queries.add(query)
            sbjcts.add(sbjct)
        infile.close()

        self.mOutfile.write(">%s\n" % self.mFilenameGraph)
        self.mOutfile.write("nlinks\t%i\n" % nlinks)
        self.mOutfile.write("nqueries\t%i\t%5.2f\n" %
                            (len(queries), 100.0 * len(queries) / self.mNNids))
        self.mOutfile.write("nsbjcts\t%i\t%5.2f\n" %
                            (len(sbjcts), 100.0 * len(sbjcts) / self.mNNids))
        nids = queries.union(sbjcts)
        self.mOutfile.write("nnids\t%i\t%5.2f\n" %
                            (len(nids), 100.0 * len(nids) / self.mNNids))

        return {'nids': len(nids), 'links': nlinks}
コード例 #16
0
    def outputSummaryMst(self):
        """analyse the alignments."""

        infile = SegmentedFile.openfile(self.mFilenameMst, "r")

        nlinks = 0
        nids, domains = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue

            nlinks += 1
            query, sbjct = line[:-1].split("\t")[:2]
            nids.add(query.split("_")[0])
            nids.add(sbjct.split("_")[0])
            domains.add(query)
            domains.add(sbjct)

        infile.close()

        self.mOutfile.write(">%s\n" % self.mFilenameMst)
        self.mOutfile.write("nlinks\t%i\n" % nlinks)
        self.mOutfile.write("ndomains\t%i\n" % len(domains))
        self.mOutfile.write("nnids\t%i\t%5.2f\n" %
                            (len(nids), 100.0 * len(nids) / self.mNNids))

        return {'nids': len(nids), 'domains': len(domains)}
コード例 #17
0
ファイル: AddaSegment.py プロジェクト: AndreasHeger/adda
    def validate(self):
        
        infile = SegmentedFile.fileopen( self.mFilenameSegments )

        last_nid = None
        found = set()
        nfound, nunknown, nduplicate = 0, 0, 0
        for line in infile:
            ninput += 1
            nid = line[:line.index("\t")]
            if nid != last_nid:
                if nid in found:
                    nduplicates += 1
                    self.warn("duplicate nid: %i in file %s" % (nid, filename))
                if nid not in tokens:
                    nunknown += 1
                    self.warn("unknown nid: %i in file %s" % (nid, filename))
                found.add(nid)
                nfound += 1
                last_nid = nid
            noutput += 1

        missing = set(self.mFasta.getTokens()).difference( found ) 
        if len(missing) > 0:
            self.warn( "the following nids were missing: %s" % str(missing) )

        self.info( "merging: ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\
                       (ninput, noutput, nfound, len(missing), nduplicate, nunknown ) )
        
        return len(missing) == 0 and nduplicate == 0 and nunknown == 0
コード例 #18
0
    def validate(self):
        """merge runs from parallel computations.

        Note: duplicated code with AddaSegments - can be merged.

        returns true if merging was succecss.
        """
        infiles = self.getPartialResults()
        last_nid = None
        found = set()
        nfound, nunknown, nduplicate = 0, 0, 0
        infile = SegmentedFile.fileopen( self.mFilenameGraph )
        for line in infile:
            ninput += 1
            nid = line[:line.index("\t")]
            if nid != last_nid:
                if nid in found:
                    nduplicates += 1
                    self.warn("duplicate nid: %i in file %s" % (nid, filename))
                if nid not in tokens:
                    nunknown += 1
                    self.warn("unknown nid: %i in file %s" % (nid, filename))
                found.add(nid)
                nfound += 1
                last_nid = nid
            noutput += 1

        missing = set(self.mFasta.getTokens()).difference( found ) 
        if len(missing) > 0:
            self.warn( "the following nids were missing: %s" % str(missing) )

        self.info( "merging: ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\
                       (ninput, noutput, nfound, len(missing), nduplicate, nunknown ) )
        
        return len(missing) == 0 and nduplicate == 0 and nunknown == 0
コード例 #19
0
ファイル: AddaSummary.py プロジェクト: AndreasHeger/adda
    def outputSummaryAlignments( self ):
        """analyse the alignments."""

        infile = SegmentedFile.openfile( self.mFilenameAlignments, "r" )

        ninput, naccepted = 0, 0
        nids, domains = set(), set()

        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "passed"): continue
            
            ninput += 1
            (code, query, sbjct, estimate, 
             qstart, qend, qali, sstart, send, sali, 
             score, naligned, ngaps, zscore) =\
             line[:-1].split("\t")

            nids.add( query.split("_")[0])
            nids.add( sbjct.split("_")[0])
            domains.add( query )
            domains.add( sbjct )
            
            if code == "+": naccepted += 1
            
        infile.close()

        self.mOutfile.write( ">%s\n" % self.mFilenameAlignments )
        self.mOutfile.write( "ntotal\t%i\n" % ninput )
        self.mOutfile.write( "naccepted\t%i\n" % naccepted )
        self.mOutfile.write( "nrejected\t%i\n" % (ninput - naccepted) )

        return { 'nids' : len(nids), 'domains' : len(domains) }
コード例 #20
0
ファイル: AddaSegment.py プロジェクト: ProteinsWebTeam/Pfam
    def validate(self):

        infile = SegmentedFile.fileopen(self.mFilenameSegments)

        last_nid = None
        found = set()
        nfound, nunknown, nduplicate = 0, 0, 0
        for line in infile:
            ninput += 1
            nid = line[:line.index("\t")]
            if nid != last_nid:
                if nid in found:
                    nduplicates += 1
                    self.warn("duplicate nid: %i in file %s" % (nid, filename))
                if nid not in tokens:
                    nunknown += 1
                    self.warn("unknown nid: %i in file %s" % (nid, filename))
                found.add(nid)
                nfound += 1
                last_nid = nid
            noutput += 1

        missing = set(self.mFasta.getTokens()).difference(found)
        if len(missing) > 0:
            self.warn("the following nids were missing: %s" % str(missing))

        self.info( "merging: ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\
                       (ninput, noutput, nfound, len(missing), nduplicate, nunknown ) )

        return len(missing) == 0 and nduplicate == 0 and nunknown == 0
コード例 #21
0
    def getComponents(self):
        '''return components.'''

        componentor = Components.SComponents()

        infile = SegmentedFile.openfile(self.mFilenameInput, "r")

        ninput = 0
        for line in infile:
            if line[0] == "#": continue

            qdomain, sdomain = line[:-1].split("\t")[:2]
            componentor.add(qdomain, sdomain)
            ninput += 1

        self.info("computing components with %i links" % ninput)

        return componentor.getComponents()
コード例 #22
0
ファイル: AddaComponents.py プロジェクト: Rfam/rfam-website
    def getComponents( self ):
        '''return components.'''

        componentor = Components.SComponents()

        infile = SegmentedFile.openfile( self.mFilenameInput, "r" )

        ninput = 0
        for line in infile:
            if line[0] == "#": continue
            
            qdomain, sdomain = line[:-1].split("\t")[:2]
            componentor.add( qdomain, sdomain )
            ninput += 1

        self.info( "computing components with %i links" % ninput)

        return componentor.getComponents()
コード例 #23
0
ファイル: AddaFit.py プロジェクト: AndreasHeger/adda
    def merge(self, filenames = None ):
        """merge runs from parallel computations.
        """

        if SegmentedFile.isComplete( self.mFilenameFit ):
            return True

        # remove unwanted results
        for x in (self.mFilenameTransfer, self.mFilenameOverhang, self.mFilenameFit):
            for fn in glob.glob( "%s.0*" % x ):
                os.remove(fn)

        # merge the details file if all is complete
        if glob.glob( "%s.0*" % self.mFilenameDetails):
            if not AddaModuleRecord.merge( self, (self.mFilenameDetails, ) ): return False

        if not AddaModuleRecord.merge( self, (self.mFilenameData, ) ): return False
        self.mNumChunks = 1
        self.readPreviousData( self.mFilenameData )
        self.finish()
            
        return True
コード例 #24
0
ファイル: AddaSummary.py プロジェクト: AndreasHeger/adda
    def outputSummaryNids( self ):
        
        infile = SegmentedFile.openfile( self.mFilenameNids, "r" )

        ndomains = 0
        nids = set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            
            nid, pid, hid, length, sequence = line[:-1].split("\t")
            nids.add(nid)

        infile.close()

        self.mNids = nids
        self.mNNids = len(self.mNids)

        self.mOutfile.write( ">%s\n" % self.mFilenameNids )
        self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), len(nids) / self.mNNids ) )

        return { 'nids' : len(nids) }
コード例 #25
0
ファイル: AddaSummary.py プロジェクト: AndreasHeger/adda
    def outputSummarySegments( self ):
        """analyse the alignments."""

        infile = SegmentedFile.openfile( self.mFilenameSegments, "r" )

        ndomains = 0
        nids = set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            
            ndomains += 1
            nid, node, parent, level, start, end = line[:-1].split("\t")
            nids.add(nid)

        infile.close()

        self.mOutfile.write( ">%s\n" % self.mFilenameSegments )
        self.mOutfile.write( "ndomains\t%i\n" % ndomains )
        self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) )

        return { 'nids' : len(nids), 'domains' : ndomains }
コード例 #26
0
    def outputSummarySegments(self):
        """analyse the alignments."""

        infile = SegmentedFile.openfile(self.mFilenameSegments, "r")

        ndomains = 0
        nids = set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue

            ndomains += 1
            nid, node, parent, level, start, end = line[:-1].split("\t")
            nids.add(nid)

        infile.close()

        self.mOutfile.write(">%s\n" % self.mFilenameSegments)
        self.mOutfile.write("ndomains\t%i\n" % ndomains)
        self.mOutfile.write("nnids\t%i\t%5.2f\n" %
                            (len(nids), 100.0 * len(nids) / self.mNNids))

        return {'nids': len(nids), 'domains': ndomains}
コード例 #27
0
    def outputSummaryNids(self):

        infile = SegmentedFile.openfile(self.mFilenameNids, "r")

        ndomains = 0
        nids = set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue

            nid, pid, hid, length, sequence = line[:-1].split("\t")
            nids.add(nid)

        infile.close()

        self.mNids = nids
        self.mNNids = len(self.mNids)

        self.mOutfile.write(">%s\n" % self.mFilenameNids)
        self.mOutfile.write("nnids\t%i\t%5.2f\n" %
                            (len(nids), len(nids) / self.mNNids))

        return {'nids': len(nids)}
コード例 #28
0
ファイル: AddaSummary.py プロジェクト: AndreasHeger/adda
    def outputSummaryClusters( self ):
        """analyse the alignments."""

        infile = SegmentedFile.openfile( self.mFilenameClusters, "r" )

        ndomains = 0
        nids, families = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            
            ndomains += 1
            nid, start, end, family = line[:-1].split("\t")
            nids.add(nid)
            families.add(family)

        infile.close()

        self.mOutfile.write( ">%s\n" % self.mFilenameClusters )
        self.mOutfile.write( "ndomains\t%i\n" % ndomains )
        self.mOutfile.write( "nfamilies\t%i\n" % len(families) )
        self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) )

        return { 'nids' : len(nids), 'domains' : ndomains, 'families': len(families) }
コード例 #29
0
ファイル: AddaModule.py プロジェクト: AndreasHeger/adda
 def isComplete( self ):
     """return if this step is complete."""
     for f in self.mFilenames:
         if not SegmentedFile.isComplete( SegmentedFile.mangle( f, self.getSlice()) ):
             return False
     return True
コード例 #30
0
 def testMerge(self):
     self.create()
     SegmentedFile.merge(self.mFilename, has_header=self.mHasHeader)
     self.checkContents()
コード例 #31
0
 def checkContents(self):
     infile = SegmentedFile.openfile(self.mFilename, "r")
     data = [int(x) for x in infile]
     self.assertEqual(data, range(10))
コード例 #32
0
 def checkContents( self ):
     infile = SegmentedFile.openfile( self.mFilename, "r" )
     data = [int(x) for x in infile ]
     self.assertEqual( data, range( 10 ) )
コード例 #33
0
 def checkContents(self):
     self.checkToken(self.mFilename)
     infile = SegmentedFile.openfile(self.mFilename, "r")
     data = [x for x in infile]
     self.assertEqual(data[0], "header\n")
     self.assertEqual([int(x) for x in data[1:]], range(20))
コード例 #34
0
 def create(self):
     outfile = SegmentedFile.openfile(self.mFilename, "w")
     for x in range(10):
         outfile.write("%i\n" % x)
     outfile.close()
コード例 #35
0
ファイル: AddaProfiles.py プロジェクト: Rfam/rfam-website
    def isComplete( self ):

        fn, fi = ProfileLibrary.getFileNames( self.mFilenameProfile + self.getSlice() )
        return SegmentedFile.isComplete( fi )
コード例 #36
0
ファイル: AddaFamilies.py プロジェクト: AndreasHeger/adda
    def applyMethod(self ):
        """apply the method.
        """

        infile = SegmentedFile.openfile( self.mFilenameClusters, "r" )

        family2domains = collections.defaultdict( list )
        nid2domains = collections.defaultdict( list )

        ndomains = 0
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue
            nid, start, end, family = line[:-1].split("\t")
            nid = int(nid)
            nid2domains[nid].append( (int(start),int(end),family) )
            family2domains[family].append( (nid,int(end)-int(start) ) )
            ndomains += 1

        self.info( "collected: nsequences=%i, ndomains=%i, nfamilies=%i" %\
                       (len(nid2domains), ndomains, len(family2domains) ) )
        
        family_id = len(family2domains) 

        self.mOutfile.write( "nid\tstart\tend\tfamily\n" )

        # output domains per nid
        seqs = self.mFasta.getContigSizes()
        nids = sorted(seqs.keys())

        nfull_singletons = 0
        npartial_singletons = 0
        ndomains = 0

        # compute stats at the same time
        seq_lengths = seqs.values()
        max_length = max(seq_lengths)
        # compute summary per family
        # and compute full histograms of length distributions
        hist_domains_mst = numpy.zeros( max_length + 1, numpy.float)
        hist_domains_full_singletons = numpy.zeros( max_length + 1, numpy.float)
        hist_domains_partial_singletons = numpy.zeros( max_length + 1, numpy.float)
        hist_sequences = numpy.zeros( max_length + 1, numpy.float)
        for x in seq_lengths: hist_sequences[x] += 1

        for nid in nids:
             length = self.mFasta.getLength( nid )
             id = self.mMapNid2Id[ nid ]

             if nid not in nid2domains:
                 family_id += 1
                 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                          ( id, 0, length, self.mPatternFamily % family_id ) )
                 family2domains[ self.mPatternFamily % family_id ].append( (nid, length) )
                 nfull_singletons += 1
                 hist_domains_full_singletons[length] += 1
                 continue

             domains = nid2domains[nid]
             domains.sort()

             last = 0
             for start, end, family in domains:
                 hist_domains_mst[end-start] += 1

                 if start - last > self.mMinDomainSize:
                     family_id += 1
                     self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                              ( id, last, start, self.mPatternFamily % family_id ) )

                     npartial_singletons += 1
                     family2domains[ self.mPatternFamily % family_id ].append( (nid, start-last) )
                     ndomains += 1
                     hist_domains_partial_singletons[start-last] += 1

                 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                          ( id, start, end, family ) )
                 
                 last = end
                 ndomains += 1

             if length - last > self.mMinDomainSize:
                 family_id += 1
                 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                                     ( id, last, length, self.mPatternFamily % family_id ) )
                 npartial_singletons += 1
                 family2domains[ self.mPatternFamily % family_id ].append( (nid, start-last) )
                 hist_domains_partial_singletons[start-last] += 1
                 ndomains += 1

        self.info( "output: nsequences=%i, ndomains=%i,nfamilies=%i, nfull_singletons=%i, npartial_singletons=%i" % (len(nids), ndomains, len(family2domains), npartial_singletons, nfull_singletons))

        self.mOutfileFamilies.write( "family\tnunits\tnsequences\tnresidues\tlength\tlength_median\tlength_stddev\n" )

        family_size_sequences, family_size_domains = [], []

        for family in sorted(family2domains.keys()):
            nids = set()
            lengths = []

            for nid, length in family2domains[family]:
                lengths.append( length )
                nids.add(nid)

            ndomains = len(lengths)
                
            self.mOutfileFamilies.write( "\t".join( (family,
                                                     str(ndomains),
                                                     str(len(nids)),
                                                     str(sum(lengths)),
                                                     "%5.2f" % numpy.mean(lengths),
                                                     "%5.2f" % numpy.median(lengths),
                                                     "%5.2f" % numpy.std(lengths) ) ) + "\n" )

            family_size_sequences.append( len(nids) )
            family_size_domains.append( ndomains )


        if PLOT:
            ## output length distributions
            lines, legends = [], []
            for title, vals in (
                ("sequences", hist_sequences), 
                ("domains", hist_domains_mst), 
                ("partial singletons", hist_domains_full_singletons),
                ("full singletons", hist_domains_partial_singletons), ):

                vv = numpy.zeros( max_length )
                for x in range( 0, max_length, 10 ):
                    vv[x] = sum( vals[x:x+10] )
                x = numpy.flatnonzero( vv > 0 )
                s = sum(vals)
                if s > 0: vv /= s

                lines.append( pylab.plot( x, vv[x] ) )
                legends.append( title )

            pylab.xlabel( "sequence or domain length / residues" )
            pylab.ylabel( "relative frequency" )
            pylab.legend( lines, legends )
            pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_domainsizes_all.png" ) )

            pylab.xlim( 0, 2000 )
            pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_domainsizes_small.png" ) )

            pylab.xlim( max_length - max_length // 4, max_length + 1 )
            pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_domainsize_large.png" ) )

            pylab.clf()

            ## output domain family sizes
            lines = []
            (yvals, xvals) = numpy.histogram( family_size_sequences, bins=50, new = True)
            lines.append( pylab.loglog( xvals[:-1], yvals ) )
            (yvals, xvals) = numpy.histogram( family_size_domains, bins=50, new = True)
            lines.append( pylab.loglog( xvals[:-1], yvals ) )

            pylab.legend( lines, ( "sequeces", "domains") )
            pylab.xlabel( "sequences/domains per family" )
            pylab.ylabel( "relative frequency" )
            pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_familysizes.png" ) )
コード例 #37
0
 def checkContents( self ):
     self.checkToken( self.mFilename )
     infile = SegmentedFile.openfile( self.mFilename, "r", has_header = self.mHasHeader )
     data = [ x for x in infile ]
     self.assertEqual( data[0], "header\n" )
     self.assertEqual( [int(x) for x in data[1:]], range( 20 ) )
コード例 #38
0
 def testMerge(self):
     self.create()
     SegmentedFile.merge( self.mFilename, has_header=self.mHasHeader )
     self.checkContents()
コード例 #39
0
ファイル: AddaProfiles.py プロジェクト: ProteinsWebTeam/Pfam
    def isComplete(self):

        fn, fi = ProfileLibrary.getFileNames(self.mFilenameProfile +
                                             self.getSlice())
        return SegmentedFile.isComplete(fi)
コード例 #40
0
 def merge(self):
     SegmentedFile.merge( self.mFilenameGraph )
コード例 #41
0
    def applyMethod(self):
        """apply the method.
        """

        infile = SegmentedFile.openfile(self.mFilenameClusters, "r")

        family2domains = collections.defaultdict(list)
        nid2domains = collections.defaultdict(list)

        ndomains = 0
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue
            nid, start, end, family = line[:-1].split("\t")
            nid = int(nid)
            nid2domains[nid].append((int(start), int(end), family))
            family2domains[family].append((nid, int(end) - int(start)))
            ndomains += 1

        self.info( "collected: nsequences=%i, ndomains=%i, nfamilies=%i" %\
                       (len(nid2domains), ndomains, len(family2domains) ) )

        family_id = len(family2domains)

        self.mOutfile.write("nid\tstart\tend\tfamily\n")

        # output domains per nid
        seqs = self.mFasta.getContigSizes()
        nids = sorted(seqs.keys())

        nfull_singletons = 0
        npartial_singletons = 0
        ndomains = 0

        # compute stats at the same time
        seq_lengths = seqs.values()
        max_length = max(seq_lengths)
        # compute summary per family
        # and compute full histograms of length distributions
        hist_domains_mst = numpy.zeros(max_length + 1, numpy.float)
        hist_domains_full_singletons = numpy.zeros(max_length + 1, numpy.float)
        hist_domains_partial_singletons = numpy.zeros(max_length + 1,
                                                      numpy.float)
        hist_sequences = numpy.zeros(max_length + 1, numpy.float)
        for x in seq_lengths:
            hist_sequences[x] += 1

        for nid in nids:
            length = self.mFasta.getLength(nid)
            id = self.mMapNid2Id[nid]

            if nid not in nid2domains:
                family_id += 1
                self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                         ( id, 0, length, self.mPatternFamily % family_id ) )
                family2domains[self.mPatternFamily % family_id].append(
                    (nid, length))
                nfull_singletons += 1
                hist_domains_full_singletons[length] += 1
                continue

            domains = nid2domains[nid]
            domains.sort()

            last = 0
            for start, end, family in domains:
                hist_domains_mst[end - start] += 1

                if start - last > self.mMinDomainSize:
                    family_id += 1
                    self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                             ( id, last, start, self.mPatternFamily % family_id ) )

                    npartial_singletons += 1
                    family2domains[self.mPatternFamily % family_id].append(
                        (nid, start - last))
                    ndomains += 1
                    hist_domains_partial_singletons[start - last] += 1

                self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                         ( id, start, end, family ) )

                last = end
                ndomains += 1

            if length - last > self.mMinDomainSize:
                family_id += 1
                self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                                    ( id, last, length, self.mPatternFamily % family_id ) )
                npartial_singletons += 1
                family2domains[self.mPatternFamily % family_id].append(
                    (nid, start - last))
                hist_domains_partial_singletons[start - last] += 1
                ndomains += 1

        self.info(
            "output: nsequences=%i, ndomains=%i,nfamilies=%i, nfull_singletons=%i, npartial_singletons=%i"
            % (len(nids), ndomains, len(family2domains), npartial_singletons,
               nfull_singletons))

        self.mOutfileFamilies.write(
            "family\tnunits\tnsequences\tnresidues\tlength\tlength_median\tlength_stddev\n"
        )

        family_size_sequences, family_size_domains = [], []

        for family in sorted(family2domains.keys()):
            nids = set()
            lengths = []

            for nid, length in family2domains[family]:
                lengths.append(length)
                nids.add(nid)

            ndomains = len(lengths)

            self.mOutfileFamilies.write("\t".join(
                (family, str(ndomains), str(len(nids)), str(sum(lengths)),
                 "%5.2f" % numpy.mean(lengths), "%5.2f" %
                 numpy.median(lengths), "%5.2f" % numpy.std(lengths))) + "\n")

            family_size_sequences.append(len(nids))
            family_size_domains.append(ndomains)

        if PLOT:
            ## output length distributions
            lines, legends = [], []
            for title, vals in (
                ("sequences", hist_sequences),
                ("domains", hist_domains_mst),
                ("partial singletons", hist_domains_full_singletons),
                ("full singletons", hist_domains_partial_singletons),
            ):

                vv = numpy.zeros(max_length)
                for x in range(0, max_length, 10):
                    vv[x] = sum(vals[x:x + 10])
                x = numpy.flatnonzero(vv > 0)
                s = sum(vals)
                if s > 0: vv /= s

                lines.append(pylab.plot(x, vv[x]))
                legends.append(title)

            pylab.xlabel("sequence or domain length / residues")
            pylab.ylabel("relative frequency")
            pylab.legend(lines, legends)
            pylab.savefig(
                os.path.expanduser(self.mFilenameDomains +
                                   "_domainsizes_all.png"))

            pylab.xlim(0, 2000)
            pylab.savefig(
                os.path.expanduser(self.mFilenameDomains +
                                   "_domainsizes_small.png"))

            pylab.xlim(max_length - max_length // 4, max_length + 1)
            pylab.savefig(
                os.path.expanduser(self.mFilenameDomains +
                                   "_domainsize_large.png"))

            pylab.clf()

            ## output domain family sizes
            lines = []
            (yvals, xvals) = numpy.histogram(family_size_sequences,
                                             bins=50,
                                             new=True)
            lines.append(pylab.loglog(xvals[:-1], yvals))
            (yvals, xvals) = numpy.histogram(family_size_domains,
                                             bins=50,
                                             new=True)
            lines.append(pylab.loglog(xvals[:-1], yvals))

            pylab.legend(lines, ("sequeces", "domains"))
            pylab.xlabel("sequences/domains per family")
            pylab.ylabel("relative frequency")
            pylab.savefig(
                os.path.expanduser(self.mFilenameDomains + "_familysizes.png"))
コード例 #42
0
 def create(self):
     outfile = SegmentedFile.openfile( self.mFilename, "w" )
     for x in range(10):
         outfile.write( "%i\n" % x )
     outfile.close()