def merge(self, filenames = None): """merge runs from parallel computations. return false if segmented file is not complete. """ if filenames == None: filenames = self.mFilenames for f in filenames: if SegmentedFile.isComplete( f ): return True self.info( "merging file %s from %i chunks" % (f, self.mNumChunks) ) # check if all parts have finished and are present if self.mNumChunks > 1: for chunk in range( self.mNumChunks ): fn = SegmentedFile.mangle(f, self.getSlice( chunk ) ) if not SegmentedFile.isComplete( fn ): self.info( "file %s is incomplete - merging aborted" % fn ) return False self.info( "all files complete" ) SegmentedFile.merge( f ) return True
def create(self): fd, self.mFilename = tempfile.mkstemp() outfile = SegmentedFile.openfile( self.mFilename, "w", slice="00-10" ) for x in range(10): outfile.write( "%i\n" % x ) outfile.close() outfile = SegmentedFile.openfile( self.mFilename, "w", slice="10-20" ) for x in range(10,20): outfile.write( "%i\n" % x ) outfile.close()
def checkContents(self): self.create() self.assertEqual( SegmentedFile.merge( self.mFilename ), True ) self.checkToken( self.mFilename ) infile = SegmentedFile.openfile( self.mFilename, "r" ) data = [ x for x in infile ] self.assertEqual( data[1], "header1\n" ) self.assertEqual( data[0], "#comment1\n" ) self.assertEqual( data[12], "#comment2\n" ) self.assertEqual( [int(x) for x in data[2:12] + data[13:]], range( 20 ) )
def checkContents(self): self.create() self.assertEqual(SegmentedFile.merge(self.mFilename), True) self.checkToken(self.mFilename) infile = SegmentedFile.openfile(self.mFilename, "r") data = [x for x in infile] self.assertEqual(data[1], "header1\n") self.assertEqual(data[0], "#comment1\n") self.assertEqual(data[12], "#comment2\n") self.assertEqual([int(x) for x in data[2:12] + data[13:]], range(20))
def create(self): fd, self.mFilename = tempfile.mkstemp() outfile = SegmentedFile.openfile(self.mFilename, "w", slice="00-10") for x in range(10): outfile.write("%i\n" % x) outfile.close() outfile = SegmentedFile.openfile(self.mFilename, "w", slice="10-20") for x in range(10, 20): outfile.write("%i\n" % x) outfile.close()
def isComplete( self ): '''check if files are complete''' if AddaModuleRecord.isComplete( self ): return True # If all the data files are complete, re-compute fit, transfer and overhang # only and then return as complete if SegmentedFile.isComplete( SegmentedFile.mangle( self.mFilenameData, self.getSlice()) ): return True if SegmentedFile.isComplete( self.mFilenameData ): return self.merge() return False
def openOutputStream(self, filename, register = False ): """opens an output stream. If the output filename exists an error is raised unless 1. mForce is set: the existing file will be overwritten 2. mAppend is set: data will be appended. The registerExistingOutput method is called to give the module the chance to advance the input stream to the appropriate point for continuation. If mSlice is set, the name will be mangled to reflect the slice. If register is true, registerExistingOutput will be called. """ if self.mAppend: mode = "a" else: mode = "w" self.debug( "%s%s opening with mode %s" % (filename, self.getSlice(), mode )) return SegmentedFile.openfile( filename, mode, slice = self.getSlice(), force = self.mForce, append_callback = self.readPreviousData, )
def outputSummaryAlignments(self): """analyse the alignments.""" infile = SegmentedFile.openfile(self.mFilenameAlignments, "r") ninput, naccepted = 0, 0 nids, domains = set(), set() for line in infile: if line[0] == "#": continue if line.startswith("passed"): continue ninput += 1 (code, query, sbjct, estimate, qstart, qend, qali, sstart, send, sali, score, naligned, ngaps, zscore) =\ line[:-1].split("\t") nids.add(query.split("_")[0]) nids.add(sbjct.split("_")[0]) domains.add(query) domains.add(sbjct) if code == "+": naccepted += 1 infile.close() self.mOutfile.write(">%s\n" % self.mFilenameAlignments) self.mOutfile.write("ntotal\t%i\n" % ninput) self.mOutfile.write("naccepted\t%i\n" % naccepted) self.mOutfile.write("nrejected\t%i\n" % (ninput - naccepted)) return {'nids': len(nids), 'domains': len(domains)}
def merge(self, filenames=None): """merge runs from parallel computations. """ if SegmentedFile.isComplete(self.mFilenameFit): return True # remove unwanted results for x in (self.mFilenameTransfer, self.mFilenameOverhang, self.mFilenameFit): for fn in glob.glob("%s.0*" % x): os.remove(fn) # merge the details file if all is complete if glob.glob("%s.0*" % self.mFilenameDetails): if not AddaModuleRecord.merge(self, (self.mFilenameDetails, )): return False if not AddaModuleRecord.merge(self, (self.mFilenameData, )): return False self.mNumChunks = 1 self.readPreviousData(self.mFilenameData) self.finish() return True
def outputSummaryResult(self): """analyse the alignments.""" infile = SegmentedFile.openfile(self.mFilenameResult, "r") ndomains = 0 nids, families = set(), set() for line in infile: if line[0] == "#": continue if line.startswith("nid"): continue ndomains += 1 nid, start, end, family = line[:-1].split("\t") nids.add(nid) families.add(family) infile.close() self.mOutfile.write(">%s\n" % self.mFilenameResult) self.mOutfile.write("ndomains\t%i\n" % ndomains) self.mOutfile.write("nfamilies\t%i\n" % len(families)) self.mOutfile.write("nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids)) return { 'nids': len(nids), 'domains': ndomains, 'families': len(families) }
def outputSummaryGraph( self ): """analyse the alignments.""" return {} infile = SegmentedFile.openfile( self.mFilenameGraph, "r" ) nlinks = 0 queries, sbjcts = set(), set() for line in infile: if line[0] == "#": continue if line.startswith( "query_nid"): continue nlinks += 1 query, sbjct = line[:-1].split("\t")[:2] queries.add( query ) sbjcts.add( sbjct ) infile.close() self.mOutfile.write( ">%s\n" % self.mFilenameGraph ) self.mOutfile.write( "nlinks\t%i\n" % nlinks ) self.mOutfile.write( "nqueries\t%i\t%5.2f\n" % (len(queries), 100.0 * len(queries) / self.mNNids ) ) self.mOutfile.write( "nsbjcts\t%i\t%5.2f\n" % (len(sbjcts), 100.0 * len(sbjcts) / self.mNNids ) ) nids = queries.union( sbjcts ) self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) ) return { 'nids' : len(nids), 'links' : nlinks }
def isComplete(self): '''check if files are complete''' if AddaModuleRecord.isComplete(self): return True # If all the data files are complete, re-compute fit, transfer and overhang # only and then return as complete if SegmentedFile.isComplete( SegmentedFile.mangle(self.mFilenameData, self.getSlice())): return True if SegmentedFile.isComplete(self.mFilenameData): return self.merge() return False
def checkContents(self): self.checkToken(self.mFilename) infile = SegmentedFile.openfile(self.mFilename, "r", has_header=self.mHasHeader) data = [int(x) for x in infile] self.assertEqual(data, range(20))
def outputSummaryMst( self ): """analyse the alignments.""" infile = SegmentedFile.openfile( self.mFilenameMst, "r" ) nlinks = 0 nids, domains = set(), set() for line in infile: if line[0] == "#": continue if line.startswith( "nid"): continue nlinks += 1 query, sbjct = line[:-1].split("\t")[:2] nids.add( query.split("_")[0]) nids.add( sbjct.split("_")[0]) domains.add( query ) domains.add( sbjct ) infile.close() self.mOutfile.write( ">%s\n" % self.mFilenameMst ) self.mOutfile.write( "nlinks\t%i\n" % nlinks ) self.mOutfile.write( "ndomains\t%i\n" % len(domains) ) self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) ) return { 'nids' : len(nids), 'domains' : len(domains) }
def outputSummaryGraph(self): """analyse the alignments.""" return {} infile = SegmentedFile.openfile(self.mFilenameGraph, "r") nlinks = 0 queries, sbjcts = set(), set() for line in infile: if line[0] == "#": continue if line.startswith("query_nid"): continue nlinks += 1 query, sbjct = line[:-1].split("\t")[:2] queries.add(query) sbjcts.add(sbjct) infile.close() self.mOutfile.write(">%s\n" % self.mFilenameGraph) self.mOutfile.write("nlinks\t%i\n" % nlinks) self.mOutfile.write("nqueries\t%i\t%5.2f\n" % (len(queries), 100.0 * len(queries) / self.mNNids)) self.mOutfile.write("nsbjcts\t%i\t%5.2f\n" % (len(sbjcts), 100.0 * len(sbjcts) / self.mNNids)) nids = queries.union(sbjcts) self.mOutfile.write("nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids)) return {'nids': len(nids), 'links': nlinks}
def outputSummaryMst(self): """analyse the alignments.""" infile = SegmentedFile.openfile(self.mFilenameMst, "r") nlinks = 0 nids, domains = set(), set() for line in infile: if line[0] == "#": continue if line.startswith("nid"): continue nlinks += 1 query, sbjct = line[:-1].split("\t")[:2] nids.add(query.split("_")[0]) nids.add(sbjct.split("_")[0]) domains.add(query) domains.add(sbjct) infile.close() self.mOutfile.write(">%s\n" % self.mFilenameMst) self.mOutfile.write("nlinks\t%i\n" % nlinks) self.mOutfile.write("ndomains\t%i\n" % len(domains)) self.mOutfile.write("nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids)) return {'nids': len(nids), 'domains': len(domains)}
def validate(self): infile = SegmentedFile.fileopen( self.mFilenameSegments ) last_nid = None found = set() nfound, nunknown, nduplicate = 0, 0, 0 for line in infile: ninput += 1 nid = line[:line.index("\t")] if nid != last_nid: if nid in found: nduplicates += 1 self.warn("duplicate nid: %i in file %s" % (nid, filename)) if nid not in tokens: nunknown += 1 self.warn("unknown nid: %i in file %s" % (nid, filename)) found.add(nid) nfound += 1 last_nid = nid noutput += 1 missing = set(self.mFasta.getTokens()).difference( found ) if len(missing) > 0: self.warn( "the following nids were missing: %s" % str(missing) ) self.info( "merging: ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\ (ninput, noutput, nfound, len(missing), nduplicate, nunknown ) ) return len(missing) == 0 and nduplicate == 0 and nunknown == 0
def validate(self): """merge runs from parallel computations. Note: duplicated code with AddaSegments - can be merged. returns true if merging was succecss. """ infiles = self.getPartialResults() last_nid = None found = set() nfound, nunknown, nduplicate = 0, 0, 0 infile = SegmentedFile.fileopen( self.mFilenameGraph ) for line in infile: ninput += 1 nid = line[:line.index("\t")] if nid != last_nid: if nid in found: nduplicates += 1 self.warn("duplicate nid: %i in file %s" % (nid, filename)) if nid not in tokens: nunknown += 1 self.warn("unknown nid: %i in file %s" % (nid, filename)) found.add(nid) nfound += 1 last_nid = nid noutput += 1 missing = set(self.mFasta.getTokens()).difference( found ) if len(missing) > 0: self.warn( "the following nids were missing: %s" % str(missing) ) self.info( "merging: ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\ (ninput, noutput, nfound, len(missing), nduplicate, nunknown ) ) return len(missing) == 0 and nduplicate == 0 and nunknown == 0
def outputSummaryAlignments( self ): """analyse the alignments.""" infile = SegmentedFile.openfile( self.mFilenameAlignments, "r" ) ninput, naccepted = 0, 0 nids, domains = set(), set() for line in infile: if line[0] == "#": continue if line.startswith( "passed"): continue ninput += 1 (code, query, sbjct, estimate, qstart, qend, qali, sstart, send, sali, score, naligned, ngaps, zscore) =\ line[:-1].split("\t") nids.add( query.split("_")[0]) nids.add( sbjct.split("_")[0]) domains.add( query ) domains.add( sbjct ) if code == "+": naccepted += 1 infile.close() self.mOutfile.write( ">%s\n" % self.mFilenameAlignments ) self.mOutfile.write( "ntotal\t%i\n" % ninput ) self.mOutfile.write( "naccepted\t%i\n" % naccepted ) self.mOutfile.write( "nrejected\t%i\n" % (ninput - naccepted) ) return { 'nids' : len(nids), 'domains' : len(domains) }
def validate(self): infile = SegmentedFile.fileopen(self.mFilenameSegments) last_nid = None found = set() nfound, nunknown, nduplicate = 0, 0, 0 for line in infile: ninput += 1 nid = line[:line.index("\t")] if nid != last_nid: if nid in found: nduplicates += 1 self.warn("duplicate nid: %i in file %s" % (nid, filename)) if nid not in tokens: nunknown += 1 self.warn("unknown nid: %i in file %s" % (nid, filename)) found.add(nid) nfound += 1 last_nid = nid noutput += 1 missing = set(self.mFasta.getTokens()).difference(found) if len(missing) > 0: self.warn("the following nids were missing: %s" % str(missing)) self.info( "merging: ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\ (ninput, noutput, nfound, len(missing), nduplicate, nunknown ) ) return len(missing) == 0 and nduplicate == 0 and nunknown == 0
def getComponents(self): '''return components.''' componentor = Components.SComponents() infile = SegmentedFile.openfile(self.mFilenameInput, "r") ninput = 0 for line in infile: if line[0] == "#": continue qdomain, sdomain = line[:-1].split("\t")[:2] componentor.add(qdomain, sdomain) ninput += 1 self.info("computing components with %i links" % ninput) return componentor.getComponents()
def getComponents( self ): '''return components.''' componentor = Components.SComponents() infile = SegmentedFile.openfile( self.mFilenameInput, "r" ) ninput = 0 for line in infile: if line[0] == "#": continue qdomain, sdomain = line[:-1].split("\t")[:2] componentor.add( qdomain, sdomain ) ninput += 1 self.info( "computing components with %i links" % ninput) return componentor.getComponents()
def merge(self, filenames = None ): """merge runs from parallel computations. """ if SegmentedFile.isComplete( self.mFilenameFit ): return True # remove unwanted results for x in (self.mFilenameTransfer, self.mFilenameOverhang, self.mFilenameFit): for fn in glob.glob( "%s.0*" % x ): os.remove(fn) # merge the details file if all is complete if glob.glob( "%s.0*" % self.mFilenameDetails): if not AddaModuleRecord.merge( self, (self.mFilenameDetails, ) ): return False if not AddaModuleRecord.merge( self, (self.mFilenameData, ) ): return False self.mNumChunks = 1 self.readPreviousData( self.mFilenameData ) self.finish() return True
def outputSummaryNids( self ): infile = SegmentedFile.openfile( self.mFilenameNids, "r" ) ndomains = 0 nids = set() for line in infile: if line[0] == "#": continue if line.startswith( "nid"): continue nid, pid, hid, length, sequence = line[:-1].split("\t") nids.add(nid) infile.close() self.mNids = nids self.mNNids = len(self.mNids) self.mOutfile.write( ">%s\n" % self.mFilenameNids ) self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), len(nids) / self.mNNids ) ) return { 'nids' : len(nids) }
def outputSummarySegments( self ): """analyse the alignments.""" infile = SegmentedFile.openfile( self.mFilenameSegments, "r" ) ndomains = 0 nids = set() for line in infile: if line[0] == "#": continue if line.startswith( "nid"): continue ndomains += 1 nid, node, parent, level, start, end = line[:-1].split("\t") nids.add(nid) infile.close() self.mOutfile.write( ">%s\n" % self.mFilenameSegments ) self.mOutfile.write( "ndomains\t%i\n" % ndomains ) self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) ) return { 'nids' : len(nids), 'domains' : ndomains }
def outputSummarySegments(self): """analyse the alignments.""" infile = SegmentedFile.openfile(self.mFilenameSegments, "r") ndomains = 0 nids = set() for line in infile: if line[0] == "#": continue if line.startswith("nid"): continue ndomains += 1 nid, node, parent, level, start, end = line[:-1].split("\t") nids.add(nid) infile.close() self.mOutfile.write(">%s\n" % self.mFilenameSegments) self.mOutfile.write("ndomains\t%i\n" % ndomains) self.mOutfile.write("nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids)) return {'nids': len(nids), 'domains': ndomains}
def outputSummaryNids(self): infile = SegmentedFile.openfile(self.mFilenameNids, "r") ndomains = 0 nids = set() for line in infile: if line[0] == "#": continue if line.startswith("nid"): continue nid, pid, hid, length, sequence = line[:-1].split("\t") nids.add(nid) infile.close() self.mNids = nids self.mNNids = len(self.mNids) self.mOutfile.write(">%s\n" % self.mFilenameNids) self.mOutfile.write("nnids\t%i\t%5.2f\n" % (len(nids), len(nids) / self.mNNids)) return {'nids': len(nids)}
def outputSummaryClusters( self ): """analyse the alignments.""" infile = SegmentedFile.openfile( self.mFilenameClusters, "r" ) ndomains = 0 nids, families = set(), set() for line in infile: if line[0] == "#": continue if line.startswith( "nid"): continue ndomains += 1 nid, start, end, family = line[:-1].split("\t") nids.add(nid) families.add(family) infile.close() self.mOutfile.write( ">%s\n" % self.mFilenameClusters ) self.mOutfile.write( "ndomains\t%i\n" % ndomains ) self.mOutfile.write( "nfamilies\t%i\n" % len(families) ) self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) ) return { 'nids' : len(nids), 'domains' : ndomains, 'families': len(families) }
def isComplete( self ): """return if this step is complete.""" for f in self.mFilenames: if not SegmentedFile.isComplete( SegmentedFile.mangle( f, self.getSlice()) ): return False return True
def testMerge(self): self.create() SegmentedFile.merge(self.mFilename, has_header=self.mHasHeader) self.checkContents()
def checkContents(self): infile = SegmentedFile.openfile(self.mFilename, "r") data = [int(x) for x in infile] self.assertEqual(data, range(10))
def checkContents( self ): infile = SegmentedFile.openfile( self.mFilename, "r" ) data = [int(x) for x in infile ] self.assertEqual( data, range( 10 ) )
def checkContents(self): self.checkToken(self.mFilename) infile = SegmentedFile.openfile(self.mFilename, "r") data = [x for x in infile] self.assertEqual(data[0], "header\n") self.assertEqual([int(x) for x in data[1:]], range(20))
def create(self): outfile = SegmentedFile.openfile(self.mFilename, "w") for x in range(10): outfile.write("%i\n" % x) outfile.close()
def isComplete( self ): fn, fi = ProfileLibrary.getFileNames( self.mFilenameProfile + self.getSlice() ) return SegmentedFile.isComplete( fi )
def applyMethod(self ): """apply the method. """ infile = SegmentedFile.openfile( self.mFilenameClusters, "r" ) family2domains = collections.defaultdict( list ) nid2domains = collections.defaultdict( list ) ndomains = 0 for line in infile: if line[0] == "#": continue if line.startswith("nid"): continue nid, start, end, family = line[:-1].split("\t") nid = int(nid) nid2domains[nid].append( (int(start),int(end),family) ) family2domains[family].append( (nid,int(end)-int(start) ) ) ndomains += 1 self.info( "collected: nsequences=%i, ndomains=%i, nfamilies=%i" %\ (len(nid2domains), ndomains, len(family2domains) ) ) family_id = len(family2domains) self.mOutfile.write( "nid\tstart\tend\tfamily\n" ) # output domains per nid seqs = self.mFasta.getContigSizes() nids = sorted(seqs.keys()) nfull_singletons = 0 npartial_singletons = 0 ndomains = 0 # compute stats at the same time seq_lengths = seqs.values() max_length = max(seq_lengths) # compute summary per family # and compute full histograms of length distributions hist_domains_mst = numpy.zeros( max_length + 1, numpy.float) hist_domains_full_singletons = numpy.zeros( max_length + 1, numpy.float) hist_domains_partial_singletons = numpy.zeros( max_length + 1, numpy.float) hist_sequences = numpy.zeros( max_length + 1, numpy.float) for x in seq_lengths: hist_sequences[x] += 1 for nid in nids: length = self.mFasta.getLength( nid ) id = self.mMapNid2Id[ nid ] if nid not in nid2domains: family_id += 1 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \ ( id, 0, length, self.mPatternFamily % family_id ) ) family2domains[ self.mPatternFamily % family_id ].append( (nid, length) ) nfull_singletons += 1 hist_domains_full_singletons[length] += 1 continue domains = nid2domains[nid] domains.sort() last = 0 for start, end, family in domains: hist_domains_mst[end-start] += 1 if start - last > self.mMinDomainSize: family_id += 1 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \ ( id, last, start, self.mPatternFamily % family_id ) ) npartial_singletons += 1 family2domains[ self.mPatternFamily % family_id ].append( (nid, start-last) ) ndomains += 1 hist_domains_partial_singletons[start-last] += 1 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \ ( id, start, end, family ) ) last = end ndomains += 1 if length - last > self.mMinDomainSize: family_id += 1 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \ ( id, last, length, self.mPatternFamily % family_id ) ) npartial_singletons += 1 family2domains[ self.mPatternFamily % family_id ].append( (nid, start-last) ) hist_domains_partial_singletons[start-last] += 1 ndomains += 1 self.info( "output: nsequences=%i, ndomains=%i,nfamilies=%i, nfull_singletons=%i, npartial_singletons=%i" % (len(nids), ndomains, len(family2domains), npartial_singletons, nfull_singletons)) self.mOutfileFamilies.write( "family\tnunits\tnsequences\tnresidues\tlength\tlength_median\tlength_stddev\n" ) family_size_sequences, family_size_domains = [], [] for family in sorted(family2domains.keys()): nids = set() lengths = [] for nid, length in family2domains[family]: lengths.append( length ) nids.add(nid) ndomains = len(lengths) self.mOutfileFamilies.write( "\t".join( (family, str(ndomains), str(len(nids)), str(sum(lengths)), "%5.2f" % numpy.mean(lengths), "%5.2f" % numpy.median(lengths), "%5.2f" % numpy.std(lengths) ) ) + "\n" ) family_size_sequences.append( len(nids) ) family_size_domains.append( ndomains ) if PLOT: ## output length distributions lines, legends = [], [] for title, vals in ( ("sequences", hist_sequences), ("domains", hist_domains_mst), ("partial singletons", hist_domains_full_singletons), ("full singletons", hist_domains_partial_singletons), ): vv = numpy.zeros( max_length ) for x in range( 0, max_length, 10 ): vv[x] = sum( vals[x:x+10] ) x = numpy.flatnonzero( vv > 0 ) s = sum(vals) if s > 0: vv /= s lines.append( pylab.plot( x, vv[x] ) ) legends.append( title ) pylab.xlabel( "sequence or domain length / residues" ) pylab.ylabel( "relative frequency" ) pylab.legend( lines, legends ) pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_domainsizes_all.png" ) ) pylab.xlim( 0, 2000 ) pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_domainsizes_small.png" ) ) pylab.xlim( max_length - max_length // 4, max_length + 1 ) pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_domainsize_large.png" ) ) pylab.clf() ## output domain family sizes lines = [] (yvals, xvals) = numpy.histogram( family_size_sequences, bins=50, new = True) lines.append( pylab.loglog( xvals[:-1], yvals ) ) (yvals, xvals) = numpy.histogram( family_size_domains, bins=50, new = True) lines.append( pylab.loglog( xvals[:-1], yvals ) ) pylab.legend( lines, ( "sequeces", "domains") ) pylab.xlabel( "sequences/domains per family" ) pylab.ylabel( "relative frequency" ) pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_familysizes.png" ) )
def checkContents( self ): self.checkToken( self.mFilename ) infile = SegmentedFile.openfile( self.mFilename, "r", has_header = self.mHasHeader ) data = [ x for x in infile ] self.assertEqual( data[0], "header\n" ) self.assertEqual( [int(x) for x in data[1:]], range( 20 ) )
def testMerge(self): self.create() SegmentedFile.merge( self.mFilename, has_header=self.mHasHeader ) self.checkContents()
def isComplete(self): fn, fi = ProfileLibrary.getFileNames(self.mFilenameProfile + self.getSlice()) return SegmentedFile.isComplete(fi)
def merge(self): SegmentedFile.merge( self.mFilenameGraph )
def applyMethod(self): """apply the method. """ infile = SegmentedFile.openfile(self.mFilenameClusters, "r") family2domains = collections.defaultdict(list) nid2domains = collections.defaultdict(list) ndomains = 0 for line in infile: if line[0] == "#": continue if line.startswith("nid"): continue nid, start, end, family = line[:-1].split("\t") nid = int(nid) nid2domains[nid].append((int(start), int(end), family)) family2domains[family].append((nid, int(end) - int(start))) ndomains += 1 self.info( "collected: nsequences=%i, ndomains=%i, nfamilies=%i" %\ (len(nid2domains), ndomains, len(family2domains) ) ) family_id = len(family2domains) self.mOutfile.write("nid\tstart\tend\tfamily\n") # output domains per nid seqs = self.mFasta.getContigSizes() nids = sorted(seqs.keys()) nfull_singletons = 0 npartial_singletons = 0 ndomains = 0 # compute stats at the same time seq_lengths = seqs.values() max_length = max(seq_lengths) # compute summary per family # and compute full histograms of length distributions hist_domains_mst = numpy.zeros(max_length + 1, numpy.float) hist_domains_full_singletons = numpy.zeros(max_length + 1, numpy.float) hist_domains_partial_singletons = numpy.zeros(max_length + 1, numpy.float) hist_sequences = numpy.zeros(max_length + 1, numpy.float) for x in seq_lengths: hist_sequences[x] += 1 for nid in nids: length = self.mFasta.getLength(nid) id = self.mMapNid2Id[nid] if nid not in nid2domains: family_id += 1 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \ ( id, 0, length, self.mPatternFamily % family_id ) ) family2domains[self.mPatternFamily % family_id].append( (nid, length)) nfull_singletons += 1 hist_domains_full_singletons[length] += 1 continue domains = nid2domains[nid] domains.sort() last = 0 for start, end, family in domains: hist_domains_mst[end - start] += 1 if start - last > self.mMinDomainSize: family_id += 1 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \ ( id, last, start, self.mPatternFamily % family_id ) ) npartial_singletons += 1 family2domains[self.mPatternFamily % family_id].append( (nid, start - last)) ndomains += 1 hist_domains_partial_singletons[start - last] += 1 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \ ( id, start, end, family ) ) last = end ndomains += 1 if length - last > self.mMinDomainSize: family_id += 1 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \ ( id, last, length, self.mPatternFamily % family_id ) ) npartial_singletons += 1 family2domains[self.mPatternFamily % family_id].append( (nid, start - last)) hist_domains_partial_singletons[start - last] += 1 ndomains += 1 self.info( "output: nsequences=%i, ndomains=%i,nfamilies=%i, nfull_singletons=%i, npartial_singletons=%i" % (len(nids), ndomains, len(family2domains), npartial_singletons, nfull_singletons)) self.mOutfileFamilies.write( "family\tnunits\tnsequences\tnresidues\tlength\tlength_median\tlength_stddev\n" ) family_size_sequences, family_size_domains = [], [] for family in sorted(family2domains.keys()): nids = set() lengths = [] for nid, length in family2domains[family]: lengths.append(length) nids.add(nid) ndomains = len(lengths) self.mOutfileFamilies.write("\t".join( (family, str(ndomains), str(len(nids)), str(sum(lengths)), "%5.2f" % numpy.mean(lengths), "%5.2f" % numpy.median(lengths), "%5.2f" % numpy.std(lengths))) + "\n") family_size_sequences.append(len(nids)) family_size_domains.append(ndomains) if PLOT: ## output length distributions lines, legends = [], [] for title, vals in ( ("sequences", hist_sequences), ("domains", hist_domains_mst), ("partial singletons", hist_domains_full_singletons), ("full singletons", hist_domains_partial_singletons), ): vv = numpy.zeros(max_length) for x in range(0, max_length, 10): vv[x] = sum(vals[x:x + 10]) x = numpy.flatnonzero(vv > 0) s = sum(vals) if s > 0: vv /= s lines.append(pylab.plot(x, vv[x])) legends.append(title) pylab.xlabel("sequence or domain length / residues") pylab.ylabel("relative frequency") pylab.legend(lines, legends) pylab.savefig( os.path.expanduser(self.mFilenameDomains + "_domainsizes_all.png")) pylab.xlim(0, 2000) pylab.savefig( os.path.expanduser(self.mFilenameDomains + "_domainsizes_small.png")) pylab.xlim(max_length - max_length // 4, max_length + 1) pylab.savefig( os.path.expanduser(self.mFilenameDomains + "_domainsize_large.png")) pylab.clf() ## output domain family sizes lines = [] (yvals, xvals) = numpy.histogram(family_size_sequences, bins=50, new=True) lines.append(pylab.loglog(xvals[:-1], yvals)) (yvals, xvals) = numpy.histogram(family_size_domains, bins=50, new=True) lines.append(pylab.loglog(xvals[:-1], yvals)) pylab.legend(lines, ("sequeces", "domains")) pylab.xlabel("sequences/domains per family") pylab.ylabel("relative frequency") pylab.savefig( os.path.expanduser(self.mFilenameDomains + "_familysizes.png"))
def create(self): outfile = SegmentedFile.openfile( self.mFilename, "w" ) for x in range(10): outfile.write( "%i\n" % x ) outfile.close()