def Split( self, max_gap_length ): """split each alignment into several, if there is a gap longer than min_gap_length. This is necessary, as structural domains can be discontinuos. """ statement = """ SELECT nid, start, end, rep_ali, domain_id, domain_from, domain_to, domain_ali, family %s FROM %s""" % (self.GetAdditionalInfo(), self.name ) tempfile = os.tempnam(Pairsdb.PATH_LOAD, "scmp") outfile = open( tempfile, "w" ) domains = self.Execute(statement).fetchall() for domain in domains: (nid, start, end, rep_ali, domain_id, domain_from, domain_to, domain_ali, family) = domain[:9] map_rep2domains = alignlib.makeAlignataVector() alignlib.fillAlignataCompressed( map_rep2domains, start, rep_ali, domain_from, domain_ali) val = alignlib.splitAlignata( map_rep2domains, max_gap_length) fragments = map( lambda x: alignlib.AlignataPtr(x), val) ## now write each fragment to the output for map_rep2domain in fragments: ## so that the object gets deleted, once it goes out of scope map_rep2domain.thisown = 1 start = map_rep2domain.getRowFrom() end = map_rep2domain.getRowTo() domain_from = map_rep2domain.getColFrom() domain_to = map_rep2domain.getColTo() (rep_ali, domain_ali) = alignlib.writeAlignataCompressed( map_rep2domain) self.WriteLine( outfile, nid, map_rep2domain, domain_id, family, domain[9:]) outfile.close() self.Drop() self.Create() self.Load( tempfile )
def GetLinks(self, query_nid, query_from, query_to, query_ali, sbjct_nid, sbjct_from, sbjct_to, sbjct_ali): """returns all possible links between link split into domains. """ if self.mLogLevel >= 2: print "# processing", query_nid, sbjct_nid, query_from, query_to, sbjct_from, sbjct_to sys.stdout.flush() map_query2sbjct = alignlib.makeAlignataVector() alignlib.fillAlignataCompressed(map_query2sbjct, query_from, query_ali, sbjct_from, sbjct_ali) # iterate over query for query_domain_from, query_domain_to, query_family in self.mDomains[ query_nid]: # check if overlap overlap = min(query_to, query_domain_to) - max( query_from, query_domain_from) + 1 if overlap <= self.mMinOverlapResidues: continue # check for overlap with domains in sbjct for sbjct_domain_from, sbjct_domain_to, sbjct_family in self.mDomains[ sbjct_nid]: overlap = min(sbjct_to, sbjct_domain_to) - max( sbjct_from, sbjct_domain_from) + 1 if overlap < self.mMinOverlapResidues: continue map_new_query2sbjct = alignlib.makeAlignataVector() alignlib.copyAlignata(map_new_query2sbjct, map_query2sbjct, query_domain_from, query_domain_to, sbjct_domain_from, sbjct_domain_to) if map_new_query2sbjct.getLength() > 0: row_ali, col_ali = alignlib.writeAlignataCompressed( map_new_query2sbjct) print string.join( ("%s_%s_%s" % (query_nid, query_domain_from, query_domain_to), "%s_%s_%s" % (sbjct_nid, sbjct_domain_from, sbjct_domain_to), "0", str(map_new_query2sbjct.getRowFrom()), str(map_new_query2sbjct.getRowTo()), row_ali, str(map_new_query2sbjct.getColFrom()), str(map_new_query2sbjct.getColTo()), col_ali), "\t")
def CheckResult(self, result, info1=None, info2=None): """check if result is ok. The function below returns everything. return tuple of strings as result. """ if (result.getLength() > 0): row_ali, col_ali = alignlib.writeAlignataCompressed(result) return map( str, (result.getScore(), result.getLength(), result.getNumGaps(), alignlib.calculatePercentSimilarity(result), result.getRowFrom(), result.getRowTo(), row_ali, result.getColFrom(), result.getColTo(), col_ali)) else: return ("0", ) * 12
def CheckResult( self, result, info1 = None, info2 = None): """check if result is ok. The function below returns everything. return tuple of strings as result. """ if (result.getLength() > 0): row_ali, col_ali = alignlib.writeAlignataCompressed( result ) return map(str, (result.getScore(), result.getLength(), result.getNumGaps(), alignlib.calculatePercentSimilarity( result ), result.getRowFrom(), result.getRowTo(), row_ali, result.getColFrom(), result.getColTo(), col_ali ) ) else: return ("0",) * 12
def GetLinks( self, query_nid, query_from, query_to, query_ali, sbjct_nid, sbjct_from, sbjct_to, sbjct_ali): """returns all possible links between link split into domains. """ if self.mLogLevel >= 2: print "# processing", query_nid, sbjct_nid, query_from, query_to, sbjct_from, sbjct_to sys.stdout.flush() map_query2sbjct = alignlib.makeAlignataVector() alignlib.fillAlignataCompressed( map_query2sbjct, query_from, query_ali, sbjct_from, sbjct_ali ) # iterate over query for query_domain_from, query_domain_to, query_family in self.mDomains[query_nid]: # check if overlap overlap = min(query_to, query_domain_to)-max(query_from, query_domain_from) + 1 if overlap <= self.mMinOverlapResidues: continue # check for overlap with domains in sbjct for sbjct_domain_from, sbjct_domain_to, sbjct_family in self.mDomains[sbjct_nid]: overlap = min(sbjct_to, sbjct_domain_to)-max(sbjct_from, sbjct_domain_from) + 1 if overlap < self.mMinOverlapResidues: continue map_new_query2sbjct = alignlib.makeAlignataVector() alignlib.copyAlignata( map_new_query2sbjct, map_query2sbjct, query_domain_from, query_domain_to, sbjct_domain_from, sbjct_domain_to) if map_new_query2sbjct.getLength() > 0: row_ali, col_ali = alignlib.writeAlignataCompressed( map_new_query2sbjct ) print string.join( ("%s_%s_%s" % (query_nid, query_domain_from, query_domain_to), "%s_%s_%s" % (sbjct_nid, sbjct_domain_from, sbjct_domain_to), "0", str(map_new_query2sbjct.getRowFrom()), str(map_new_query2sbjct.getRowTo()), row_ali, str(map_new_query2sbjct.getColFrom()), str(map_new_query2sbjct.getColTo()), col_ali), "\t")
def Check(self): while 1: line = sys.stdin.readline() if not line: break try: (query_token, sbjct_token) = string.split(line[:-1], "\t")[:2] query_nid, query_from, query_to = map( string.atoi, string.split(query_token, "_")) sbjct_nid, sbjct_from, sbjct_to = map( string.atoi, string.split(sbjct_token, "_")) except ValueError: continue if self.mLogLevel >= 4: print "# --> checking link between %i (%i-%i) and %i (%i-%i)" % ( query_nid, query_from, query_to, sbjct_nid, sbjct_from, sbjct_to) sys.stdout.flush() passed, alignment = self.CheckLink(query_nid, query_from, query_to, sbjct_nid, sbjct_from, sbjct_to) if passed: print "+\t", else: print "-\t", if alignment.getLength() > 0: ali_row, ali_col = alignlib.writeAlignataCompressed(alignment) print line[:-1] + "\t" + string.join( map(str, (alignment.getRowFrom(), alignment.getRowTo(), ali_row, alignment.getColFrom(), alignment.getColTo(), ali_col, alignment.getScore(), alignment.getLength(), alignment.getNumGaps())), "\t") else: print line[:-1] + "\t" + string.join( map(str, (0, 0, "", 0, 0, "", 0, 0, 0)), "\t") sys.stdout.flush()
def WriteLine(self, outfile, nid, map_rep2domain, domain_id, family, additional_info): """write line into file for loading into table. """ start = map_rep2domain.getRowFrom() end = map_rep2domain.getRowTo() domain_from = map_rep2domain.getColFrom() domain_to = map_rep2domain.getColTo() (rep_ali, domain_ali) = alignlib.writeAlignataCompressed(map_rep2domain) outfile.write( string.join( map(str, (nid, start, end, rep_ali, domain_id, domain_from, domain_to, domain_ali, family)) + map(str, additional_info), "\t", ) + "\n" )
def WriteLine( self, outfile, nid, map_rep2domain, domain_id, family, additional_info): """write line into file for loading into table. """ start = map_rep2domain.getRowFrom() end = map_rep2domain.getRowTo() domain_from = map_rep2domain.getColFrom() domain_to = map_rep2domain.getColTo() (rep_ali, domain_ali) = alignlib.writeAlignataCompressed( map_rep2domain ) outfile.write ( string.join( map( str, ( nid, start, end, rep_ali, domain_id, domain_from, domain_to, domain_ali, family)) + map(str, additional_info), "\t") + "\n" )
if cur_record is None: break sequences.append( (cur_record.title, alignlib.makeSequence(re.sub( " ", "", cur_record.sequence)) ) ) if options.filename_sequences: infile.close() alignator = alignlib.makeAlignatorFullDP( options.gop, options.gep ) map_a2b = alignlib.makeAlignataVector() nsequences = len(sequences) for x in range(0,nsequences-1): for y in range(x+1, nsequences): alignator.Align( sequences[x][1], sequences[y][1], map_a2b) row_ali, col_ali = alignlib.writeAlignataCompressed( map_a2b ) options.stdout.write( "%s\t%s\t%i\t%i\t%i\t%s\t%i\t%i\t%s\t%i\t%i\t%i\t%i\n" % (\ sequences[x][0], sequences[y][0], map_a2b.getScore(), map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getScore(), 100 * alignlib.calculatePercentIdentity( map_a2b, sequences[x][1], sequences[y][1]), sequences[x][1].getLength(), sequences[y][1].getLength() ))
def FillAlignments( self ): """the main all-vs-all alignments engine. """ sources = self.mTableSources.GetSources() nsources = len(sources) if self.mLogLevel >= 1: print "--> calculating %i alignments for %i sequences" % ((nsources * (nsources -1)) / 2, nsources) # calculate the alignanda objects if self.mLogLevel >= 1: print "--> retrieving alignanda objects" sys.stdout.flush() alignanda = self.CreateAlignandumObjects( sources ) map_query2sbjct = alignlib.makeAlignataVector() outfile = open (self.mTempFilename, "w" ) # do all vs all alignments for query in range(0, nsources - 1): query_id, query_alignandum = alignanda[query] start_time = time.time() if self.mLogLevel >= 2: print "processing id %i at %s" % (query_id, time.asctime(time.localtime(start_time))) sys.stdout.flush() for sbjct in range( query + 1, nsources): sbjct_id, sbjct_alignandum = alignanda[sbjct] self.mAlignator.Align( query_alignandum, sbjct_alignandum, map_query2sbjct ) (query_ali, sbjct_ali) = alignlib.writeAlignataCompressed( map_query2sbjct ) outfile.write( string.join( map( str, ( query_id, map_query2sbjct.getRowFrom(), map_query2sbjct.getRowTo(), query_ali, sbjct_id, map_query2sbjct.getColFrom(), map_query2sbjct.getColTo(), sbjct_ali, map_query2sbjct.getScore(), map_query2sbjct.getNumGaps(), map_query2sbjct.getLength(), 0)), "@") + "\n" ) stop_time = time.time() if self.mLogLevel >= 2: print "--> alignments: %5i, time: %7.2fs" %\ ( nsources - query - 1, stop_time - start_time) outfile.close() # load data self.mTableAlignments.Drop() self.mTableAlignments.Create() self.mTableAlignments.Load( self.mTempFilename)