def GetLinks(self, query_nid, query_from, query_to, query_ali, sbjct_nid, sbjct_from, sbjct_to, sbjct_ali): """returns all possible links between link split into domains. """ if self.mLogLevel >= 2: print "# processing", query_nid, sbjct_nid, query_from, query_to, sbjct_from, sbjct_to sys.stdout.flush() map_query2sbjct = alignlib.makeAlignataVector() alignlib.fillAlignataCompressed(map_query2sbjct, query_from, query_ali, sbjct_from, sbjct_ali) # iterate over query for query_domain_from, query_domain_to, query_family in self.mDomains[ query_nid]: # check if overlap overlap = min(query_to, query_domain_to) - max( query_from, query_domain_from) + 1 if overlap <= self.mMinOverlapResidues: continue # check for overlap with domains in sbjct for sbjct_domain_from, sbjct_domain_to, sbjct_family in self.mDomains[ sbjct_nid]: overlap = min(sbjct_to, sbjct_domain_to) - max( sbjct_from, sbjct_domain_from) + 1 if overlap < self.mMinOverlapResidues: continue map_new_query2sbjct = alignlib.makeAlignataVector() alignlib.copyAlignata(map_new_query2sbjct, map_query2sbjct, query_domain_from, query_domain_to, sbjct_domain_from, sbjct_domain_to) if map_new_query2sbjct.getLength() > 0: row_ali, col_ali = alignlib.writeAlignataCompressed( map_new_query2sbjct) print string.join( ("%s_%s_%s" % (query_nid, query_domain_from, query_domain_to), "%s_%s_%s" % (sbjct_nid, sbjct_domain_from, sbjct_domain_to), "0", str(map_new_query2sbjct.getRowFrom()), str(map_new_query2sbjct.getRowTo()), row_ali, str(map_new_query2sbjct.getColFrom()), str(map_new_query2sbjct.getColTo()), col_ali), "\t")
def CheckLink(self, query_nid, query_from, query_to, sbjct_nid, sbjct_from, sbjct_to): """check, whether link is faithfull. """ query_profile = self.GetProfile(query_nid) query_profile.useSegment(query_from, query_to) sbjct_profile = self.GetProfile(sbjct_nid) sbjct_profile.useSegment(sbjct_from, sbjct_to) alignator = alignlib.makeFullDP(self.mGop, self.mGep) result = alignlib.makeAlignataVector() alignator.Align(query_profile, sbjct_profile, result) if self.mLogLevel >= 3: print "# --> %i vs %i: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i" %\ (query_nid, sbjct_nid, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo()) sys.stdout.flush() query_profile.useFullLength() sbjct_profile.useFullLength() if result.getScore() > self.mMinAlignmentScore: return 1, result else: return 0, result
def PrintAlignedSequences(sequence1, sequence2, chain=None, format="modeller"): ## align sequences by identity seq_row = alignlib.makeSequence(sequence1) seq_col = alignlib.makeSequence(sequence2) alignator = alignlib.makeAlignatorFullDP(-0.0, -0.0) map_row2col = alignlib.makeAlignataVector() alignator.Align(seq_row, seq_col, map_row2col) lines = string.split( alignlib.writePairAlignment(seq_row, seq_col, map_row2col), "\n") if format == "modeller": first_res, sequence, last_res = string.split(lines[0], "\t") print ">P1;structure" print "structureX: %s : %s : %s : %s : %s : : : : " % ( "structure", first_res, "", last_res, "") print "%s*" % sequence first_res, sequence, last_res = string.split(lines[1], "\t") print ">P1;sequence" print "sequence:%s : %s : %s : %s : %s : : : : " % ( "sequence", first_res, "", last_res, "") print "%s*" % sequence else: print lines
def PrintAlignedSequences( sequence1, sequence2, chain = None, format="modeller" ): ## align sequences by identity seq_row = alignlib.makeSequence( sequence1 ) seq_col = alignlib.makeSequence( sequence2 ) alignator = alignlib.makeAlignatorFullDP( -0.0, -0.0 ) map_row2col = alignlib.makeAlignataVector() alignator.Align( seq_row, seq_col, map_row2col ) lines = string.split(alignlib.writePairAlignment( seq_row, seq_col, map_row2col ), "\n") if format == "modeller": first_res, sequence, last_res = string.split( lines[0], "\t" ) print ">P1;structure" print "structureX: %s : %s : %s : %s : %s : : : : " % ("structure", first_res, "" , last_res, "" ) print "%s*" % sequence first_res, sequence, last_res = string.split( lines[1], "\t" ) print ">P1;sequence" print "sequence:%s : %s : %s : %s : %s : : : : " % ("sequence" , first_res, "", last_res, "") print "%s*" % sequence else: print lines
def GetAlignmentBetweenCorrespondingAtoms( coordinates1, coordinates2, cutoff ): """returns a list of atom positions, which are close to each other. This is done via a dynamic programming step. First all versus all comparison between atom positions is done. Only those positions are kept below cutoff. """ dots = alignlib.makeAlignataMatrixRow() for i in range(len(coordinates1)): x1,y1,z1 = coordinates1[i] for j in range(len(coordinates2)): x2,y2,z2 = coordinates2[j] d = math.sqrt( (x1-x2)*(x1-x2) + (y1-y2)*(y1-y2) + (z1-z2)*(z1-z2)) if d <= cutoff: dots.addPairExplicit(i+1, j+1, 1) seq1 = alignlib.makeSequence ("A" * len(coordinates1)) seq2 = alignlib.makeSequence ("A" * len(coordinates2)) if dots.getLength() <= 3: return None dottor = alignlib.makeAlignatorDummy( dots ) alignator = alignlib.makeAlignatorDotsSquared( 0, 0, dottor) map_a2b = alignlib.makeAlignataVector() alignator.Align( seq1, seq2, map_a2b) return map_a2b
def Split( self, max_gap_length ): """split each alignment into several, if there is a gap longer than min_gap_length. This is necessary, as structural domains can be discontinuos. """ statement = """ SELECT nid, start, end, rep_ali, domain_id, domain_from, domain_to, domain_ali, family %s FROM %s""" % (self.GetAdditionalInfo(), self.name ) tempfile = os.tempnam(Pairsdb.PATH_LOAD, "scmp") outfile = open( tempfile, "w" ) domains = self.Execute(statement).fetchall() for domain in domains: (nid, start, end, rep_ali, domain_id, domain_from, domain_to, domain_ali, family) = domain[:9] map_rep2domains = alignlib.makeAlignataVector() alignlib.fillAlignataCompressed( map_rep2domains, start, rep_ali, domain_from, domain_ali) val = alignlib.splitAlignata( map_rep2domains, max_gap_length) fragments = map( lambda x: alignlib.AlignataPtr(x), val) ## now write each fragment to the output for map_rep2domain in fragments: ## so that the object gets deleted, once it goes out of scope map_rep2domain.thisown = 1 start = map_rep2domain.getRowFrom() end = map_rep2domain.getRowTo() domain_from = map_rep2domain.getColFrom() domain_to = map_rep2domain.getColTo() (rep_ali, domain_ali) = alignlib.writeAlignataCompressed( map_rep2domain) self.WriteLine( outfile, nid, map_rep2domain, domain_id, family, domain[9:]) outfile.close() self.Drop() self.Create() self.Load( tempfile )
def GetLinks( self, query_nid, query_from, query_to, query_ali, sbjct_nid, sbjct_from, sbjct_to, sbjct_ali): """returns all possible links between link split into domains. """ if self.mLogLevel >= 2: print "# processing", query_nid, sbjct_nid, query_from, query_to, sbjct_from, sbjct_to sys.stdout.flush() map_query2sbjct = alignlib.makeAlignataVector() alignlib.fillAlignataCompressed( map_query2sbjct, query_from, query_ali, sbjct_from, sbjct_ali ) # iterate over query for query_domain_from, query_domain_to, query_family in self.mDomains[query_nid]: # check if overlap overlap = min(query_to, query_domain_to)-max(query_from, query_domain_from) + 1 if overlap <= self.mMinOverlapResidues: continue # check for overlap with domains in sbjct for sbjct_domain_from, sbjct_domain_to, sbjct_family in self.mDomains[sbjct_nid]: overlap = min(sbjct_to, sbjct_domain_to)-max(sbjct_from, sbjct_domain_from) + 1 if overlap < self.mMinOverlapResidues: continue map_new_query2sbjct = alignlib.makeAlignataVector() alignlib.copyAlignata( map_new_query2sbjct, map_query2sbjct, query_domain_from, query_domain_to, sbjct_domain_from, sbjct_domain_to) if map_new_query2sbjct.getLength() > 0: row_ali, col_ali = alignlib.writeAlignataCompressed( map_new_query2sbjct ) print string.join( ("%s_%s_%s" % (query_nid, query_domain_from, query_domain_to), "%s_%s_%s" % (sbjct_nid, sbjct_domain_from, sbjct_domain_to), "0", str(map_new_query2sbjct.getRowFrom()), str(map_new_query2sbjct.getRowTo()), row_ali, str(map_new_query2sbjct.getColFrom()), str(map_new_query2sbjct.getColTo()), col_ali), "\t")
def CalculateMatches(self): """calculate all-vs-all alignments. """ if not self.mAlignanda: self.GetAlignanda() if self.mLogLevel >= 1: print "# --> calculating alignments for %i entries" % len( self.mAlignanda) print "# --> starting at:", Tools.GetTimeStamp() nalignanda = len(self.mAlignanda) for a1 in range(self.startAt, nalignanda - 1): if self.mLogLevel >= 1: print "# %5i/%5i at %s" % (a1, nalignanda, Tools.GetTimeStamp()) sys.stdout.flush() for a2 in range(a1 + 1, nalignanda): if self.mLogLevel >= 3: print "# aligning to %i" % (a2), self.mInformation[a2] sys.stdout.flush() result = alignlib.makeAlignataVector() self.mAlignator.Align(self.mAlignanda[a1], self.mAlignanda[a2], result) info = self.mAlignator.CheckResult(result, self.mInformation[a1], self.mInformation[a2]) if info: r = tuple(self.mInformation[a1]) + tuple( self.mInformation[a2]) + tuple(info) print string.join(r, "\t") sys.stdout.flush() self.mAlignanda[a1].Release() self.mAlignanda[a1] = None if self.mLogLevel >= 1: print "# --> finished at:", Tools.GetTimeStamp()
def getMapFromMali( seq1, seq2, gap_char = "-" ): """build map of positions between mali.""" xpos = 0 ypos = 0 map_a2b = alignlib.makeAlignataVector() # build map between genomic sequences: for p in range(len(seq1)): if seq1[p] != gap_char and \ seq2[p] != gap_char and \ seq1[p] in string.uppercase and \ seq2[p] in string.uppercase: map_a2b.addPairExplicit( xpos + 1, ypos + 1, 0) if seq1[p] != gap_char: xpos += 1 if seq2[p] != gap_char: ypos += 1 return map_a2b
def CalculateMatches( self ): """calculate all-vs-all alignments. """ if not self.mAlignanda: self.GetAlignanda() if self.mLogLevel >= 1: print "# --> calculating alignments for %i entries" % len(self.mAlignanda) print "# --> starting at:", Tools.GetTimeStamp() nalignanda = len(self.mAlignanda) for a1 in range(self.startAt, nalignanda-1): if self.mLogLevel >= 1: print "# %5i/%5i at %s" % (a1, nalignanda, Tools.GetTimeStamp()) sys.stdout.flush() for a2 in range(a1+1,nalignanda): if self.mLogLevel >= 3: print "# aligning to %i" % (a2), self.mInformation[a2] sys.stdout.flush() result = alignlib.makeAlignataVector() self.mAlignator.Align( self.mAlignanda[a1], self.mAlignanda[a2], result ) info = self.mAlignator.CheckResult( result, self.mInformation[a1], self.mInformation[a2] ) if info: r = tuple(self.mInformation[a1]) + tuple(self.mInformation[a2]) + tuple(info) print string.join(r, "\t" ) sys.stdout.flush() self.mAlignanda[a1].Release() self.mAlignanda[a1] = None if self.mLogLevel >= 1: print "# --> finished at:", Tools.GetTimeStamp()
def BuildBLASTMatrix( dbhandle, query_nid, resolution=1.0, table_name=None, combine_repeats=None, max_evalue=None, min_evalue=None, residue_level=None, parser=None, add_self=None, ): """build matrix based on BLAST alignments to query_nid. matrix of size N*M N: number of neighbours M: length of query (scaled with resolution) alignments are truncated. the query is included in the matrix. if combine_repeats is set, multiple alignments between the query and a sbjct will be entered into the same row. if residue_level is set, entries are added on the residue level. The resolution parameter is ignored. """ if residue_level: query_length = Table_nrdb(dbhandle).GetLength(query_nid) else: query_length = int(math.floor(float(Table_nrdb(dbhandle).GetLength(query_nid)) / float(resolution))) tbl_pairsdb_90x90 = TablePairsdbNeighbours(dbhandle) if table_name: tbl_pairsdb_90x90.SetName(table_name) neighbours = tbl_pairsdb_90x90.GetNeighbours( query_nid, sort_order=3, skip_query=add_self, min_evalue=min_evalue, max_evalue=max_evalue ) nindex = {} nneighbours = 0 if combine_repeats: for neighbour in neighbours: ( query_from, query_to, query_ali, sbjct_nid, sbjct_from, sbjct_to, sbjct_ali, score, pide, evalue, ) = neighbour if not nindex.has_key(sbjct_nid): nindex[sbjct_nid] = nneighbours nneighbours += 1 else: nneighbours = len(neighbours) if add_self: nneighbours += 1 matrix = numpy.zeros((nneighbours, query_length), numpy.int) if add_self: matrix[0, 0:query_length] = 1 row = 1 else: row = 0 for neighbour in neighbours: (query_from, query_to, query_ali, sbjct_nid, sbjct_from, sbjct_to, sbjct_ali, score, pide, evalue) = neighbour if combine_repeats: use_row = nindex[sbjct_nid] else: use_row = row row += 1 if residue_level: map_sbjct2query = alignlib.makeAlignataVector() alignlib.fillAlignataCompressed(map_sbjct2query, sbjct_from, sbjct_ali, query_from, query_ali) if parser: parser(map_sbjct2query) for x in range(sbjct_from, sbjct_to + 1): y = map_sbjct2query.mapRowToCol(x) if y: try: matrix[use_row, y - 1] = 1 except IndexError: print "IndexError in ", query_nid, sbjct_nid, x, y - 1, query_length else: yfrom = int(math.floor(query_from / resolution)) yto = int(math.floor(query_to / resolution)) matrix[use_row, yfrom:yto] = 1 return matrix
def MapAndAddDomains( self, domains): """Map a domain using an alignment and write to outputfile for loading into table. map info from member to rep domains contains the following information: nid, # nid of new rep info_mem_from, info_mem_to, info_mem_ali, # information to be mapped on mem info_from, info_to, info_ali, # information to be mapped on other quantity start, end, rep_ali, # map between mem and new rep, rep-part mem_from, mem_to, mem_ali, # map between mem and new rep, mem-part ...info-fields """ temp_filename = os.tempnam( Pairsdb.PATH_LOAD, "clup" ) failed = 0 outfile = open(temp_filename, "w") for domain in domains: ( nid, info_mem_from, info_mem_to, info_mem_ali, info_from, info_to, info_ali, start, end, rep_ali, mem_from, mem_to, mem_ali, domain_id, family) = domain[:15] # set does not work (for example 1b77 for ddd, obscure error, probably # due to destructors? map_info_mem2info = alignlib.makeAlignataVector() alignlib.fillAlignataCompressed( map_info_mem2info, info_mem_from, info_mem_ali.tostring(), info_from, info_ali.tostring() ) map_rep2mem = alignlib.makeAlignataVector() alignlib.fillAlignataCompressed( map_rep2mem, start, rep_ali.tostring(), mem_from, mem_ali.tostring() ) map_rep2info = alignlib.makeAlignataVector() alignlib.combineAlignata( map_rep2info, map_rep2mem, map_info_mem2info, alignlib.CR) if map_rep2info.getLength() == 0: if self.mLogLevel >= 2: print "----> mapping failed for", domain sys.stdout.flush() failed += 1 else: self.WriteLine( outfile, nid, map_rep2info, domain_id, family, domain[15:]) outfile.close() self.Load( temp_filename ) if self.mLogLevel >= 1: print "--> mapping failed for %i pairings." % failed sys.stdout.flush() return failed
def BuildBLASTMatrix( dbhandle, query_nid, resolution = 1.0, table_name = None, combine_repeats = None, max_evalue = None, min_evalue = None, residue_level = None, parser = None, add_self = None): """build matrix based on BLAST alignments to query_nid. matrix of size N*M N: number of neighbours M: length of query (scaled with resolution) alignments are truncated. the query is included in the matrix. if combine_repeats is set, multiple alignments between the query and a sbjct will be entered into the same row. if residue_level is set, entries are added on the residue level. The resolution parameter is ignored. """ if residue_level: query_length = Table_nrdb(dbhandle).GetLength( query_nid ) else: query_length = int( math.floor( float(Table_nrdb(dbhandle).GetLength( query_nid )) / float(resolution))) tbl_pairsdb_90x90 = TablePairsdbNeighbours( dbhandle ) if table_name: tbl_pairsdb_90x90.SetName( table_name ) neighbours = tbl_pairsdb_90x90.GetNeighbours( query_nid, sort_order = 3, skip_query = add_self, min_evalue = min_evalue, max_evalue = max_evalue) nindex = {} nneighbours = 0 if combine_repeats: for neighbour in neighbours: (query_from, query_to, query_ali, sbjct_nid, sbjct_from, sbjct_to, sbjct_ali, score, pide, evalue) = neighbour if not nindex.has_key(sbjct_nid): nindex[sbjct_nid] = nneighbours nneighbours += 1 else: nneighbours = len(neighbours) if add_self: nneighbours += 1 matrix = numpy.zeros( (nneighbours, query_length), numpy.int) if add_self: matrix[0, 0:query_length] = 1 row = 1 else: row = 0 for neighbour in neighbours: (query_from, query_to, query_ali, sbjct_nid, sbjct_from, sbjct_to, sbjct_ali, score, pide, evalue) = neighbour if combine_repeats: use_row = nindex[sbjct_nid] else: use_row = row row += 1 if residue_level: map_sbjct2query = alignlib.makeAlignataVector() alignlib.fillAlignataCompressed( map_sbjct2query, sbjct_from, sbjct_ali, query_from, query_ali ) if parser: parser( map_sbjct2query ) for x in range(sbjct_from, sbjct_to + 1): y = map_sbjct2query.mapRowToCol(x) if y: try: matrix[use_row, y-1] = 1 except IndexError: print "IndexError in ", query_nid, sbjct_nid, x, y-1, query_length else: yfrom = int(math.floor(query_from/resolution)) yto = int(math.floor(query_to/resolution)) matrix[use_row, yfrom:yto] = 1 return matrix
for token, sequence in lines: if param_sequences.has_key(token): if len(param_sequences[token]) >= len(sequence): continue param_sequences[token] = sequence tbl_nrdb = Table_nrdb( dbhandle ) query_sequence = None if param_unaligned: mali = alignlib.makeMultipleAlignmentDots(param_compression) else: mali = alignlib.makeMultipleAlignment() map_query2sbjct = alignlib.makeAlignataVector() lines = map( lambda x: string.split( x[:-1], "\t")[:9], filter( lambda x: x[0] != "#", sys.stdin.readlines())) if param_sort_order: data = [] for line in lines: sbjct_nid = line[1] if param_sort_order.has_key( sbjct_nid ): o = param_sort_order[sbjct_nid] else: o = len(param_sort_order) data.append( (o, line) ) data.sort() lines = map( lambda x: x[1], data)
def CheckLink( self, query_nid, query_from, query_to, sbjct_nid, sbjct_from, sbjct_to): """check, whether link is faithfull. """ query_profile = self.GetProfile( query_nid ) query_profile.useSegment( query_from, query_to) sbjct_profile = self.GetProfile( sbjct_nid ) sbjct_profile.useSegment( sbjct_from, sbjct_to) alignator = alignlib.makeFullDP( self.mGop, self.mGep ) result = alignlib.makeAlignataVector() alignator.Align( query_profile, sbjct_profile, result) if result.getLength() == 0: query_profile.useFullLength() sbjct_profile.useFullLength() return 0, result if self.mLogLevel >= 3: print "# --> %i vs %i: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i" %\ (query_nid, sbjct_nid, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo()) sys.stdout.flush() if result.getScore() < self.mMinAlignmentScore: query_profile.useFullLength() sbjct_profile.useFullLength() return 0, result if result.getScore() > self.mSafetyThreshold * self.mMinAlignmentScore: query_profile.useFullLength() sbjct_profile.useFullLength() return 1,result z_params = alignlib.makeNormalDistributionParameters() alignlib.calculateZScoreParameters( z_params, query_profile, sbjct_profile, alignator, self.mNumIterationsZScore) mean = z_params.getMean() stddev = z_params.getStandardDeviation() if stddev == 0: stddev = 1 zscore = (result.getScore() - mean) / stddev if self.mLogLevel >= 3: print "# --> mean=%f, stdev=%f, zscore=%f" % (mean, stddev, zscore) sys.stdout.flush() query_profile.useFullLength() sbjct_profile.useFullLength() if zscore > self.mMinZScore: return 1,result else: return 0,result
sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") ) if options.loglevel >= 1: print "# read %i sequences" % len(sequences) for k in sequences.keys(): sequences[k] = alignlib.makeSequence( sequences[k] ) if options.loglevel >= 2: print "# converted %i sequences" % len(sequences) ninput, noutput, nskipped, nfailed = 0, 0, 0, 0 link = BlastAlignments.Link() ali = alignlib.makeAlignataVector() for line in sys.stdin: if line[0] == "#": continue link.Read( line ) ninput += 1 if link.mQueryToken not in sequences or link.mSbjctToken not in sequences: nskipped += 1 continue ali.Clear() alignlib.fillAlignataCompressed( ali, link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli )
del options.parameters[0] old_length = mali.getLength() new_mali = convertMali2Mali( mali ) if options.alignment_method == "sw": alignator = alignlib.makeAlignatorFullDP( options.gop, options.gep ) else: alignator = alignlib.makeAlignatorFullDPGlobal( options.gop, options.gep ) while 1: cur_record = iterator.next() if cur_record is None: break map_mali2seq = alignlib.makeAlignataVector() sequence = alignlib.makeSequence( cur_record.sequence ) profile = alignlib.makeProfileFromMali( new_mali ) if options.loglevel >= 4: options.stdlog.write(profile.Write()) alignator.Align( profile, sequence, map_mali2seq ) if options.loglevel >= 3: options.stdlog.write( map_mali2seq.Write() ) ## add sequence to mali a = alignlib.makeAlignatumFromString( cur_record.sequence ) a.thisown = 0
infile = sys.stdin parser = FastaIterator.FastaIterator( infile ) sequences = [] while 1: cur_record = iterator.next() if cur_record is None: break sequences.append( (cur_record.title, alignlib.makeSequence(re.sub( " ", "", cur_record.sequence)) ) ) if options.filename_sequences: infile.close() alignator = alignlib.makeAlignatorFullDP( options.gop, options.gep ) map_a2b = alignlib.makeAlignataVector() nsequences = len(sequences) for x in range(0,nsequences-1): for y in range(x+1, nsequences): alignator.Align( sequences[x][1], sequences[y][1], map_a2b) row_ali, col_ali = alignlib.writeAlignataCompressed( map_a2b ) options.stdout.write( "%s\t%s\t%i\t%i\t%i\t%s\t%i\t%i\t%s\t%i\t%i\t%i\t%i\n" % (\ sequences[x][0], sequences[y][0], map_a2b.getScore(), map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(),
def FillAlignments( self ): """the main all-vs-all alignments engine. """ sources = self.mTableSources.GetSources() nsources = len(sources) if self.mLogLevel >= 1: print "--> calculating %i alignments for %i sequences" % ((nsources * (nsources -1)) / 2, nsources) # calculate the alignanda objects if self.mLogLevel >= 1: print "--> retrieving alignanda objects" sys.stdout.flush() alignanda = self.CreateAlignandumObjects( sources ) map_query2sbjct = alignlib.makeAlignataVector() outfile = open (self.mTempFilename, "w" ) # do all vs all alignments for query in range(0, nsources - 1): query_id, query_alignandum = alignanda[query] start_time = time.time() if self.mLogLevel >= 2: print "processing id %i at %s" % (query_id, time.asctime(time.localtime(start_time))) sys.stdout.flush() for sbjct in range( query + 1, nsources): sbjct_id, sbjct_alignandum = alignanda[sbjct] self.mAlignator.Align( query_alignandum, sbjct_alignandum, map_query2sbjct ) (query_ali, sbjct_ali) = alignlib.writeAlignataCompressed( map_query2sbjct ) outfile.write( string.join( map( str, ( query_id, map_query2sbjct.getRowFrom(), map_query2sbjct.getRowTo(), query_ali, sbjct_id, map_query2sbjct.getColFrom(), map_query2sbjct.getColTo(), sbjct_ali, map_query2sbjct.getScore(), map_query2sbjct.getNumGaps(), map_query2sbjct.getLength(), 0)), "@") + "\n" ) stop_time = time.time() if self.mLogLevel >= 2: print "--> alignments: %5i, time: %7.2fs" %\ ( nsources - query - 1, stop_time - start_time) outfile.close() # load data self.mTableAlignments.Drop() self.mTableAlignments.Create() self.mTableAlignments.Load( self.mTempFilename)
def CheckLink(self, query_nid, query_from, query_to, sbjct_nid, sbjct_from, sbjct_to): """check, whether link is faithfull. """ query_profile = self.GetProfile(query_nid) query_profile.useSegment(query_from, query_to) sbjct_profile = self.GetProfile(sbjct_nid) sbjct_profile.useSegment(sbjct_from, sbjct_to) alignator = alignlib.makeFullDP(self.mGop, self.mGep) result = alignlib.makeAlignataVector() alignator.Align(query_profile, sbjct_profile, result) if result.getLength() == 0: query_profile.useFullLength() sbjct_profile.useFullLength() return 0, result if self.mLogLevel >= 3: print "# --> %i vs %i: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i" %\ (query_nid, sbjct_nid, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo()) sys.stdout.flush() if result.getScore() < self.mMinAlignmentScore: query_profile.useFullLength() sbjct_profile.useFullLength() return 0, result if result.getScore() > self.mSafetyThreshold * self.mMinAlignmentScore: query_profile.useFullLength() sbjct_profile.useFullLength() return 1, result z_params = alignlib.makeNormalDistributionParameters() alignlib.calculateZScoreParameters(z_params, query_profile, sbjct_profile, alignator, self.mNumIterationsZScore) mean = z_params.getMean() stddev = z_params.getStandardDeviation() if stddev == 0: stddev = 1 zscore = (result.getScore() - mean) / stddev if self.mLogLevel >= 3: print "# --> mean=%f, stdev=%f, zscore=%f" % (mean, stddev, zscore) sys.stdout.flush() query_profile.useFullLength() sbjct_profile.useFullLength() if zscore > self.mMinZScore: return 1, result else: return 0, result
def buildMapPdb2Sequence( sequence, filename_pdb, options, pdb_chain = ""): """build a map for residue numbers in pdb file to residue numbers on a sequence. returns the following maps: map_structure2seq: mapping of residue numbers between structure and sequence. These are mappings that will work if you "renumber" the structure. map_pdb2seq, map_seq2pdb: mapping according to residue numbers in pdb file. """ if not os.path.exists( filename_pdb ): return None, None structure = Scientific.IO.PDB.Structure( filename_pdb ) map_pdb2seq = {} map_seq2pdb = {} for chain in structure.peptide_chains: if chain.chain_id == pdb_chain: ## align pdb sequence to sequence map_structure2seq = alignlib.makeAlignataVector() alignator = alignlib.makeFullDP( -10.0, -2.0 ) ## build sequence of pdb file structure = "" for residue in chain.sequence(): structure += AMINOACIDS[residue] ## align reference sequence to sequence of pdb file row = alignlib.makeSequence( structure ) col = alignlib.makeSequence( sequence ) alignator.Align(row, col, map_structure2seq) if options.loglevel >= 3: options.stdlog.write( "structure: %s\n" % structure ) options.stdlog.write( "sequence : %s\n" % sequence ) options.stdlog.write( "alignment of structure to sequence:\n" ) options.stdlog.write( alignlib.writePairAlignment( row, col, map_structure2seq ) + "\n" ) # print alignlib.writeAlignataTable(map_structure2seq) residue_number = 0 for residue in chain.residues: residue_number += 1 mapped_residue = map_structure2seq.mapRowToCol(residue_number) if not mapped_residue: if options.loglevel >= 3: options.stdlog.write( "# skipped residue %s=%s %i\n" % (str(residue.number), residue.name, residue_number)) continue r = str(residue.number) map_pdb2seq[r] = mapped_residue map_seq2pdb[mapped_residue] = r return map_structure2seq, map_pdb2seq, map_seq2pdb, residue_number-1, str(chain.residues[0].number), str(chain.residues[-1].number), structure