def PrintAlignedSequences( sequence1, sequence2, chain = None, format="modeller" ): ## align sequences by identity seq_row = alignlib.makeSequence( sequence1 ) seq_col = alignlib.makeSequence( sequence2 ) alignator = alignlib.makeAlignatorFullDP( -0.0, -0.0 ) map_row2col = alignlib.makeAlignataVector() alignator.Align( seq_row, seq_col, map_row2col ) lines = string.split(alignlib.writePairAlignment( seq_row, seq_col, map_row2col ), "\n") if format == "modeller": first_res, sequence, last_res = string.split( lines[0], "\t" ) print ">P1;structure" print "structureX: %s : %s : %s : %s : %s : : : : " % ("structure", first_res, "" , last_res, "" ) print "%s*" % sequence first_res, sequence, last_res = string.split( lines[1], "\t" ) print ">P1;sequence" print "sequence:%s : %s : %s : %s : %s : : : : " % ("sequence" , first_res, "", last_res, "") print "%s*" % sequence else: print lines
def PrintAlignedSequences(sequence1, sequence2, chain=None, format="modeller"): ## align sequences by identity seq_row = alignlib.makeSequence(sequence1) seq_col = alignlib.makeSequence(sequence2) alignator = alignlib.makeAlignatorFullDP(-0.0, -0.0) map_row2col = alignlib.makeAlignataVector() alignator.Align(seq_row, seq_col, map_row2col) lines = string.split( alignlib.writePairAlignment(seq_row, seq_col, map_row2col), "\n") if format == "modeller": first_res, sequence, last_res = string.split(lines[0], "\t") print ">P1;structure" print "structureX: %s : %s : %s : %s : %s : : : : " % ( "structure", first_res, "", last_res, "") print "%s*" % sequence first_res, sequence, last_res = string.split(lines[1], "\t") print ">P1;sequence" print "sequence:%s : %s : %s : %s : %s : : : : " % ( "sequence", first_res, "", last_res, "") print "%s*" % sequence else: print lines
if map_query2sbjct.getRowTo() > len(cds_fragment): print "# ERROR: length mismatch: cds fragment (%i) shorter than last aligned residue (%i)" %\ (len(cds_fragment), map_query2sbjct.getRowTo()) print "#", line print "# cds" print "#", cds_fragment print "# genomic" print "#",genomic_fragment continue cds_seq = alignlib.makeSequence( cds_fragment ) genomic_seq = alignlib.makeSequence( genomic_fragment ) data = map( lambda x: string.split(x, "\t"), string.split( alignlib.writePairAlignment( cds_seq, genomic_seq, map_query2sbjct ), "\n" )) row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(data[0][1], data[1][1]) row_ali = Genomics.MaskStopCodons( row_ali ) col_ali = Genomics.MaskStopCodons( col_ali ) if len(row_ali) != len(col_ali): print "# ERROR: wrong alignment lengths." sys.exit(1) if len(row_ali) % 3 or len(col_ali) % 3: print line print row_ali
def buildMapPdb2Sequence( sequence, filename_pdb, options, pdb_chain = ""): """build a map for residue numbers in pdb file to residue numbers on a sequence. returns the following maps: map_structure2seq: mapping of residue numbers between structure and sequence. These are mappings that will work if you "renumber" the structure. map_pdb2seq, map_seq2pdb: mapping according to residue numbers in pdb file. """ if not os.path.exists( filename_pdb ): return None, None structure = Scientific.IO.PDB.Structure( filename_pdb ) map_pdb2seq = {} map_seq2pdb = {} for chain in structure.peptide_chains: if chain.chain_id == pdb_chain: ## align pdb sequence to sequence map_structure2seq = alignlib.makeAlignataVector() alignator = alignlib.makeFullDP( -10.0, -2.0 ) ## build sequence of pdb file structure = "" for residue in chain.sequence(): structure += AMINOACIDS[residue] ## align reference sequence to sequence of pdb file row = alignlib.makeSequence( structure ) col = alignlib.makeSequence( sequence ) alignator.Align(row, col, map_structure2seq) if options.loglevel >= 3: options.stdlog.write( "structure: %s\n" % structure ) options.stdlog.write( "sequence : %s\n" % sequence ) options.stdlog.write( "alignment of structure to sequence:\n" ) options.stdlog.write( alignlib.writePairAlignment( row, col, map_structure2seq ) + "\n" ) # print alignlib.writeAlignataTable(map_structure2seq) residue_number = 0 for residue in chain.residues: residue_number += 1 mapped_residue = map_structure2seq.mapRowToCol(residue_number) if not mapped_residue: if options.loglevel >= 3: options.stdlog.write( "# skipped residue %s=%s %i\n" % (str(residue.number), residue.name, residue_number)) continue r = str(residue.number) map_pdb2seq[r] = mapped_residue map_seq2pdb[mapped_residue] = r return map_structure2seq, map_pdb2seq, map_seq2pdb, residue_number-1, str(chain.residues[0].number), str(chain.residues[-1].number), structure
for line in sys.stdin: if line[0] == "#": continue link.Read( line ) ninput += 1 if link.mQueryToken not in sequences or link.mSbjctToken not in sequences: nskipped += 1 continue ali.Clear() alignlib.fillAlignataCompressed( ali, link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli ) result = alignlib.writePairAlignment( sequences[link.mQueryToken], sequences[link.mSbjctToken], ali ).split("\n") if len(result) != 3: nfailed += 1 if options.format == "fasta": print ">%s %i-%i\n%s\n>%s %i-%i\n%s\n" %\ (link.mQueryToken, link.mQueryFrom, link.mQueryTo, result[0].split("\t")[1], link.mSbjctToken, link.mSbjctFrom, link.mSbjctTo, result[1].split("\t")[1] ) noutput += 1 print "# ninput=%i, noutput=%i, nskipped=%i, nfailed=%i" % (ninput, noutput, nskipped, nfailed) E.Stop()