def get_object_sequence(selection='sele'): ''' DESCRIPTION :param selection: the selected object in pymol :return: the object's sequence ''' print cmd.get_fastastr( hf.get_selection_details(selection)['pdb_object_name'])
def testDoubleRNAFASTA(self): rna = "AUUUUUUUCG" cmd.fnab(input=rna, mode="RNA", form="B", dbl_helix=1) fasta_str = cmd.get_fastastr().splitlines() self.assertEqual(len(fasta_str), 2) sense_strand = fasta_str[1] self.assertEqual(rna, sense_strand)
def get_sequence(obj): seq = '' for chain in cmd.get_chains(obj): seq_ = cmd.get_fastastr(f'{obj} and chain {chain} and polymer.protein') seq_ = seq_.split()[1:] seq_ = ''.join(seq_) seq += seq_ return seq
def testDoubleDNAFASTA(self): dna = "ATCCCCG" revcomp = "CGGGGAT" cmd.fnab(input=dna, dbl_helix=1) fasta_str = cmd.get_fastastr().splitlines() fasta_raw = (fasta_str[1], fasta_str[3]) sense_strand = fasta_raw[0] antisense_strand = fasta_raw[1] self.assertEqual(dna, sense_strand) self.assertEqual(revcomp, antisense_strand) cmd.delete('all') cmd.fnab(input=dna) fasta_str = cmd.get_fastastr().splitlines() fasta_raw = (fasta_str[1], fasta_str[3]) sense_strand = fasta_raw[0] antisense_strand = fasta_raw[1] self.assertEqual(dna, sense_strand) self.assertEqual(revcomp, antisense_strand)
def test_CanGetNewRNASequence(self): cmd.load(self.datafile("1rna.cif")) cmd.wizard("nucmutagenesis") cmd.select("/1rna/A/A/14") cmd.get_wizard().mode = "Guanine" cmd.get_wizard().do_select("sele") cmd.get_wizard().apply() seq = cmd.get_fastastr("/1rna/A/A").splitlines()[1] self.assertEqual(seq, "UUAUAUAUAUAUAG")
def test_CanGetNewDNASequence(self): cmd.load(self.datafile("1bna.cif")) cmd.wizard("nucmutagenesis") cmd.select("/1bna/A/A/1") cmd.get_wizard().mode = "Adenine" cmd.get_wizard().do_select("sele") cmd.get_wizard().apply() seq = cmd.get_fastastr("/1bna/A/A").splitlines()[1] self.assertEqual(seq, "AGCGAATTCGCG")
def test_CanMutateNonCanonicalNucleo(self): cmd.load(self.datafile("1k5e.cif")) cmd.wizard("nucmutagenesis") cmd.select("/1k5e/A/A/6") cmd.get_wizard().mode = "Adenine" cmd.get_wizard().do_select("sele") cmd.get_wizard().apply() seq = cmd.get_fastastr("/1k5e/A/A").splitlines()[1] self.assertEqual(seq, "CGGACAAGAAG")
def Mutagenesis(kinase1, model, template,peptide_instance): """superposition the model and template, remove template and leave peptide behind """ """replace peptide with instance peptide""" list_name = peptide_instance with open(input_data_folder+list_name,'r') as f: instances = f.readlines() instance = [x.strip() for x in instances] print "your peptide: ", instance for pep in instance: cmd.delete('all') cmd.fetch(model) # model_candidate, ex.chk1,chk2... cmd.remove("hetatm") #remove the nonstandard residues cmd.fetch(template) #mutagenesis template, ex.2phk peptide_template = cmd.get_fastastr( "/"+template+'//B') #get the peptide from the template and generate another one for mutagenesis peptide_template = peptide_template + 'G' #peptide of 2phk is 7 amino acid long, when our input peptide is 8 aa, we need to plus one character for aa in peptide_template[6:].lower(): #creat template_peptide for mutagenesis cmd._alt(aa) firstaa = AAcode_1_to_3(peptide_template[6]) #translate template_peptide to 3 letter low_firstaa = firstaa[0].lower() cmd.alter(low_firstaa, 'chain = "B"') #select this template_peptide cmd.show_as("cartoon") cmd.align(model, template) #superpostion of model and template cmd.align(low_firstaa,template) #superpostion of template_peptide and template remove_part = "("+template+" and not resn ATP"+")" cmd.select("remove_part",remove_part) cmd.remove("remove_part") #remove the template except for ATP, there are only model and template_peptide cmd.remove("resn hoh") #remove water cmd.wizard("mutagenesis") peptide_position = 0 for i in pep: #the peptide_position starting point depends the first position of mutagenesis peptide #pymol's peptide start from 1, not 0 mutagenesis_template = '/'+low_firstaa+ '///' + str(peptide_position + 2) # because of 2phk start at 2nd of peptide #mutagenesis_template = '/' + template + '//B/' + str(peptide_position + 2) cmd.get_wizard().do_select(mutagenesis_template) # select peptide position of mutation replace_aminoacid = AAcode_1_to_3(pep)[peptide_position] cmd.get_wizard().set_mode(replace_aminoacid) # select which residue want to mutate to cmd.get_wizard().apply() peptide_position += 1 filename = kinase1 + '_' + model + 'model_' + template + 'muta_' + pep + '.pdb' #build the canonical name cmd.save(filename) ATPchange(filename) #change ATP naming to the format of ATP.params cmd.wizard(None) return
def testGetFastastr(self): seq, name = 'ACD', 'm1' cmd.fab(seq, name) s = cmd.get_fastastr() lines = s.split() self.assertTrue(lines[0] in ( '>' + name, '>' + name + '_', )) self.assertEqual(lines[1:], [seq])
def find_mutations(obj1, obj2, sel_name='mutations'): from pymol import stored, CmdException if cmd.count_atoms(obj1) == 0: print '%s is empty'%obj1 return if cmd.count_atoms(obj2) == 0: print '%s is empty'%obj2 return seq1 = ''.join(cmd.get_fastastr(obj1).split('\n')[1:]) seq2 = ''.join(cmd.get_fastastr(obj2).split('\n')[1:]) muts = [] for i, aa in enumerate(seq1): if aa != seq2[i]: muts.append(i) if not muts: print('no mutations') return percent = 100 * float(len(muts)) / float(len(seq1)) print 'found %i mutations with %s, which is %.2f' % (len(muts), obj2, percent) # this is a correction for PDBs that are not renumbered, also if there's # a MEM residue in the middle of the numbering... stored.resis = [] stored.resns = [] cmd.iterate(obj1, 'stored.resis.append(resi)') cmd.iterate(obj1, 'stored.resns.append(resn)') resnums = sorted(list(set([int(a) for a, n in zip(stored.resis, stored.resns) if n != 'MEM']))) seq_num_resnum = {ind+1: val for ind, val in enumerate(resnums)} muts_s = ['%s' % str(seq_num_resnum[a+1]) for a in muts] muts_sel = '((%s and resi %s) or (%s and resi %s))' % (obj1, '+'.join(muts_s), obj2, '+'.join(muts_s)) cmd.select(sel_name, muts_sel)
def testMixedCase(self): dna = "AtG" revcomp = "CAT" cmd.fnab(input=dna) fasta_str = cmd.get_fastastr().splitlines() fasta_raw = (fasta_str[1], fasta_str[3]) sense_strand = fasta_raw[0] antisense_strand = fasta_raw[1] self.assertEqual(dna.upper(), sense_strand) self.assertEqual(revcomp, antisense_strand)
def get_chain_sequence(selection='sele'): ''' DESCRIPTION: :param selection: the selected object in pymol :return: fasta sequence for the selection's chain ''' selection_details = hf.get_selection_details(selection) seq = cmd.get_fastastr('chain ' + selection_details['chain']).rstrip() seq = seq.split('\n') seq = seq[0] + '_' + selection_details['chain'] + '\n' + ''.join(seq[1:]) print seq
def testIdentifiers(self): seq = 'ACD' segi = 'foo' chain = 'F' resv = 10 cmd.fab(seq, 'm1', 'peptide', resv, chain, segi) cmd.iterate('first m1', 'stored.v = (segi, chain, resv, resn)') self.assertEqual(stored.v, (segi, chain, resv, 'ALA')) cmd.iterate('last m1', 'stored.v = (segi, chain, resv, resn)') self.assertEqual(stored.v, (segi, chain, resv + 2, 'ASP')) v = cmd.get_fastastr().splitlines()[1] self.assertEqual(v, seq)
def get_sequence(obj): aa1 = list("ACDEFGHIKLMNPQRSTVWY") aa3 = "ALA CYS ASP GLU PHE GLY HIS ILE LYS LEU MET ASN PRO GLN ARG SER THR VAL TRP TYR".split( ) aa123 = dict(zip(aa1, aa3)) # aa321 = dict(zip(aa3, aa1)) chains = cmd.get_chains(obj) seq_cat = '' for chain in chains: seq = cmd.get_fastastr(f'{obj} and chain {chain}') seq = seq.split()[1:] seq = ''.join(seq) seq_cat += seq seq_cat = np.asarray([aa123[r] for r in seq_cat]) return seq_cat
def get_sequence(query_object, verbose=True): """ USAGE get_sequence OBJECT Returns a single string with the aminoacid sequence of the object. Similar to print(cmd.get_fastastr(OBJECT)) but does not separate and add chain ID names. """ query_seq = "" # Retrieve the fasta string, while removing newlines fasta_seq = cmd.get_fastastr(query_object).replace("\n", "") # Remove the chain ID name headers for chain in cmd.get_chains(query_object): fasta_seq = fasta_seq.replace(">%s_%s" % (query_object, chain), "") # Print (by default) and return the sequence if verbose: print(fasta_seq) return fasta_seq
def get_seq(obj1): seq1 = ''.join(cmd.get_fastastr(obj1).split('\n')[1:]) print(seq1)
def get_seq(file_name, chain): pymol.finish_launching() cmd.delete('all') cmd.load(file_name) cmd.select('seq_chain', 'chain ' + chain) return cmd.get_fastastr('seq_chain')
def test_CanGetSourceSequence(self): cmd.load(self.datafile("1rna.cif")) seq = cmd.get_fastastr('/1rna/A/A').splitlines()[1] self.assertEqual(seq, "UUAUAUAUAUAUAA")
def testSingleDNAFASTA(self): dna = "ATGC" cmd.fnab(input=dna, mode="DNA", form="B", dbl_helix=-1) fasta_str = cmd.get_fastastr().splitlines() self.assertEqual(dna, fasta_str[1])
def testSingleRNAFASTA(self): rna = "AUGC" cmd.fnab(input=rna, mode="RNA") fasta_str = cmd.get_fastastr().splitlines() self.assertEqual(rna, fasta_str[1])
def get_sequence(chain): seq = cmd.get_fastastr(f'inpdb and chain {chain} and polymer.protein') seq = seq.split()[1:] seq = ''.join(seq) return seq
def print_seq(): print cmd.get_fastastr('all')
def color_sensitivity(file, column=None, show_hetatm=True, show_chains=True, on_chain=None, reload=True, normalize=True, min_val=-1, max_val=1, on_pdb=None): _, pdbid, chain, _ = file.split('_') if on_chain: chain = on_chain if on_pdb: pdbid = on_pdb if not stored.pdbid == pdbid or reload: stored.pdbid = pdbid stored.chain = chain stored.df = pd.read_csv(os.path.join(label_path, file), sep='\t') stored.col_pos = 0 get_pdb_file(pdbid) cmd.load(os.path.join(pdb_path, '{}.pdb'.format(pdbid))) print('Columns:\n{}'.format(', '.join(stored.df.columns))) # actual coloring cmd.alter("sele", "b=0.0") cmd.select("all") cmd.hide("everything") cmd.select("sele", "chain {}".format(chain)) cmd.show("cartoon", "sele") cmd.color("grey", "sele") if show_hetatm: cmd.select("het", "hetatm") cmd.show("sticks", "het") cmd.color("yellow", "het") if show_chains: cmd.select("other", "not chain {}".format(chain)) cmd.show("cartoon", "other") cmd.color("orange", "other") if not column: column = stored.df.columns[stored.col_pos % len(stored.df.columns)] print('Showing {}'.format(column)) seq_df = ''.join(stored.df['AA'][1:]) seq_pdb = cmd.get_fastastr('chain {}'.format(chain)) seq_pdb = ''.join(seq_pdb.split()[1:]) print('Seq sensitivity:\n{}\n\nSeq pdb:\n{}\n\n'.format(seq_df, seq_pdb)) alignment_obj = pairwise2.align.globalms(seq_df, seq_pdb, 1, 0, -.5, -.1)[0] #one_alignment_only=True, print(alignment_obj) aligned_df, aligned_pdb = alignment_obj[:2] stored.aligned_values = [] current_pos = 0 for aa_df, aa_pdb in zip(aligned_df, aligned_pdb): if aa_df == '-': stored.aligned_values.append(float('nan')) elif aa_pdb == '-': current_pos += 1 continue else: stored.aligned_values.append(stored.df.loc[current_pos + 1, column]) current_pos += 1 if normalize: stored.aligned_values = list( np.asarray(stored.aligned_values / np.nanmax(np.abs(stored.aligned_values)))) else: pass for idx in range(len(stored.aligned_values)): if np.isnan(stored.aligned_values[idx]): stored.aligned_values[idx] = -1000 stored.aligned_values = iter(stored.aligned_values) stored.last_resi = -1 def helper(resi): try: if resi != stored.last_resi: stored.b = next(stored.aligned_values) except StopIteration: stored.b = np.nan stored.last_resi = resi return stored.b stored.helper = helper cmd.alter("sele", "b = stored.helper(resi)") cmd.spectrum("b", "red_white_blue", "sele", str(min_val), str(max_val)) cmd.select("nan", "b=-1000") cmd.color("grey", "nan") stored.col_pos += 1
def testGetFastastr(self): seq, name = 'ACD', 'm1' cmd.fab(seq, name) s = cmd.get_fastastr() self.assertEqual(s.split(), ['>' + name, seq])
def annotate_v(selection, scheme): aaseq = "".join(cmd.get_fastastr(selection).split("\n")[1:]) obj = annotate(aaseq, scheme) result = obj.retrieve() for i in result.keys(): cmd.select(i, "pepseq %s" % result[i])
import sys from pymol import cmd protein = sys.argv[1] # PDB file outfile = sys.argv[2] # Outfile cmd.load(protein, 'Protein') chains = cmd.get_chains('Protein') N_res = len(cmd.get_fastastr("chain A").split('\n')[1]) for n in range(len(chains)): cmd.select('AA', '/Protein//' + chains[n] + '/1/N') cmd.edit('AA') cmd.editor.attach_amino_acid("pk1", 'ace') cmd.unpick() cmd.select('AA', '/Protein//' + chains[n] + '/' + str(N_res) + '/C') cmd.edit('AA') cmd.editor.attach_amino_acid("pk1", 'nhh') cmd.unpick() # Rename NHH to NH2 (GROMACS format) cmd.select("NH2s", "resn NHH") cmd.alter("NH2s", "resn='NH2'") cmd.delete("NH2s") cmd.save(outfile, 'Protein')
def compare_10gs_11gs(): # it is possible to download proteins from RCSB (Protein Data Bank) to fetch_path, which is current working # directory by default cmd.set('fetch_path', cmd.exp_path(PATH_data)) cmd.fetch('10gs') cmd.fetch('11gs') cmd.load('{0}10gs.cif'.format(PATH_data), '10gs') cmd.load('{0}11gs.cif'.format(PATH_data), '11gs') # proteins # pymol has several options for proteins alignment, align is better for proteins with high homology align_command = cmd.align # or super, or cealign align_res = align_command('10gs', '11gs') if align_command == cmd.align: print('aligned with rmsd {0}'.format(align_res[0])) # cmd.create creates a separate object (copies everything to it) # het stands for heteroatoms (non-protein) cmd.create('10gs_protein', '10gs and not het') cmd.create('11gs_protein', '11gs and not het') # print protein sequences for fun fasta_10gs = cmd.get_fastastr(selection='10gs_protein') fasta_11gs = cmd.get_fastastr(selection='11gs_protein') print('10gs sequence:{0}'.format(fasta_10gs)) print('11gs sequence:{0}'.format(fasta_11gs)) cmd.delete('*_protein') # delete objects # ligands cmd.select('10gs_ligands', '10gs and not resname HOH and het') cmd.select('11gs_ligands', '11gs and not resname HOH and het') space_10gs = {'lig_names': []} # or simpy lig_names = [] space_11gs = {'lig_names': []} # strange pymol interface to iterate over atoms cmd.iterate('10gs_ligands', 'lig_names.append(resn)', space=space_10gs) cmd.iterate('11gs_ligands', 'lig_names.append(resn)', space=space_11gs) ligs_10gs = set(space_10gs['lig_names']) ligs_11gs = set(space_11gs['lig_names']) # by the way, in both 10gs and 11gs we have MSE, which is actually a modified residue and not a ligand print('ligands found in 10gs: {0}'.format(ligs_10gs)) print('ligands found in 11gs: {0}'.format(ligs_11gs)) for ligand_unique in ligs_10gs.symmetric_difference(ligs_11gs): cmd.save( '{0}{1}_{2}_ligand.sdf'.format( PATH_data, '10gs' if ligand_unique in ligs_10gs else '11gs', ligand_unique), 'resname {0}'.format(ligand_unique)) # deleting selection, objects remain unchanged cmd.delete('ligs*') print('drawing a figure (this will take some time)') cmd.set('transparency_mode', 1) cmd.bg_color('white') cmd.show('sticks', 'het and not resname HOH') cmd.color('white', 'not het') cmd.show('surface', 'not het') cmd.hide('lines') cmd.set('transparency', '0.7') cmd.show('spheres', 'resname HOH') cmd.color('palecyan', 'resname HOH') cmd.hide('nonbonded', 'resname HOH') cmd.set('sphere_transparency', '0.7') cmd.ray() cmd.png('{0}fig.png'.format(PATH_data))
mutant = aa_mapping[to_aa] selection = 'A/{}/'.format(location) print('PDB path:', pdb_path) print('Raw AA substitution:', aa_sub) print('Selection:', selection) print('New amino acid:', mutant) print('Starting mutagenesis') cmd.wizard('mutagenesis') print('Loading PDB') cmd.load(pdb_path) seq_str = cmd.get_fastastr('all') if verbose: print(seq_str) lines = seq_str.splitlines() sequence = ''.join(lines[1:]) residue_indexes = [] def append_residue_index(resi, resn, name): residue_indexes.append(resi) namespace = {'append_residue_index': append_residue_index}
def fasta(selection="all"): print(cmd.get_fastastr(selection))