Ejemplo n.º 1
0
	def match_renumber(self, reference_pdb):
		""" Match the chains and renumber the structures according to a reference PDB """

		clustalo_exe = ini.get('third_party', 'clustalo_exe')

		if not shutil.which(clustalo_exe):
			print('\n+ ClustalO not correctly configured in haddock3.ini')
			print('+ WARNING: matching not possible!')
			return False
		else:
			print('\n+ Running automated chain matching and renumbering')
			print('+ WARNING: Use with caution, some residues could be deleted')

		if reference_pdb == 'lowest':
			reference_pdb, reference_score = self.fetch_lowest()

		if ' ' not in PDB.identify_chains(reference_pdb):
			reference_pdb = PDB.fix_id(reference_pdb, priority='chain')
		if ' ' not in PDB.identify_chainseg(reference_pdb):
			reference_pdb = PDB.fix_id(reference_pdb, priority='seg')

		reference_seq_dic = PDB.load_seq(reference_pdb)
		reference_chains = PDB.identify_chains(reference_pdb)
		reference_chains.sort()

		pdb_list = list(self.structure_dic.keys())
		pdb_list.sort()

		for pdb in pdb_list:
			# match the chains with sequence alignment
			target_seq_dic = PDB.load_seq(pdb)
			pdb = PDB.fix_id(pdb, priority='seg')

			# Get what chains are present in the target
			target_chains = PDB.identify_chains(pdb)
			target_chains.sort()

			# Do a combinatorial alignment to check which chains match better
			identity_dic = {}
			for ref_chain, target_chain in itertools.product(reference_chains, target_chains):
				ref_seq = ''.join(list(reference_seq_dic[ref_chain].values()))
				target_seq = ''.join(list(target_seq_dic[target_chain].values()))
				open('seq.fasta', 'w').write(f'>ref\n{ref_seq}\n>target\n{target_seq}\n')
				cmd = f'{clustalo_exe} -i seq.fasta --outfmt=clu --resno --wrap=9000 --force'
				p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
				out = p.communicate()
				os.remove('seq.fasta')
				aln_data = out[0].decode('utf-8').split()
				ref_aln = aln_data[6]
				target_aln = aln_data[9]
				counter_a = 0
				counter_b = 0
				numbering_dic = {}
				for i in range(len(ref_aln)):
					ref_char = ref_aln[i]
					target_char = target_aln[i]
					ref_resnum = list(reference_seq_dic[ref_chain])[counter_a]
					try:
						target_resnum = list(target_seq_dic[target_chain])[counter_b]
					except IndexError:
						# Target sequence exhausted, ignore
						target_resnum = '-'
					# print(ref_char, ref_resnum, target_char, target_resnum)
					if '-' not in ref_char:
						counter_a += 1
					if '-' not in target_char:
						counter_b += 1
					if '-' not in ref_char and '-' not in target_char:
						numbering_dic[target_resnum] = ref_resnum
				identity = out[0].decode('utf-8').count('*') / float(len(ref_seq))
				coverage = len(numbering_dic) / len(ref_aln)
				# print(ref_chain, target_chain, identity, coverage)
				# print(f'>R:{ref_chain}\n{ref_aln}')
				# print(f'>T:{target_chain}\n{target_aln}')
				try:
					identity_dic[ref_chain].append((target_chain, identity, coverage, numbering_dic))
				except KeyError:
					identity_dic[ref_chain] = [(target_chain, identity, coverage,  numbering_dic)]
			# print('#########\n')

			# do the renumbering
			for i, ref_c in enumerate(reference_chains):
				target_info_list = [(v[0], v[1], v[2]) for v in identity_dic[ref_c]]
				# sort by identity and coverage
				sorted_target_list = sorted(target_info_list, key=lambda x: (-x[2], x[1]))
				# create a catalog with possible numbering references
				numbering_dic_catalog = dict([(v[0], v[3]) for v in identity_dic[ref_c]])

				if len(set([ e[1] for e in sorted_target_list])) == 1:
					# this is a h**o-something, match is sequentialy
					selected_chain = target_chains[i]
				else:
					# get the highest identity/coverage
					selected_chain = sorted_target_list[0][0]

				# select the correct numbering dictionary
				selected_numbering_dic = numbering_dic_catalog[selected_chain]
				# just for readability:
				old_chain = selected_chain
				new_chain = ref_c

				# replace the target chain (old) with the same observed in the reference (new)
				chain_matched_pdb = PDB.replace_chain(pdb, old_chain, new_chain)

				# renumber!
				#  Note, if residue is present in target and not in reference it will be DELETED, use with caution
				renumbered_pdb = PDB.renumber(chain_matched_pdb, selected_numbering_dic, new_chain, overwrite=True)

		return True
Ejemplo n.º 2
0
class TestPDB(unittest.TestCase):

	def setUp(self):
		self.PDB = PDB()

	def test_treat_ensemble(self):
		copyfile(f'{data_path}/mini_ens.pdb', f'{data_path}/temp_ens.pdb')
		input_pdb_dic = {'mol1': f'{data_path}/temp_ens.pdb'}

		treated_dic = self.PDB.treat_ensemble(input_pdb_dic)
		expected_treated_dic = {'mol1': [f'{data_path}/temp_1.pdb', f'{data_path}/temp_2.pdb']}

		self.assertEqual(treated_dic, expected_treated_dic)
		self.assertTrue(filecmp.cmp(f'{data_path}/temp_1.pdb', f'{data_path}/mini_ens1.pdb'))
		self.assertTrue(filecmp.cmp(f'{data_path}/temp_2.pdb', f'{data_path}/mini_ens2.pdb'))

		os.remove(f'{data_path}/temp_1.pdb')
		os.remove(f'{data_path}/temp_2.pdb')
		os.remove(f'{data_path}/temp_ens.pdb')

	def test_load_structure(self):
		pdb_f = f'{data_path}/miniA.pdb'
		pdb_dic = self.PDB.load_structure(pdb_f)
		expected_pdb_dic = {'A': ['ATOM      2  CA  MET A   1      16.967  12.784   4.338  1.00 10.80      A    C  \n',
							  'ATOM      9  CA  ARG A   2      13.856  11.469   6.066  1.00  8.31      A    C  \n',
							  'ATOM     16  CA  CYS A   3      13.660  10.707   9.787  1.00  5.39      A    C  \n']}
		self.assertEqual(pdb_dic, expected_pdb_dic)

	def test_identify_chains(self):
		pdb_f = f'{data_path}/mini.pdb'
		chain_l = self.PDB.identify_chains(pdb_f)
		expected_chain_l = ['A', 'B','C']
		self.assertEqual(chain_l, expected_chain_l)

	def test_identify_segids(self):
		pdb_f = f'{data_path}/miniA.pdb'
		segid_l = self.PDB.identify_segids(pdb_f)
		expected_segid_l = ['A']
		self.assertEqual(segid_l, expected_segid_l)

	def test_split_models(self):
		ensamble_f = f'{data_path}/mini_ens.pdb'
		model_list = self.PDB.split_models(ensamble_f)
		expected_list = [f'{data_path}/mini_1.pdb', f'{data_path}/mini_2.pdb']

		self.assertEqual(model_list, expected_list, 'Name of list elements differ')
		self.assertTrue(filecmp.cmp(f'{data_path}/mini_1.pdb', f'{data_path}/mini_1.gold'))
		self.assertTrue(filecmp.cmp(f'{data_path}/mini_2.pdb', f'{data_path}/mini_2.gold'))

		for f in model_list:
			os.remove(f)

	def test_fix_id(self):
		nosegid_pdb_f = f'{data_path}/mini.pdb'
		nochain_pdb_f = f'{data_path}/mini_nochain.pdb'

		segid_pdb = self.PDB.fix_id(nosegid_pdb_f, priority='chain', overwrite=False)
		chain_pdb = self.PDB.fix_id(nochain_pdb_f, priority='seg', overwrite=False)

		self.assertTrue(filecmp.cmp(segid_pdb, f'{data_path}/mini_segid.pdb'))
		self.assertTrue(filecmp.cmp(chain_pdb, f'{data_path}/mini_segid.pdb'))

		os.remove(f'{data_path}/mini.pdb_')
		os.remove(f'{data_path}/mini_nochain.pdb_')

	def test_add_chainseg(self):

		copyfile(f'{data_path}/mini.pdb', f'{data_path}/temp.pdb')

		check = self.PDB.add_chainseg(f'{data_path}/temp.pdb', 'A')

		self.assertTrue(check)
		self.assertTrue(filecmp.cmp(f'{data_path}/temp.pdb', f'{data_path}/miniA.pdb'))

		os.remove(f'{data_path}/temp.pdb')

	def test_identify_chainseg(self):

		pdbf = f'{data_path}/miniA.pdb'

		chainseg = self.PDB.identify_chainseg(pdbf)

		self.assertEqual(chainseg, ['A'])

	def test_fix_chainseg(self):
		copyfile(f'{data_path}/mini_1.gold', f'{data_path}/mol1.pdb')
		copyfile(f'{data_path}/mini_2.gold', f'{data_path}/mol2.pdb')

		input_pdb_dic = {'mol1': f'{data_path}/mol1.pdb', 'segid1': 'X', 'mol2': f'{data_path}/mol2.pdb'}

		return_pdb_dic = self.PDB.fix_chainseg(input_pdb_dic)
		expected_return_dic = {'mol1': f'{data_path}/mol1.pdb', 'mol2': f'{data_path}/mol2.pdb'}

		self.assertEqual(return_pdb_dic, expected_return_dic)
		self.assertTrue(filecmp.cmp(f'{data_path}/mol1.pdb', f'{data_path}/miniX.pdb'))

		os.remove(f'{data_path}/mol1.pdb')
		os.remove(f'{data_path}/mol2.pdb')

	def test_sanitize(self):
		copyfile(f'{data_path}/mini.dirty.pdb', f'{data_path}/temp.pdb')

		input_pdb_dic = {'mol1': [f'{data_path}/temp.pdb']}

		model_list = self.PDB.sanitize(input_pdb_dic)

		expected_model_list = [f'{data_path}/temp.pdb']

		self.assertEqual(model_list, expected_model_list)
		self.assertTrue(filecmp.cmp(f'{data_path}/temp.pdb', f'{data_path}/mini.clean.pdb'))

		os.remove(f'{data_path}/temp.pdb')

	def test_count_atoms(self):
		pdb_f = f'{data_path}/mini.pdb'
		atom_count = self.PDB.count_atoms(pdb_f)
		self.assertEqual(atom_count, 3)

	def test_organize_chains(self):
		pass

	def test_replace_chain(self):
		pdb_f = f'{data_path}/mini.pdb'
		newchain_pdb = self.PDB.replace_chain(pdb_f, 'A', 'X', overwrite=False)
		self.assertTrue(filecmp.cmp(newchain_pdb, f'{data_path}/mini_A-X.pdb'))
		os.remove(f'{data_path}/mini.pdb_')

	def test_renumber(self):
		pass

	def test_load_seq(self):
		pass

	def tearDown(self):
		pass