/
pdb_matching.py
executable file
·84 lines (71 loc) · 3.1 KB
/
pdb_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
# -*- coding: UTF8 -*-
# Author: Guillaume Bouvier -- guillaume.bouvier@pasteur.fr
# https://research.pasteur.fr/en/member/guillaume-bouvier/
# 2020-10-12 09:38:56 (UTC+0200)
"""
Compute matching between structures
"""
import pymol.cmd as cmd
import numpy
import scipy.spatial.distance as dist
def load_pdb(pdbfile, obj, sel):
cmd.load(pdbfile, object=obj)
cmd.remove(f'(not ({obj} and {sel} and name CA)) and {obj}')
coords = cmd.get_coords(selection=f'{obj}')
return coords
def matching(A, B, dist_threshold=3.):
"""
Matching between pair of coord set.
Percentage of protein residues in the deposited model or the unique part of the deposited model that are within 3 A of a residue (residues represented by their CA) in the automatically-generated model.
"""
cdist = dist.cdist(A, B)
dmin = cdist.min(axis=0)
n = B.shape[0]
n_match = (dmin <= dist_threshold).sum()
return n_match / n
def get_sequence(obj):
seq = ''
for chain in cmd.get_chains(obj):
seq_ = cmd.get_fastastr(f'{obj} and chain {chain} and polymer.protein')
seq_ = seq_.split()[1:]
seq_ = ''.join(seq_)
seq += seq_
return seq
def seq_match(A, B, seq_A, seq_B, dist_threshold=3.):
"""
Sequence matching between pair of coord set.
Percentage of matching residues that have the same residue name.
"""
cdist = dist.cdist(A, B)
dmin = cdist.min(axis=0)
assignment = cdist.argmin(axis=0)
is_match = (dmin <= dist_threshold)
n_match = 0
for i, match in enumerate(is_match):
if match:
if seq_A[assignment[i]] == seq_B[i]:
n_match += 1
return n_match / is_match.sum()
if __name__ == '__main__':
import argparse
# argparse.ArgumentParser(prog=None, usage=None, description=None, epilog=None, parents=[], formatter_class=argparse.HelpFormatter, prefix_chars='-', fromfile_prefix_chars=None, argument_default=None, conflict_handler='error', add_help=True, allow_abbrev=True, exit_on_error=True)
parser = argparse.ArgumentParser(description='Compute structural matching')
# parser.add_argument(name or flags...[, action][, nargs][, const][, default][, type][, choices][, required][, help][, metavar][, dest])
parser.add_argument('--pdb1', help='first pdb file', type=str, required=True)
parser.add_argument('--pdb2', help='second (reference) pdb file', type=str, required=True)
parser.add_argument('--sel1', help='selection for pdb1 (default: all CA)', type=str, default='all')
parser.add_argument('--sel2', help='selection for pdb2 (default: all CA)', type=str, default='all')
args = parser.parse_args()
A = load_pdb(args.pdb1, 'A_', args.sel1)
A_chains = cmd.get_chains('A_')
B = load_pdb(args.pdb2, 'B_', args.sel2)
B_chains = cmd.get_chains('B_')
print(f'PDB1: {A.shape[0]} CA')
print(f'PDB2: {B.shape[0]} CA')
matching = matching(A, B)
print(f'% Matching:\t{matching*100:.1f}')
seq_A = get_sequence('A_')
seq_B = get_sequence('B_')
seqmatching = seq_match(A, B, seq_A, seq_B)
print(f'% Seq match:\t{seqmatching*100:.1f}')