def read(self, f_handle, f_id="casp2"): """Read a distance prediction file Parameters ---------- f_handle Open file handle [read permissions] f_id : str, optional Unique contact file identifier Returns ------- :obj:`~conkit.core.distancefile.DistanceFile` """ hierarchy = DistanceFile(f_id) hierarchy.original_file_format = "CASPRR_MODE_2" _map = Distogram("distogram_1") hierarchy.add(_map) for line in f_handle.readlines(): line = line.lstrip().rstrip().split() if not line or len(line) != 13 or not line[0].isdigit() or not line[1].isdigit(): continue res1_seq = int(line[0]) res2_seq = int(line[1]) raw_score = float(line[2]) distance_scores = tuple([float(p) for p in line[3:]]) _distance = Distance(res1_seq, res2_seq, distance_scores, DISTANCE_BINS, raw_score=raw_score) _map.add(_distance) return hierarchy
def test_original_file_format(self): distance_file = DistanceFile("test") distance_file.original_file_format = "pdb" distogram = Distogram("test") distance_file.add(distogram) self.assertTrue(distogram in distance_file.child_list) self.assertEqual("pdb", distogram.original_file_format)
def read(self, f_handle, f_id="rosettanpz"): """Read a distance prediction file Parameters ---------- f_handle Open file handle [read permissions] f_id : str, optional Unique contact file identifier Returns ------- :obj:`~conkit.core.distancefile.DistanceFile` """ hierarchy = DistanceFile(f_id) hierarchy.original_file_format = "ROSETTA_NPZ" _map = Distogram("distogram_1") hierarchy.add(_map) prediction = np.load(f_handle, allow_pickle=True) probs = prediction['dist'] # Bin #0 corresponds with d>20A & bins #1 ~ #36 correspond with 2A<d<20A in increments of 0.5A probs = probs[:, :, [x for x in range(1, 37)] + [0]] L = probs.shape[0] for i in range(L): for j in range(i, L): _distance = Distance(i + 1, j + 1, tuple(probs[i, j, :].tolist()), DISTANCE_BINS) _map.add(_distance) return hierarchy
def test_write_1(self): expected_output = """PFRMAT RR RMODE 2 1 6 0.199696 0.043889 0.085795 0.070011 0.071518 0.054028 0.213284 0.069087 0.097959 0.090083 0.204345 1 7 0.233644 0.049411 0.075135 0.109098 0.150810 0.096584 0.092398 0.096662 0.093350 0.123176 0.113375 1 8 0.246451 0.106886 0.039024 0.100540 0.082028 0.108344 0.078788 0.105980 0.130109 0.113708 0.134592 1 9 0.267139 0.072002 0.083053 0.112084 0.124356 0.128044 0.097491 0.132106 0.047198 0.110915 0.092751 1 10 0.351914 0.081445 0.069721 0.200748 0.099755 0.090368 0.117449 0.127677 0.050879 0.101965 0.059993 2 7 0.228459 0.085973 0.091366 0.051120 0.085890 0.070657 0.119253 0.082744 0.180051 0.097734 0.135213 2 8 0.256177 0.081094 0.077748 0.097335 0.060811 0.138077 0.130496 0.106911 0.101101 0.121346 0.085081 2 9 0.216631 0.046454 0.053018 0.117160 0.196036 0.144154 0.125199 0.090720 0.052621 0.098583 0.076055 2 10 0.284653 0.087567 0.125308 0.071778 0.071988 0.095966 0.099270 0.174715 0.109563 0.062611 0.101233 3 8 0.345583 0.117500 0.110134 0.117950 0.085312 0.098812 0.072826 0.079326 0.196758 0.059058 0.062325 3 9 0.203586 0.036574 0.050725 0.116287 0.174339 0.070881 0.116388 0.083683 0.060738 0.160257 0.130128 3 10 0.293849 0.059364 0.135117 0.099368 0.113124 0.135930 0.066876 0.075962 0.114771 0.127034 0.072454 4 9 0.234649 0.077170 0.048841 0.108638 0.107559 0.119732 0.116349 0.077063 0.111788 0.119497 0.113362 4 10 0.322930 0.090789 0.133412 0.098729 0.099123 0.084633 0.107534 0.137072 0.096560 0.042234 0.109913 5 10 0.279782 0.054314 0.114427 0.111042 0.069073 0.083048 0.105829 0.073806 0.119769 0.088666 0.180028""" distancefile = DistanceFile("test") distancefile.original_file_format = 'ALPHAFOLD2' distogram = Distogram("1") distancefile.add(distogram) list_res1 = [1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5] list_res2 = [6, 7, 8, 9, 10, 7, 8, 9, 10, 8, 9, 10, 9, 10, 10] bin_edges = (2.3125, 2.625, 2.9375, 3.25, 3.5625, 3.875, 4.1875, 4.5, 4.8125, 5.125, 5.4375, 5.75, 6.0625, 6.375, 6.6875, 6.9999995, 7.3125, 7.625, 7.9375, 8.25, 8.5625, 8.875, 9.1875, 9.5, 9.812499, 10.124999, 10.4375, 10.75, 11.0625, 11.375, 11.687499, 12., 12.3125, 12.625, 12.9375, 13.25, 13.5625, 13.874999, 14.187501, 14.499999, 14.812499, 15.124999, 15.437499, 15.75, 16.0625, 16.375, 16.687502, 16.999998, 17.312498, 17.624998, 17.937498, 18.25, 18.5625, 18.875, 19.1875, 19.5, 19.8125, 20.125, 20.437498, 20.75, 21.062498, 21.374998, 21.6875) distance_bins = [(0, bin_edges[0])] distance_bins += [(bin_edges[idx], bin_edges[idx + 1]) for idx in range(len(bin_edges) - 1)] distance_bins.append((bin_edges[-1], np.inf)) distance_bins = tuple(distance_bins) np.random.seed(41) for res_1, res_2 in zip(list_res1, list_res2): distance_scores = np.random.dirichlet(np.ones(64)).tolist() distance = Distance(res_1, res_2, distance_scores, distance_bins) distogram.add(distance) f_name = self.tempfile() with open(f_name, "w") as f_out: CaspMode2Parser().write(f_out, distogram) with open(f_name, "r") as f_in: output = f_in.read().splitlines() self.assertListEqual(expected_output.split('\n'), output)
def test_write_1(self): expected_output = """#REMARK MapPred 1.1 #REMARK idx_i, idx_j, distance distribution of 34 bins #REMARK 34 bins consist of 32 normal bins (4-20A with a step of 0.5A) and two boundary bins ( [0,4) and [20, inf) ), as follows: [0,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12,12.5,13,13.5,14,14.5,15,15.5,16,16.5,17,17.5,18,18.5,19,19.5,20,inf] 5 10 0.013746 0.002245 0.053742 0.002115 0.005889 0.044058 0.010081 0.052535 0.118677 0.025818 0.019215 0.015831 0.009808 0.018148 0.031220 0.003428 0.058081 0.017978 0.065069 0.024163 0.044585 0.062025 0.026062 0.023824 0.012573 0.027729 0.022212 0.041685 0.005015 0.064340 0.004133 0.006420 0.018552 0.048998 1 35 0.187103 0.008180 0.021642 0.051089 0.038619 0.006100 0.010553 0.031697 0.010831 0.015310 0.006949 0.008237 0.043400 0.051436 0.003820 0.008148 0.018467 0.057307 0.022873 0.029184 0.008235 0.008025 0.004214 0.027027 0.070948 0.028355 0.049284 0.060124 0.041885 0.043900 0.000681 0.006836 0.007679 0.011862 43 85 0.024968 0.014838 0.021987 0.031265 0.019144 0.033038 0.018177 0.008716 0.017331 0.046459 0.051147 0.043912 0.004041 0.007990 0.027690 0.073997 0.001269 0.008161 0.067709 0.055700 0.028615 0.091884 0.021842 0.025949 0.025295 0.006136 0.031655 0.028990 0.082802 0.005069 0.002322 0.015611 0.039637 0.016654 85 43 0.015871 0.013765 0.006593 0.014670 0.029273 0.042705 0.058513 0.014858 0.050493 0.014216 0.010146 0.037020 0.018679 0.003142 0.031215 0.011736 0.008920 0.007325 0.144325 0.003512 0.018591 0.005043 0.001607 0.043659 0.068744 0.052532 0.050643 0.039295 0.003413 0.035119 0.102032 0.004150 0.005737 0.032456 50 50 0.000490 0.027392 0.001090 0.009625 0.011421 0.002011 0.015100 0.018622 0.008785 0.114531 0.044962 0.019562 0.022973 0.008111 0.042691 0.061367 0.001060 0.032753 0.073944 0.006790 0.002509 0.073759 0.025060 0.031361 0.039123 0.043318 0.032752 0.004280 0.044655 0.000556 0.000111 0.095043 0.028036 0.056157 18 50 0.002704 0.015000 0.024442 0.105520 0.014259 0.027628 0.002832 0.035063 0.038354 0.055931 0.039683 0.035546 0.004621 0.019932 0.012316 0.087781 0.006637 0.043857 0.008459 0.053482 0.016937 0.083507 0.031733 0.000793 0.004304 0.066937 0.009968 0.006859 0.038950 0.064003 0.003185 0.008042 0.007331 0.023401""" distancefile = DistanceFile("test") distancefile.original_file_format = 'MAPPRED' distogram = Distogram("1") distancefile.add(distogram) list_res1 = [5, 1, 43, 85, 50, 18] list_res2 = [10, 35, 85, 43, 50, 50] distance_bins = ((0, 4), (4, 4.5), (4.5, 5), (5, 5.5), (5.5, 6), (6, 6.5), (6.5, 7), (7, 7.5), (7.5, 8), (8, 8.5), (8.5, 9), (9, 9.5), (9.5, 10), (10, 10.5), (10.5, 11), (11, 11.5), (11.5, 12), (12, 12.5), (12.5, 13), (13, 13.5), (13.5, 14), (14, 14.5), (14.5, 15), (15, 15.5), (15.5, 16), (16, 16.5), (16.5, 17), (17, 17.5), (17.5, 18), (18, 18.5), (18.5, 19), (19, 19.5), (19.5, 20), (20, np.inf)) np.random.seed(41) for res_1, res_2 in zip(list_res1, list_res2): distance_scores = np.random.dirichlet(np.ones(34)).tolist() distance = Distance(res_1, res_2, distance_scores, distance_bins) distogram.add(distance) f_name = self.tempfile() with open(f_name, "w") as f_out: MapPredParser().write(f_out, distogram) with open(f_name, "r") as f_in: output = f_in.read().splitlines() self.assertListEqual(expected_output.split("\n"), output)
def read(self, f_handle, f_id="alphafold2"): """Read a distance prediction file Parameters ---------- f_handle Open file handle [read permissions] f_id : str, optional Unique contact file identifier Returns ------- :obj:`~conkit.core.distancefile.DistanceFile` """ hierarchy = DistanceFile(f_id) hierarchy.original_file_format = "alphafold2" _map = Distogram("distogram_1") hierarchy.add(_map) prediction = np.load(f_handle, allow_pickle=True) predicted_distogram = prediction['distogram'] probs = softmax(predicted_distogram['logits'], axis=-1) bin_edges = predicted_distogram['bin_edges'] distance_bins = [(0, bin_edges[0])] distance_bins += [(bin_edges[idx], bin_edges[idx + 1]) for idx in range(len(bin_edges) - 1)] distance_bins.append((bin_edges[-1], np.inf)) distance_bins = tuple(distance_bins) L = probs.shape[0] for i in range(L): for j in range(i, L): _distance = Distance(i + 1, j + 1, tuple(probs[i, j, :].tolist()), distance_bins) _map.add(_distance) return hierarchy
def DistanceFile(*args, **kwargs): """:obj:`Contact <conkit.core.distancefile.DistanceFile>` instance""" from conkit.core.distancefile import DistanceFile return DistanceFile(*args, **kwargs)
def _read(self, structure, f_id, distance_cutoff, atom_type): """Read a contact file Parameters ---------- structure A :obj:`~Bio.PDB.Structure.Structure>` instance f_id : str Unique contact file identifier distance_cutoff : int Distance cutoff for which to determine contacts atom_type : str Atom type between which distances are calculated Returns ------- :obj:`~conkit.core.distancefile.DistanceFile~` """ hierarchies = [] distance_bound = (0.0, float(distance_cutoff)) for model in structure: hierarchy = DistanceFile(f_id + "_" + str(model.id)) hierarchy.original_file_format = "PDB" chains = list(chain for chain in model) for chain in chains: self._remove_hetatm(chain) self._remove_atom(chain, atom_type) for chain1, chain2 in itertools.product(chains, chains): if chain1.id == chain2.id: # intra distogram = Distogram(chain1.id) else: # inter distogram = Distogram(chain1.id + chain2.id) for (atom1, atom2, distance) in self._chain_contacts(chain1, chain2): if distance < distance_cutoff: score = round(1.0 - (distance / 100), 6) else: score = 0 dist = Distance(atom1.resseq, atom2.resseq, (1, ), ((distance, distance), ), score, distance_bound) dist.res1_altseq = atom1.resseq_alt dist.res2_altseq = atom2.resseq_alt dist.res1 = atom1.resname dist.res2 = atom2.resname dist.res1_chain = atom1.reschain dist.res2_chain = atom2.reschain if distance_cutoff == 0 or distance < distance_cutoff: dist.true_positive = True distogram.add(dist) if distogram.empty: del distogram else: if len(distogram.id) == 1: distogram.sequence = self._build_sequence(chain1) assert len(distogram.sequence.seq) == len(chain1) else: distogram.sequence = self._build_sequence( chain1) + self._build_sequence(chain2) assert len(distogram.sequence.seq ) == len(chain1) + len(chain2) hierarchy.add(distogram) hierarchy.method = "Distogram extracted from PDB " + str(model.id) hierarchy.remark = [ "The model id is the chain identifier, i.e XY equates to chain X and chain Y.", "Residue numbers in column 1 are chain X, and numbers in column 2 are chain Y.", ] hierarchies.append(hierarchy) if len(hierarchies) > 1: msg = "Super-level to contact file not yet implemented. " "Parser returns hierarchy for top model only!" warnings.warn(msg, FutureWarning) return hierarchies[0]