def atomium_parse(self,file): try: struc = atomium.open(str(file)) except FileNotFoundError: struc = atomium.fetch(str(file)) self.coord_dict = {} for chain in struc.model.chains(): coords = [] for res in chain: for atom in res.atoms(): if (atom.name == 'CA' and atom.het.code != 'X'): coords.append(atom.location) self.atoms.append({'res_id':res.id, 'res':toggle_code(res.code, '3to1'), 'atom_id':atom.id, 'coords':atom.location, 'chain':chain.id}) self.coord_dict[chain.internal_id] = np.asarray(coords) self.er_dict = {} for chain_id, coords in self.coord_dict.items(): self.er_dict[chain_id] = eigenrank(coords) # picking first chain first_chain = sorted(self.coord_dict.keys())[0] self.coordinates = self.coord_dict[first_chain] self.l = self.coordinates.shape[0] self.er = self.er_dict[first_chain]
def atomium_parse(self,file): try: struc = atomium.open(str(file)) except FileNotFoundError: struc = atomium.fetch(str(file)) self.coord_dict = {} for chain in struc.model.chains(): coords = [] for res in chain: for atom in res.atoms(): if (atom.name == 'CA' and atom.het.code != 'X'): coords.append(atom.location) self.atoms.append({'res_id':int(res.id.split('.')[-1]), 'res':toggle_code(res.code, '3to1'), 'atom_id':atom.id, 'coords':atom.location, 'chain':chain.id}) self.coord_dict[chain.internal_id] = np.asarray(coords) self.er_dict = {} self.lr_dict = {} for chain_id, coords in self.coord_dict.items(): self.er_dict[chain_id], self.lr_dict[chain_id] = eigenrank(coords) break # for only first chain is selected below # picking first chain first_chain = sorted(self.coord_dict.keys())[0] self.coordinates = self.coord_dict[first_chain] self.l = self.coordinates.shape[0] if self.l < 10: print('{} is too short for a sensible EigenRank ({})'.format(self.id, self.l)) else: self.er = self.er_dict[first_chain] self.lr = self.lr_dict[first_chain]
def test_templates(self): site = atomium.fetch("1TON").model.molecule(name="ZN").site() template = biometal.create_site_template(site) self.assertEqual(len(template.atoms()), 6) self.assertEqual(len(template.atoms(name="CA")), 3) self.assertEqual(len(template.atoms(name="CB")), 3) template.save("test.pdb")
def download_pdb_chain(self, pdb_chain): """ downloads certain protein chain via atomium library :params: pdb_chain - wothout .pdb extension """ path_dest = os.path.join(self.path, pdb_chain) struc_id, chain = pdb_chain.split('_') temp = atomium.fetch(struc_id.upper()) temp.model.chain(chain.upper()).save(path_dest + '.pdb')
def pdb(request, id): keys = [k for k in request.path.split("/") if k] try: d = atomium.fetch(keys.pop(0), file_dict="file" in request.GET, data_dict=True) except: raise Http404 while keys: try: d = d[keys.pop(0)] if isinstance(d, dict) else d[int(keys.pop(0))] except KeyError: pass return JsonResponse(d, safe=False, json_dumps_params={"indent": 4})
def FetchProtein(pdb_id, bu, selection, model, use_authid=True): if model == None or model == "": model = 0 else: model = int(model) lchains_id = [] sel_chains = [] p = atomium.fetch(pdb_id) if bu != "AU" and bu != "": bu = int(bu) - 1 elif bu == "AU": bu = -1 else: bu = 0 asele = [""] if selection != None and selection != "": asele = selection sel_chains = asele.split(",") return getPDBString(p, sel_chains, bu, model, use_authid=use_authid)
def structure(file): ''' atomium structure class with the coordinates of the chains (only one chain in SCOPe, COPS and CATH files) and it's EigenRank Profile ''' try: structure = atomium.open(str(file)) except FileNotFoundError: structure = atomium.fetch(str(file)) if structure.code == None: structure.id = os.path.basename(file).split('.')[0] else: structure.id = structure.code coord_dict = {} for chain in structure.model.chains(): coords = [] for res in chain: for atom in res.atoms(): if (atom.name == 'CA' and atom.het.code != 'X'): coords.append(atom.location) coord_dict[chain.internal_id] = np.asarray(coords) structure.coordinates = coord_dict return erpscpy.er.add_eigenrank(structure)
def main(reset=False, log=True, json=True): # Setup log logger = get_log() if log else None # Get all PDBs which contain zinc if log: logger.info("Getting PDB codes") codes = get_zinc_pdb_codes() print(f"There are {len(codes)} PDBs with zinc") if not reset: checked = [p.id for p in Pdb.objects.all()] print(f"{len(checked)} have already been checked") codes = [code for code in codes if code not in checked] # Go through each PDB mmcif_count = 0 for code in tqdm(codes): with transaction.atomic(): # Get PDB if log: logger.info("Getting PDB {} object from server".format(code)) try: pdb = atomium.fetch(code) except ValueError: mmcif_count += 1 if log: logger.info("Couldn't get {}".format(code)) continue # Which assembly should be used? if log: logger.info("Getting best assembly") model = pdb.generate_best_assembly() metals = model.atoms(is_metal=True) while not metals: pdb.assemblies.remove(pdb.best_assembly) model = pdb.generate_best_assembly() metals = model.atoms(is_metal=True) # Save the PDB if log: logger.info("Saving PDB to database") pdb_record = Pdb.create_from_atomium(pdb) # Is the PDB usable? if log: logger.info("Checking PDB usable") if model_is_skeleton(model): zincs = model.atoms(element="ZN") if log: logger.info("It isn't - saving metals") for zinc in zincs: Metal.create_from_atomium( zinc, pdb_record, omission="No residue side chain information in PDB.") continue # Are any PDB zincs not in assembly if log: logger.info("Looking for unused zinc") au_zincs = pdb.model.atoms(element="ZN") assembly_zinc_ids = [atom.id for atom in model.atoms(element="ZN")] for zinc in au_zincs: if zinc.id not in assembly_zinc_ids: Metal.create_from_atomium( zinc, pdb_record, omission= "Zinc was in asymmetric unit but not biological assembly." ) # Get zinc clusters if log: logger.info("Clustering metals into sites") zinc_clusters = cluster_zincs_with_residues(metals) # Create chains if log: logger.info("Creating chains") chains = {} for cluster in zinc_clusters: for o in cluster["residues"].union(cluster["metals"]): chains[o.chain.id] = o.chain for chain_id, chain in chains.items(): chains[chain_id] = Chain.create_from_atomium(chain, pdb_record) # Create binding sites for index, cluster in enumerate(zinc_clusters, start=1): # Does the cluster even have any residues? if len(cluster["residues"]) == 0: if log: logger.info("Not creating site - no residues") Metal.create_from_atomium( zinc, pdb_record, omission="Zinc has no binding residues.") continue # Does the cluster have enough liganding atoms? atoms = [] for residue in cluster["residues"]: atoms += [a for a in residue.atoms() if a.liganding] if len(atoms) < 3: if log: logger.info( "Not creating site - too few liganding atoms") for metal in cluster["metals"]: Metal.create_from_atomium( metal, pdb_record, omission="Zinc has too few liganding atoms.") continue # Create site record itself if log: logger.info("Creating site") site = ZincSite.objects.create(id=f"{pdb_record.id}-{index}", pdb=pdb_record, code=create_site_code( cluster["residues"]), copies=cluster["count"]) # Create metals if log: logger.info("Creating metals") for metal in cluster["metals"]: Metal.create_from_atomium(metal, pdb_record, site=site) # Create residue records if log: logger.info("Creating residues") for r in cluster["residues"]: chain = chains[r.chain.id] Residue.create_from_atomium(r, chain, site) mmcif = [] if log: logger.info("{} mmcif files ignored".format(mmcif_count)) print("{} mmcif files ignored".format(mmcif_count)) # JSON? if json: print("Saving JSON") sysout = sys.stdout with open("data/zinc.json", "w") as f: sys.stdout = f call_command("dumpdata", "--exclude=contenttypes", verbosity=0) sys.stdout = sysout
#! /usr/bin/env python3 import atomium pdbs = ["1TON", "2CAB", "8TLN", "5CPA", "7ADH"] sites = {} for pdb in pdbs: print(f"Processing {pdb}...") model = atomium.fetch(pdb).model() zincs = model.atoms(element="ZN") print(f" Found {len(zincs)} zinc atom" + ("s" if len(zincs) != 1 else "")) for zinc in zincs: id_ = pdb + zinc.molecule().molecule_id() site = zinc.molecule().site() site.add_atom(zinc) site.translate(-zinc.x(), -zinc.y(), -zinc.z()) sites[id_] = site print(f" Extracting {id_}...") print(f"Saving {len(sites)} zinc sites as PDBs...") for site in sites: sites[site].save(f"{site}.pdb")
def get_cath_domain_distogram(cath_id, cath_seq, return_seq_pair=False): """ Given a CATH ID and seq, return distogram correspoding to CATH sequence Return: if return_seq_pair == False: return a np.array with shape [ seq_len, seq_len ]. Missing values denoted with -1 else: return np.array, [cath_seq, pdb_seq] Example: get_cath_domain_distogram("2j43A01", "ASHHLRXHFKTLPAGESLGSLGLWVWGDVDQPSKDWPNGAITXTKAKKDDYGYYLDVPLAAKHRQQVSYLINNKAGENLSKDQHISLLTPKXNEVWIDENY") """ import atomium, Structure ## 分解 code = cath_id[:4] chain = cath_id[4] domain = cath_id[5:7] ## 读取PDB文件 pdb = atomium.fetch(code) chain = pdb.model.chain(chain) ## 获取PDB文件的序列和CATH数据库序列的关系 present_sequence = "".join( [threeToOne.get(res.name, "X") for res in chain.residues()]) start = present_sequence.find(cath_seq) if start != -1: end = start + len(cath_seq) full_seq = present_sequence domain_seq = "-" * start + cath_seq + "-" * (len(present_sequence) - end) assert len(domain_seq) == len(full_seq) else: full_seq, domain_seq = Structure.multi_alignment( [present_sequence, cath_seq]) domain_index = [-1] * len(cath_seq) fi, di = 0, 0 for f, d in zip(full_seq, domain_seq): if f == '-': di += 1 elif d == '-': fi += 1 else: domain_index[di] = fi di += 1 fi += 1 res_list = [ chain.residues()[di] if di != -1 else None for di in domain_index ] ## 计算distogram distogram = np.zeros([len(res_list), len(res_list)]) distogram[:] = -1 for i in range(len(res_list)): if res_list[i] is None: continue if res_list[i].name == 'GLY': atom1 = res_list[i].atom(name="CA") else: atom1 = res_list[i].atom(name="CB") if atom1 is None: continue for j in range(i + 1, len(res_list)): if res_list[j] is None: continue if res_list[j].name == 'GLY': atom2 = res_list[j].atom(name="CA") else: atom2 = res_list[j].atom(name="CB") if atom2 is None: continue distogram[i, j] = distogram[j, i] = atom1.distance_to(atom2) if return_seq_pair: pdb_seq = "".join([ threeToOne.get(res.name, 'X') if res is not None else '?' for res in res_list ]) return distogram, (cath_seq, pdb_seq) else: return distogram
# Get all codes response = requests.get("https://www.rcsb.org/pdb/rest/getCurrent") codes = [child.attrib["structureId"] for child in ET.fromstring(response.text)] print(f"There are {len(codes)} codes") # Go through them print(f"Processing a random {SUBSET} of them...") shuffle(codes) sub_codes = codes[:SUBSET] results = {} for code in tqdm(sub_codes): results[code] = {} for ext in ("cif", "mmtf", "pdb"): try: pdb = atomium.fetch(f"{code}.{ext}") results[code][ext] = str(pdb.model) except Exception as e: results[code][ext] = str(e) models = [ results[code][ext] for ext in ("cif", "mmtf", "pdb") if results[code][ext][0] == "<" ] results[code]["match"] = len(set(models)) == 1 no_pdb = [ c for c in sub_codes if "could not find" in results[c]["pdb"].lower() ] print(f"{len(no_pdb)} codes had no .pdb representation:") print((" ".join(no_pdb) + "\n") if no_pdb else "")
import sys sys.path.insert(0, ".") import atomium from random import shuffle from big import get_all_codes print("Getting PDB codes...") codes = get_all_codes() print("There are {} codes.".format(len(codes))) print("Parsing...") shuffle(codes) for code in codes: print("\tParsing {}.pdb...".format(code)) try: pdb = atomium.fetch(code + ".pdb") except ValueError: print(" Doesn't exist.") pdb = None print("\tParsing {}.cif...".format(code)) cif = atomium.fetch(code + ".cif") if pdb: assert len(pdb.model.chains()) == len(cif.model.chains()) assert len(pdb.model.residues()) == len(cif.model.residues()) assert len(pdb.model.ligands()) == len(cif.model.ligands()) assert len(pdb.model.atoms()) == len(cif.model.atoms()) assert len(pdb.assemblies) == len(cif.assemblies) print()
import sys sys.path.insert(0, ".") import atomium from random import shuffle from big import get_all_codes print("Getting PDB codes...") codes = get_all_codes() print("There are {} codes.".format(len(codes))) print("Parsing...") shuffle(codes) for code in codes: print("\tParsing {}...".format(code)) pdb = atomium.fetch(code) print("\tSuccess ({} model{}).\n".format( len(pdb.models()), "" if len(pdb.models()) == 1 else "s"))