def read_mmCIF_file(structure_id, filename, hetatm=False, water=False): """ Read mmCIF file and create Structure instance based upon it. Argument: *structure_id* structure_id code of mmCIF file *filename* name of mmCIF file *hetatm* Boolean representing whether the mmCIF file contains hetatom. Default and recommended is False. *water* Boolean representing whether to add water to the structure. Default and recommended is False. Return: Structure Instance """ from .Bio.PDB import MMCIFParser as MMCIFParserBiopy p = MMCIFParserBiopy() #permissive default True structure = p.get_structure(structure_id, filename) return mmCIFParser._biommCIF_strcuture_to_TEMpy( filename, structure, hetatm, water)
def fetch_mmCIF(structure_id, filename,hetatm=False,water= False): """ Fetch mmCIF file and create Structure instance based upon it. Argument: *structure_id* structure_id code of mmCIF file *filename* name of mmCIF file *hetatm* Boolean representing whether the mmCIF file contains hetatom. *water* Boolean representing whether to add water to the structure. Default and recommended is False. Return: Structure Instance """ from Bio.PDB import MMCIFParser as MMCIFParserBiopy p=MMCIFParserBiopy() url = 'http://www.rcsb.org/pdb/files/%s.cif' % structure_id urllib.urlretrieve(url, filename) structure=p.get_structure(structure_id, filename) return mmCIFParser._biommCIF_strcuture_to_TEMpy(filename,structure,hetatm,water)
def test_dssp_with_mmcif_file_and_nonstandard_residues(self): """Test DSSP generation from MMCIF with non-standard residues.""" p = MMCIFParser() pdbfile = "PDB/1AS5.cif" model = p.get_structure("1AS5", pdbfile)[0] dssp = DSSP(model, pdbfile) self.assertEqual(len(dssp), 24)
def structure_scanning(pdb, ligname, graph, model, edge_map, embed_dim): """ Given a PDB structure make a prediction for each residue in the structure: - chop the structure into candidate sites (for each residue get a sphere..) - convert residue neighbourhood into graph - get prediction from model for each - compare prediction to native ligand. :returns: `residue_preds` dictionary with residue id as key and fingerprint prediction as value. """ from data_processor.build_dataset import get_pocket_graph parser = MMCIFParser(QUIET=True) structure = parser.get_structure("", pdb)[0] residue_preds = {} residues = list(structure.get_residues()) for residue in tqdm(residues): if residue.resname in ['A', 'U', 'C', 'G', ligname]: res_info = ":".join([ "_", residue.get_parent().id, residue.resname, str(residue.id[1]) ]) pocket_graph = get_pocket_graph(pdb, res_info, graph) _, dgl_graph = nx_to_dgl(pocket_graph, edge_map, embed_dim) _, fp_pred = model(dgl_graph) fp_pred = fp_pred.detach().numpy() > 0.5 residue_preds[(residue.get_parent().id, residue.id[1])] = fp_pred else: continue return residue_preds
def test_dssp_with_mmcif_file(self): """Test DSSP generation from MMCIF.""" p = MMCIFParser() pdbfile = "PDB/2BEG.cif" model = p.get_structure("2BEG", pdbfile)[0] dssp = DSSP(model, pdbfile) self.assertEqual(len(dssp), 130)
def load_test_structure(self, pdb_code): mmcif_parser = MMCIFParser(QUIET=True) structure = mmcif_parser.get_structure( pdb_code, self.TEST_STRUCTURE_DIR / f'{pdb_code}.cif') return structure, BiopythonToMmcifResidueIds.create( mmcif_parser._mmcif_dict) # reuse already parsed
def cifReader(self, name, file): try: parser = MMCIFParser() structure = parser.get_structure(name, file) return structure except: print("Something went wrong: File not found")
def __get_structure__(self, file_path): base_name = os.path.basename(file_path) name, ext = os.path.splitext(base_name) if 'cif' in ext: parser = MMCIFParser() else: parser = PDBParser() return parser.get_structure(name, file_path)
def load_pdb(path): # If using PDB # parser = PDBParser(PERMISSIVE=1) # if using mmCIF parser = MMCIFParser() structure = parser.get_structure('structure 1', path) return structure
def setUpClass(self): self.io = MMCIFIO() self.mmcif_parser = MMCIFParser() self.pdb_parser = PDBParser() with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) self.structure = self.pdb_parser.get_structure( "example", "PDB/1A8O.pdb") self.mmcif_file = "PDB/1A8O.cif" self.mmcif_multimodel_pdb_file = "PDB/1SSU_mod.pdb" self.mmcif_multimodel_mmcif_file = "PDB/1SSU_mod.cif"
def creator(parser=parser): try: ret = parser.get_structure(pid, file=dst) except ValueError as e: # assume it's a .cif if PARSE_CIF: parser = MMCIFParser(QUIET=True) ret = parser.get_structure(pid, dst) else: raise e finally: self.freemem() return ret
def test_cealigner_nucleic(self): """Test aligning 1LCD on 1LCD.""" ref = "PDB/1LCD.cif" mob = "PDB/1LCD.cif" parser = MMCIFParser(QUIET=1) s1 = parser.get_structure("1lcd_ref", ref) s2 = parser.get_structure("1lcd_mob", mob) aligner = CEAligner() aligner.set_reference(s1) aligner.align(s2) self.assertAlmostEqual(aligner.rms, 0.0, places=3)
def print_input_file(structure_file, ss2_file=None): extension = os.path.basename(structure_file).rsplit(".", 1)[-1].lower() if extension in ("cif", "mmcif"): from Bio.PDB import MMCIFParser parser = MMCIFParser() else: from Bio.PDB import PDBParser parser = PDBParser() struc = parser.get_structure("", structure_file) seq = "" coords = [] for chain in struc[0]: for res in chain: # Skip hetero and water residues if res.id[0] != " ": continue seq += three_to_one_aas[res.get_resname()] if res.get_resname() == "GLY": # Extend vector of length 1 Å from Cα to act as fake centroid d = res["CA"].get_coord() - res["C"].get_coord() + res["CA"].get_coord() - res["N"].get_coord() coord_cent = res["CA"].get_coord() + d / np.linalg.norm(d) else: # Centroid coordinates of sidechain heavy atoms atom_coords = [] for atom in res: if atom.get_name() not in ("N", "CA", "C", "O") and atom.element != "H": atom_coords.append(atom.get_coord()) coord_cent = np.array(atom_coords).mean(0) coords.append([res["N"].get_coord(), res["CA"].get_coord(), res["C"].get_coord(), coord_cent]) print(seq) if ss2_file: # Extract 3-state secondary structure prediction from PSIPRED ss2 output file ss_pred = "" with open(ss2_file) as f: for line in f: if len(line.rstrip()) > 0 and not line.startswith("#"): ss_pred += line.split()[2] assert len(seq) == len(ss_pred), f"Sequence length is {len(seq)} but SS prediction length is {len(ss_pred)}" print(ss_pred) else: print("C" * len(seq)) def coord_str(coord): return " ".join([str(round(c, 3)) for c in coord]) for coord_n, coord_ca, coord_c, coord_cent in coords: print(f"{coord_str(coord_n)} {coord_str(coord_ca)} {coord_str(coord_c)} {coord_str(coord_cent)}")
def get_convert_cifs(url, cif_path, pdb_path): try: url_req.urlretrieve(url, cif_path) except (url_err.URLError, url_err.HTTPError): print("!!!HTTP or URL error, couldn't get " + url + '.') return try: p = MMCIFParser() struc = p.get_structure('', cif_path) io = PDBIO() io.set_structure(struc) io.save(pdb_path) print('^^^SUCCESSFULLY CONVERTED CIF TO PDB') except TypeError: print('Problem making pdb file')
def scanning_analyze(): """ Visualize results of scanning on PDB. Color residues by prediction score. 1fmn_#0.1:A:FMN:36.nx_annot.p """ from data_processor.build_dataset import find_residue, lig_center model, edge_map, embed_dim = load_model('small_no_rec_2', '../data/annotated/pockets_nx') for f in os.listdir("../data/annotated/pockets_nx"): pdbid = f.split("_")[0] _, chain, ligname, pos = f.replace(".nx_annot.p", "").split(":") pos = int(pos) print(chain, ligname, pos) graph = pickle.load(open(f'../data/RNA_Graphs/{pdbid}.pickle', 'rb')) if len(graph.nodes()) > 100: continue try: fp_preds = structure_scanning( f'../data/all_rna_prot_lig_2019/{pdbid}.cif', ligname, graph, model, edge_map, embed_dim) except Exception as e: print(e) continue parser = MMCIFParser(QUIET=True) structure = parser.get_structure( "", f"../data/all_rna_prot_lig_2019/{pdbid}.cif")[0] lig_res = find_residue(structure[chain], pos) lig_c = lig_center(lig_res.get_atoms()) fp_dict = pickle.load(open("../data/all_ligs_maccs.p", 'rb')) true_fp = fp_dict[ligname] dists = [] jaccards = [] decoys = get_decoys() for res, fp in fp_preds.items(): chain, pos = res r = find_residue(structure[chain], pos) r_center = lig_center(r.get_atoms()) dists.append(euclidean(r_center, lig_c)) jaccards.append(mse(true_fp, fp)) plt.title(f) plt.distplot(dists, jaccards) plt.xlabel("dist to binding site") plt.ylabel("dist to fp") plt.show() pass
def setUpClass(cls): cls.dssp_version = "0.0.0" is_dssp_available = False # Check if DSSP is installed quiet_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT} try: try: # Newer versions of DSSP version_string = subprocess.check_output( ["dssp", "--version"], universal_newlines=True) cls.dssp_version = re.search(r"\s*([\d.]+)", version_string).group(1) is_dssp_available = True except subprocess.CalledProcessError: # Older versions of DSSP subprocess.check_call(["dssp", "-h"], **quiet_kwargs) is_dssp_available = True except OSError: try: version_string = subprocess.check_output( ["mkdssp", "--version"], universal_newlines=True) cls.dssp_version = re.search(r"\s*([\d.]+)", version_string).group(1) is_dssp_available = True except OSError: pass if not is_dssp_available: raise unittest.SkipTest( "Install dssp if you want to use it from Biopython.") cls.pdbparser = PDBParser() cls.cifparser = MMCIFParser()
def parse_structure(path): """ Parses a structure using Biopython's PDB/mmCIF Parser Verifies the integrity of the structure (gaps) and its suitability for the calculation (is it a complex?). """ # setup logging logger = logging.getLogger('Prodigy') logger.info('[+] Reading structure file: {0}'.format(path)) fname = os.path.basename(path) sname = '.'.join(fname.split('.')[:-1]) s_ext = fname.split('.')[-1] _ext = {'pdb', 'ent', 'cif'} if s_ext not in _ext: raise IOError( '[!] Structure format \'{0}\' is not supported. Use \'.pdb\' or \'.cif\'.' .format(s_ext)) sparser = PDBParser(QUIET=1) if s_ext in {'pdb', 'ent'} else MMCIFParser() try: s = sparser.get_structure(sname, path) except Exception as exeption: logger.error('[!] Structure \'{0}\' could not be parsed'.format(sname)) raise Exception(exeption) return (validate_structure(s), len(set([c.id for c in s.get_chains()])), len(list(s.get_residues())))
def __init__(self, structure, name='structure', path='.'): if isinstance(structure, str): file_type = (str(structure).split('.')[-1]).lower() if file_type in ('pdb', 'ent'): # load a PDB file __parser = PDBParser(PERMISSIVE=1, QUIET=True) self.structure = __parser.get_structure( name, os.path.join(path, structure)) elif file_type == 'cif': # load MMCIF file __parser = MMCIFParser(QUIET=True) self.structure = __parser.get_structure( name, os.path.join(path, structure)) else: raise ValueError( "Unknown filetype for structure file name: {}".format( structure)) elif isinstance(structure, Entity): # use structure as-is self.structure = structure else: raise ValueError( "Unknown type for input argument 'structure': {}".format( str(structure))) # properties self.name = name # cachable properties self.cache = {} self._atom_KDTree = None self._atom_list = None self._surface_residues = None
def _biommCIF_strcuture_to_TEMpy(filename, structure, hetatm=False, water=False): #imported if and when the function is executed. """ PRIVATE FUNCTION to convert to Structure Instance filename = name of mmCIF file hetatm = Boolean representing whether to add hetatm to the structure.Default and Raccomanded is False. water = Boolean representing whether to add water to the structure.Default and Raccomanded is False. """ from Bio.PDB import MMCIFParser as MMCIFParserBiopy p = MMCIFParserBiopy() atomList = [] hetatomList = [] wateratomList = [] footer = '' header = '' cif_code = filename.split("/")[-1] #use os.1FAT.cif structure_id = "%s" % cif_code[:-4] structure = p.get_structure(structure_id, filename) residues = structure.get_residues() for res in residues: hetfield = res.get_id()[0] if hetfield[0] == "H": for atom in res: BioPyAtom(atom) hetatomList.append(BioPyAtom(atom)) elif hetfield[0] == "W": for atom in res: BioPyAtom(atom) wateratomList.append(BioPyAtom(atom)) else: for atom in res: BioPyAtom(atom) atomList.append(BioPyAtom(atom)) if hetatm: atomList = append(atomList, hetatomList) if water: atomList = append(atomList, wateratomList) return BioPy_Structure(atomList, filename=filename, header=header, footer=footer)
def Extract_coordinates_from_PDB(self, PDB_file, type): ''' Returns both the alpha carbon coordinates contained in the PDB file and the residues coordinates for the desired chains''' from Bio.PDB.PDBParser import PDBParser from Bio.PDB import MMCIFParser Name = ntpath.basename(PDB_file).split('.')[0] try: parser = PDB.PDBParser() structure = parser.get_structure('%s' % (Name), PDB_file) except: parser = MMCIFParser() structure = parser.get_structure('%s' % (Name), PDB_file) ############## Iterating over residues to extract all of them even if there is more than 1 chain if type == 'models': CoordinatesPerModel = [] for model in structure: model_coord = [] for chain in model: for residue in chain: if is_aa(residue.get_resname(), standard=True): model_coord.append(residue['CA'].get_coord()) CoordinatesPerModel.append(model_coord) return CoordinatesPerModel elif type == 'chains': CoordinatesPerChain = [] for model in structure: for chain in model: chain_coord = [] for residue in chain: if is_aa(residue.get_resname(), standard=True): chain_coord.append(residue['CA'].get_coord()) CoordinatesPerChain.append(chain_coord) return CoordinatesPerChain elif type == 'all': alpha_carbon_coordinates = [] for chain in structure.get_chains(): for residue in chain: if is_aa(residue.get_resname(), standard=True): # try: alpha_carbon_coordinates.append( residue['CA'].get_coord()) # except: # pass return alpha_carbon_coordinates
def call_mmcif(f): ''' Call function for mmcif files ''' if (".cif") in f: name = f.split('/')[-1].split('.')[0].upper() # Open gz files if ".gz" in f: f = gzip.open(f, 'rt') parser = MMCIFParser() structure = parser.get_structure(name, f) mmtf_encoder = MMTFEncoder() pass_data_on(input_data=structure, input_function=biopythonInputFunction, output_data=mmtf_encoder) return (name, mmtf_encoder)
def Write_PDB(self, initialPDB, Rotation, Translation, N): ''' Transform by rotating and translating the atom coordinates from the original PDB file and rewrite it ''' from Bio.PDB.PDBParser import PDBParser from Bio.PDB import MMCIFParser, PDBIO Name = ntpath.basename(initialPDB).split('.')[0] try: parser = PDB.PDBParser() structure = parser.get_structure('%s' % (Name), initialPDB) except: parser = MMCIFParser() structure = parser.get_structure('%s' % (Name), initialPDB) for atom in structure.get_atoms(): atom.transform(Rotation, Translation) io = PDBIO() io.set_structure(structure) io.save("{}_{}".format(N, ntpath.basename(initialPDB)))
def StructureParser(pdbfile): if pdbfile.endswith('.pdb'): parser = PDBParser(QUIET=True) elif pdbfile.endswith('.cif'): parser = MMCIFParser(QUIET=True) else: print 'ERROR: a protein structure file shall end with either .pdb or .cif' exit(1) return parser
def test_cealigner_no_transform(self): """Test aligning 7CFN on 6WQA without transforming 7CFN.""" ref = "PDB/6WQA.cif" mob = "PDB/7CFN.cif" parser = MMCIFParser(QUIET=1) s1 = parser.get_structure("6wqa", ref) s2 = parser.get_structure("7cfn", mob) s2_original_coords = [list(a.coord) for a in s2.get_atoms()] aligner = CEAligner() aligner.set_reference(s1) aligner.align(s2, transform=False) s2_coords_final = [list(a.coord) for a in s2.get_atoms()] self.assertAlmostEqual(aligner.rms, 3.83, places=2) self.assertEqual(s2_original_coords, s2_coords_final)
def set_parser(protein_file): ''' Choose the correct parser according to the protein file's format ''' if is_cif(protein_file): parser = MMCIFParser() else: parser = PDBParser() return parser
def is_holo(ordinal, s: Dict): logger.info(f'processing {ordinal}-th structure {s["pdb_code"]}') is_holo_analyzer = IsHolo() with warnings.catch_warnings(): warnings.simplefilter("ignore") model = MMCIFParser().get_structure(s['pdb_code'], s['path'])[0] return is_holo_analyzer(model, model[s['chain_id']])
def setUpClass(cls): # Check if MSMS is installed try: v = subprocess.check_output(["msms", "-h"], universal_newlines=True, stderr=subprocess.STDOUT) except OSError: raise unittest.SkipTest( "Install MSMS if you want to use it from Biopython.") cls.pdbparser = PDBParser() cls.cifparser = MMCIFParser()
def get_parser(file_): """ Get a parser appropriate to the file format. """ try: # Try to get the file extension to determine the file format. file_base, ext = os.path.splitext(file_) except ValueError: raise ValueError("Cannot obtain extension of file {}".format(file_)) else: try: # Use a parser appropriate for the file format. return {".pdb": PDBParser(), ".cif": MMCIFParser()}[ext] except KeyError: raise ValueError("Unknown molecular file format: {}".format(ext))
def process_structrure(pdb_file_chains, save_dir): pdb_file, chain_ids = pdb_file_chains prot = os.path.split(pdb_file)[-1].split(".")[0].upper() parser = MMCIFParser() try: model = parser.get_structure(None, pdb_file)[0] except PDB.PDBExceptions.PDBConstructionException: return for c_id in set(chain_ids): try: chain = model[c_id] except KeyError: return seq = [] coords = [] for residue in chain.get_unpacked_list(): if "CA" in residue: xyz = residue["CA"].get_coord() if coords and np.allclose( coords[-1], xyz ): # Ignore residue if too close to the previous one. continue aa_c = aa_codes.get( _aa3to1_dict.get(residue.get_resname(), "-"), 0) seq.append(aa_c) coords.append(xyz) if seq: npz_filename = os.path.join(save_dir, f"{prot}-{chain.id}.npz") # if os.path.exists(npz_filename): # print(f'{prot}-{c_id} exists already!') # return np.savez_compressed(npz_filename, seq=seq, coords=coords) print(f"{npz_filename} saved!")
def add_struc_path(self, struc_path): from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.PDB import PDBParser, MMCIFParser from Bio.SeqUtils import seq1 self.struc_path = struc_path if ntpath.splitext(self.struc_path)[1] == ".pdb": parser = PDBParser() elif ntpath.splitext(self.struc_path)[1] == ".cif": parser = MMCIFParser() else: raise IOError( "Unrecognized structure file type! Please use .pdb or .cif files!" ) structure = parser.get_structure("none", self.struc_path) chains = list() for chain in structure.get_chains(): chains.append(chain) if len(chains) != 1: raise IOError( f"When using structure files, they need to have a single chain!" ) sequence = str() seq_ix_mapping = dict() untrue_seq_ix = 1 residues = list(chains[0].get_residues()) for resi in residues: resi_id = resi.get_id() if not re.match(r' ', resi_id[2]): continue if re.match(r'^H_', resi_id[0]): continue if re.match(r'W', resi_id[0]): continue sequence += resi.get_resname().replace(' ', '') seq_ix_mapping[untrue_seq_ix] = int(resi.get_id()[1]) untrue_seq_ix += 1 if len(seq1(residues[seq_ix_mapping[1]].get_resname().replace( ' ', ''))) != 0: sequence = seq1(sequence) self.seq_ix_mapping = seq_ix_mapping self.struc_seq = SeqRecord(Seq(sequence))