def main(): """Run it.""" args = _parse_arguments() fname, s_ext = splitext(basename(args.input_file)) parser = None if s_ext in {'.pdb', '.ent'}: parser = PDBParser(QUIET=1) elif s_ext == ".cif": parser = FastMMCIFParser(QUIET=1) with open(args.input_file) as in_file: # try to set electrostatics from input file if not provided by user electrostatics = args.electrostatics \ if args.electrostatics or s_ext == '.cif' \ else extract_electrostatics(in_file) prodigy_lig = ProdigyLig(parser.get_structure(fname, in_file), chains=args.chains, electrostatics=electrostatics, cutoff=args.distance_cutoff) prodigy_lig.predict() prodigy_lig.print_prediction('', args.verbose) if args.output_file is not None: output_file_name = splitext(prodigy_lig.structure.id)[0] output_file_name += "-processed.pdb" prodigy_lig.print_structure(output_file_name)
def from_file(cls, filename, file_format="pdb"): """ Initialize structure from PDB/mmCIF file Parameters ---------- filename : str Path of file file_format : {"pdb", "cif"}, optional (default: "pdb") Format of structure (old PDB format or mmCIF) Returns ------- ClassicPDB Initialized PDB structure """ try: if file_format == "pdb": from Bio.PDB import PDBParser parser = PDBParser(QUIET=True) elif file_format == "cif": from Bio.PDB import FastMMCIFParser parser = FastMMCIFParser(QUIET=True) else: raise InvalidParameterError( "Invalid file_format, valid options are: pdb, cif" ) structure = parser.get_structure("", filename) return cls(structure) except FileNotFoundError as e: raise ResourceError( "Could not find file {}".format(filename) ) from e
def call_fast_mmcif(f): ''' Call function for mmcifr files (Using Fast Parser) ''' if (".cif") in f: name = f.split('/')[-1].split('.')[0].upper() # Open gz files if ".gz" in f: f = gzip.open(f, 'rt') parser = FastMMCIFParser() structure = parser.get_structure(name, f) mmtf_encoder = MMTFEncoder() pass_data_on(input_data=structure, input_function=biopythonInputFunction, output_data=mmtf_encoder) return (name, mmtf_encoder)
def retrieve_struct(pdb_id, chain_id, load_local=False): '''Structure parser use mmCif supported more filetype :param pdb_id: PDB_ID protein :param chain_id: Alphabetic chain :return: structure and has_structured sequences ''' from Bio.PDB import PDBList, FastMMCIFParser from Bio.PDB.Polypeptide import three_to_one pdbl = PDBList() pdbl.retrieve_pdb_file(pdb_id, file_format='mmCif', pdir='./') parser = FastMMCIFParser() structure = parser.get_structure(pdb_id, pdb_id.lower() + '.cif') os.remove(pdb_id.lower() + '.cif') chain = structure[0][chain_id] coords = [] structured_sequence = '' for residue in chain: if 'CA' in residue and residue['CA'].is_disordered() == 0: coords.append(residue['CA'].get_coord()) structured_sequence += three_to_one(residue.resname) else: print((residue.is_disordered(), residue.id)) return np.array(coords), str(structured_sequence)
def parse_structure(spath): """Parses a PDB/cif structure""" if not os.path.isfile(spath): return IOError('File not found: {0}'.format(spath)) if spath.endswith(('pdb', 'ent')): parser = PDBParser(QUIET=True) elif spath.endswith('cif'): parser = FastMMCIFParser() else: raise Exception('Format not supported ({0}). Must be .pdb/.ent or .cif'.format(spath)) sname = os.path.basename(spath.split('.')[0]) return parser.get_structure(sname, spath)
def parse(filename, quiet=False): '''Parses pdb file and returns emap object. Parameters ---------- filename: str Full path to file which needs to be parsed quiet: bool, optional Supresses output when set to true Returns ------- my_emap: :class:`~pyemap.emap` emap object reading for parsing ''' try: parser = PDBParser() structure = parser.get_structure("protein", filename) except Exception as e: parser = FastMMCIFParser() structure = parser.get_structure("protein", filename) io = PDBIO() fn = filename[:-4] + ".pdb" io.set_structure(structure) io.save(fn) parser = PDBParser() structure = parser.get_structure("protein", fn) chain_list = [] num_models = 0 for model in structure.get_models(): num_models += 1 if num_models < 1: raise RuntimeError("Unable to parse file.") for chain in structure[0].get_chains(): chain_list.append(chain.id) non_standard_residue_list = [] for res in structure[0].get_residues(): if res.resname not in res_name_to_char: res.get_full_id() arom_res = res.copy() non_standard_residue_list.append(arom_res) custom_residue_list = process_custom_residues(non_standard_residue_list) if not quiet: print("Identified " + str(len(custom_residue_list)) + " non-protein ET active moieties.") my_emap = emap(filename, structure, custom_residue_list, chain_list) return my_emap
def parse_structure(self, filepath_): """Parse a PDB/cif structure.""" try: filepath = pathlib.Path(filepath_) filepath.resolve(strict=True) except FileNotFoundError: raise FileNotFoundError(f"File not found: {filepath}") if filepath.suffix in {".pdb", ".ent"}: parser = PDBParser() elif filepath.suffix in {".cif", ".mmcif"}: parser = FastMMCIFParser() else: raise ValueError( f"Unsupported input structure format: {filepath.suffix}") return parser.get_structure(filepath.name, str(filepath))
def loop_parsing(file_type, proteins, rep=10): cwd = os.getcwd() if file_type == 'mmtf': parser = MMTFParser() elif file_type == 'fast_cif': parser = FastMMCIFParser() elif file_type == 'cif': parser = MMCIFParser() else: parser = PDBParser() for p in proteins: if file_type == "fast_cif": file_type = "cif" directory = "%s/%s/%s.%s" % (cwd, file_type, p, file_type) try: if file_type == 'mmtf': protein = parser.get_structure(directory) else: protein = parser.get_structure(random.randint(0, 100), directory) except Exception: print("Having trouble parsing %s" % (p)) break return
def pdb_process(task, core, output): process_start_time = time.time() # for simple profiling chunksize = task.chunksize + task.chunksize_offset * core # chunksize: how many files to analyse before commiting data to the database # assigning SQLalchemy connection engine = create_engine(task.dc_db) Base.metadata.bind = engine db_session = sessionmaker(bind=engine) session = db_session() str_parser = FastMMCIFParser(QUIET=1) print("Connected core %i, chunksize %i, offset %i" % (core, task.chunksize, task.chunksize_offset * core)) # detail of console output verbose = task.verbose quiet = task.quiet # buffer to hold data for submission to database and if it has been submitted successfully. Handles potential of locked database while writing from parallel process buffer = [] flush = True flush_offset = task.flush_offset flush_offset_count = 0 file_queue = task.file_queue[core] # loop to analyse individual structure files from file queue for pdb_count, (root, file) in enumerate(file_queue): pdb = PDB(os.path.join(root, file), core) # generate a PDB object to hold all relevant information for a single parsed structure file task_summary = "" task_summary = task_summary + ("Processing file %s on core %i" % (pdb.path, core)) # checks before structure analysis, including successful fast structure parsing if pdb.filesize > task.filesize_limit: # throw out large files. TODO: Seperate filesize due to large assembly from structure factors task_summary = task_summary + "\n\tSize abort" pdb.pass_filesize = False pdb.abort = True print(task_summary) else: if task.verbose: task_summary = task_summary + "\n\tSize pass" pdb.pass_filesize = True try: pdb.structure = str_parser.get_structure(pdb.id, pdb.path) if task.verbose: task_summary = task_summary + "\n\tMMCIFParser pass" pdb.pass_structure = True except: task_summary = task_summary + "\n\tMMCIFParser abort" pdb.pass_structure = False pdb.abort = True print(task_summary) # PDB object functions analysing distance data in a structure file and assigning relevant metadata for the database pdb.analyse(task) pdb.generate_dictionary() pdb.assign_dictionary_data() pdb.sqla_convert_distances() if len( pdb.filtered_distances) > 0: # hits from filtered_distances are preferred the summary table (here: intermolecular hits) representative_distance = pdb.representative_distance_filtered() else: # no intermolecular hits found or no hits found at all representative_distance = pdb.representative_distance_unfiltered() if representative_distance: pdb.top_hit_sqla(representative_distance) # populate class with information for summary table pdb.sqla_summary() # generate entry for summary table pdb_entries_wrap = pdb.alchemy_distances, pdb.alchemy_sum # results for all distances per structure and single distance for summary table buffer.append(pdb_entries_wrap) # append result to buffer holding data for submission to database if len(buffer) == chunksize: print("Core %i - %i tasks in %i seconds" % (core, pdb_count + 1, (time.time() - process_start_time))) flush = buffer_SQLal_dc_submission(session, buffer, core) # returns if submission to database successful chunksize = task.chunksize # remove offset after first time. This might not be needed if flush: flush_offset_count = 0 buffer = [] print("Core %i - commit successful" % core) if not flush: # Handles incomplete submission of buffer due to busy database. This fix likely already made chunk_offset obsolete if flush_offset_count == flush_offset: # Tries to submit again every flush_offset until it succeeds print("Core %i - not flush at %i tasks with %i tasks in buffer" % (core, pdb_count + 1, len(buffer))) flush_offset_count = 0 flush = buffer_SQLal_dc_submission(session, buffer, core) if flush: buffer = [] flush_offset_count += 1 # # # some text for console if not verbose: task_summary = task_summary + "\nFinished task %i on core %i\n" % (pdb_count, core) if verbose: task_summary = (task_summary + ("\n\t\t%i\tdistances\t" % len(pdb.distances) + str(pdb.distances))) task_summary = task_summary + "\n\t\t\t%i of %i distances intermolecular" % ( pdb.inter_count, len(pdb.distances)) task_summary = task_summary + "\n\t\t\t%i of %i distances below cutoff" % ( pdb.dist_count, len(pdb.distances)) task_summary = task_summary + "\n\t\t\t%i distances below cutoff and intermolecular" % (pdb.hit_count) task_summary = task_summary + "\nFinished task %i on core %i\n" % (pdb_count, core) if not quiet: print(task_summary) # Final commit for incomplete buffer at end of queue (remainder from chunk size) for i in range(0, 10): flush = buffer_SQLal_dc_submission(session, buffer, core) # final commit attempted 10 times if not flush: time.sleep(60) print("Failed to commit final chunk of size %i on core %i. Waiting 60s" % (len(buffer), core)) else: print("Committed final chunk of size %i on core %i." % (len(buffer), core)) break if not flush: print("Failed to flush after 10 min. Attempting single submission") for i in range(0, 10): buffer = single_buffer_SQLal_dc_submission(session, buffer, core) # final commit attempted 10 times if len(buffer) == 0: flush = True else: flush = False if not flush: time.sleep(60) print("Failed to single commit, final chunk of size %i on core %i. Waiting 60s" % (len(buffer), core)) else: print("Committed all chunks on core %i." % (core)) break if not flush: # writes error to logfile upon failure to commit final entries after 2*10 attempts with open(task.dc_error_log, "a+") as outfile: for entry in buffer: outfile.write("{}\t{}".format(entry[1].pdb_id, core)) output.put("--- Core %s finished %s tasks in %s seconds ---" % ( core, len(file_queue), (time.time() - process_start_time)))
import sys, os, json import re from pathlib import Path from Bio.PDB.ResidueDepth import residue_depth from Bio.PDB.Structure import Structure from Bio.PDB.Atom import Atom from Bio.PDB.Chain import Chain from Bio.PDB.Residue import Residue from ciftools.Structure import fetchStructure from dotenv import load_dotenv from Bio.PDB import FastMMCIFParser from functools import reduce parser = FastMMCIFParser(QUIET=True) struct: Structure = parser.get_structure('3j7z', '3J7Z.cif')[0] with open("3J7Z_TUNNEL_REPORT.json") as infile: tunnel = json.load(infile) strands = tunnel['adjacent_strands'] charge_from_P = 0 charge_from_GLU_ASP = 0 charge_from_ARG_LYS = 0 p_global = [] for strand in strands: chain: Chain = struct[strand]