def load_from_one_cath_pml_file(pml_file, scratch_path, superfamilies, dssp_path): '''Load data from a .pml file of superposed homologous superfamilies from the CATH database. ''' superfamilies.append([]) candidate_proteins = [] with open(pml_file, 'r') as f: while True: line = f.readline() if not line: break # Read one structure if line.strip().startswith('cmd.read_pdbstr'): pdb_lines = [line.strip()[19:].strip('\\')] pdb_id = '' while True: line = f.readline() if line.strip().startswith('"""'): pdb_id = line.strip()[5:12] break pdb_line = line.strip().strip('\\') if len(pdb_line) > 17: pdb_line = pdb_line[0:16] + ' ' + pdb_line[ 17:] # Remove all altLoc flags pdb_lines.append(pdb_line) # Remove all altLoc flags # Make a pdb file of the structure for DSSP analysis structure = structure_from_pdb_string('\n'.join(pdb_lines), pdb_id) # Store structures without chain breaks if len(topology.find_structure_chain_breaks(structure)) == 0: structure_path = os.path.join(scratch_path, pdb_id + '.pdb') io = PDB.PDBIO() io.set_structure(structure) io.save(structure_path) candidate_proteins.append({ 'structure': structure, 'path': structure_path }) for p in candidate_proteins: try: find_secondary_structures(p, dssp_path) except: continue superfamilies[-1].append( p) # Add a protein to a superfamily if there's no exception
def preparePdb(pdb_fname, out_pdb_fname): ''' Prepare the PDB file with only first model and redundancies cut out ''' # 'Absolutize' the path names - rest is done in the temporary dir pdb_fname = os.path.abspath(pdb_fname) if not os.path.exists(pdb_fname): raise IOError('%s does not exist' % pdb_fname) out_pdb_fname = os.path.abspath(out_pdb_fname) # Inside the temporary dir with tempDir() as tmp_dir: # Temporary names for curated input and output files new_pdb_fname = 'query.pdb' out_tmp_fname = 'out.pdb' # If the original PDB is packed with gzip - unpack it into a new file if pdb_fname.endswith('.gz'): rfh = gzip.open(pdb_fname, 'r') else: rfh = open(pdb_fname, 'r') try: with open(new_pdb_fname, 'w') as wfh: wfh.write(rfh.read()) finally: rfh.close() # Parse structure # Redirect standard output/error to a cStringIO, #so that PDBParser stops messing the output parser = Bio.PDB.PDBParser() err_fh = io.StringIO() sys.stdout = err_fh sys.stderr = err_fh struct = parser.get_structure('query', new_pdb_fname) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ # Output formatted info about PDBParser's work to a log s = err_fh.getvalue() if s.strip(): logging.info( "Structure parsing generated following error message(s): \n%s\n%s\n%s" % ('-' * 120, s, '-' * 120)) # By default use only first model model = struct[0] del struct.child_list[1:] # Check for discontinuities greater than 5 residues - warn about this _specifically_ for chain in model: chid = chain.id last_rid = None for residue in chain: if last_rid is not None and rid > last_rid + 5: rid = residue.id[1] logging.warn( "Residues %s:%s-%s:%s. Results might be inaccurate, as the break in a protein chain numbers more than 5 residues." % (last_id, chain, rid, chain)) last_rid = rid # Save structure without hydrogens io = Bio.PDB.PDBIO() io.set_structure(struct) io.save(new_pdb_fname) shutil.move(new_pdb_fname, out_pdb_fname) return out_pdb_fname
def save_structure(struct, name): file = '{}.pdb'.format(name) io = PDBIO() io.set_structure(struct) io.save(file) del io with open(file, 'r') as f: atoms = f.read() data = header() + atoms with open(file, 'w') as f: f.write(data)
def get_sequence(pdb, chain): pdb_parser = PDBParser(PERMISSIVE=0) # The PERMISSIVE instruction allows PDBs presenting errors. pdb_structure = pdb_parser.get_structure(pdb,pdb+".pdb") pdb_chain = pdb_structure[0][chain] i = 1 lista=[] for residue in pdb_chain: if i < int(sys.argv[3]) or i > int(sys.argv[4]): lista.append(residue.get_id()) #pdb_chain.detach_child(residue.get_id()) i+=1 for x in lista: pdb_chain.detach_child(x) io = PDBIO() io.set_structure(pdb_chain) output = sys.argv[5]+"_segment.pdb" io.save(output)
def extract_ligands(path): """ Extraction of the heteroatoms of .pdb files """ for pfb_file in os.listdir(path + 'pdbs/'): i = 1 if pfb_file.endswith('.pdb') and not pfb_file.startswith("lig_"): pdb_code = pfb_file[:-4] pdb = PDBParser().get_structure(pdb_code, path + 'pdbs/' + pfb_file) io = PDBIO() io.set_structure(pdb) model_selected = pdb[0] # for model in pdb: for chain in model_selected: for residue in chain: if not is_het(residue): continue print(f"saving {chain} {residue}") io.save(f"lig_{pdb_code}_{i}.pdb", ResidueSelect(chain, residue)) i += 1
def download_pdb(self, info): pdb_id, chain_id = info ## Check if atom has alternative position, if so, keep 'A' position and remove the flag ## but somehow this class doesn't seem to function well class NotDisordered(Select): def accept_atom(self, atom): if not atom.is_disordered() or atom.get_altloc() == 'A': atom.set_altloc(' ') return True else: return False ## BioPython downloads PDB but it gives a lowercase name in pdb{}.ent format biopdb_name = '{0}/pdb{1}.ent'.format(self.work_dir, pdb_id.lower()) biopdb_modf = '{0}/pdb{1}.mod.ent'.format(self.work_dir, pdb_id.lower()) if not os.path.isfile(biopdb_modf): try: PDB.PDBList(verbose=False).retrieve_pdb_file( pdb_id, pdir=self.work_dir, obsolete=False, file_format='pdb') except FileNotFoundError: print( ' \033[31m> ERROR: BioPython cannot download PDB: \033[0m' + pdb_id) return None ## Replace modified AA to avoid mis-recognition in biopython readin ## Replace disordered atoms and keep only the "A" variant ReplacePDBModifiedAA(biopdb_name, biopdb_modf) os.system('grep "REMARK " {0} > {0}.remark'.format(biopdb_modf)) with open(biopdb_modf, 'r') as fi: remarks = [l for l in fi if re.search('REMARK HET ', l)] ## Read the PDB file and extract the chain from structure[0] try: model = PDB.PDBParser(PERMISSIVE=1, QUIET=1).get_structure(pdb_id, biopdb_modf)[0] except KeyError: print(' \033[31m> ERROR: BioPython cannot read in PDB: \033[0m' + biopdb_modf) return None except ValueError: print(' \033[31m> ERROR: PDB file is empty: \033[0m' + biopdb_modf) return None ### Bug alert: as of 20.02.18, Biopython dev hasn't come up with good ### strategy to fix the 'atom.disordered_get_list()' issue with alternative ### position of residue side chains. To go around this, will physically ### remove "B" variant and keep only "A" variant in io = PDB.PDBIO() io.set_structure(model[chain_id]) io.save('{0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id, chain_id), select=NotDisordered()) # Attach REMARK to end of PDB as safekeeping os.system('cat {0}/{1}_{2}.pdb {3}.remark > {1}.temp'.format( self.work_dir, pdb_id, chain_id, biopdb_modf)) os.system('mv {1}.temp {0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id, chain_id)) # os.system('mv {1} {0}/{2}.ent'.format(self.work_dir, biopdb_name, pdb_id)) # os.system('bzip2 -f {0}/{1}.ent'.format(self.work_dir, pdb_id)) # os.system('rm {0} {0}.remark'.format(biopdb_modf)) return '{0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id, chain_id)
def prepareWithHydrogens(pdb_fname, out_pdb_fname="wth_hydro.pdb"): ''' Prepare the PDB file with hydrogen data (clean up and create a new one). ''' # 'Absolutize' the path names - rest is done in the temporary dir pdb_fname = os.path.abspath(pdb_fname) if not os.path.exists(pdb_fname) or not os.path.isfile(pdb_fname): raise IOError('%s does not exist or is not a file.' % pdb_fname) out_pdb_fname = os.path.abspath(out_pdb_fname) if pdb_fname.endswith('.gz'): rfh = gzip.open(pdb_fname, 'r') #print pdb_fname else: rfh = open(pdb_fname, 'r') try: # Parse structure parser = Bio.PDB.PDBParser() # Redirect standard output/error to a cStringIO, # so that PDBParser stops messing the output err_fh = io.StringIO() sys.stdout = err_fh sys.stderr = err_fh struct = parser.get_structure('query', rfh) finally: # Restore streams sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ # ... and close up rfh.close() # Output formatted info about PDBParser's work to a logger s = err_fh.getvalue() if s.strip(): logging.info( "Structure parsing generated following error message(s): \n%s\n%s\n%s" % ('-' * 120, s, '-' * 120)) # By default use only first model # ... delete the rest model = struct[0] del struct.child_list[1:] # Check for discontinuities greater than 5 residues - warn about this _specifically_ (into the logger, again) for chain in model: chid = chain.id last_rid = None for residue in chain: if last_rid is not None and rid > last_rid + 5: rid = residue.id[1] logging.warn( "Residues %s:%s-%s:%s. Results might be inaccurate, as the break in a protein chain numbers more than 5 residues." % (last_id, chain, rid, chain)) last_rid = rid # Prepare the remade hydrogens remakeHydrogens(struct) # Save structure if out_pdb_fname.endswith('.gz'): with closing(gzip.open(out_pdb_fname, 'w')) as wfh: io = Bio.PDB.PDBIO() io.set_structure(struct) io.save(wfh) else: io = Bio.PDB.PDBIO() io.set_structure(struct) io.save(out_pdb_fname) return out_pdb_fname
def prepareWithHydrogensPrep23(pdb_fname, out_pdb_fname="wth_hydro.pdb"): ''' Prepare the PDB file with hydrogen data (clean up and create a new one). ''' # 'Absolutize' the path names - rest is done in the temporary dir pdb_fname = os.path.abspath(pdb_fname) if not os.path.exists(pdb_fname) or not os.path.isfile(pdb_fname): raise IOError('%s does not exist or is not a file.' % pdb_fname) out_pdb_fname = os.path.abspath(out_pdb_fname) # Inside the temporary dir with tempDir() as tmp_dir: # Temporary names for curated input and output files new_pdb_fname = 'query.pdb' out_tmp_fname = 'out.pdb' # Prepare the sources prep_exec = _preparePrepExec() # Copy the original file into our temporary directory # If the original PDB is packed with gzip - unpack it into a new file if pdb_fname.endswith('.gz'): rfh = gzip.open(pdb_fname, 'r') else: rfh = open(pdb_fname, 'r') try: with open(new_pdb_fname, 'w') as wfh: wfh.write(rfh.read()) finally: rfh.close() # Parse structure # Redirect standard output/error to a cStringIO, #so that PDBParser stops messing the output parser = Bio.PDB.PDBParser() err_fh = io.StringIO() sys.stdout = err_fh sys.stderr = err_fh with open(new_pdb_fname, 'r') as rfh: struct = parser.get_structure('query', rfh) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ # Output formatted info about PDBParser's work to a log s = err_fh.getvalue() if s.strip(): logging.info( "Structure parsing generated following error message(s): \n%s\n%s\n%s" % ('-' * 120, s, '-' * 120)) # By default use only first model model = struct[0] del struct.child_list[1:] # Check for discontinuities greater than 5 residues - warn about this _specifically_ for chain in model: chid = chain.id last_rid = None # Curate disordered residues keeping only the last chain.child_list = [residue for residue in chain] chain.child_dict = dict((residue.id, residue) for residue in chain) for residue in chain: # Curate disordered atoms keeeping only the last residue.child_list = [a for a in residue] residue.child_dict = dict((a.id, a) for a in residue) if last_rid is not None and rid > last_rid + 5: rid = residue.id[1] logging.warn( "Residues %s:%s-%s:%s. Results might be inaccurate, as the break in a protein chain numbers more than 5 residues." % (last_id, chain, rid, chain)) last_rid = rid # Save structure without hydrogens io = Bio.PDB.PDBIO() io.set_structure(struct) io.save(new_pdb_fname, NoHydroSelect()) # Run the preparation executable on the newly created PDB file if (subprocess.call("%s %s %s 1>tmp.out 2>tmp.err" % (prep_exec, new_pdb_fname, out_tmp_fname), shell=True) != 0): raise RuntimeError( 'Could not prepare corrected structure file for %s' % pdb_fname) # Fix the occupancies (creating the last and final temporary PDB file) final_fn = "final.pdb" #raw_input('WAITING...') with open(out_tmp_fname, 'r') as rfh: with open(final_fn, 'w') as wfh: for line in rfh: if line.startswith('ATOM'): print >> wfh, line[:-1] + " 0.00 0.00 C" else: print >> wfh, line, # Move the output file to the desired location shutil.move(final_fn, out_pdb_fname) return out_pdb_fname