def fetch_pdb(pdbid): """Get the newest entry from the RCSB server for the given PDB ID. Exits with '1' if PDB ID is invalid.""" pdbid = pdbid.lower() logger.info(f'checking status of PDB-ID {pdbid}') # @todo re-implement state check with ew RCSB API, see https://www.rcsb.org/news?year=2020&article=5eb18ccfd62245129947212a&feature=true # state, current_entry = check_pdb_status(pdbid) # Get state and current PDB ID # # if state == 'OBSOLETE': # logger.info(f'entry is obsolete, getting {current_entry} instead') # elif state == 'CURRENT': # logger.info('entry is up-to-date') # elif state == 'UNKNOWN': # logger.error('invalid PDB-ID (entry does not exist on PDB server)') # sys.exit(1) logger.info('downloading file from PDB') # get URL for current entry # @todo needs update to react properly on response codes of RCSB servers pdburl = f'http://www.rcsb.org/pdb/files/{pdbid}.pdb' try: pdbfile = urlopen(pdburl).read().decode() # If no PDB file is available, a text is now shown with "We're sorry, but ..." # Could previously be distinguished by an HTTP error if 'sorry' in pdbfile: logger.error('no file in PDB format available from wwPDB for the given PDB ID.') sys.exit(1) except HTTPError: logger.error('no file in PDB format available from wwPDB for the given PDB ID') sys.exit(1) return [pdbfile, pdbid]
def run_analysis(inputstructs, inputpdbids): """Main function. Calls functions for processing, report generation and visualization.""" pdbid, pdbpath = None, None # @todo For multiprocessing, implement better stacktracing for errors # Print title and version logger.info(f'Protein-Ligand Interaction Profiler (PLIP) {__version__}') logger.info(f'brought to you by: {config.__maintainer__}') logger.info(f'please cite: https://www.doi.org/10.1093/nar/gkv315') output_prefix = config.OUTPUTFILENAME if inputstructs is not None: # Process PDB file(s) num_structures = len(inputstructs) inputstructs = remove_duplicates(inputstructs) read_from_stdin = False for inputstruct in inputstructs: if inputstruct == '-': inputstruct = sys.stdin.read() read_from_stdin = True if config.RAWSTRING: if sys.version_info < (3, ): inputstruct = bytes(inputstruct).decode( 'unicode_escape') else: inputstruct = bytes(inputstruct, 'utf8').decode('unicode_escape') else: if os.path.getsize(inputstruct) == 0: logger.error('empty PDB file') sys.exit(1) if num_structures > 1: basename = inputstruct.split('.')[-2].split('/')[-1] config.OUTPATH = '/'.join([config.BASEPATH, basename]) output_prefix = 'report' process_pdb(inputstruct, config.OUTPATH, as_string=read_from_stdin, outputprefix=output_prefix) else: # Try to fetch the current PDB structure(s) directly from the RCBS server num_pdbids = len(inputpdbids) inputpdbids = remove_duplicates(inputpdbids) for inputpdbid in inputpdbids: pdbpath, pdbid = download_structure(inputpdbid) if num_pdbids > 1: config.OUTPATH = '/'.join( [config.BASEPATH, pdbid[1:3].upper(), pdbid.upper()]) output_prefix = 'report' process_pdb(pdbpath, config.OUTPATH, outputprefix=output_prefix) if (pdbid is not None or inputstructs is not None) and config.BASEPATH is not None: if config.BASEPATH in ['.', './']: logger.info( 'finished analysis, find the result files in the working directory' ) else: logger.info( f'finished analysis, find the result files in {config.BASEPATH}' )
def remove_duplicates(slist): """Checks input lists for duplicates and returns a list with unique entries""" unique = list(set(slist)) difference = len(slist) - len(unique) if difference == 1: logger.info('removed one duplicate entry from input list') if difference > 1: logger.info(f'Removed {difference} duplicate entries from input list') return unique
def process_pdb(pdbfile, outpath, as_string=False, outputprefix='report'): """Analysis of a single PDB file. Can generate textual reports XML, PyMOL session files and images as output.""" if not as_string: pdb_file_name = pdbfile.split('/')[-1] startmessage = f'starting analysis of {pdb_file_name}' else: startmessage = 'starting analysis from STDIN' logger.info(startmessage) mol = PDBComplex() mol.output_path = outpath mol.load_pdb(pdbfile, as_string=as_string) # @todo Offers possibility for filter function from command line (by ligand chain, position, hetid) for ligand in mol.ligands: mol.characterize_complex(ligand) create_folder_if_not_exists(outpath) # Generate the report files streport = StructureReport(mol, outputprefix=outputprefix) config.MAXTHREADS = min(config.MAXTHREADS, len(mol.interaction_sets)) ###################################### # PyMOL Visualization (parallelized) # ###################################### if config.PYMOL or config.PICS: from plip.visualization.visualize import visualize_in_pymol complexes = [ VisualizerData(mol, site) for site in sorted(mol.interaction_sets) if not len(mol.interaction_sets[site].interacting_res) == 0 ] if config.MAXTHREADS > 1: logger.info( f'generating visualizations in parallel on {config.MAXTHREADS} cores' ) parfn = parallel_fn(visualize_in_pymol) parfn(complexes, processes=config.MAXTHREADS) else: [visualize_in_pymol(plcomplex) for plcomplex in complexes] if config.XML: # Generate report in xml format streport.write_xml(as_string=config.STDOUT) if config.TXT: # Generate report in txt (rst) format streport.write_txt(as_string=config.STDOUT)
def hbonds(acceptors, donor_pairs, protisdon, typ): """Detection of hydrogen bonds between sets of acceptors and donor pairs. Definition: All pairs of hydrogen bond acceptor and donors with donor hydrogens and acceptor showing a distance within HBOND DIST MIN and HBOND DIST MAX and donor angles above HBOND_DON_ANGLE_MIN """ data = namedtuple('hbond', 'a a_orig_idx d d_orig_idx h distance_ah distance_ad angle type protisdon resnr ' 'restype reschain resnr_l restype_l reschain_l sidechain atype dtype') pairings = [] # DEBUG if protisdon: logger.info(f'Ligand has {len(acceptors)} acceptors') logger.info(f'Prot has {len(donor_pairs)} donor pairs of type {typ}') else: logger.info(f'Prot has {len(acceptors)} acceptors') logger.info(f'Lig has {len(donor_pairs)} donor pairs of type {typ}') for acc, don in itertools.product(acceptors, donor_pairs): #if not typ == 'strong': #continue # keep only the strong hydrogen bonds (default plip version) # Regular (strong) hydrogen bonds dist_ah = euclidean3d(acc.a.coords, don.h.coords) # acceptor to H dist dist_ad = euclidean3d(acc.a.coords, don.d.coords) # acceptor to donor dist if not config.MIN_DIST < dist_ad < config.HBOND_DIST_MAX: continue vec1, vec2 = vector(don.h.coords, don.d.coords), vector(don.h.coords, acc.a.coords) v = vecangle(vec1, vec2) if not v > config.HBOND_DON_ANGLE_MIN: continue protatom = don.d.OBAtom if protisdon else acc.a.OBAtom ligatom = don.d.OBAtom if not protisdon else acc.a.OBAtom is_sidechain_hbond = protatom.GetResidue().GetAtomProperty(protatom, 8) # Check if sidechain atom resnr = whichresnumber(don.d) if protisdon else whichresnumber(acc.a) resnr_l = whichresnumber(acc.a_orig_atom) if protisdon else whichresnumber(don.d_orig_atom) restype = whichrestype(don.d) if protisdon else whichrestype(acc.a) restype_l = whichrestype(acc.a_orig_atom) if protisdon else whichrestype(don.d_orig_atom) reschain = whichchain(don.d) if protisdon else whichchain(acc.a) rechain_l = whichchain(acc.a_orig_atom) if protisdon else whichchain(don.d_orig_atom) # Next line prevents H-Bonds within amino acids in intermolecular interactions if config.INTRA is not None and whichresnumber(don.d) == whichresnumber(acc.a): continue # Next line prevents backbone-backbone H-Bonds if config.INTRA is not None and protatom.GetResidue().GetAtomProperty(protatom, 8) and ligatom.GetResidue().GetAtomProperty( ligatom, 8): continue contact = data(a=acc.a, a_orig_idx=acc.a_orig_idx, d=don.d, d_orig_idx=don.d_orig_idx, h=don.h, distance_ah=dist_ah, distance_ad=dist_ad, angle=v, type=typ, protisdon=protisdon, resnr=resnr, restype=restype, reschain=reschain, resnr_l=resnr_l, restype_l=restype_l, reschain_l=rechain_l, sidechain=is_sidechain_hbond, atype=acc.a.type, dtype=don.d.type) pairings.append(contact) return filter_contacts(pairings)
def download_structure(inputpdbid): """Given a PDB ID, downloads the corresponding PDB structure. Checks for validity of ID and handles error while downloading. Returns the path of the downloaded file.""" try: if len(inputpdbid) != 4 or extract_pdbid(inputpdbid.lower()) == 'UnknownProtein': logger.error(f'invalid PDB-ID (wrong format): {inputpdbid}') sys.exit(1) pdbfile, pdbid = fetch_pdb(inputpdbid.lower()) pdbpath = tilde_expansion('%s/%s.pdb' % (config.BASEPATH.rstrip('/'), pdbid)) create_folder_if_not_exists(config.BASEPATH) with open(pdbpath, 'w') as g: g.write(pdbfile) logger.info(f'file downloaded as {pdbpath}') return pdbpath, pdbid except ValueError: # Invalid PDB ID, cannot fetch from RCBS server logger.error(f'PDB-ID does not exist: {inputpdbid}') sys.exit(1)
def metal_complexation(metals, metal_binding_lig, metal_binding_bs): """Find all metal complexes between metals and appropriate groups in both protein and ligand, as well as water""" data = namedtuple( 'metal_complex', 'metal metal_orig_idx metal_type target target_orig_idx target_type ' 'coordination_num distance resnr restype ' 'reschain restype_l reschain_l resnr_l location rms, geometry num_partners complexnum' ) pairings_dict = {} pairings = [] # #@todo Refactor metal_to_id = {} metal_to_orig_atom = {} for metal, target in itertools.product( metals, metal_binding_lig + metal_binding_bs): distance = euclidean3d(metal.m.coords, target.atom.coords) if not distance < config.METAL_DIST_MAX: continue if metal.m not in pairings_dict: pairings_dict[metal.m] = [ (target, distance), ] metal_to_id[metal.m] = metal.m_orig_idx metal_to_orig_atom[metal.m] = metal.orig_m else: pairings_dict[metal.m].append((target, distance)) for cnum, metal in enumerate(pairings_dict): rms = 0.0 excluded = [] # cnum +1 being the complex number contact_pairs = pairings_dict[metal] num_targets = len(contact_pairs) vectors_dict = defaultdict(list) for contact_pair in contact_pairs: target, distance = contact_pair vectors_dict[target.atom.idx].append( vector(metal.coords, target.atom.coords)) # Listing of coordination numbers and their geometries configs = { 2: [ 'linear', ], 3: ['trigonal.planar', 'trigonal.pyramidal'], 4: ['tetrahedral', 'square.planar'], 5: ['trigonal.bipyramidal', 'square.pyramidal'], 6: [ 'octahedral', ] } # Angle signatures for each geometry (as seen from each target atom) ideal_angles = { 'linear': [[180.0]] * 2, 'trigonal.planar': [[120.0, 120.0]] * 3, 'trigonal.pyramidal': [[109.5, 109.5]] * 3, 'tetrahedral': [[109.5, 109.5, 109.5, 109.5]] * 4, 'square.planar': [[90.0, 90.0, 90.0, 90.0]] * 4, 'trigonal.bipyramidal': [[120.0, 120.0, 90.0, 90.0]] * 3 + [[90.0, 90.0, 90.0, 180.0]] * 2, 'square.pyramidal': [[90.0, 90.0, 90.0, 180.0]] * 4 + [[90.0, 90.0, 90.0, 90.0]], 'octahedral': [[90.0, 90.0, 90.0, 90.0, 180.0]] * 6 } angles_dict = {} for target in vectors_dict: cur_vector = vectors_dict[target] other_vectors = [] for t in vectors_dict: if not t == target: [other_vectors.append(x) for x in vectors_dict[t]] angles = [ vecangle(pair[0], pair[1]) for pair in itertools.product(cur_vector, other_vectors) ] angles_dict[target] = angles all_total = [] # Record fit information for each geometry tested gdata = namedtuple( 'gdata', 'geometry rms coordination excluded diff_targets') # Geometry Data # Can't specify geometry with only one target if num_targets == 1: final_geom = 'NA' final_coo = 1 excluded = [] rms = 0.0 else: for coo in sorted( configs, reverse=True): # Start with highest coordination number geometries = configs[coo] for geometry in geometries: signature = ideal_angles[ geometry] # Set of ideal angles for geometry, from each perspective geometry_total = 0 geometry_scores = [ ] # All scores for one geometry (from all subsignatures) used_up_targets = [ ] # Use each target just once for a subsignature not_used = [] coo_diff = num_targets - coo # How many more observed targets are there? # Find best match for each subsignature for subsignature in signature: # Ideal angles from one perspective best_target = None # There's one best-matching target for each subsignature best_target_score = 999 for k, target in enumerate(angles_dict): if target not in used_up_targets: observed_angles = angles_dict[ target] # Observed angles from perspective of one target single_target_scores = [] used_up_observed_angles = [] for i, ideal_angle in enumerate(subsignature): # For each angle in the signature, find the best-matching observed angle best_match = None best_match_diff = 999 for j, observed_angle in enumerate( observed_angles): if j not in used_up_observed_angles: diff = abs(ideal_angle - observed_angle) if diff < best_match_diff: best_match_diff = diff best_match = j if best_match is not None: used_up_observed_angles.append( best_match) single_target_scores.append( best_match_diff) # Calculate RMS for target angles target_total = sum([ x**2 for x in single_target_scores ])**0.5 # Tot. score targ/sig if target_total < best_target_score: best_target_score = target_total best_target = target used_up_targets.append(best_target) geometry_scores.append(best_target_score) # Total score is mean of RMS values geometry_total = np.mean(geometry_scores) # Record the targets not used for excluding them when deciding for a final geometry [ not_used.append(target) for target in angles_dict if target not in used_up_targets ] all_total.append( gdata(geometry=geometry, rms=geometry_total, coordination=coo, excluded=not_used, diff_targets=coo_diff)) # Make a decision here. Starting with the geometry with lowest difference in ideal and observed partners ... # Check if the difference between the RMS to the next best solution is not larger than 0.5 if not num_targets == 1: # Can't decide for any geoemtry in that case all_total = sorted(all_total, key=lambda x: abs(x.diff_targets)) for i, total in enumerate(all_total): next_total = all_total[i + 1] this_rms, next_rms = total.rms, next_total.rms diff_to_next = next_rms - this_rms if diff_to_next > 0.5: final_geom, final_coo, rms, excluded = total.geometry, total.coordination, total.rms, total.excluded break elif next_total.rms < 3.5: final_geom, final_coo, = next_total.geometry, next_total.coordination rms, excluded = next_total.rms, next_total.excluded break elif i == len(all_total) - 2: final_geom, final_coo, rms, excluded = "NA", "NA", float( 'nan'), [] break # Record all contact pairing, excluding those with targets superfluous for chosen geometry only_water = set([x[0].location for x in contact_pairs]) == {'water'} if not only_water: # No complex if just with water as targets logger.info( f'metal ion {metal.type} complexed with {final_geom} geometry (coo. number {final_coo}/ {num_targets} observed)' ) for contact_pair in contact_pairs: target, distance = contact_pair if target.atom.idx not in excluded: metal_orig_atom = metal_to_orig_atom[metal] restype_l, reschain_l, resnr_l = whichrestype( metal_orig_atom), whichchain( metal_orig_atom), whichresnumber(metal_orig_atom) contact = data(metal=metal, metal_orig_idx=metal_to_id[metal], metal_type=metal.type, target=target, target_orig_idx=target.atom_orig_idx, target_type=target.type, coordination_num=final_coo, distance=distance, resnr=target.resnr, restype=target.restype, reschain=target.reschain, location=target.location, rms=rms, geometry=final_geom, num_partners=num_targets, complexnum=cnum + 1, resnr_l=resnr_l, restype_l=restype_l, reschain_l=reschain_l) pairings.append(contact) return filter_contacts(pairings)