Esempio n. 1
0
def fetch_pdb(pdbid):
    """Get the newest entry from the RCSB server for the given PDB ID. Exits with '1' if PDB ID is invalid."""
    pdbid = pdbid.lower()
    logger.info(f'checking status of PDB-ID {pdbid}')
    # @todo re-implement state check with ew RCSB API, see https://www.rcsb.org/news?year=2020&article=5eb18ccfd62245129947212a&feature=true
    # state, current_entry = check_pdb_status(pdbid)  # Get state and current PDB ID
    #
    # if state == 'OBSOLETE':
    #     logger.info(f'entry is obsolete, getting {current_entry} instead')
    # elif state == 'CURRENT':
    #     logger.info('entry is up-to-date')
    # elif state == 'UNKNOWN':
    #     logger.error('invalid PDB-ID (entry does not exist on PDB server)')
    #     sys.exit(1)
    logger.info('downloading file from PDB')
    # get URL for current entry
    # @todo needs update to react properly on response codes of RCSB servers
    pdburl = f'http://www.rcsb.org/pdb/files/{pdbid}.pdb'
    try:
        pdbfile = urlopen(pdburl).read().decode()
        # If no PDB file is available, a text is now shown with "We're sorry, but ..."
        # Could previously be distinguished by an HTTP error
        if 'sorry' in pdbfile:
            logger.error('no file in PDB format available from wwPDB for the given PDB ID.')
            sys.exit(1)
    except HTTPError:
        logger.error('no file in PDB format available from wwPDB for the given PDB ID')
        sys.exit(1)
    return [pdbfile, pdbid]
Esempio n. 2
0
def run_analysis(inputstructs, inputpdbids):
    """Main function. Calls functions for processing, report generation and visualization."""
    pdbid, pdbpath = None, None
    # @todo For multiprocessing, implement better stacktracing for errors
    # Print title and version
    logger.info(f'Protein-Ligand Interaction Profiler (PLIP) {__version__}')
    logger.info(f'brought to you by: {config.__maintainer__}')
    logger.info(f'please cite: https://www.doi.org/10.1093/nar/gkv315')
    output_prefix = config.OUTPUTFILENAME

    if inputstructs is not None:  # Process PDB file(s)
        num_structures = len(inputstructs)
        inputstructs = remove_duplicates(inputstructs)
        read_from_stdin = False
        for inputstruct in inputstructs:
            if inputstruct == '-':
                inputstruct = sys.stdin.read()
                read_from_stdin = True
                if config.RAWSTRING:
                    if sys.version_info < (3, ):
                        inputstruct = bytes(inputstruct).decode(
                            'unicode_escape')
                    else:
                        inputstruct = bytes(inputstruct,
                                            'utf8').decode('unicode_escape')
            else:
                if os.path.getsize(inputstruct) == 0:
                    logger.error('empty PDB file')
                    sys.exit(1)
                if num_structures > 1:
                    basename = inputstruct.split('.')[-2].split('/')[-1]
                    config.OUTPATH = '/'.join([config.BASEPATH, basename])
                    output_prefix = 'report'
            process_pdb(inputstruct,
                        config.OUTPATH,
                        as_string=read_from_stdin,
                        outputprefix=output_prefix)
    else:  # Try to fetch the current PDB structure(s) directly from the RCBS server
        num_pdbids = len(inputpdbids)
        inputpdbids = remove_duplicates(inputpdbids)
        for inputpdbid in inputpdbids:
            pdbpath, pdbid = download_structure(inputpdbid)
            if num_pdbids > 1:
                config.OUTPATH = '/'.join(
                    [config.BASEPATH, pdbid[1:3].upper(),
                     pdbid.upper()])
                output_prefix = 'report'
            process_pdb(pdbpath, config.OUTPATH, outputprefix=output_prefix)

    if (pdbid is not None
            or inputstructs is not None) and config.BASEPATH is not None:
        if config.BASEPATH in ['.', './']:
            logger.info(
                'finished analysis, find the result files in the working directory'
            )
        else:
            logger.info(
                f'finished analysis, find the result files in {config.BASEPATH}'
            )
Esempio n. 3
0
def remove_duplicates(slist):
    """Checks input lists for duplicates and returns
    a list with unique entries"""
    unique = list(set(slist))
    difference = len(slist) - len(unique)
    if difference == 1:
        logger.info('removed one duplicate entry from input list')
    if difference > 1:
        logger.info(f'Removed {difference} duplicate entries from input list')
    return unique
Esempio n. 4
0
def process_pdb(pdbfile, outpath, as_string=False, outputprefix='report'):
    """Analysis of a single PDB file. Can generate textual reports XML, PyMOL session files and images as output."""
    if not as_string:
        pdb_file_name = pdbfile.split('/')[-1]
        startmessage = f'starting analysis of {pdb_file_name}'
    else:
        startmessage = 'starting analysis from STDIN'
    logger.info(startmessage)
    mol = PDBComplex()
    mol.output_path = outpath
    mol.load_pdb(pdbfile, as_string=as_string)
    # @todo Offers possibility for filter function from command line (by ligand chain, position, hetid)
    for ligand in mol.ligands:
        mol.characterize_complex(ligand)

    create_folder_if_not_exists(outpath)

    # Generate the report files
    streport = StructureReport(mol, outputprefix=outputprefix)

    config.MAXTHREADS = min(config.MAXTHREADS, len(mol.interaction_sets))

    ######################################
    # PyMOL Visualization (parallelized) #
    ######################################

    if config.PYMOL or config.PICS:
        from plip.visualization.visualize import visualize_in_pymol
        complexes = [
            VisualizerData(mol, site) for site in sorted(mol.interaction_sets)
            if not len(mol.interaction_sets[site].interacting_res) == 0
        ]
        if config.MAXTHREADS > 1:
            logger.info(
                f'generating visualizations in parallel on {config.MAXTHREADS} cores'
            )
            parfn = parallel_fn(visualize_in_pymol)
            parfn(complexes, processes=config.MAXTHREADS)
        else:
            [visualize_in_pymol(plcomplex) for plcomplex in complexes]

    if config.XML:  # Generate report in xml format
        streport.write_xml(as_string=config.STDOUT)

    if config.TXT:  # Generate report in txt (rst) format
        streport.write_txt(as_string=config.STDOUT)
Esempio n. 5
0
def hbonds(acceptors, donor_pairs, protisdon, typ):
    """Detection of hydrogen bonds between sets of acceptors and donor pairs.
    Definition: All pairs of hydrogen bond acceptor and donors with
    donor hydrogens and acceptor showing a distance within HBOND DIST MIN and HBOND DIST MAX
    and donor angles above HBOND_DON_ANGLE_MIN
    """
    data = namedtuple('hbond', 'a a_orig_idx d d_orig_idx h distance_ah distance_ad angle type protisdon resnr '
                               'restype reschain resnr_l restype_l reschain_l sidechain atype dtype')
    pairings = []

    # DEBUG
    if protisdon:
        logger.info(f'Ligand has {len(acceptors)} acceptors')
        logger.info(f'Prot has {len(donor_pairs)} donor pairs of type {typ}')
    else:
        logger.info(f'Prot has {len(acceptors)} acceptors')
        logger.info(f'Lig has {len(donor_pairs)} donor pairs of type {typ}')

    for acc, don in itertools.product(acceptors, donor_pairs):
        #if not typ == 'strong':
            #continue # keep only the strong hydrogen bonds (default plip version)
        # Regular (strong) hydrogen bonds

        dist_ah = euclidean3d(acc.a.coords, don.h.coords)  # acceptor to H dist
        dist_ad = euclidean3d(acc.a.coords, don.d.coords)  # acceptor to donor dist
        if not config.MIN_DIST < dist_ad < config.HBOND_DIST_MAX:
            continue
        vec1, vec2 = vector(don.h.coords, don.d.coords), vector(don.h.coords, acc.a.coords)
        v = vecangle(vec1, vec2)
        if not v > config.HBOND_DON_ANGLE_MIN:
            continue
        protatom = don.d.OBAtom if protisdon else acc.a.OBAtom
        ligatom = don.d.OBAtom if not protisdon else acc.a.OBAtom
        is_sidechain_hbond = protatom.GetResidue().GetAtomProperty(protatom, 8)  # Check if sidechain atom
        resnr = whichresnumber(don.d) if protisdon else whichresnumber(acc.a)
        resnr_l = whichresnumber(acc.a_orig_atom) if protisdon else whichresnumber(don.d_orig_atom)
        restype = whichrestype(don.d) if protisdon else whichrestype(acc.a)
        restype_l = whichrestype(acc.a_orig_atom) if protisdon else whichrestype(don.d_orig_atom)
        reschain = whichchain(don.d) if protisdon else whichchain(acc.a)
        rechain_l = whichchain(acc.a_orig_atom) if protisdon else whichchain(don.d_orig_atom)
        # Next line prevents H-Bonds within amino acids in intermolecular interactions
        if config.INTRA is not None and whichresnumber(don.d) == whichresnumber(acc.a):
            continue
        # Next line prevents backbone-backbone H-Bonds
        if config.INTRA is not None and protatom.GetResidue().GetAtomProperty(protatom,
                                                                              8) and ligatom.GetResidue().GetAtomProperty(
                ligatom, 8):
            continue
        contact = data(a=acc.a, a_orig_idx=acc.a_orig_idx, d=don.d, d_orig_idx=don.d_orig_idx, h=don.h,
                       distance_ah=dist_ah, distance_ad=dist_ad, angle=v, type=typ, protisdon=protisdon,
                       resnr=resnr, restype=restype, reschain=reschain, resnr_l=resnr_l,
                       restype_l=restype_l, reschain_l=rechain_l, sidechain=is_sidechain_hbond,
                       atype=acc.a.type, dtype=don.d.type)
        pairings.append(contact)
    return filter_contacts(pairings)
Esempio n. 6
0
def download_structure(inputpdbid):
    """Given a PDB ID, downloads the corresponding PDB structure.
    Checks for validity of ID and handles error while downloading.
    Returns the path of the downloaded file."""
    try:
        if len(inputpdbid) != 4 or extract_pdbid(inputpdbid.lower()) == 'UnknownProtein':
            logger.error(f'invalid PDB-ID (wrong format): {inputpdbid}')
            sys.exit(1)
        pdbfile, pdbid = fetch_pdb(inputpdbid.lower())
        pdbpath = tilde_expansion('%s/%s.pdb' % (config.BASEPATH.rstrip('/'), pdbid))
        create_folder_if_not_exists(config.BASEPATH)
        with open(pdbpath, 'w') as g:
            g.write(pdbfile)
        logger.info(f'file downloaded as {pdbpath}')
        return pdbpath, pdbid

    except ValueError:  # Invalid PDB ID, cannot fetch from RCBS server
        logger.error(f'PDB-ID does not exist: {inputpdbid}')
        sys.exit(1)
Esempio n. 7
0
def metal_complexation(metals, metal_binding_lig, metal_binding_bs):
    """Find all metal complexes between metals and appropriate groups in both protein and ligand, as well as water"""
    data = namedtuple(
        'metal_complex',
        'metal metal_orig_idx metal_type target target_orig_idx target_type '
        'coordination_num distance resnr restype '
        'reschain  restype_l reschain_l resnr_l location rms, geometry num_partners complexnum'
    )
    pairings_dict = {}
    pairings = []
    # #@todo Refactor
    metal_to_id = {}
    metal_to_orig_atom = {}
    for metal, target in itertools.product(
            metals, metal_binding_lig + metal_binding_bs):
        distance = euclidean3d(metal.m.coords, target.atom.coords)
        if not distance < config.METAL_DIST_MAX:
            continue
        if metal.m not in pairings_dict:
            pairings_dict[metal.m] = [
                (target, distance),
            ]
            metal_to_id[metal.m] = metal.m_orig_idx
            metal_to_orig_atom[metal.m] = metal.orig_m
        else:
            pairings_dict[metal.m].append((target, distance))
    for cnum, metal in enumerate(pairings_dict):
        rms = 0.0
        excluded = []
        # cnum +1 being the complex number
        contact_pairs = pairings_dict[metal]
        num_targets = len(contact_pairs)
        vectors_dict = defaultdict(list)
        for contact_pair in contact_pairs:
            target, distance = contact_pair
            vectors_dict[target.atom.idx].append(
                vector(metal.coords, target.atom.coords))

        # Listing of coordination numbers and their geometries
        configs = {
            2: [
                'linear',
            ],
            3: ['trigonal.planar', 'trigonal.pyramidal'],
            4: ['tetrahedral', 'square.planar'],
            5: ['trigonal.bipyramidal', 'square.pyramidal'],
            6: [
                'octahedral',
            ]
        }

        # Angle signatures for each geometry (as seen from each target atom)
        ideal_angles = {
            'linear': [[180.0]] * 2,
            'trigonal.planar': [[120.0, 120.0]] * 3,
            'trigonal.pyramidal': [[109.5, 109.5]] * 3,
            'tetrahedral': [[109.5, 109.5, 109.5, 109.5]] * 4,
            'square.planar': [[90.0, 90.0, 90.0, 90.0]] * 4,
            'trigonal.bipyramidal':
            [[120.0, 120.0, 90.0, 90.0]] * 3 + [[90.0, 90.0, 90.0, 180.0]] * 2,
            'square.pyramidal':
            [[90.0, 90.0, 90.0, 180.0]] * 4 + [[90.0, 90.0, 90.0, 90.0]],
            'octahedral': [[90.0, 90.0, 90.0, 90.0, 180.0]] * 6
        }
        angles_dict = {}

        for target in vectors_dict:
            cur_vector = vectors_dict[target]
            other_vectors = []
            for t in vectors_dict:
                if not t == target:
                    [other_vectors.append(x) for x in vectors_dict[t]]
            angles = [
                vecangle(pair[0], pair[1])
                for pair in itertools.product(cur_vector, other_vectors)
            ]
            angles_dict[target] = angles

        all_total = []  # Record fit information for each geometry tested
        gdata = namedtuple(
            'gdata',
            'geometry rms coordination excluded diff_targets')  # Geometry Data
        # Can't specify geometry with only one target
        if num_targets == 1:
            final_geom = 'NA'
            final_coo = 1
            excluded = []
            rms = 0.0
        else:
            for coo in sorted(
                    configs,
                    reverse=True):  # Start with highest coordination number
                geometries = configs[coo]
                for geometry in geometries:
                    signature = ideal_angles[
                        geometry]  # Set of ideal angles for geometry, from each perspective
                    geometry_total = 0
                    geometry_scores = [
                    ]  # All scores for one geometry (from all subsignatures)
                    used_up_targets = [
                    ]  # Use each target just once for a subsignature
                    not_used = []
                    coo_diff = num_targets - coo  # How many more observed targets are there?

                    # Find best match for each subsignature
                    for subsignature in signature:  # Ideal angles from one perspective
                        best_target = None  # There's one best-matching target for each subsignature
                        best_target_score = 999

                        for k, target in enumerate(angles_dict):
                            if target not in used_up_targets:
                                observed_angles = angles_dict[
                                    target]  # Observed angles from perspective of one target
                                single_target_scores = []
                                used_up_observed_angles = []
                                for i, ideal_angle in enumerate(subsignature):
                                    # For each angle in the signature, find the best-matching observed angle
                                    best_match = None
                                    best_match_diff = 999
                                    for j, observed_angle in enumerate(
                                            observed_angles):
                                        if j not in used_up_observed_angles:
                                            diff = abs(ideal_angle -
                                                       observed_angle)
                                            if diff < best_match_diff:
                                                best_match_diff = diff
                                                best_match = j
                                    if best_match is not None:
                                        used_up_observed_angles.append(
                                            best_match)
                                        single_target_scores.append(
                                            best_match_diff)
                                # Calculate RMS for target angles
                                target_total = sum([
                                    x**2 for x in single_target_scores
                                ])**0.5  # Tot. score targ/sig
                                if target_total < best_target_score:
                                    best_target_score = target_total
                                    best_target = target

                        used_up_targets.append(best_target)
                        geometry_scores.append(best_target_score)
                        # Total score is mean of RMS values
                        geometry_total = np.mean(geometry_scores)
                    # Record the targets not used for excluding them when deciding for a final geometry
                    [
                        not_used.append(target) for target in angles_dict
                        if target not in used_up_targets
                    ]
                    all_total.append(
                        gdata(geometry=geometry,
                              rms=geometry_total,
                              coordination=coo,
                              excluded=not_used,
                              diff_targets=coo_diff))

        # Make a decision here. Starting with the geometry with lowest difference in ideal and observed partners ...
        # Check if the difference between the RMS to the next best solution is not larger than 0.5
        if not num_targets == 1:  # Can't decide for any geoemtry in that case
            all_total = sorted(all_total, key=lambda x: abs(x.diff_targets))
            for i, total in enumerate(all_total):
                next_total = all_total[i + 1]
                this_rms, next_rms = total.rms, next_total.rms
                diff_to_next = next_rms - this_rms
                if diff_to_next > 0.5:
                    final_geom, final_coo, rms, excluded = total.geometry, total.coordination, total.rms, total.excluded
                    break
                elif next_total.rms < 3.5:
                    final_geom, final_coo, = next_total.geometry, next_total.coordination
                    rms, excluded = next_total.rms, next_total.excluded
                    break
                elif i == len(all_total) - 2:
                    final_geom, final_coo, rms, excluded = "NA", "NA", float(
                        'nan'), []
                    break

        # Record all contact pairing, excluding those with targets superfluous for chosen geometry
        only_water = set([x[0].location for x in contact_pairs]) == {'water'}
        if not only_water:  # No complex if just with water as targets
            logger.info(
                f'metal ion {metal.type} complexed with {final_geom} geometry (coo. number {final_coo}/ {num_targets} observed)'
            )
            for contact_pair in contact_pairs:
                target, distance = contact_pair
                if target.atom.idx not in excluded:
                    metal_orig_atom = metal_to_orig_atom[metal]
                    restype_l, reschain_l, resnr_l = whichrestype(
                        metal_orig_atom), whichchain(
                            metal_orig_atom), whichresnumber(metal_orig_atom)
                    contact = data(metal=metal,
                                   metal_orig_idx=metal_to_id[metal],
                                   metal_type=metal.type,
                                   target=target,
                                   target_orig_idx=target.atom_orig_idx,
                                   target_type=target.type,
                                   coordination_num=final_coo,
                                   distance=distance,
                                   resnr=target.resnr,
                                   restype=target.restype,
                                   reschain=target.reschain,
                                   location=target.location,
                                   rms=rms,
                                   geometry=final_geom,
                                   num_partners=num_targets,
                                   complexnum=cnum + 1,
                                   resnr_l=resnr_l,
                                   restype_l=restype_l,
                                   reschain_l=reschain_l)
                    pairings.append(contact)
    return filter_contacts(pairings)