Exemple #1
0
def parse_index(path, index):
    '''
    '''
    regexp = r"""^
                (?P<pdb>\w{4})\s+
                (?P<resolution>\d[.]\d{2}|NMR)\s+
                (?P<year>\d{4})\s+
                (?P<pKx>\d{1,2}[.]\d{2})\s+
                (?P<type>\w{2,4})
                (?P<relation>[<>=~]{1,2})
                (?P<value>\d+[.]\d+|\d+)
                (?P<unit>\w{2}).+"""

    pattern = re.compile(regexp, re.VERBOSE)

    data = {}
    for line in open(index):
        if not line.startswith('#'):
            match = pattern.match(line)

            # PRINT A WARNING IF REGULAR EXPRESSION FAILED ON A LINE
            if not match:
                logger.warn("Could not parse line: {0}".format(line))
                continue

            rowdata = match.groupdict()
            pdb = rowdata.pop('pdb')
            data[pdb] = rowdata

    return data
def parse_index(path, index):
    '''
    '''
    regexp = r"""^
                (?P<pdb>\w{4})\s+
                (?P<resolution>\d[.]\d{2}|NMR)\s+
                (?P<year>\d{4})\s+
                (?P<pKx>\d{1,2}[.]\d{2})\s+
                (?P<type>\w{2,4})
                (?P<relation>[<>=~]{1,2})
                (?P<value>\d+[.]\d+|\d+)
                (?P<unit>\w{2}).+"""

    pattern = re.compile(regexp, re.VERBOSE)

    data = {}
    for line in open(index):
        if not line.startswith('#'):
            match = pattern.match(line)

            # PRINT A WARNING IF REGULAR EXPRESSION FAILED ON A LINE
            if not match:
                logger.warn("Could not parse line: {0}".format(line))
                continue

            rowdata = match.groupdict()
            pdb = rowdata.pop('pdb')
            data[pdb] = rowdata

    return data
Exemple #3
0
def parse_index(path, index):
    '''
    parse files:
        INDEX_general_PL_data.2015
        INDEX_refined_data.2015
        INDEX_core_data.2015

    for example:
        {'1mhw':{'year':        '2002',
                'resolution':   '1.90',
                'type':         'Ki',
                'value':        '45',
                'relation':     '=',
                'pKx':          '7.35',
                'unit':         'nM'}
        }
    '''
    regexp = r"""^
                (?P<pdb>\w{4})\s+
                (?P<resolution>\d[.]\d{2}|NMR)\s+
                (?P<year>\d{4})\s+
                (?P<pKx>\d{1,2}[.]\d{2})\s+
                (?P<type>\w{2,4})
                (?P<relation>[<>=~]{1,2})
                (?P<value>\d+[.]\d+|\d+)
                (?P<unit>\w{2}).+"""

    pattern = re.compile(regexp, re.VERBOSE)

    data = {}
    for line in open(index):
        if not line.startswith('#'):
            match = pattern.match(line)

            # PRINT A WARNING IF REGULAR EXPRESSION FAILED ON A LINE
            if not match:
                logger.warn("Could not parse line: {0}".format(line))
                continue

            rowdata = match.groupdict()
            pdb = rowdata.pop('pdb')
            data[pdb] = rowdata

    return data
Exemple #4
0
def sift_descriptor(protein, ligand, binsize=0.0):
    """
    Calculates a descriptor of the protein-ligand complex as the sum of the structural
    interaction fingerprints (SIFTs) of all interacting atoms.

    Parameters
    ----------
    protein_path: str
        Path to the PDB structure of the protein.
    ligand_path: str
        Path to the structure of the ligand, must be readable by Open Babel.
    binsize: float
        Size of the distance bins in Angstrom that will be used to bin the contacts.
        The total range will be from 1.0 to <cutoff> + <binsize> in <binsize> steps.

    Returns
    -------
    descriptor: numpy.ndarray
        The shape of the descriptor array will be 1D equal to the number of contact
        types or 2D (number of bins, number of contact types) if a binsize was given.
    """
    # SUPPRESS OPENBABEL WARNINGS
    pybel.ob.obErrorLog.StopLogging()

    # ELEMENT TABLE TO DETERMINE VDW AND COVALENT BONDS
    et = OBElementTable()

    # CREDO DESCRIPTOR LABELS
    interaction_types = ['covalent','vdw_clash','vdw','proximal','hbond','weak_hbond',
                         'xbond','ionic','metal_complex','aromatic','hydrophobic',
                         'carbonyl']

    numcols = len(interaction_types)

    # GENERATE THE DISTANCE BINS
    if binsize:

        # get the distance bins for the given cutoff and bin size
        bins = get_distance_bins(config['cutoff'], binsize)

        offset = bins.size + 1

        # DEBUG DISTANCE BINS
        logger.debug("The distance bins in Angstrom are {0}.".format(bins))

        # NUMBER OF TOTAL COLUMNS IN DESCRIPTOR
        numcols *= (bins.size + 1)

        labels = []
        # CREATE A COLUMN FOR EACH ELEMENT PAIR AND DISTANCE BIN
        for interaction_type in interaction_types:
            for i in range(len(bins) + 1):
                label = "{0}-B{1}".format(interaction_type, i)
                labels.append(label)

    # LABEL WITHOUT BINS
    else: labels = interaction_types

    # DESCRIPTOR THAT WILL CONTAIN THE SUM OF ALL ELEMENT-ELEMENT INTERACTIONS
    descriptor = numpy.zeros(numcols, dtype=int)

    # GET THE ATOM TYPES FOR THE LIGAND
    # CALCULATED ON THE FLY
    lig_atom_types = ob.get_atom_types(ligand, config)

    contacts = get_contacts(protein, ligand, config['cutoff'])

    # ITERATE THROUGH CONTACT PAIRS AND DETERMINE SIFT
    for hetatm, hetatm_contacts in contacts:

        # GET THE ATOM TYPES FOR THE HETATM
        hetatm_types = lig_atom_types[hetatm.GetIdx()]

        # GET ATOM RADII FOR THE LIGAND ATOM
        hetatm_cov = et.GetCovalentRad(hetatm.GetAtomicNum())
        hetatm_vdw = et.GetVdwRad(hetatm.GetAtomicNum())

        # ITERATE THROUGH ALL THE CONTACTS THE HETATM HAS
        for atom, distance in hetatm_contacts:

            # INITIALIZE STRUCTURAL INTERACTION FINGERPRINT
            sift = numpy.zeros(descriptor.size)

            residue = atom.GetResidue()
            res_name = residue.GetName()[:3]

            # IGNORE WATER RESIDUES
            if res_name == 'HOH': continue

            # GET ATOM TYPES FOR THE PROTEIN ATOM
            try:
                atom_types = res_atom_types[res_name][residue.GetAtomID(atom).strip()]
            except KeyError:
                logger.warn("Cannot find atom types for {} {}."
                            .format(res_name, residue.GetAtomID(atom).strip()))
                continue

            sum_cov = hetatm_cov + et.GetCovalentRad(atom.GetAtomicNum())
            sum_vdw = hetatm_vdw + et.GetVdwRad(atom.GetAtomicNum())

            # BIN INTERACTIONS
            if binsize:

                # GET THE BIN THIS CONTACT BELONGS IN
                # DIGITIZE TAKES AN ARRAY-LIKE AS INPUT
                bin_id = numpy.digitize([distance,], bins)[0] + 1

            else:
                offset = 1
                bin_id = 0

            # COVALENT BOND - SHOULD NOT OCCUR IN PDBBIND
            if distance <= sum_cov: sift[0 * offset + bin_id] = 1

            # VAN DER WAALS CLASH
            elif distance <= sum_vdw: sift[1 * offset + bin_id] = 1

            # VAN DER WAALS CONTACT
            elif distance <= sum_vdw + 0.5: sift[2 * offset + bin_id] = 1

            # PROXIMAL
            else: sift[3 * offset + bin_id] = 1

            if interactions.is_hbond(hetatm,hetatm_types,atom,atom_types,distance): sift[4 * offset + bin_id] = 1
            if interactions.is_weak_hbond(hetatm,hetatm_types,atom,atom_types,distance): sift[5 * offset + bin_id] = 1
            if interactions.is_xbond(hetatm,hetatm_types,atom,atom_types,distance): sift[6 * offset + bin_id] = 1
            if interactions.is_ionic(hetatm,hetatm_types,atom,atom_types,distance): sift[7 * offset + bin_id] = 1
            if interactions.is_metal_complex(hetatm,hetatm_types,atom,atom_types,distance): sift[8 * offset + bin_id] = 1
            if interactions.is_aromatic(hetatm,hetatm_types,atom,atom_types,distance): sift[9 * offset + bin_id] = 1
            if interactions.is_hydrophobic(hetatm,hetatm_types,atom,atom_types,distance): sift[10 * offset + bin_id] = 1
            if interactions.is_carbonyl(hetatm,hetatm_types,atom,atom_types,distance): sift[11 * offset + bin_id] = 1

            descriptor += sift

    if binsize: sum_descriptor_bins(descriptor, bins)

    return descriptor, labels
Exemple #5
0
def sift_descriptor(protein, ligand, binsize=0.0):
    """
    Calculates a descriptor of the protein-ligand complex as the sum of the structural
    interaction fingerprints (SIFTs) of all interacting atoms.

    Parameters
    ----------
    protein_path: str
        Path to the PDB structure of the protein.
    ligand_path: str
        Path to the structure of the ligand, must be readable by Open Babel.
    binsize: float
        Size of the distance bins in Angstrom that will be used to bin the contacts.
        The total range will be from 1.0 to <cutoff> + <binsize> in <binsize> steps.

    Returns
    -------
    descriptor: numpy.ndarray
        The shape of the descriptor array will be 1D equal to the number of contact
        types or 2D (number of bins, number of contact types) if a binsize was given.
    """
    # SUPPRESS OPENBABEL WARNINGS
    pybel.ob.obErrorLog.StopLogging()

    # ELEMENT TABLE TO DETERMINE VDW AND COVALENT BONDS
    et = OBElementTable()

    # CREDO DESCRIPTOR LABELS
    interaction_types = [
        'covalent', 'vdw_clash', 'vdw', 'proximal', 'hbond', 'weak_hbond',
        'xbond', 'ionic', 'metal_complex', 'aromatic', 'hydrophobic',
        'carbonyl'
    ]

    numcols = len(interaction_types)

    # GENERATE THE DISTANCE BINS
    if binsize:

        # get the distance bins for the given cutoff and bin size
        bins = get_distance_bins(config['cutoff'], binsize)

        offset = bins.size + 1

        # DEBUG DISTANCE BINS
        logger.debug("The distance bins in Angstrom are {0}.".format(bins))

        # NUMBER OF TOTAL COLUMNS IN DESCRIPTOR
        numcols *= (bins.size + 1)

        labels = []
        # CREATE A COLUMN FOR EACH ELEMENT PAIR AND DISTANCE BIN
        for interaction_type in interaction_types:
            for i in range(len(bins) + 1):
                label = "{0}-B{1}".format(interaction_type, i)
                labels.append(label)

    # LABEL WITHOUT BINS
    else:
        labels = interaction_types

    # DESCRIPTOR THAT WILL CONTAIN THE SUM OF ALL ELEMENT-ELEMENT INTERACTIONS
    descriptor = numpy.zeros(numcols, dtype=int)

    # GET THE ATOM TYPES FOR THE LIGAND
    # CALCULATED ON THE FLY
    lig_atom_types = ob.get_atom_types(ligand, config)

    contacts = get_contacts(protein, ligand, config['cutoff'])

    # ITERATE THROUGH CONTACT PAIRS AND DETERMINE SIFT
    for hetatm, hetatm_contacts in contacts:

        # GET THE ATOM TYPES FOR THE HETATM
        hetatm_types = lig_atom_types[hetatm.GetIdx()]

        # GET ATOM RADII FOR THE LIGAND ATOM
        hetatm_cov = et.GetCovalentRad(hetatm.GetAtomicNum())
        hetatm_vdw = et.GetVdwRad(hetatm.GetAtomicNum())

        # ITERATE THROUGH ALL THE CONTACTS THE HETATM HAS
        for atom, distance in hetatm_contacts:

            # INITIALIZE STRUCTURAL INTERACTION FINGERPRINT
            sift = numpy.zeros(descriptor.size)

            residue = atom.GetResidue()
            res_name = residue.GetName()[:3]

            # IGNORE WATER RESIDUES
            if res_name == 'HOH': continue

            # GET ATOM TYPES FOR THE PROTEIN ATOM
            try:
                atom_types = res_atom_types[res_name][residue.GetAtomID(
                    atom).strip()]
            except KeyError:
                logger.warn("Cannot find atom types for {} {}.".format(
                    res_name,
                    residue.GetAtomID(atom).strip()))
                continue

            sum_cov = hetatm_cov + et.GetCovalentRad(atom.GetAtomicNum())
            sum_vdw = hetatm_vdw + et.GetVdwRad(atom.GetAtomicNum())

            # BIN INTERACTIONS
            if binsize:

                # GET THE BIN THIS CONTACT BELONGS IN
                # DIGITIZE TAKES AN ARRAY-LIKE AS INPUT
                bin_id = numpy.digitize([
                    distance,
                ], bins)[0] + 1

            else:
                offset = 1
                bin_id = 0

            # COVALENT BOND - SHOULD NOT OCCUR IN PDBBIND
            if distance <= sum_cov:
                sift[0 * offset + bin_id] = 1

                # VAN DER WAALS CLASH
            elif distance <= sum_vdw:
                sift[1 * offset + bin_id] = 1

                # VAN DER WAALS CONTACT
            elif distance <= sum_vdw + 0.5:
                sift[2 * offset + bin_id] = 1

                # PROXIMAL
            else:
                sift[3 * offset + bin_id] = 1

            if interactions.is_hbond(hetatm, hetatm_types, atom, atom_types,
                                     distance):
                sift[4 * offset + bin_id] = 1
            if interactions.is_weak_hbond(hetatm, hetatm_types, atom,
                                          atom_types, distance):
                sift[5 * offset + bin_id] = 1
            if interactions.is_xbond(hetatm, hetatm_types, atom, atom_types,
                                     distance):
                sift[6 * offset + bin_id] = 1
            if interactions.is_ionic(hetatm, hetatm_types, atom, atom_types,
                                     distance):
                sift[7 * offset + bin_id] = 1
            if interactions.is_metal_complex(hetatm, hetatm_types, atom,
                                             atom_types, distance):
                sift[8 * offset + bin_id] = 1
            if interactions.is_aromatic(hetatm, hetatm_types, atom, atom_types,
                                        distance):
                sift[9 * offset + bin_id] = 1
            if interactions.is_hydrophobic(hetatm, hetatm_types, atom,
                                           atom_types, distance):
                sift[10 * offset + bin_id] = 1
            if interactions.is_carbonyl(hetatm, hetatm_types, atom, atom_types,
                                        distance):
                sift[11 * offset + bin_id] = 1

            descriptor += sift

    if binsize: sum_descriptor_bins(descriptor, bins)

    return descriptor, labels