def parse_index(path, index): ''' ''' regexp = r"""^ (?P<pdb>\w{4})\s+ (?P<resolution>\d[.]\d{2}|NMR)\s+ (?P<year>\d{4})\s+ (?P<pKx>\d{1,2}[.]\d{2})\s+ (?P<type>\w{2,4}) (?P<relation>[<>=~]{1,2}) (?P<value>\d+[.]\d+|\d+) (?P<unit>\w{2}).+""" pattern = re.compile(regexp, re.VERBOSE) data = {} for line in open(index): if not line.startswith('#'): match = pattern.match(line) # PRINT A WARNING IF REGULAR EXPRESSION FAILED ON A LINE if not match: logger.warn("Could not parse line: {0}".format(line)) continue rowdata = match.groupdict() pdb = rowdata.pop('pdb') data[pdb] = rowdata return data
def parse_index(path, index): ''' parse files: INDEX_general_PL_data.2015 INDEX_refined_data.2015 INDEX_core_data.2015 for example: {'1mhw':{'year': '2002', 'resolution': '1.90', 'type': 'Ki', 'value': '45', 'relation': '=', 'pKx': '7.35', 'unit': 'nM'} } ''' regexp = r"""^ (?P<pdb>\w{4})\s+ (?P<resolution>\d[.]\d{2}|NMR)\s+ (?P<year>\d{4})\s+ (?P<pKx>\d{1,2}[.]\d{2})\s+ (?P<type>\w{2,4}) (?P<relation>[<>=~]{1,2}) (?P<value>\d+[.]\d+|\d+) (?P<unit>\w{2}).+""" pattern = re.compile(regexp, re.VERBOSE) data = {} for line in open(index): if not line.startswith('#'): match = pattern.match(line) # PRINT A WARNING IF REGULAR EXPRESSION FAILED ON A LINE if not match: logger.warn("Could not parse line: {0}".format(line)) continue rowdata = match.groupdict() pdb = rowdata.pop('pdb') data[pdb] = rowdata return data
def sift_descriptor(protein, ligand, binsize=0.0): """ Calculates a descriptor of the protein-ligand complex as the sum of the structural interaction fingerprints (SIFTs) of all interacting atoms. Parameters ---------- protein_path: str Path to the PDB structure of the protein. ligand_path: str Path to the structure of the ligand, must be readable by Open Babel. binsize: float Size of the distance bins in Angstrom that will be used to bin the contacts. The total range will be from 1.0 to <cutoff> + <binsize> in <binsize> steps. Returns ------- descriptor: numpy.ndarray The shape of the descriptor array will be 1D equal to the number of contact types or 2D (number of bins, number of contact types) if a binsize was given. """ # SUPPRESS OPENBABEL WARNINGS pybel.ob.obErrorLog.StopLogging() # ELEMENT TABLE TO DETERMINE VDW AND COVALENT BONDS et = OBElementTable() # CREDO DESCRIPTOR LABELS interaction_types = ['covalent','vdw_clash','vdw','proximal','hbond','weak_hbond', 'xbond','ionic','metal_complex','aromatic','hydrophobic', 'carbonyl'] numcols = len(interaction_types) # GENERATE THE DISTANCE BINS if binsize: # get the distance bins for the given cutoff and bin size bins = get_distance_bins(config['cutoff'], binsize) offset = bins.size + 1 # DEBUG DISTANCE BINS logger.debug("The distance bins in Angstrom are {0}.".format(bins)) # NUMBER OF TOTAL COLUMNS IN DESCRIPTOR numcols *= (bins.size + 1) labels = [] # CREATE A COLUMN FOR EACH ELEMENT PAIR AND DISTANCE BIN for interaction_type in interaction_types: for i in range(len(bins) + 1): label = "{0}-B{1}".format(interaction_type, i) labels.append(label) # LABEL WITHOUT BINS else: labels = interaction_types # DESCRIPTOR THAT WILL CONTAIN THE SUM OF ALL ELEMENT-ELEMENT INTERACTIONS descriptor = numpy.zeros(numcols, dtype=int) # GET THE ATOM TYPES FOR THE LIGAND # CALCULATED ON THE FLY lig_atom_types = ob.get_atom_types(ligand, config) contacts = get_contacts(protein, ligand, config['cutoff']) # ITERATE THROUGH CONTACT PAIRS AND DETERMINE SIFT for hetatm, hetatm_contacts in contacts: # GET THE ATOM TYPES FOR THE HETATM hetatm_types = lig_atom_types[hetatm.GetIdx()] # GET ATOM RADII FOR THE LIGAND ATOM hetatm_cov = et.GetCovalentRad(hetatm.GetAtomicNum()) hetatm_vdw = et.GetVdwRad(hetatm.GetAtomicNum()) # ITERATE THROUGH ALL THE CONTACTS THE HETATM HAS for atom, distance in hetatm_contacts: # INITIALIZE STRUCTURAL INTERACTION FINGERPRINT sift = numpy.zeros(descriptor.size) residue = atom.GetResidue() res_name = residue.GetName()[:3] # IGNORE WATER RESIDUES if res_name == 'HOH': continue # GET ATOM TYPES FOR THE PROTEIN ATOM try: atom_types = res_atom_types[res_name][residue.GetAtomID(atom).strip()] except KeyError: logger.warn("Cannot find atom types for {} {}." .format(res_name, residue.GetAtomID(atom).strip())) continue sum_cov = hetatm_cov + et.GetCovalentRad(atom.GetAtomicNum()) sum_vdw = hetatm_vdw + et.GetVdwRad(atom.GetAtomicNum()) # BIN INTERACTIONS if binsize: # GET THE BIN THIS CONTACT BELONGS IN # DIGITIZE TAKES AN ARRAY-LIKE AS INPUT bin_id = numpy.digitize([distance,], bins)[0] + 1 else: offset = 1 bin_id = 0 # COVALENT BOND - SHOULD NOT OCCUR IN PDBBIND if distance <= sum_cov: sift[0 * offset + bin_id] = 1 # VAN DER WAALS CLASH elif distance <= sum_vdw: sift[1 * offset + bin_id] = 1 # VAN DER WAALS CONTACT elif distance <= sum_vdw + 0.5: sift[2 * offset + bin_id] = 1 # PROXIMAL else: sift[3 * offset + bin_id] = 1 if interactions.is_hbond(hetatm,hetatm_types,atom,atom_types,distance): sift[4 * offset + bin_id] = 1 if interactions.is_weak_hbond(hetatm,hetatm_types,atom,atom_types,distance): sift[5 * offset + bin_id] = 1 if interactions.is_xbond(hetatm,hetatm_types,atom,atom_types,distance): sift[6 * offset + bin_id] = 1 if interactions.is_ionic(hetatm,hetatm_types,atom,atom_types,distance): sift[7 * offset + bin_id] = 1 if interactions.is_metal_complex(hetatm,hetatm_types,atom,atom_types,distance): sift[8 * offset + bin_id] = 1 if interactions.is_aromatic(hetatm,hetatm_types,atom,atom_types,distance): sift[9 * offset + bin_id] = 1 if interactions.is_hydrophobic(hetatm,hetatm_types,atom,atom_types,distance): sift[10 * offset + bin_id] = 1 if interactions.is_carbonyl(hetatm,hetatm_types,atom,atom_types,distance): sift[11 * offset + bin_id] = 1 descriptor += sift if binsize: sum_descriptor_bins(descriptor, bins) return descriptor, labels
def sift_descriptor(protein, ligand, binsize=0.0): """ Calculates a descriptor of the protein-ligand complex as the sum of the structural interaction fingerprints (SIFTs) of all interacting atoms. Parameters ---------- protein_path: str Path to the PDB structure of the protein. ligand_path: str Path to the structure of the ligand, must be readable by Open Babel. binsize: float Size of the distance bins in Angstrom that will be used to bin the contacts. The total range will be from 1.0 to <cutoff> + <binsize> in <binsize> steps. Returns ------- descriptor: numpy.ndarray The shape of the descriptor array will be 1D equal to the number of contact types or 2D (number of bins, number of contact types) if a binsize was given. """ # SUPPRESS OPENBABEL WARNINGS pybel.ob.obErrorLog.StopLogging() # ELEMENT TABLE TO DETERMINE VDW AND COVALENT BONDS et = OBElementTable() # CREDO DESCRIPTOR LABELS interaction_types = [ 'covalent', 'vdw_clash', 'vdw', 'proximal', 'hbond', 'weak_hbond', 'xbond', 'ionic', 'metal_complex', 'aromatic', 'hydrophobic', 'carbonyl' ] numcols = len(interaction_types) # GENERATE THE DISTANCE BINS if binsize: # get the distance bins for the given cutoff and bin size bins = get_distance_bins(config['cutoff'], binsize) offset = bins.size + 1 # DEBUG DISTANCE BINS logger.debug("The distance bins in Angstrom are {0}.".format(bins)) # NUMBER OF TOTAL COLUMNS IN DESCRIPTOR numcols *= (bins.size + 1) labels = [] # CREATE A COLUMN FOR EACH ELEMENT PAIR AND DISTANCE BIN for interaction_type in interaction_types: for i in range(len(bins) + 1): label = "{0}-B{1}".format(interaction_type, i) labels.append(label) # LABEL WITHOUT BINS else: labels = interaction_types # DESCRIPTOR THAT WILL CONTAIN THE SUM OF ALL ELEMENT-ELEMENT INTERACTIONS descriptor = numpy.zeros(numcols, dtype=int) # GET THE ATOM TYPES FOR THE LIGAND # CALCULATED ON THE FLY lig_atom_types = ob.get_atom_types(ligand, config) contacts = get_contacts(protein, ligand, config['cutoff']) # ITERATE THROUGH CONTACT PAIRS AND DETERMINE SIFT for hetatm, hetatm_contacts in contacts: # GET THE ATOM TYPES FOR THE HETATM hetatm_types = lig_atom_types[hetatm.GetIdx()] # GET ATOM RADII FOR THE LIGAND ATOM hetatm_cov = et.GetCovalentRad(hetatm.GetAtomicNum()) hetatm_vdw = et.GetVdwRad(hetatm.GetAtomicNum()) # ITERATE THROUGH ALL THE CONTACTS THE HETATM HAS for atom, distance in hetatm_contacts: # INITIALIZE STRUCTURAL INTERACTION FINGERPRINT sift = numpy.zeros(descriptor.size) residue = atom.GetResidue() res_name = residue.GetName()[:3] # IGNORE WATER RESIDUES if res_name == 'HOH': continue # GET ATOM TYPES FOR THE PROTEIN ATOM try: atom_types = res_atom_types[res_name][residue.GetAtomID( atom).strip()] except KeyError: logger.warn("Cannot find atom types for {} {}.".format( res_name, residue.GetAtomID(atom).strip())) continue sum_cov = hetatm_cov + et.GetCovalentRad(atom.GetAtomicNum()) sum_vdw = hetatm_vdw + et.GetVdwRad(atom.GetAtomicNum()) # BIN INTERACTIONS if binsize: # GET THE BIN THIS CONTACT BELONGS IN # DIGITIZE TAKES AN ARRAY-LIKE AS INPUT bin_id = numpy.digitize([ distance, ], bins)[0] + 1 else: offset = 1 bin_id = 0 # COVALENT BOND - SHOULD NOT OCCUR IN PDBBIND if distance <= sum_cov: sift[0 * offset + bin_id] = 1 # VAN DER WAALS CLASH elif distance <= sum_vdw: sift[1 * offset + bin_id] = 1 # VAN DER WAALS CONTACT elif distance <= sum_vdw + 0.5: sift[2 * offset + bin_id] = 1 # PROXIMAL else: sift[3 * offset + bin_id] = 1 if interactions.is_hbond(hetatm, hetatm_types, atom, atom_types, distance): sift[4 * offset + bin_id] = 1 if interactions.is_weak_hbond(hetatm, hetatm_types, atom, atom_types, distance): sift[5 * offset + bin_id] = 1 if interactions.is_xbond(hetatm, hetatm_types, atom, atom_types, distance): sift[6 * offset + bin_id] = 1 if interactions.is_ionic(hetatm, hetatm_types, atom, atom_types, distance): sift[7 * offset + bin_id] = 1 if interactions.is_metal_complex(hetatm, hetatm_types, atom, atom_types, distance): sift[8 * offset + bin_id] = 1 if interactions.is_aromatic(hetatm, hetatm_types, atom, atom_types, distance): sift[9 * offset + bin_id] = 1 if interactions.is_hydrophobic(hetatm, hetatm_types, atom, atom_types, distance): sift[10 * offset + bin_id] = 1 if interactions.is_carbonyl(hetatm, hetatm_types, atom, atom_types, distance): sift[11 * offset + bin_id] = 1 descriptor += sift if binsize: sum_descriptor_bins(descriptor, bins) return descriptor, labels