def test_hash_ecfp(self): for power in (2, 16, 64): for _ in range(10): string = random_string(10) string_hash = rgf.hash_ecfp(string, power) self.assertIsInstance(string_hash, integer_types) self.assertLess(string_hash, 2**power) self.assertGreaterEqual(string_hash, 0)
def getOriginalIdentifiers(mol, prop=[ 'AtomicNumber', 'AtomicMass', 'TotalConnections', 'HCount', 'HeavyNeighborCount', 'FormalCharge', 'DeltaMass', 'IsTerminalAtom', 'SolidAngle', 'SolidAngleValue', 'SolidAngleSign' ], sa_dict=None, includeAtoms=None, radius=2, hash_type='str', idf_power=64): """Compute the original identifiers for atoms in a molecule based on atomic properties. Note it only includes HEAVY atoms. Parameters: mol - rdkit.Chem.rdchem.Mol molecule prop - atomic property list 'AtomicNumber': the atomic number of atom 'AtomicMass': the mass of atom 'TotalConnections': the degree of the atom in the molecule including Hs 'HeavyNeighborCount': the number of heavy (non-hydrogen) neighbor atoms 'HCount': the number of attached hydrogens (both implicit and explicit) 'FormalCharge': the formal charge of atom 'DeltaMass': the difference between atomic mass and atomic weight (weighted average of atomic masses) 'IsTerminalAtom': indicates whether the atom is a terminal atom 'SolidAngle': the solid angle of the atom on the molecule surface (> 0: convex, < 0: concave) 'SolidAngleValue': the absolute solid angle of the atom on the molecule surface 'SolidAngleSign': the sign of solid angle of the atom (-1, 0, 1) sa_dict - a dictionary mapping atom indices to their solid angles includeAtoms - atom indices for getting identifiers radius - ECFP radius, only calculates the identifiers of atoms in the neighborhoods (of radius) of included atoms (includeAtoms) hash_type - type for hash the properties, can be 'str' or 'vec' idf_power - power for the 'str' hash type (default 64-bit integers) Returns an dictionary mapping each heavy-atom index to an integer representing the atomic properties """ tbl = GetPeriodicTable() idf_dict = {} nAtoms = mol.GetNumAtoms() if includeAtoms is None: indices = range(nAtoms) else: indices = includeAtoms for i in indices: index = int(i) env = list( Chem.FindAtomEnvironmentOfRadiusN(mol, radius, index, useHs=True)) env_aids = set( [mol.GetBondWithIdx(bid).GetBeginAtomIdx() for bid in env] + [mol.GetBondWithIdx(bid).GetEndAtomIdx() for bid in env]) for aid in env_aids: if (aid, 0) not in idf_dict: atom = mol.GetAtomWithIdx(aid) if atom.GetAtomicNum() > 1: properties = [] if 'AtomicNumber' in prop: properties.append(atom.GetAtomicNum()) if 'AtomicMass' in prop: tmp_prop = atom.GetMass( ) if hash_type == 'vec' else '%.2f' % atom.GetMass() properties.append(tmp_prop) if 'TotalConnections' in prop: properties.append(atom.GetDegree()) if 'HCount' in prop: properties.append(atom.GetNumExplicitHs()) if 'HeavyNeighborCount' in prop: properties.append( len([ bond.GetOtherAtom(atom) for bond in atom.GetBonds() if bond.GetOtherAtom(atom).GetAtomicNum() > 1 ])) if 'FormalCharge' in prop: tmp_prop = atom.GetFormalCharge( ) if hash_type == 'vec' else '%.2f' % atom.GetFormalCharge( ) properties.append(tmp_prop) if 'DeltaMass' in prop: tmp_prop = atom.GetMass() - tbl.GetAtomicWeight( atom.GetAtomicNum()) tmp_prop = tmp_prop if hash_type == 'vec' else '%.2f' % tmp_prop properties.append() if 'IsTerminalAtom' in prop: is_terminal_atom = 1 if atom.GetDegree() == 1 else 0 properties.append(is_terminal_atom) if len([p for p in prop if 'SolidAngle' in p]) > 0: sa = sa_dict[aid] solang = 0 if (sa is None) else sa if 'SolidAngle' in prop: tmp_prop = solang if hash_type == 'vec' else '%.2f' % solang properties.append(tmp_prop) elif 'SolidAngleValue' in prop: tmp_prop = abs( solang ) if hash_type == 'vec' else '%.2f' % abs(solang) properties.append(tmp_prop) else: solang_sign = '0' if (sa in [None, 0]) else ( '+' if sa > 0 else '-') properties.append(solang_sign) if hash_type == 'str': idf = hash_ecfp(ecfp=','.join( [str(p) for p in properties]), power=idf_power) elif hash_type == 'vec': idf = hash(tuple(properties)) else: print('Wrong hash type!') return idf_dict idf_dict[(aid, 0)] = idf return idf_dict
def getIdentifiersRadiusN_all(molinfo, prop=[ 'AtomicNumber', 'AtomicMass', 'TotalConnections', 'HCount', 'HeavyNeighborCount', 'FormalCharge', 'DeltaMass', 'IsTerminalAtom', 'SolidAngle', 'SolidAngleValue', 'SolidAngleSign' ], sa_dict=None, includeAtoms=None, radius=2, hash_type='str', idf_power=64): """Calculate the Identifiers of all molecular fragments (each originated from an atom, of radius N, can be redundant) in a molecule. Parameters: molinfo - a tuple describing a molecule (coordinates, rdkit.Chem.rdchem.Mol molecule, weights), weights = None for non-weighted alpha shapes prop, sa_dict, radius, includeAtoms, hash_type and idf_power - same as in getOriginalIdentifiers Returns the identifiers """ idfs_all = {} mol = molinfo[1] nAtoms = mol.GetNumAtoms() deadAtoms = [0] * nAtoms # get original identifiers (of radius 0) of included atoms and their neighbors (in neighborhood of radius) idf_dict = getOriginalIdentifiers(mol=mol, prop=prop, sa_dict=sa_dict, includeAtoms=includeAtoms, radius=radius, hash_type=hash_type, idf_power=idf_power) ids_fil = set([u[0] for (u, v) in idf_dict.items()]) idfs_all = {k: (v, []) for (k, v) in idf_dict.items()} # get atom orders if includeAtoms is not None: # put the query atoms in front positions (access first) atomOrder = includeAtoms + [ i for i in ids_fil if i not in includeAtoms ] else: atomOrder = range(nAtoms) # iteratively calculate the identifiers of larger radius if radius == 0: return idfs_all else: for layer in range(radius): for ind in atomOrder: index = int(ind) if not deadAtoms[index]: atom = mol.GetAtomWithIdx(index) env = list( Chem.FindAtomEnvironmentOfRadiusN(mol, layer + 1, index, useHs=True)) env.sort() if atom.GetAtomicNum() == 1 or atom.GetDegree == 0: deadAtoms[index] = 1 continue nbrs = [] bonds = atom.GetBonds() for bond in bonds: oth_index = bond.GetOtherAtomIdx(index) if (oth_index, layer) in idfs_all: bt = bond.GetBondTypeAsDouble() nbrs.append((bt, idfs_all[(oth_index, layer)][0])) nbrs.sort() nbrhd = [layer, idfs_all[(index, layer)][0]] for nbr in nbrs: nbrhd.append(nbr) # use [layer, idf, (nbr1_bondtype, nbr1_idf), ..., (nbrN_bondtype, nbrN_idf)] to represent an atomic neighborhood of a specific radius (layer) if hash_type == 'str': idf = hash_ecfp(ecfp=','.join( [str(itm) for itm in nbrhd]), power=idf_power) elif hash_type == 'vec': idf = hash(tuple(nbrhd)) else: print('Wrong hash type!!!') return [] idfs_all[(index, layer + 1)] = (idf, env) return idfs_all
def getECFPstringsRadiusN_avg_ecfp(molinfo, heavy_atoms=0, base_prop=['AtomicMass'], sa_dict={}, indices=None, degree=2, parameters={ 'weighted': 0, 'alpha': -1, 'alpha_step': 0.1 }, hash_type='str', idf_power=64): """Obtain molecular fragment for all atoms emanating outward to given degree, using the ECFP procedure. For each fragment, compute average atomic properties (and SMILES string for now) and hash to an integer. Parameters: molinfo - a tuple describing a molecule (coordinates, rdkit.Chem.rdchem.Mol molecule, weights), weights = None for non-weighted alpha shapes heavy_atoms - use heavy atoms (1) or all atoms (0) to compute ecfp base_prop - base atomic property, the hashed value of (base_prop, environment smile) will be the atomic identifiers 'AtomicMass': the atomic mass of atom 'TotalConnections': the degree of the atom in the molecule including Hs 'HeavyNeighborCount': the number of heavy (non-hydrogen) neighbor atoms 'HCount': the number of attached hydrogens (both implicit and explicit) 'FormalCharge': the formal charge of atom 'DeltaMass': the difference between atomic mass and atomic weight (weighted average of atomic masses) 'SolidAngle': the solid angle of the atom on the molecule surface (> 0: convex, < 0: concave) 'SolidAngleValue': the absolute value of solid angle of the atom 'SolidAngleSign': the sign of solid angle of the atom (-1, 0, 1) sa_dict - a dictionary mapping atom indices to their solid angles indices - indices for queried atoms degree - ecfp radius parameters - parameters for calculating the solid angles of surface atoms (for concave_hull_3D class) hash_type - type for hashing the fragment, either 'str' (using hash_ecfp function) or 'vec' (using the default hash function) idf_power - power for the 'str' hash type (default 64-bit integers) Returns a dictionary mapping atom index to a string or vector that is to be hashed later. """ ecfp_dict = {} mol = molinfo[1] nAtoms = mol.GetNumAtoms() neighborhoods = [] deadAtoms = [0] * nAtoms sa_list = sa_dict if len([p for p in base_prop if 'SolidAngle' in p]) > 0: if len(sa_dict) == 0: ch = concave_hull_3D(points=molinfo[0], weights=molinfo[2], alpha=parameters['alpha'], alpha_step=parameters['alpha_step']) ch.construct_conchull() sa_list = ch.compute_solid_angles() else: sa_list = sa_dict aids_all = range(nAtoms) if indices is None else indices for dg in range(degree + 1): neighborhoodThisRound = [] for ix in aids_all: i = int(ix) if deadAtoms[i] == 0: atom = mol.GetAtomWithIdx(i) sign1 = (heavy_atoms and atom.GetAtomicNum() == 1) sign2 = (atom.GetDegree() == 0) if sign1 or sign2: deadAtoms[i] = 1 continue env = list( Chem.FindAtomEnvironmentOfRadiusN(mol, dg, i, useHs=not heavy_atoms)) env.sort() tmp_aids = set( [mol.GetBondWithIdx(bid).GetBeginAtomIdx() for bid in env] + [mol.GetBondWithIdx(bid).GetEndAtomIdx() for bid in env]) env_aids = set([i]) if len(tmp_aids) == 0 else tmp_aids tmpprop = get_atom_proplist(mol=mol, sa_dict=sa_list, aids=env_aids, base_prop=base_prop, hash_type=hash_type) # submol = Chem.PathToSubmol(mol, env) # tmp_smile = Chem.MolToSmiles(submol) # smile = atom.GetSymbol() if tmp_smile == '' else tmp_smile # tmpprop += [smile] # compute idf ############################ if hash_type == 'str': idf = hash_ecfp(ecfp=','.join(tmpprop), power=idf_power) elif hash_type == 'vec': idf = hash(tuple(tmpprop)) else: print('Wrong hash type!!!') return ecfp_dict ########################################## if dg == 0: ecfp_dict[(i, 'r0')] = idf else: neighborhoodThisRound.append((env, idf, i)) # check if env in the old neighborhood list (previous rounds), if yes turns on the deadAtoms sign if env in neighborhoods: deadAtoms[i] = 1 if dg > 0: neighborhoodThisRound.sort() # check if env in the neighborhood list of this round, if yes turns on the deadAtoms sign for candidate in neighborhoodThisRound: if candidate[0] not in neighborhoods: neighborhoods.append(candidate[0]) ecfp_dict[(candidate[2], 'r' + str(dg))] = candidate[1] else: deadAtoms[candidate[ 2]] = 1 # has the same environment as that of atoms in this round return ecfp_dict