Esempio n. 1
0
 def test_hash_ecfp(self):
     for power in (2, 16, 64):
         for _ in range(10):
             string = random_string(10)
             string_hash = rgf.hash_ecfp(string, power)
             self.assertIsInstance(string_hash, integer_types)
             self.assertLess(string_hash, 2**power)
             self.assertGreaterEqual(string_hash, 0)
 def test_hash_ecfp(self):
   for power in (2, 16, 64):
     for _ in range(10):
       string = random_string(10)
       string_hash = rgf.hash_ecfp(string, power)
       self.assertIsInstance(string_hash, integer_types)
       self.assertLess(string_hash, 2**power)
       self.assertGreaterEqual(string_hash, 0)
Esempio n. 3
0
def getOriginalIdentifiers(mol,
                           prop=[
                               'AtomicNumber', 'AtomicMass',
                               'TotalConnections', 'HCount',
                               'HeavyNeighborCount', 'FormalCharge',
                               'DeltaMass', 'IsTerminalAtom', 'SolidAngle',
                               'SolidAngleValue', 'SolidAngleSign'
                           ],
                           sa_dict=None,
                           includeAtoms=None,
                           radius=2,
                           hash_type='str',
                           idf_power=64):
    """Compute the original identifiers for atoms in a molecule based on atomic properties. 
       Note it only includes HEAVY atoms.
    Parameters:
        mol - rdkit.Chem.rdchem.Mol molecule
        prop - atomic property list
               'AtomicNumber': the atomic number of atom
               'AtomicMass': the mass of atom
               'TotalConnections': the degree of the atom in the molecule including Hs
               'HeavyNeighborCount': the number of heavy (non-hydrogen) neighbor atoms
               'HCount': the number of attached hydrogens (both implicit and explicit)
               'FormalCharge': the formal charge of atom
               'DeltaMass': the difference between atomic mass and atomic weight (weighted average of atomic masses)
               'IsTerminalAtom': indicates whether the atom is a terminal atom
               'SolidAngle': the solid angle of the atom on the molecule surface (> 0: convex, < 0: concave)
               'SolidAngleValue': the absolute solid angle of the atom on the molecule surface
               'SolidAngleSign': the sign of solid angle of the atom (-1, 0, 1)
        sa_dict - a dictionary mapping atom indices to their solid angles
        includeAtoms - atom indices for getting identifiers
        radius - ECFP radius, only calculates the identifiers of atoms in the neighborhoods (of radius) of included atoms (includeAtoms)
        hash_type - type for hash the properties, can be 'str' or 'vec'
        idf_power - power for the 'str' hash type (default 64-bit integers)
    Returns an dictionary mapping each heavy-atom index to an integer representing the atomic properties
    """
    tbl = GetPeriodicTable()
    idf_dict = {}
    nAtoms = mol.GetNumAtoms()
    if includeAtoms is None:
        indices = range(nAtoms)
    else:
        indices = includeAtoms
    for i in indices:
        index = int(i)
        env = list(
            Chem.FindAtomEnvironmentOfRadiusN(mol, radius, index, useHs=True))
        env_aids = set(
            [mol.GetBondWithIdx(bid).GetBeginAtomIdx() for bid in env] +
            [mol.GetBondWithIdx(bid).GetEndAtomIdx() for bid in env])
        for aid in env_aids:
            if (aid, 0) not in idf_dict:
                atom = mol.GetAtomWithIdx(aid)
                if atom.GetAtomicNum() > 1:
                    properties = []
                    if 'AtomicNumber' in prop:
                        properties.append(atom.GetAtomicNum())
                    if 'AtomicMass' in prop:
                        tmp_prop = atom.GetMass(
                        ) if hash_type == 'vec' else '%.2f' % atom.GetMass()
                        properties.append(tmp_prop)
                    if 'TotalConnections' in prop:
                        properties.append(atom.GetDegree())
                    if 'HCount' in prop:
                        properties.append(atom.GetNumExplicitHs())
                    if 'HeavyNeighborCount' in prop:
                        properties.append(
                            len([
                                bond.GetOtherAtom(atom)
                                for bond in atom.GetBonds()
                                if bond.GetOtherAtom(atom).GetAtomicNum() > 1
                            ]))
                    if 'FormalCharge' in prop:
                        tmp_prop = atom.GetFormalCharge(
                        ) if hash_type == 'vec' else '%.2f' % atom.GetFormalCharge(
                        )
                        properties.append(tmp_prop)
                    if 'DeltaMass' in prop:
                        tmp_prop = atom.GetMass() - tbl.GetAtomicWeight(
                            atom.GetAtomicNum())
                        tmp_prop = tmp_prop if hash_type == 'vec' else '%.2f' % tmp_prop
                        properties.append()
                    if 'IsTerminalAtom' in prop:
                        is_terminal_atom = 1 if atom.GetDegree() == 1 else 0
                        properties.append(is_terminal_atom)
                    if len([p for p in prop if 'SolidAngle' in p]) > 0:
                        sa = sa_dict[aid]
                        solang = 0 if (sa is None) else sa
                        if 'SolidAngle' in prop:
                            tmp_prop = solang if hash_type == 'vec' else '%.2f' % solang
                            properties.append(tmp_prop)
                        elif 'SolidAngleValue' in prop:
                            tmp_prop = abs(
                                solang
                            ) if hash_type == 'vec' else '%.2f' % abs(solang)
                            properties.append(tmp_prop)
                        else:
                            solang_sign = '0' if (sa in [None, 0]) else (
                                '+' if sa > 0 else '-')
                            properties.append(solang_sign)

                    if hash_type == 'str':
                        idf = hash_ecfp(ecfp=','.join(
                            [str(p) for p in properties]),
                                        power=idf_power)
                    elif hash_type == 'vec':
                        idf = hash(tuple(properties))
                    else:
                        print('Wrong hash type!')
                        return idf_dict

                    idf_dict[(aid, 0)] = idf

    return idf_dict
Esempio n. 4
0
def getIdentifiersRadiusN_all(molinfo,
                              prop=[
                                  'AtomicNumber', 'AtomicMass',
                                  'TotalConnections', 'HCount',
                                  'HeavyNeighborCount', 'FormalCharge',
                                  'DeltaMass', 'IsTerminalAtom', 'SolidAngle',
                                  'SolidAngleValue', 'SolidAngleSign'
                              ],
                              sa_dict=None,
                              includeAtoms=None,
                              radius=2,
                              hash_type='str',
                              idf_power=64):
    """Calculate the Identifiers of all molecular fragments (each originated from an atom, of radius N, can be redundant) in a molecule.
    Parameters:
        molinfo - a tuple describing a molecule (coordinates, rdkit.Chem.rdchem.Mol molecule, weights), weights = None for non-weighted alpha shapes
        prop, sa_dict, radius, includeAtoms, hash_type and idf_power - same as in getOriginalIdentifiers
    Returns the identifiers
    """
    idfs_all = {}
    mol = molinfo[1]
    nAtoms = mol.GetNumAtoms()
    deadAtoms = [0] * nAtoms

    # get original identifiers (of radius 0) of included atoms and their neighbors (in neighborhood of radius)
    idf_dict = getOriginalIdentifiers(mol=mol,
                                      prop=prop,
                                      sa_dict=sa_dict,
                                      includeAtoms=includeAtoms,
                                      radius=radius,
                                      hash_type=hash_type,
                                      idf_power=idf_power)
    ids_fil = set([u[0] for (u, v) in idf_dict.items()])
    idfs_all = {k: (v, []) for (k, v) in idf_dict.items()}

    # get atom orders
    if includeAtoms is not None:
        # put the query atoms in front positions (access first)
        atomOrder = includeAtoms + [
            i for i in ids_fil if i not in includeAtoms
        ]
    else:
        atomOrder = range(nAtoms)

    # iteratively calculate the identifiers of larger radius
    if radius == 0:
        return idfs_all
    else:
        for layer in range(radius):
            for ind in atomOrder:
                index = int(ind)
                if not deadAtoms[index]:
                    atom = mol.GetAtomWithIdx(index)
                    env = list(
                        Chem.FindAtomEnvironmentOfRadiusN(mol,
                                                          layer + 1,
                                                          index,
                                                          useHs=True))
                    env.sort()
                    if atom.GetAtomicNum() == 1 or atom.GetDegree == 0:
                        deadAtoms[index] = 1
                        continue
                    nbrs = []
                    bonds = atom.GetBonds()
                    for bond in bonds:
                        oth_index = bond.GetOtherAtomIdx(index)
                        if (oth_index, layer) in idfs_all:
                            bt = bond.GetBondTypeAsDouble()
                            nbrs.append((bt, idfs_all[(oth_index, layer)][0]))
                    nbrs.sort()
                    nbrhd = [layer, idfs_all[(index, layer)][0]]
                    for nbr in nbrs:
                        nbrhd.append(nbr)
                    # use [layer, idf, (nbr1_bondtype, nbr1_idf), ..., (nbrN_bondtype, nbrN_idf)] to represent an atomic neighborhood of a specific radius (layer)
                    if hash_type == 'str':
                        idf = hash_ecfp(ecfp=','.join(
                            [str(itm) for itm in nbrhd]),
                                        power=idf_power)
                    elif hash_type == 'vec':
                        idf = hash(tuple(nbrhd))
                    else:
                        print('Wrong hash type!!!')
                        return []

                    idfs_all[(index, layer + 1)] = (idf, env)

        return idfs_all
def getECFPstringsRadiusN_avg_ecfp(molinfo,
                                   heavy_atoms=0,
                                   base_prop=['AtomicMass'],
                                   sa_dict={},
                                   indices=None,
                                   degree=2,
                                   parameters={
                                       'weighted': 0,
                                       'alpha': -1,
                                       'alpha_step': 0.1
                                   },
                                   hash_type='str',
                                   idf_power=64):
    """Obtain molecular fragment for all atoms emanating outward to given degree, using the ECFP procedure.
    For each fragment, compute average atomic properties (and SMILES string for now) and hash to an integer.
    
    Parameters:
        molinfo - a tuple describing a molecule (coordinates, rdkit.Chem.rdchem.Mol molecule, weights), weights = None for non-weighted alpha shapes
        heavy_atoms - use heavy atoms (1) or all atoms (0) to compute ecfp
        base_prop - base atomic property, the hashed value of (base_prop, environment smile) will be the atomic identifiers
               'AtomicMass': the atomic mass of atom
               'TotalConnections': the degree of the atom in the molecule including Hs
               'HeavyNeighborCount': the number of heavy (non-hydrogen) neighbor atoms
               'HCount': the number of attached hydrogens (both implicit and explicit)
               'FormalCharge': the formal charge of atom
               'DeltaMass': the difference between atomic mass and atomic weight (weighted average of atomic masses)
               'SolidAngle': the solid angle of the atom on the molecule surface (> 0: convex, < 0: concave)
               'SolidAngleValue': the absolute value of solid angle of the atom 
               'SolidAngleSign': the sign of solid angle of the atom (-1, 0, 1)
        sa_dict - a dictionary mapping atom indices to their solid angles
        indices - indices for queried atoms
        degree - ecfp radius
        parameters - parameters for calculating the solid angles of surface atoms (for concave_hull_3D class)
        hash_type - type for hashing the fragment, either 'str' (using hash_ecfp function) or 'vec' (using the default hash function)
        idf_power - power for the 'str' hash type (default 64-bit integers)
    Returns a dictionary mapping atom index to a string or vector that is to be hashed later.
  """
    ecfp_dict = {}
    mol = molinfo[1]
    nAtoms = mol.GetNumAtoms()
    neighborhoods = []
    deadAtoms = [0] * nAtoms
    sa_list = sa_dict
    if len([p for p in base_prop if 'SolidAngle' in p]) > 0:
        if len(sa_dict) == 0:
            ch = concave_hull_3D(points=molinfo[0],
                                 weights=molinfo[2],
                                 alpha=parameters['alpha'],
                                 alpha_step=parameters['alpha_step'])
            ch.construct_conchull()
            sa_list = ch.compute_solid_angles()
        else:
            sa_list = sa_dict

    aids_all = range(nAtoms) if indices is None else indices
    for dg in range(degree + 1):
        neighborhoodThisRound = []
        for ix in aids_all:
            i = int(ix)
            if deadAtoms[i] == 0:
                atom = mol.GetAtomWithIdx(i)
                sign1 = (heavy_atoms and atom.GetAtomicNum() == 1)
                sign2 = (atom.GetDegree() == 0)
                if sign1 or sign2:
                    deadAtoms[i] = 1
                    continue
                env = list(
                    Chem.FindAtomEnvironmentOfRadiusN(mol,
                                                      dg,
                                                      i,
                                                      useHs=not heavy_atoms))
                env.sort()
                tmp_aids = set(
                    [mol.GetBondWithIdx(bid).GetBeginAtomIdx()
                     for bid in env] +
                    [mol.GetBondWithIdx(bid).GetEndAtomIdx() for bid in env])
                env_aids = set([i]) if len(tmp_aids) == 0 else tmp_aids
                tmpprop = get_atom_proplist(mol=mol,
                                            sa_dict=sa_list,
                                            aids=env_aids,
                                            base_prop=base_prop,
                                            hash_type=hash_type)
                #                submol = Chem.PathToSubmol(mol, env)
                #                tmp_smile = Chem.MolToSmiles(submol)
                #                smile = atom.GetSymbol() if tmp_smile == '' else tmp_smile
                #                tmpprop += [smile]
                # compute idf ############################
                if hash_type == 'str':
                    idf = hash_ecfp(ecfp=','.join(tmpprop), power=idf_power)
                elif hash_type == 'vec':
                    idf = hash(tuple(tmpprop))
                else:
                    print('Wrong hash type!!!')
                    return ecfp_dict
                ##########################################
                if dg == 0:
                    ecfp_dict[(i, 'r0')] = idf
                else:
                    neighborhoodThisRound.append((env, idf, i))
                    # check if env in the old neighborhood list (previous rounds), if yes turns on the deadAtoms sign
                    if env in neighborhoods:
                        deadAtoms[i] = 1
        if dg > 0:
            neighborhoodThisRound.sort()
            # check if env in the neighborhood list of this round, if yes turns on the deadAtoms sign
            for candidate in neighborhoodThisRound:
                if candidate[0] not in neighborhoods:
                    neighborhoods.append(candidate[0])
                    ecfp_dict[(candidate[2], 'r' + str(dg))] = candidate[1]
                else:
                    deadAtoms[candidate[
                        2]] = 1  # has the same environment as that of atoms in this round
    return ecfp_dict