Exemple #1
0
    def test9ToNumpy(self):
        import numpy
        for typ in (DataStructs.ExplicitBitVect, ):
            bv = typ(32)
            bv.SetBit(0)
            bv.SetBit(1)
            bv.SetBit(17)
            bv.SetBit(23)
            bv.SetBit(31)
            arr = numpy.zeros((32, ), 'i')
            DataStructs.ConvertToNumpyArray(bv, arr)
            for i in range(bv.GetNumBits()):
                self.assertEqual(bv[i], arr[i])

        for typ in (DataStructs.IntSparseIntVect,
                    DataStructs.LongSparseIntVect,
                    DataStructs.UIntSparseIntVect,
                    DataStructs.ULongSparseIntVect):
            iv = typ(32)
            iv[0] = 1
            iv[1] = 1
            iv[17] = 1
            iv[23] = 1
            iv[31] = 1
            arr = numpy.zeros((32, ), 'i')
            DataStructs.ConvertToNumpyArray(iv, arr)
            for i in range(iv.GetLength()):
                self.assertEqual(iv[i], arr[i])
Exemple #2
0
def rf_validate(enzymes, success, candidate):
    # generate fingeprints: Morgan fingerprint with radius 2
    mols = []
    for i in enzymes:
        mols.append( Chem.MolFromSmiles(i) )
    fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]  
    
    # convert the RDKit explicit vectors into numpy arrays
    np_fps = []
    for fp in fps:
      arr = numpy.zeros((1,))
      DataStructs.ConvertToNumpyArray(fp, arr)
      np_fps.append(arr)
    
    # get a random forest classifiert with 100 trees
    rf = RandomForestClassifier(n_estimators=100, random_state=1123)
    
    # train the random forest
    # with the first two molecules being actives (class 1) and
    # the last two being inactives (class 0)
    ys_fit = success
    rf.fit(np_fps, ys_fit)
    
    # use the random forest to predict a new molecule
    m6 = Chem.MolFromSmiles(candidate)
    fp = numpy.zeros((1,))
    DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(m6, 2), fp)
    
    return rf.predict_proba(fp)[0][1] 
Exemple #3
0
def create_rxn_Morgan2FP(rsmi,
                         psmi,
                         rxnfpsize=2048,
                         pfpsize=2048,
                         useFeatures=False,
                         calculate_rfp=True):
    """Create a rxn Morgan (r=2) fingerprint as bit vector from SMILES string lists of reactants and products"""
    # Modified from Schneider's code (2014)
    if calculate_rfp is True:
        rsmi = rsmi.encode('utf-8')
        try:
            mol = Chem.MolFromSmiles(rsmi)
        except Exception as e:
            return

        try:
            fp_bit = AllChem.GetMorganFingerprintAsBitVect(mol=mol,
                                                           radius=2,
                                                           nBits=rxnfpsize,
                                                           useFeatures=False,
                                                           useChirality=True)
            fp = np.empty(rxnfpsize, dtype=np.bool)
            DataStructs.ConvertToNumpyArray(fp_bit, fp)

        except Exception as e:
            print("Cannot build reactant fp due to {}".format(e))

            return

        rfp = fp
    else:
        rfp = None

    psmi = psmi.encode('utf-8')
    try:
        mol = Chem.MolFromSmiles(psmi)
    except Exception as e:
        print(psmi)
        return

    try:
        fp_bit = AllChem.GetMorganFingerprintAsBitVect(mol=mol,
                                                       radius=2,
                                                       nBits=pfpsize,
                                                       useFeatures=False,
                                                       useChirality=True)
        fp = np.empty(pfpsize, dtype=np.bool)
        DataStructs.ConvertToNumpyArray(fp_bit, fp)

    except Exception as e:
        print("Cannot build product fp due to {}".format(e))
        return

    pfp = fp

    return [pfp, rfp]
def mol_train_test(dataset,
                   labels,
                   test_size=0.1,
                   random_state=2019,
                   nbits=1024):

    # TAKING WRONG INCHIS
    all_mols = [
        Chem.MolFromSmiles(SMILES_string)
        for SMILES_string in dataset['SMILES']
    ]
    drop_index = [i for i, mol in enumerate(all_mols)
                  if mol == None]  # FINDING WRONG INCHIS

    # DROP FROM MOLS, lABELS, AND DATASET
    if len(drop_index) != 0:
        labels = labels.drop(drop_index).reset_index(drop=True)
        dataset = dataset.drop(drop_index).reset_index(drop=True)

    all_mols = [
        Chem.MolFromSmiles(SMILES_string)
        for SMILES_string in dataset['SMILES']
    ]  ### FIND BETTER WAY TO NOT CALCULATE AGAIN!!!!

    # TRAIN-TEST SPLITS
    train_mols, test_mols, y_train, y_test = train_test_split(all_mols, labels, test_size=test_size\
                                                              , random_state=random_state)

    # CONVERT TRAINING MOLECULES INTO FINGERPRINT AS 256BITS VECTORS
    bi = {}
    fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, radius=2, bitInfo= bi, nBits=nbits) \
           for m in train_mols]

    # PUT ALL EACH OF THE CORRESPONDING 256BITS FINGERPRINTS INTO A LIST
    train_fps_array = []
    for fp in fps:
        arr = np.zeros((1, ), dtype=int)
        DataStructs.ConvertToNumpyArray(fp, arr)
        train_fps_array.append(arr)

    # CONVERT InChi STRINGS INTO MOLECULES FOR TEST DATA
    test_fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(test_m, radius=2, bitInfo= bi, nBits=nbits) \
            for test_m in test_mols]

    #Convert testing fingerprints into binary, and put all testing binaries into arrays
    test_np_fps_array = []
    for test_fp in test_fps:
        test_arr = np.zeros((1, ), dtype=int)
        DataStructs.ConvertToNumpyArray(test_fp, test_arr)
        test_np_fps_array.append(test_arr)

    return dataset, labels, all_mols, y_train, y_test, train_fps_array, test_np_fps_array
Exemple #5
0
def SMILES_2_ECFP(smiles, radius=3, bit_len=4096, index=None):
    """
    This function transforms a list of SMILES strings into a list of ECFP with 
    radius 3.
    ----------
    smiles: List of SMILES strings to transform
    Returns
    -------
    This function return the SMILES strings transformed into a vector of 4096 elements
    """
    fps = np.zeros((len(smiles), bit_len))
    for i, smile in enumerate(smiles):
        mol = Chem.MolFromSmiles(smile)
        arr = np.zeros((1, ))
        try:

            mol = MurckoScaffold.GetScaffoldForMol(mol)

            fp = AllChem.GetMorganFingerprintAsBitVect(mol,
                                                       radius,
                                                       nBits=bit_len)
            DataStructs.ConvertToNumpyArray(fp, arr)
            fps[i, :] = arr
        except:
            print(smile)
            fps[i, :] = [0] * bit_len
    return pd.DataFrame(fps, index=(smiles if index is None else index))
Exemple #6
0
def calculate_ecfp4(mol, nBits=1024):
    rdmol = to_rdkit_Mol(mol)
    Chem.rdmolops.SanitizeMol(rdmol)
    fp = Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(rdmol, 2, nBits=nBits)
    arr = np.array([])
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr
Exemple #7
0
 def __call__(self, smiles, radius=3, bit_len=4096, scaffold=0):
     fps = np.zeros((len(smiles), bit_len))
     for i, smile in enumerate(smiles):
         mol = Chem.MolFromSmiles(smile)
         arr = np.zeros((1, ))
         try:
             if scaffold == 1:
                 mol = MurckoScaffold.GetScaffoldForMol(mol)
             elif scaffold == 2:
                 mol = MurckoScaffold.MakeScaffoldGeneric(mol)
             if not mol:
                 raise Exception(
                     f'Failed to calculate Morgan fingerprint (creating RDKit instance from smiles failed: {smile})'
                 )
             fp = AllChem.GetMorganFingerprintAsBitVect(mol,
                                                        radius,
                                                        nBits=bit_len)
             DataStructs.ConvertToNumpyArray(fp, arr)
             fps[i, :] = arr
         except Exception as exp:
             # TODO: use a more specific exception related to descriptor errors
             # traceback.print_exc()
             self.builder.errors.append(exp)
             fps[i, :] = [0] * bit_len
     return pd.DataFrame(fps)
Exemple #8
0
def generate_fingerprints(smile):
    mol = MolFromSmiles(smile)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024)
    array = np.zeros((0,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, array)
    #print(array)
    return array
def main():
    """Run the main function."""
    outfile = open(sys.argv[1] + "_predictions.txt", 'w')
    outfile.write("Chemical Name\tPrediction\n")
    model = load_model("pparg_ligand_model.h5")
    dataframe = pandas.read_csv(sys.argv[1], sep="\t")
    mols = []
    fps = []

    for index, row in dataframe.iterrows():
        mol = Chem.MolFromSmiles(row['SMILES'])
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
        mols.append(mol)
        fps.append(fp)

    np_fps = []
    for fp in fps:
        arr = numpy.zeros((1, ))
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)

    np_fps_array = numpy.array(np_fps)
    predictions = model.predict(np_fps_array, batch_size=5)
    i = 0
    for prediction in predictions:
        y_prediction = ''
        if (prediction < 0.50):
            y_prediction = "ligand"
        else:
            y_prediction = "not_ligand"
        outfile.write(dataframe['Chemical_Name'][i] + "\t" + y_prediction +
                      "\n")
        i += 1
    outfile.close()
def morgan_fingp(fname):
    nbits = 1024
    radius = 2
    #fp = []
    fsplit = fname.split('/')[-1]
    #to_skip = done_dict[fsplit]
    ref2 = open(fp + '/' + fn + '/' + fsplit, 'a')
    #print(fname,to_skip)
    with open(fname, 'r') as ref:
        ref.readline()
        #for count in range(to_skip):
        #    ref.readline()
        for line in ref:
            smile, zin_id = line.rstrip().split()
            arg = np.zeros((1, ))
            try:
                DataStructs.ConvertToNumpyArray(
                    AllChem.GetMorganFingerprintAsBitVect(
                        Chem.MolFromSmiles(smile), radius, nBits=nbits), arg)

                ref2.write(
                    (',').join([zin_id] +
                               [str(elem) for elem in np.where(arg == 1)[0]]))
                ref2.write('\n')
            except:
                print(line)
                pass
Exemple #11
0
 def fp_matrix(self, fp):
     matrix_fp = []
     for f in fp:
         arr = np.zeros((1, ))
         DataStructs.ConvertToNumpyArray(f, arr)
         matrix_fp.append(arr)
     return matrix_fp
Exemple #12
0
def computeFP(x):
    #compute depth-2 morgan fingerprint hashed to 2048 bits
    fp = Chem.GetMorganFingerprintAsBitVect(x, 2, nBits=2048)
    res = numpy.zeros(len(fp), numpy.int32)
    #convert the fingerprint to a numpy array and wrap it into the dummy container
    DataStructs.ConvertToNumpyArray(fp, res)
    return FP(res)
def compute_morgan_fingerprints(smiles, fingerprint_length, fingerprint_radius):
    """Get Morgan Fingerprint of a specific SMILES string.

    Adapted from: <https://github.com/google-research/google-research/blob/
    dfac4178ccf521e8d6eae45f7b0a33a6a5b691ee/mol_dqn/chemgraph/dqn/deep_q_networks.py#L750>

    Args:
      smiles: String. The SMILES string of the molecule.
      fingerprint_length (int): Bit-length of fingerprint
      fingerprint_radius (int): Radius used to compute fingerprint
    Returns:
      np.array. shape = [hparams.fingerprint_length]. The Morgan fingerprint.
    """
    if smiles is None:  # No smiles string
        return np.zeros((fingerprint_length,))
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:  # Invalid smiles string
        return np.zeros((fingerprint_length,))

    # Compute the fingerprint
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(
        molecule, fingerprint_radius, fingerprint_length)
    arr = np.zeros((1,))

    # ConvertToNumpyArray takes ~ 0.19 ms, while
    # np.asarray takes ~ 4.69 ms
    DataStructs.ConvertToNumpyArray(fingerprint, arr)
    return arr
Exemple #14
0
def Xyfromdf(df, return_y):
    """Generate X (design matrix) and y (labels) for training a machine learning model from a dataframe of 
    structures. The structures are encoded using their Morgan fingerprint.

    Args:
        df: a Pandas DataFrame with columns "Compound ID", "Structure", "IC50"
        return_y: whether or not to return labels y.

    Returns:
        2-tuple (X,y) where X is a dataframe and y is a series if return_y is True. Otherwise just X
    """
    # generate fingeprints: Morgan fingerprint with radius 2
    fps = [
        AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile), 2)
        for smile in df["Structure"]
    ]

    # convert the RDKit explicit vectors into numpy arrays
    np_fps = [np.zeros((1, )) for fp in fps]
    for i, fp in enumerate(fps):
        DataStructs.ConvertToNumpyArray(fp, np_fps[i])
    X = pd.DataFrame(np.array(np_fps))
    if return_y:
        y = np.log(df["IC50"])
        assert y.isna().sum() == 0
        return X, y
    else:
        return X
Exemple #15
0
    def morgan(self, radius, size=None):
        """
        Calculates circular fingerprints and renders them as a DataFrame, so that it is easier to handle.
        
        : radius (int): radius = 2 ~ ECFP4, radius = 3 ~ ECFP6
        : size (int, optional): number of bits to generate. If None, it will be assigned 
        the standard value of 2048
        
        """

        if size is None:
            size = 2048

        fps = [
            AllChem.GetMorganFingerprintAsBitVect(m, radius, size)
            for m in self.smiles_converted
        ]
        np_fps = []

        for fp in fps:
            arr = np.zeros((1, ))
            DataStructs.ConvertToNumpyArray(fp, arr)
            np_fps.append(arr)

        df = pd.DataFrame(np_fps)

        return df
Exemple #16
0
    def get_morgan_fp(self,
                      mol: Mol,
                      radius: int = 2,
                      nBits: int = 1024,
                      invariants: List[AtomPairsParameters] = [],
                      fromAtoms: List[AtomPairsParameters] = [],
                      useChirality: bool = False,
                      useBondTypes: bool = True,
                      useFeatures: bool = False,
                      bitInfo: AtomPairsParameters = {},
                      includeRedundantEnvironments: bool = False) -> array:
        """
        Function to generate a set of fingerprints from a single molecule

        Parameters:
            Same parameters as: https://www.rdkit.org/docs/source/rdkit.Chem.rdMolDescriptors.html#rdkit.Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect

        Returns:
            fingerprints_vector (array): numpy array containing fingerprints for the molecule
        """

        arr = np.zeros((1, ))
        fp = AllChem.GetMorganFingerprintAsBitVect(mol=mol,
                                                   radius=radius,
                                                   nBits=nBits,
                                                   invariants=invariants,
                                                   fromAtoms=fromAtoms,
                                                   useChirality=useChirality,
                                                   useBondTypes=useBondTypes,
                                                   useFeatures=useFeatures,
                                                   bitInfo=bitInfo)
        DataStructs.ConvertToNumpyArray(fp, arr)
        arr = np.array(
            [len(bitInfo[x]) if x in bitInfo else 0 for x in range(nBits)])
        return arr
Exemple #17
0
def convert_bitvec_to_array(bitvec: list) -> np.ndarray:
    """Convert bit vector fingerprint to numpy array."""

    features = np.zeros(1, )
    DataStructs.ConvertToNumpyArray(bitvec, features)

    return features
Exemple #18
0
 def fingerprints_from_mol(cls, mol):  # use ECFP4
     features_vec = AllChem.GetMorganFingerprintAsBitVect(mol,
                                                          2,
                                                          nBits=2048)
     features = np.zeros((1, ))
     DataStructs.ConvertToNumpyArray(features_vec, features)
     return features.reshape(1, -1)
def featurizer(folder, filename):
    folderpath = folder
    data = pd.read_csv(folderpath + filename, sep=',')

    alias = []
    ic50 = []
    fps = []

    #convert smiles to rkd objects
    for i in range(data.shape[0]):
        try:
            arr = np.zeros((1, ))
            compound = Chem.MolFromSmiles(data.ix[i, 1])
            fp = AllChem.GetMorganFingerprintAsBitVect(compound, 3, 4096)
            DataStructs.ConvertToNumpyArray(fp, arr)
            fps.append(arr)
            alias.append(data.ix[i, 0])
            ic50.append(data.ix[i, 2])
        except:
            print(i)
            print(data.ix[i, 0])

    #create dataframe to store fingerprinters and write to csv file
    df = pd.DataFrame(fps, index=alias)
    df.insert(0, "ic50", ic50)
    df.to_csv(folderpath + 'fingerprinters_4096.csv')
Exemple #20
0
def getNumpy(inlist):
    outlist = []
    for i in inlist:
        arr = numpy.zeros((3, ), tree.DTYPE)
        DataStructs.ConvertToNumpyArray(i[1], arr)
        outlist.append(arr)
    return outlist
def get_circular_fp(smile, radius=6, fp_len=128):
    mol = Chem.MolFromSmiles(smile)
    fingerprint = Chem.AllChem.GetMorganFingerprintAsBitVect(
        mol, radius, fp_len)
    arr = np.zeros((1, ))
    DataStructs.ConvertToNumpyArray(fingerprint, arr)
    return arr
Exemple #22
0
    def smile2fp(smile: str) -> Any:
        """
        Calculates one fingerprint from a SMILE
        :param smile: Input SMILE
        :return: List of bits if conversion is successfull,
        None otherwise
        """

        # generate morgan fp (circular, ecfp)
        # smile = df['smiles'][1]
        # mol = Chem.MolFromSmiles(smile)
        # from rdkit.Chem import AllChem
        # morgan = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
        # npa = np.zeros((0,), dtype=np.bool)
        # from rdkit import DataStructs
        # DataStructs.ConvertToNumpyArray(morgan, npa)

        npa = np.zeros((0,), dtype=np.bool)
        try:
            DataStructs.ConvertToNumpyArray(
                AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile), 2, nBits=fp_size),
                npa)
            return npa
        except:
            return None
Exemple #23
0
def main(smiles, output, top = 10, model = '10uM'):
    if model == '1uM':
        morgan_nb = joblib.load('Data/models_23/1uM/mNB_1uM_all.pkl')
    else:
        morgan_nb = joblib.load('Data/models_23/10uM/mNB_10uM_all.pkl')

    classes = list(morgan_nb.targets)

    mol = Chem.MolFromSmiles(smiles)

    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits = 2048)
    res = np.zeros(len(fp), np.int32)
    DataStructs.ConvertToNumpyArray(fp, res)

    probas = list(morgan_nb.predict_proba(res.reshape(1,-1))[0])
    predictions = pd.DataFrame(list(zip(classes, probas)), columns=['id','probas'])

    top_pred = predictions.sort_values(by='probas', ascending = False).head(top)

    plist = []
    for i, e in enumerate(top_pred['id']):
        plist.append(fetch_WS(e))

    target_info = pd.DataFrame(plist, columns=['id', 'name', 'organism'])
    result = pd.merge(top_pred, target_info)
    result.to_csv(output)
Exemple #24
0
def rdkit_numpy_convert(fp):
    output = []
    for f in fp:
        arr = np.zeros((1, ))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)
Exemple #25
0
def morgan_fingerprint(
    df: pd.DataFrame,
    mols_col: str,
    radius: int = 3,
    nbits: int = 2048,
    kind: str = "counts",
):
    """
    Convert a column of RDKIT Mol objects into Morgan Fingerprints.

    Returns a new dataframe without any of the original data. This is
    intentional, as Morgan fingerprints are usually high-dimensional
    features.

    Method chaining usage:

    .. code-block:: python

        df = pd.DataFrame(...)
        morgans = df.morgan_fingerprint(mols_col='mols', radius=3, nbits=2048)

    If you wish to join the Morgans back into the original dataframe, this
    can be accomplished by doing a `join`, becuase the indices are
    preserved:

    ..code-block:: python

        joined = df.join(morgans)

    :param df: A pandas DataFrame.
    :param mols_col: The name of the column that has the RDKIT mol objects
    :param radius: Radius of Morgan fingerprints. Defaults to 3.
    :param nbits: The length of the fingerprints. Defaults to 2048.
    :param kind: Whether to return counts or bits. Defaults to counts.
    :returns: A pandas DataFrame
    """
    acceptable_kinds = ["counts", "bits"]
    if kind not in acceptable_kinds:
        raise ValueError(f"`kind` must be one of {acceptable_kinds}")

    if kind == "bits":
        fps = [
            GetMorganFingerprintAsBitVect(m, radius, nbits)
            for m in df[mols_col]
        ]
    elif kind == "counts":
        fps = [
            GetHashedMorganFingerprint(m, radius, nbits) for m in df[mols_col]
        ]

    np_fps = []
    for fp in fps:
        arr = np.zeros((1, ))
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)
    np_fps = np.vstack(np_fps)
    fpdf = pd.DataFrame(np_fps)
    fpdf.index = df.index
    return fpdf
Exemple #26
0
def fp_to_pandas(fp, drug_names):
    fp_np = []
    for fp in fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        fp_np.append(arr)
    fp_df = pd.DataFrame(fp_np, index=drug_names)
    return fp_df
def fingerprintsToNPArr(fps):
    # print fps
    np_fps = []
    for fp in fps:
        arr = np.zeros((1, ))
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)
    return np_fps
    def get_numpy_fingerprint_from_smiles(smiles):
        """Get Morgan Fingerprint as NumPy vector from SMILES string"""

        mol = Chem.MolFromSmiles(smiles)
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 3)
        finger_container = np.empty(fingerprint.GetNumBits())
        DataStructs.ConvertToNumpyArray(fingerprint, finger_container)
        return finger_container
 def _maccs_keys(self, molecules: List, parameters: {}):
     fingerprints = []
     fps = [MACCSkeys.GenMACCSKeys(mol) for mol in molecules]
     for fp in fps:
         fp_np = np.zeros((1, ), dtype=np.int32)
         DataStructs.ConvertToNumpyArray(fp, fp_np)
         fingerprints.append(fp_np)
     return fingerprints
Exemple #30
0
 def bitvect_as_np_array(self):
     """Transforms the calculated 2048-bit bitvector into a np array.
 Returns a 1 x 2048 array.
 """
     import numpy as np
     arr = np.zeros((1, ))
     DataStructs.ConvertToNumpyArray(self.bitvect, arr)
     return arr