Beispiel #1
0
def fp_maccs_std_mp(mol, mol_can, i):
    fp_mol = MACCSkeys.GenMACCSKeys(mol)
    if id(mol) == id(mol_can):
        fp_can = fp_mol
    else:
        fp_can = MACCSkeys.GenMACCSKeys(mol_can)
    return (i, fp_mol, fp_can)
Beispiel #2
0
def fp_maccs_std(mols):
    for i in mols:
        fp_mol = MACCSkeys.GenMACCSKeys(mols[i]["mol"])
        mols[i]["fp"] = fp_mol
        if id(mols[i]["mol"]) == id(mols[i]["mol_can"]):
            mols[i]["fp_can"] = fp_mol
        else:
            mols[i]["fp_can"] = MACCSkeys.GenMACCSKeys(mols[i]["mol_can"])
Beispiel #3
0
def label_switching_encoder(key_smiles, bits, df, nmol_df):
    '''

    :param bits: a list of ACSII code
    :param df: df where to pick key molecule and the 'pad' molecules
    :return: key molecule and chemical messages
    '''
    # molecular key
    key_mol = Chem.MolFromSmiles(key_smiles)
    key_fp = MACCSkeys.GenMACCSKeys(key_mol)
    # build root_seed and rotor_seed based on MW and number of atoms of key_mol
    root_seed = int(Chem.Descriptors.ExactMolWt(key_mol))
    rotor_seed = key_mol.GetNumAtoms()

    #pick 128 neighbor molecules
    # Pick the 128 reference molecules
    np.random.seed(root_seed)
    ref_smiles = np.random.choice(nmol_df.SMILES, size=128, replace=False)
    #compute the distance
    dist = []
    for i in range(len(ref_smiles)):
        mol = Chem.MolFromSmiles(ref_smiles[i])
        fp = MACCSkeys.GenMACCSKeys(mol)
        dist.append(DataStructs.FingerprintSimilarity(key_fp, fp))

    # build a list from 0 to 127
    orig_label = [i for i in range(128)]
    message_mol_list = []

    for index, bit in enumerate(bits):
        SEED = root_seed + index * rotor_seed
        np.random.seed(SEED)
        # Base on the random seed, swap the original distance.
        step_dist = np.random.choice(dist, size=len(dist), replace=False)

        # get the index of ordered distances
        dict_rank = [0] * len(step_dist)
        for i, x in enumerate(
                sorted(range(len(step_dist)), key=lambda y: step_dist[y])):
            dict_rank[x] = i
        swaper_dict = dict(zip(dict_rank, orig_label))
        # pick mol from df
        # fix the problem that the original text has the ACSII code larger than 127
        if int(bit) < 128:
            rand_mol = random.choice(
                df[df.clusters == swaper_dict.get(int(bit))]['smiles'])
        else:
            rand_mol = random.choice(df[df.clusters == swaper_dict.get(int(
                42))]['smiles'])  # use * as replacement
        message_mol_list.append(rand_mol)
    return message_mol_list
Beispiel #4
0
def getXNN(trainSmilesList, train, predEx, smilesAttrName, nameAttr, X,
           simType):

    if simType == "Topological":
        fpsTrain = [FingerprintMols.FingerprintMol(x) for x in trainSmilesList]
        fp = FingerprintMols.FingerprintMol(
            Chem.MolFromSmiles(predEx[smilesAttrName].value))
    elif simType == "Morgan":
        fpsTrain = [
            AllChem.GetMorganFingerprint(x, 2) for x in trainSmilesList
        ]
        fp = AllChem.GetMorganFingerprint(
            Chem.MolFromSmiles(predEx[smilesAttrName].value), 2)
    elif simType == "MACCS":
        fpsTrain = [MACCSkeys.GenMACCSKeys(x) for x in trainSmilesList]
        fp = MACCSkeys.GenMACCSKeys(
            Chem.MolFromSmiles(predEx[smilesAttrName].value))
    else:
        print "This type of sim is not implemented ", simType

    simDict = {}
    idx = 0
    simList = []
    for ex in train:
        if simType == "Topological":
            sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp)
        elif simType == "Morgan":
            sim = DataStructs.DiceSimilarity(fpsTrain[idx], fp)
        elif simType == "MACCS":
            sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp)
        else:
            print "This type of sim is not implemented ", simType
        idx = idx + 1
        simDict[ex[nameAttr].value] = sim
        simList.append(sim)

    simList.sort(reverse=True)
    simList = simList[0:X]
    medSim = round(numpy.median(simList), 3)
    stdSim = round(numpy.std(simList), 3)
    minSim = round(min(simList), 3)
    maxSim = round(max(simList), 3)

    entropy = round(getRespVar(simList, simDict, train, nameAttr), 3)
    entropyClosest = round(
        getRespVar(simList[0:X / 2], simDict, train, nameAttr), 3)

    return medSim, stdSim, minSim, maxSim, entropy, entropyClosest
    def compute_pca(self):
        Database = self.Database2
        smiles = list(Database.SMILES)
        smi = [Chem.MolFromSmiles(x) for x in smiles]
        fps=[MACCSkeys.GenMACCSKeys(x) for x in smi]

        # Generate the lower similarity matrix triangle
        tanimoto_sim_mat_lower_triangle=GetTanimotoSimMat(fps)
        # tanimoto_sim_mat_lower_triangle
        n_mol = len(fps)
        similarity_matrix = np.ones([n_mol,n_mol])
        i_lower= np.tril_indices(n=n_mol,m=n_mol,k=-1)
        i_upper= np.triu_indices(n=n_mol,m=n_mol,k=1)
        similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle
        similarity_matrix[i_upper] = similarity_matrix.T[i_upper] 

        sklearn_pca = sklearn.decomposition.PCA(n_components=2, svd_solver = "full", whiten = True)
        sklearn_pca.fit(similarity_matrix)
        variance = list(sklearn_pca.explained_variance_ratio_)
        a = round(variance[0] * 100, 2)
        b = round(variance[1] * 100,2)
        pca_result = pd.DataFrame(sklearn_pca.transform(similarity_matrix) , columns=['PC1','PC2'])
        pca_result["LIBRARY"] = Database.LIBRARY
        pca_result["TIPO"] = Database.LIBRARY
        pca_result["SMILES"] = Database.SMILES
        pca_result["NAME"] = Database.NAME
        self.pca_result = pca_result.set_index('TIPO')
        variance = list(sklearn_pca.explained_variance_ratio_)
        self.a = round(variance[0] * 100, 2)
        self.b = round(variance[1] * 100,2)

        return pca_result
Beispiel #6
0
def preprocess_dataset(path, data_config, fingerprint, morgan_nbits=None):
    """Calculate representation for each smiles in the dataset."""
    if fingerprint == 'morgan':
        assert morgan_nbits is not None, 'Parameter `morgan_nbits` must be set when using Morgan fingerprint.'

    smiles, labels = load_data_from_df([path,], **data_config[csv_section])
    x = []
    y = []
    calculated_smiles = []

    # we go smiles by smiles because some compounds make rdkit throw errors
    for this_smiles, this_label in zip(smiles, labels):
        try:
            mol = Chem.MolFromSmiles(this_smiles)
            if fingerprint == 'morgan':
                fp = AllChem.GetMorganFingerprintAsBitVect(mol, 6, nBits=morgan_nbits)
                fp = [int(i) for i in fp.ToBitString()]
            elif fingerprint == 'maccs':
                fp = MACCSkeys.GenMACCSKeys(mol)
                fp = np.array(fp)[1:]  # index 0 is unset
            elif fingerprint == 'krfp':
                fp = krfp(this_smiles)
            else:
                pass  # unknown fingerprint
            x.append(fp)
            y.append(this_label)
            calculated_smiles.append(this_smiles)
        except Exception as e:
            print('exp', e)
    return np.array(x), np.array(y), calculated_smiles
 def __init__(self):
     self.binaryfp_names = [
         "MACCSkeys",
         "Avalon",
         "Morgan2(1024bits)",
         "Morgan2F(1024bits)",
         "Morgan4(2048bits)",
         "Morgan4F(2048bits)",
         # "AtomPair",
         # "Topological",
         # "TopologicalTortion",
     ]
     self.binaryfp = [
         lambda mol: MACCSkeys.GenMACCSKeys(mol),
         lambda mol: pyAvalonTools.GetAvalonFP(mol),
         lambda mol: AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024),
         lambda mol: AllChem.GetMorganFingerprintAsBitVect(
             mol, 2, nBits=1024, useFeatures=True
         ),
         lambda mol: AllChem.GetMorganFingerprintAsBitVect(mol, 4, nBits=2048),
         lambda mol: AllChem.GetMorganFingerprintAsBitVect(
             mol, 4, nBits=2048, useFeatures=True
         ),
         # lambda mol: Pairs.GetAtomPairFingerprintAsBitVect(mol), # クラッシュする
         # lambda mol: FingerprintMols.FingerprintMol(mol), #Topological Fingerprint # NaNを生成する
         # lambda mol: Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol), # ToBitString を持ってない
     ]
     self.countfp_names = [
         "ECFP2",
         "FCFP2",
         "ECFP4",
         "FCFP4",
         "ECFP6",
         "FCFP6",
     ]
     self.countfp = [
         lambda mol: AllChem.GetMorganFingerprint(
             mol, radius=1, bitInfo=self.bit_info, useFeatures=False
         ),
         lambda mol: AllChem.GetMorganFingerprint(
             mol, radius=1, bitInfo=self.bit_info, useFeatures=True
         ),
         lambda mol: AllChem.GetMorganFingerprint(
             mol, radius=2, bitInfo=self.bit_info, useFeatures=False
         ),
         lambda mol: AllChem.GetMorganFingerprint(
             mol, radius=2, bitInfo=self.bit_info, useFeatures=True
         ),
         lambda mol: AllChem.GetMorganFingerprint(
             mol, radius=3, bitInfo=self.bit_info, useFeatures=False
         ),
         lambda mol: AllChem.GetMorganFingerprint(
             mol, radius=3, bitInfo=self.bit_info, useFeatures=True
         ),
     ]
     self.bit_info = {}
     self.bit_infos = {}
     self.vectors = []
     self.all_bit_info_keys = {}
     self.mols = []
Beispiel #8
0
def numpy_atompair(mols):
    """ Calculate atom pair fingerprints and output them as a numpy array

    :param mols: {list} list of molecules (RDKit mols)
    :return: numpy array containing row-wise fingerprints for every molecule
    """
    return _rdk2numpy([MACCSkeys.GenMACCSKeys(m) for m in mols if m])
Beispiel #9
0
def create_fingerprints(chemical_compounds):
    """
    Create a learning matrix `X` with (Morgan) fingerprints
    from the `chemical_compounds` molecular structures.

    Parameters
    ----------
    chemical_compounds: array [n_chem, 1] or list [n_chem,]
        chemical_compounds[i] is a string describing the ith chemical
        compound.

    Return
    ------
    X: array [n_chem, 124]
        Generated (Morgan) fingerprints for each chemical compound, which
        represent presence or absence of substructures.
    """
    n_chem = chemical_compounds.shape[0]

    #nBits = 167
    nBits = 512
    X = np.zeros((n_chem, nBits))
    X2 = np.zeros((n_chem, 167))
    for i in range(n_chem):
        m = Chem.MolFromSmiles(chemical_compounds[i])
        X[i,:] = AllChem.GetMorganFingerprintAsBitVect(m,3,nBits=512,useFeatures=True)
        X2[i,:] = MACCSkeys.GenMACCSKeys(m)

        #print(AllChem.GetMorganFingerprintAsBitVect(m,2,nBits=1024))
    X3 = np.concatenate((X,X2),axis=1)
    return X3
Beispiel #10
0
def _maccsClustering(rdkit_mols):
    """
    Returns the tanimoto distance matrix based on maccs method

    Parameters
    ----------
    rdkit_mols: list
        The list of rdkit.Chem.rdchem.Mol objects

    Returns
    -------
    tanimotomatrix: np.array
        The numpy array containing the tanimoto matrix
    """
    from rdkit.Chem import MACCSkeys  # calcola MACCS keys

    fps = []
    for m in tqdm(rdkit_mols):
        fps.append(MACCSkeys.GenMACCSKeys(m))

    aprun = ParallelExecutor(n_jobs=-1)  # _config['ncpus'])
    tanimoto_matrix = aprun(total=len(fps), desc='MACCS Distance') \
            (delayed(TanimotoDistances)(fp1, fps) for fp1 in fps)

    return np.array(tanimoto_matrix)
Beispiel #11
0
 def get_maccfps(self):
     df = self.df
     df['Standard Value'].dropna(axis=0)
     smi = df['Canonical Smiles']
     sd = [Chem.MolFromSmiles(m) for m in smi]
     maccfps = [MACCSkeys.GenMACCSKeys(m) for m in sd]
     return maccfps
Beispiel #12
0
def fingerprint(smiles_or_mol,
                fp_type='maccs',
                dtype=None,
                morgan__r=2,
                morgan__n=1024,
                *args,
                **kwargs):
    """
    Generates fingerprint for SMILES
    If smiles is invalid, returns None
    Returns numpy array of fingerprint bits

    Parameters:
        smiles: SMILES string
        type: type of fingerprint: [MACCS|morgan]
        dtype: if not None, specifies the dtype of returned array
    """
    fp_type = fp_type.lower()
    molecule = get_mol(smiles_or_mol, *args, **kwargs)
    if molecule is None:
        return None
    if fp_type == 'maccs':
        keys = MACCSkeys.GenMACCSKeys(molecule)
        keys = np.array(keys.GetOnBits())
        fingerprint = np.zeros(166, dtype='uint8')
        if len(keys) != 0:
            fingerprint[keys - 1] = 1  # We drop 0-th key that is always zero
    elif fp_type == 'morgan':
        fingerprint = np.asarray(Morgan(molecule, morgan__r, nBits=morgan__n),
                                 dtype='uint8')
    else:
        raise ValueError("Unknown fingerprint type {}".format(fp_type))
    if dtype is not None:
        fingerprint = fingerprint.astype(dtype)
    return fingerprint
def make_fingerprints(data, length=512, verbose=False):
    fp_list = [
        fingerprint(Chem.rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect,
                    "Torsion "),
        fingerprint(lambda x: GetMorganFingerprintAsBitVect(x, 2, nBits=length),
                    "Morgan"),
        fingerprint(FingerprintMol, "Estate (1995)"),
        fingerprint(lambda x: GetAvalonFP(x, nBits=length),
                    "Avalon bit based (2006)"),
        fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)),
                    "Avalon+mol. weight"),
        fingerprint(lambda x: GetErGFingerprint(x), "ErG fingerprint (2006)"),
        fingerprint(lambda x: RDKFingerprint(x, fpSize=length),
                    "RDKit fingerprint"),
        fingerprint(lambda x: MACCSkeys.GenMACCSKeys(x),
                    "MACCS fingerprint"),
        fingerprint(lambda x: get_fingerprint(x,fp_type='pubchem'), "PubChem"),
        # fingerprint(lambda x: get_fingerprint(x, fp_type='FP4'), "FP4")
        fingerprint(lambda x: Generate.Gen2DFingerprint(x,Gobbi_Pharm2D.factory,dMat=Chem.Get3DDistanceMatrix(x)),
                    "3D pharmacophore"),

    ]

    for fp in fp_list:
        if (verbose): print("doing", fp.name)
        fp.apply_fp(data)

    return fp_list
Beispiel #14
0
def CalculateMACCSFingerprint(mol):
    """
    #################################################################
    Calculate MACCS keys (166 bits).

    Usage:

        result=CalculateMACCSFingerprint(mol)

        Input: mol is a molecule object.

        Output: result is a tuple form. The first is the number of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = {}
    NumFinger = 166
    bv = MACCSkeys.GenMACCSKeys(mol)
    temp = tuple(bv.GetOnBits())
    for i in temp:
        res.update({i: 1})

    return NumFinger, res, bv
Beispiel #15
0
    def computeFP(self, typeFP):

        from rdkit.Chem.Fingerprints import FingerprintMols
        from rdkit.Chem import MACCSkeys
        from rdkit.Chem.AtomPairs import Pairs, Torsions
        from rdkit.Chem import AllChem

        if not "smiclean" in self.__dict__:
            self.log = self.log + "No smiles prepared\n"
            return 1
        else:
            self.mol = Chem.MolFromSmiles(self.smiclean)
            #print self.smiclean

        dFP = {}
        if typeFP == "Mol" or typeFP == "All":
            dFP["Mol"] = FingerprintMols.FingerprintMol(self.mol)
        if typeFP == "MACCS" or typeFP == "All":
            dFP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol)
        if typeFP == "pairs" or typeFP == "All":
            dFP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol)
        if typeFP == "Torsion" or typeFP == "All":
            dFP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint(
                self.mol)
        if typeFP == "Morgan" or typeFP == "All":
            dFP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2)

        self.FP = dFP
        return 0
def calc_maccs(molecules, name_col='CASRN'):
    """
    Takes in a list of rdkit molecules and returns MACCS fingerprints for a list of rdkit molecules

    :param name_col: Name of the field to index the resulting DataFrame.  Needs to be a valid property of all molecules
    :param molecules: List of rdkit molecules with no None values

    :return: pandas DataFrame of dimensions m x n, where m = # of descriptors and n = # of molecules
    """

    # Checks for appropriate input
    assert isinstance(
        molecules,
        list), 'The molecules entered are not in the form of a list.'
    assert all((isinstance(mol, Chem.rdchem.Mol) for mol in molecules)), 'The molecules entered are not rdkit Mol ' \
                                                                         'objects.'
    assert None not in molecules, 'The list of molecules entered contains None values.'
    assert isinstance(
        name_col,
        str), 'The input parameter name_col (%s) must be a string.' % name_col

    data = []

    for mol in molecules:
        maccs = [int(x) for x in MACCSkeys.GenMACCSKeys(mol)]
        data.append(maccs)

    return pd.DataFrame(
        data,
        index=[
            mol.GetProp(name_col) if mol.HasProp(name_col) else ''
            for mol in molecules
        ])
Beispiel #17
0
    def compute_tsne(self):
        Database = self.Database2
        smiles = list(Database["SMILES"])
        smi = [Chem.MolFromSmiles(x) for x in smiles]
        fps = [MACCSkeys.GenMACCSKeys(x) for x in smi]
        tanimoto_sim_mat_lower_triangle = GetTanimotoSimMat(fps)
        n_mol = len(fps)
        similarity_matrix = np.ones([n_mol, n_mol])
        i_lower = np.tril_indices(n=n_mol, m=n_mol, k=-1)
        i_upper = np.triu_indices(n=n_mol, m=n_mol, k=1)
        similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle
        similarity_matrix[i_upper] = similarity_matrix.T[i_upper]
        distance_matrix = np.subtract(1, similarity_matrix)

        TSNE_sim = TSNE(
            n_components=2,
            init='pca',
            random_state=1992,
            angle=0.3,
            perplexity=self.perplexity).fit_transform(distance_matrix)
        tsne_result = pd.DataFrame(data=TSNE_sim, columns=["PC1", "PC2"])
        tsne_result["LIBRARY"] = list(Database.LIBRARY)
        tsne_result["TIPO"] = list(Database.LIBRARY)
        tsne_result["SMILES"] = list(Database.SMILES)
        tsne_result["NAME"] = list(Database.NAME)
        self.tsne_result = tsne_result.set_index('TIPO')
Beispiel #18
0
def get_maccs_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    vec = MACCSkeys.GenMACCSKeys(mol)
    bv = list(vec.GetOnBits())
    arr = np.zeros(167)
    arr[bv] = 1
    return arr
Beispiel #19
0
def smi_to_maccs(smi):
    MACCS_SIZE = 167
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        return np.array(MACCSkeys.GenMACCSKeys(mol))
    else:
        return np.zeros(MACCS_SIZE)
Beispiel #20
0
def MACCfpDataFrame(chempandas, namecol, smicol):
    """ Generate the physicochemical properties of the compounds.
    The compounds are stored in the DataFrame Structure defined by pandas.

    Keyword arguments:
    chempandas: the compounds stored in DataFrame, which contain the name and SMILES as columns.
    namecol: the column number of the name of SMILES.
    smicol: the column number of SMILES in the DataFrame.

    Return: a DataFrame of the compounds merging chempadas and the fingerprints by columns. 
    If None is detected given a SMILES-like string, it would be not deleted. 
    Note: The SMILES output by Chem.MolToSmiles is canonical, and might be different with the original.
    Add the names to different compounds.
    """
    assert chempandas.shape[0] <= MAXLINES
    molsmitmp = [Chem.MolFromSmiles(x) for x in chempandas.iloc[:, smicol]]
    i = 0
    molsmi = []
    for x in molsmitmp:
        if x is not None:
            x.SetProp("_Name", chempandas.iloc[i, namecol])
            molsmi.append(x)
        i += 1
    # MACC Fingerprints.
    fps = [MACCSkeys.GenMACCSKeys(x) for x in molsmi]
    fpsmat = np.matrix(fps)
    df = DataFrame(fpsmat, index=[x.GetProp("_Name")
                                  for x in molsmi])  # how to name the col?
    df['SMILES'] = [Chem.MolToSmiles(x) for x in molsmi]
    df['CHEMBL'] = df.index
    return (df)
Beispiel #21
0
def get_ecfp(
    smi_path,
    data_path='./',
):

    if type(smi_path) is str:
        smi_path = Path(smi_path)

    def get_smi(smifile):
        smiles = {}
        with open(str(smifile), 'r+') as f:
            lines = f.readlines()
            smiles = pd.DataFrame({
                'cindex': [
                    smifile.stem + '_' + str(idx)
                    for idx, content in enumerate(lines)
                ],
                'smiles':
                [content.strip('\n') for idx, content in enumerate(lines)]
            })
        return smiles

    smiles = get_smi(smi_path)['smiles']

    mols = [Chem.MolFromSmiles(smi) for smi in smiles]

    fingerprints = [MACCSkeys.GenMACCSKeys(molecule) for molecule in mols]
    fingerprints_bit = [list(fp.ToBitString()) for fp in fingerprints]
    fingerprints_df = pd.DataFrame(fingerprints_bit)

    data = pd.concat([fingerprints_df, get_smi(smi_path)], axis=1)
    data.to_csv(data_path + '/' + smi_path.stem + '_fp.csv')

    return data
Beispiel #22
0
def label_switching_decoder(key_smiles, bit_list, nmol_df):
    '''
    :param key_smiles: key molecules
    :param bit_list: model predictions
    :param df: df where to pick key molecule and the 'neighbor' molecules
    :return: list; ACSII code
    '''
    bit_list = list(map(int, bit_list))  #conver string to integers

    # build a list from 0 to 127
    orig_label = [i for i in range(128)]

    key_mol = Chem.MolFromSmiles(key_smiles)
    key_fp = MACCSkeys.GenMACCSKeys(key_mol)
    # rebuild root_seed and rotor_seed based on MW and number of atoms of key_mol
    root_seed = int(Chem.Descriptors.ExactMolWt(key_mol))
    rotor_seed = key_mol.GetNumAtoms()

    #pick 128 neighbor molecules
    # Pick the 128 reference molecules
    np.random.seed(root_seed)
    ref_smiles = np.random.choice(nmol_df.SMILES, size=128, replace=False)
    #compute the distance
    dist = []
    for i in range(len(ref_smiles)):
        mol = Chem.MolFromSmiles(ref_smiles[i])
        fp = MACCSkeys.GenMACCSKeys(mol)
        dist.append(DataStructs.FingerprintSimilarity(key_fp, fp))

    decoded_message = []
    for index, bit in enumerate(bit_list):
        SEED = root_seed + index * rotor_seed
        # Pick the 128 reference molecules
        np.random.seed(SEED)
        step_dist = np.random.choice(dist, size=len(dist), replace=False)
        # Base on the distance, swap the original cluster labels

        # get the index of ordered distances
        dict_rank = [0] * len(dist)
        for i, x in enumerate(
                sorted(range(len(step_dist)), key=lambda y: step_dist[y])):
            dict_rank[x] = i
        swaper_dict = dict(zip(orig_label, dict_rank))
        # print(swaper_dict)
        decoded_message.append(swaper_dict.get(bit))
        output = ''.join([chr(i) for i in decoded_message])
    return output
Beispiel #23
0
def maacs_fingerprint_evaluation(references):
    """ 
    Generate Similarity via MACCSKeys
    """
    scores = []
    for reference in references:
        cur_scores = []
        for candidate in references:
            if reference != candidate:
                candidate_maccs = MACCSkeys.GenMACCSKeys(candidate)
                reference_maccs = MACCSkeys.GenMACCSKeys(reference)
                cur_scores.append(
                    round(
                        DataStructs.TanimotoSimilarity(reference_maccs,
                                                       candidate_maccs), 4))
        scores.append(np.mean(cur_scores))
    return round(np.mean(scores), 4)
 def _maccs_keys(self, molecules: List, parameters: {}):
     fingerprints = []
     fps = [MACCSkeys.GenMACCSKeys(mol) for mol in molecules]
     for fp in fps:
         fp_np = np.zeros((1, ), dtype=np.int32)
         DataStructs.ConvertToNumpyArray(fp, fp_np)
         fingerprints.append(fp_np)
     return fingerprints
Beispiel #25
0
def smi2fp(smi):
    m = Chem.MolFromSmiles(smi)
    fp = MACCSkeys.GenMACCSKeys(m)
    fp_array = []
    for byte in fp:
        fp_array.append(byte)
    fp_array = fp_array[1:]
    return fp_array
Beispiel #26
0
def generate_MACCS(smiles):
    header = ['bit' + str(i) for i in range(167)]
    data = []
    for i in range(len(smiles)):
        mol = Chem.MolFromSmiles(smiles[i])
        ds = list(MACCSkeys.GenMACCSKeys(mol).ToBitString())
        data.append(ds)
    return data, header
 def transform(self, molecules):
     print("\tBuilding MACS Fingerprints")
     df = pd.DataFrame()
     molecules = molecules["molecules"].tolist()
     fingerprints = [MACCSkeys.GenMACCSKeys(mol).ToBitString() for mol in molecules]
     for i, fingerprint in enumerate(fingerprints):
         df = df.append(pd.Series({"rdkit_fingerprintMACS_{}".format(j):element for j, element in enumerate(fingerprint)}), ignore_index=True)
     np.savetxt("MAC_descriptors.txt", list(df), fmt="%s")
     return df.astype(float)
Beispiel #28
0
def createFingerprint(smiles):
    try:
        m = Chem.MolFromSmiles(smiles)
        if m == None:
            return None
        else:
            return MACCSkeys.GenMACCSKeys(m)
    except:
        return None
Beispiel #29
0
 def compute_MACCS(self, name):
     MACCS_list = []
     header = ['bit' + str(i) for i in range(167)]
     for i in range(len(self.mols)):
         ds = list(MACCSkeys.GenMACCSKeys(self.mols[i]).ToBitString())
         MACCS_list.append(ds)
     df = pd.DataFrame(MACCS_list,columns=header)
     df.insert(loc=0, column='smiles', value=self.smiles)
     df.to_csv(name[:-4]+'_MACCS.csv', index=False)
Beispiel #30
0
def maacs_fingerprint_evaluation(references, candidates):
    """ 
    Generate Similarity via MACCSKeys
    """
    print("Calculating Similarity via MACCS Keys")
    similarities = [
        [], [], [], [], []
    ]  # various similarities: Tanimoto, Dice, Cosine, Sokal, McConnaughey
    for img in references:
        similarity = [0, 0, 0, 0, 0]
        if img in candidates:
            candidate_maccs = MACCSkeys.GenMACCSKeys(candidates[img])
            reference_maccs = MACCSkeys.GenMACCSKeys(references[img])
            similarity[0] = round(
                DataStructs.TanimotoSimilarity(reference_maccs,
                                               candidate_maccs), 4)
            similarity[1] = round(
                DataStructs.DiceSimilarity(reference_maccs, candidate_maccs),
                4)
            similarity[2] = round(
                DataStructs.CosineSimilarity(reference_maccs, candidate_maccs),
                4)
            similarity[3] = round(
                DataStructs.SokalSimilarity(reference_maccs, candidate_maccs),
                4)
            similarity[4] = round(
                DataStructs.McConnaugheySimilarity(reference_maccs,
                                                   candidate_maccs), 4)
        similarities[0].append(similarity[0])
        similarities[1].append(similarity[1])
        similarities[2].append(similarity[2])
        similarities[3].append(similarity[3])
        similarities[4].append(similarity[4])
    print("Done Calculating Similarity via MACCS Keys")
    print("##########################################")
    print("Tanimoto Similarity:{}".format(round(np.mean(similarities[0]), 4)))
    print("Dice Similarity:{}".format(round(np.mean(similarities[1]), 4)))
    print("Cosine Similarity:{}".format(round(np.mean(similarities[2]), 4)))
    print("Sokal Similarity:{}".format(round(np.mean(similarities[3]), 4)))
    print("McConnaughey Similarity:{}".format(
        round(np.mean(similarities[4]), 4)))
    print("##########################################")
    return round(np.mean(similarities[0]), 4)