Beispiel #1
0
def tanimoto_sml(queryism, targetism):
    """
    Returns the Tanimoto similarity between the two molecules in SMILES format.
    """
    # no Unicode here
    querymol = MolFromSmiles(str(queryism))
    targetmol = MolFromSmiles(str(targetism))

    if querymol and targetmol:
        queryfp = RDKFingerprint(querymol)
        targetfp = RDKFingerprint(targetmol)

        return TanimotoSimilarity(queryfp, targetfp)
Beispiel #2
0
def similarity_search(fps_db, smile):
    fps_test = RDKFingerprint(MolFromSmiles(smile))
    ts = []
    for i, s_top in enumerate(fps_db):
        ts.append(DataStructs.FingerprintSimilarity(s_top, fps_test))
    ts = np.array(ts)
    return ts.mean()  # ts.max()
    def _set_target_fps(self, pickaxe: Pickaxe):
        for smiles in pickaxe.target_smiles:
            mol = MolFromSmiles(smiles)
            if self.fingerprint_method == "Morgan":
                fp = AllChem.GetMorganFingerprintAsBitVect(mol, **self.fingerprint_args)
            else:
                fp = RDKFingerprint(mol)

            self.target_fps.append(fp)
Beispiel #4
0
 def fingerprint(self, m, fpsize=1024, bitsperhash=2, tgtDensity=0.3):
     """Compute bit fingerprint of molecule m"""
     from rdkit.Chem import RDKFingerprint
     #from rdkit import DataStructs
     #fp=RDKFingerprint(m, minPath=1, maxPath=7,fpSize=1024, bitsPerHash=2, useHs=False, tgtDensity=0.3)
     return RDKFingerprint(m,
                           minPath=1,
                           maxPath=7,
                           fpSize=fpsize,
                           nBitsPerHash=bitsperhash,
                           tgtDensity=tgtDensity,
                           minSize=fpsize)
Beispiel #5
0
def calculate_fp(mol, method='maccs', n_bits=2048):
	# mol = Chem molecule object
	# Function to calculate molecular fingerprints given the number of bits and the method
	if method == 'maccs':
		return MACCSkeys.GenMACCSKeys(mol)
	if method == 'ecfp4':
		return GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits, useFeatures=False)
	if method == 'ecfp6':
		return GetMorganFingerprintAsBitVect(mol, 3, nBits=n_bits, useFeatures=False)
	if method == 'torsion':
		return GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=n_bits)
	if method == 'rdk5':
		return RDKFingerprint(mol, maxPath=5, fpSize=1024, nBitsPerHash=2)
Beispiel #6
0
 def transform(self):
     super().transform()
     fts = []
     self.mol_names = []
     for mol in self.structures:
         fp = RDKFingerprint(mol)
         arr = np.zeros((0, ), dtype=np.int8)
         DataStructs.ConvertToNumpyArray(fp, arr)
         fts.append(arr)
         self.features = np.array(fts)
         self.mol_names.append(mol.GetProp("_Name"))
     self.columns = [str(i) for i in list(range(self.features.shape[1]))]
     return self.features
def rdk_molstring(molecule, fptype):
    """
    Method for make molstring for rdk fingerprint

    :param molecule: molecule object
    :param fptype: type, radius and size of fingerprint
    :type fptype: dict
    :return: molstring for rdk fingerprint
    """
    arr = np.zeros((1, ), dtype=int)
    DataStructs.ConvertToNumpyArray(
        RDKFingerprint(molecule, fpSize=fptype['Size']), arr)

    return arr
    def ensemble_test(self, submission, data_list, reversed_token_map,
                      transform):
        """
        ensemble test function
        :param submission: submission file
        :param data_list: list of test data path
        :param reversed_token_map: converts prediction to readable format
        :param transform: normalize function
        """
        # load .yaml file that contains information about each model
        with open('model/prediction_models.yaml') as f:
            p_configs = yaml.load(f)

        predictors = []

        for conf in p_configs.values():
            predictors.append(
                Predict.remote(conf, self._device, self._gpu_non_block,
                               self._decode_length, self._model_load_path))

        loop = asyncio.get_event_loop()

        async def process_async_calculate_similarity(combination_of_smiles,
                                                     combination_index):
            return {
                idx: await self.async_fps(comb[0], comb[1])
                for comb, idx in zip(combination_of_smiles, combination_index)
            }

        def ray_prediction(imgs):
            return ray.get([p.decode.remote(imgs) for p in predictors])

        conf_len = len(p_configs)  # configure length == number of model to use
        fault_counter = 0
        sequence = None
        model_contribution = np.zeros(conf_len)
        for i, dat in enumerate(data_list):
            imgs = Image.open(self._test_file_path + dat)
            imgs = self.png_to_tensor(imgs)
            imgs = transform(imgs).pin_memory().cuda()

            # predict SMILES sequence form each predictors
            preds_raw = ray_prediction(imgs)

            preds = []
            for p in preds_raw:
                # predicted sequence token value
                SMILES_predicted_sequence = list(
                    torch.argmax(p.detach().cpu(), -1).numpy())[0]
                # converts prediction to readable format from sequence token value
                decoded_sequences = decode_predicted_sequences(
                    SMILES_predicted_sequence, reversed_token_map)
                preds.append(decoded_sequences)
            del (preds_raw)

            # fault check: whether the prediction satisfies the SMILES format or not
            ms = {}
            for idx, p in enumerate(preds):
                m = MolFromSmiles(p)
                if m != None:
                    ms.update({idx: m})

            if len(
                    ms
            ) == 0:  # there is no decoded sequence that matches to SMILES format
                print('decode fail')
                fault_counter += 1
                sequence = preds[0]

            elif len(
                    ms
            ) == 1:  # there is only one decoded sequence that matches to SMILES format
                sequence = preds[list(ms.keys())[0]]

            else:  # there is more than two decoded sequence that matches to SMILES format
                # result ensemble
                ms_to_fingerprint = [RDKFingerprint(x) for x in ms.values()]
                combination_of_smiles = list(combinations(
                    ms_to_fingerprint, 2))
                ms_to_index = [x for x in ms]
                combination_index = list(combinations(ms_to_index, 2))

                # calculate similarity score
                smiles_dict = loop.run_until_complete(
                    process_async_calculate_similarity(combination_of_smiles,
                                                       combination_index))

                # sort the pairs by similarity score
                smiles_dict = sorted(smiles_dict.items(),
                                     key=(lambda x: x[1]),
                                     reverse=True)

                if smiles_dict[0][
                        1] == 1.0:  # if a similar score is 1 we assume to those predictions are correct.
                    sequence = preds[smiles_dict[0][0][0]]
                else:
                    score_board = np.zeros(conf_len)
                    for i, (idx, value) in enumerate(smiles_dict):
                        score_board[list(idx)] += conf_len - i

                    pick = int(np.argmax(score_board)
                               )  # choose the index that has the highest score
                    sequence = preds[pick]  # pick the decoded sequence
                    model_contribution[pick] += 1  # logging witch model used
                    sequence = preds[np.argmax(score_board)]

            print('{} sequence:, {}'.format(i, sequence))
            # print('decode_time:', time.time() - start_time)

            submission.loc[submission['file_name'] == dat, 'SMILES'] = sequence
            del (preds)

        loop.close()
        print('total fault:', fault_counter)
        print('model contribution:', model_contribution)
        return submission
    def rdk_fp(self):
        """
        Receives the csv file which is used to generate rdk fingerprints (2048) and saves as numpy file
        
        Parameter
        ---------
        
        input smiles : str
            Compouds in the form of smiles are used
    
        return : np.array
            Features are saved in the form of numpy files
        """
        df = pd.read_csv(self.csv_path)
        smiles_list = df['Smiles'].tolist()

        fingerprints = []
        not_found = []

        for i in tqdm(range(len(smiles_list))):
            try:

                mol = Chem.MolFromSmiles(smiles_list[i])
                fp = RDKFingerprint(mol, nBitsPerHash=1)
                bits_array = (np.fromstring(fp.ToBitString(), 'u1') - ord('0'))
                fingerprints.append(bits_array)

            except:

                fingerprints.append(np.nan)
                not_found.append(i)
                pass

        df.drop(not_found, axis=0, inplace=True)

        print('Number of FPs not found: {}'.format(len(not_found)))

        df.reset_index(drop=True, inplace=True)
        labelencoder = LabelEncoder()
        Y = labelencoder.fit_transform(df['Label'].values)
        Y = Y.reshape(Y.shape[0], 1)

        print('Output shape: {}'.format(Y.shape))

        fp_array = (np.asarray((fingerprints), dtype=object))
        X = np.delete(fp_array, not_found, axis=0)
        X = np.vstack(X).astype(np.float32)

        print('Input shape: {}'.format(X.shape))

        final_array = np.concatenate((X, Y), axis=1)

        # Removing rows, from final_array, where duplicate FPs are present
        final_array_slice = final_array[:, 0:(final_array.shape[1] - 1)]
        _, unq_row_indices = np.unique(final_array_slice,
                                       return_index=True,
                                       axis=0)
        final_array_unique = final_array[unq_row_indices]

        print(
            'Number of Duplicate FPs: {}'.format(final_array.shape[0] -
                                                 final_array_unique.shape[0]))

        print('Final Numpy array shape: {}'.format(final_array_unique.shape))
        print('Type of final array: {}'.format(type(final_array_unique)))
        final_numpy_array = np.asarray((final_array_unique), dtype=np.float32)

        return final_numpy_array
Beispiel #10
0
 def fingerprint(fingerprint_method, keyword_dict, smi):
     mol = AllChem.MolFromSmiles(smi)
     if fingerprint_method == "Morgan":
         return AllChem.GetMorganFingerprintAsBitVect(mol, **keyword_dict)
     else:
         return RDKFingerprint(mol)
Beispiel #11
0
def main_bo(vocab_path,
            model_path,
            save_dir,
            descriptor_path,
            sampling=60,
            iterations=2,
            epochs=2,
            hidden_size=450,
            latent_size=56,
            depthT=20,
            depthG=3,
            random_seed=1,
            pIC50_weight=0,
            QED_weight=0,
            logP_weight=1,
            SA_weight=1,
            cycle_weight=1,
            sim_weight=0):
    if os.path.isdir(save_dir) is False:
        os.makedirs(save_dir)

    vocab = [x.strip("\r\n ") for x in open(vocab_path)]
    vocab = Vocab(vocab)

    model = JTNNVAE(vocab, hidden_size, latent_size, depthT, depthG)
    model.load_state_dict(torch.load(model_path))
    model = model.cuda()

    if sim_weight != 0:
        df_100 = pd.read_csv(
            '../data/covid/MPro_6wqf_A_ProteaseData_smiles_top100.csv')
        ms_db = [MolFromSmiles(x) for x in df_100['SMILES'].tolist()]
        fps_db = [RDKFingerprint(x) for x in ms_db]

    # We load the random seed
    np.random.seed(random_seed)

    # Path of the files
    latent_feature = os.path.join(descriptor_path, './latent_features.txt')
    target = os.path.join(descriptor_path, './targets.txt')
    logp_value = os.path.join(descriptor_path, './logP_values.txt')
    QED_value = os.path.join(descriptor_path, './QED_values.txt')
    pIC50_value = os.path.join(descriptor_path, './pIC50_values.txt')
    sa_score = os.path.join(descriptor_path, './SA_scores.txt')
    cycle_score = os.path.join(descriptor_path, './cycle_scores.txt')
    sim_score = os.path.join(descriptor_path, './sim_values.txt')

    # We load the data (y is minued!)
    X = np.loadtxt(latent_feature)
    y = -np.loadtxt(target)
    y = y.reshape((-1, 1))

    n = X.shape[0]
    permutation = np.random.choice(n, n, replace=False)

    X_train = X[permutation, :][0:np.int(np.round(0.9 * n)), :]
    X_test = X[permutation, :][np.int(np.round(0.9 * n)):, :]

    y_train = y[permutation][0:np.int(np.round(0.9 * n))]
    y_test = y[permutation][np.int(np.round(0.9 * n)):]

    np.random.seed(random_seed)

    pIC50_values = np.loadtxt(pIC50_value)
    QED_values = np.loadtxt(QED_value)
    logP_values = np.loadtxt(logp_value)
    SA_scores = np.loadtxt(sa_score)
    cycle_scores = np.loadtxt(cycle_score)
    sim_values = np.loadtxt(sim_score)

    iteration = 0
    while iteration < iterations:
        # We fit the GP
        np.random.seed(iteration * random_seed)
        M = 500
        sgp = SparseGP(X_train, 0 * X_train, y_train, M)
        # TODO: test hyperparameters
        sgp.train_via_ADAM(X_train,
                           0 * X_train,
                           y_train,
                           X_test,
                           X_test * 0,
                           y_test,
                           minibatch_size=10 * M,
                           max_iterations=5,
                           learning_rate=0.001)

        pred, uncert = sgp.predict(X_test, 0 * X_test)
        error = np.sqrt(np.mean((pred - y_test)**2))
        testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert)))
        print('Test RMSE: ', error)
        print('Test ll: ', testll)

        pred, uncert = sgp.predict(X_train, 0 * X_train)
        error = np.sqrt(np.mean((pred - y_train)**2))
        trainll = np.mean(
            sps.norm.logpdf(pred - y_train, scale=np.sqrt(uncert)))
        print('Train RMSE: ', error)
        print('Train ll: ', trainll)

        # We pick the next 60 inputs
        next_inputs = sgp.batched_greedy_ei(sampling, np.min(X_train, 0),
                                            np.max(X_train, 0))
        # joblib.dump(next_inputs, './next_inputs.pkl')
        # next_inputs = joblib.load('./next_inputs.pkl')
        valid_smiles = []
        new_features = []
        for i in tqdm(range(sampling)):
            all_vec = next_inputs[i].reshape((1, -1))
            tree_vec, mol_vec = np.hsplit(all_vec, 2)
            tree_vec = create_var(torch.from_numpy(tree_vec).float())
            mol_vec = create_var(torch.from_numpy(mol_vec).float())
            tree_vecs, _ = model.rsample(tree_vec, model.T_mean, model.T_var)
            mol_vecs, _ = model.rsample(mol_vec, model.G_mean, model.G_var)
            s = model.decode(tree_vecs, mol_vecs, prob_decode=False)
            if s is not None:
                valid_smiles.append(s)
                new_features.append(all_vec)

        print(len(valid_smiles), "molecules are found")
        valid_smiles = valid_smiles
        new_features = next_inputs
        new_features = np.vstack(new_features)
        save_object(
            valid_smiles,
            os.path.join(save_dir, "valid_smiles{}.pkl".format(iteration)))

        scores = []
        if pIC50_weight != 0:
            current_pIC50 = calculate_pIC50(valid_smiles)
            for i in range(len(valid_smiles)):
                current_pIC50_normalized = (
                    current_pIC50[i] -
                    np.mean(pIC50_values)) / np.std(pIC50_values)
        else:
            current_pIC50_normalized = []
            for i in range(len(valid_smiles)):
                current_pIC50_normalized.append(0)

        for i in range(len(valid_smiles)):
            if sim_weight != 0:
                current_sim_value = similarity_search(fps_db, valid_smiles[i])
                current_sim_value_normalized = (
                    current_sim_value -
                    np.mean(sim_values)) / np.std(sim_values)
            else:
                current_sim_value_normalized = 0

            current_QED_value = QED.qed(MolFromSmiles(valid_smiles[i]))
            current_log_P_value = Descriptors.MolLogP(
                MolFromSmiles(valid_smiles[i]))
            current_SA_score = -sascorer.calculateScore(
                MolFromSmiles(valid_smiles[i]))
            cycle_list = nx.cycle_basis(
                nx.Graph(
                    rdmolops.GetAdjacencyMatrix(MolFromSmiles(
                        valid_smiles[i]))))
            if len(cycle_list) == 0:
                cycle_length = 0
            else:
                cycle_length = max([len(j) for j in cycle_list])
            if cycle_length <= 6:
                cycle_length = 0
            else:
                cycle_length = cycle_length - 6

            current_cycle_score = -cycle_length

            current_SA_score_normalized = (
                current_SA_score - np.mean(SA_scores)) / np.std(SA_scores)

            current_QED_value_normalized = (
                current_QED_value - np.mean(QED_values)) / np.std(QED_values)

            current_log_P_value_normalized = (
                current_log_P_value -
                np.mean(logP_values)) / np.std(logP_values)

            current_cycle_score_normalized = (
                current_cycle_score -
                np.mean(cycle_scores)) / np.std(cycle_scores)

            score = (SA_weight * current_SA_score_normalized +
                     QED_weight * current_QED_value_normalized +
                     logP_weight * current_log_P_value_normalized +
                     cycle_weight * current_cycle_score_normalized +
                     pIC50_weight * current_pIC50_normalized[i] +
                     sim_weight * current_sim_value_normalized)

            scores.append(-score)  # target is always minused

        print(valid_smiles)
        print(scores)

        save_object(scores,
                    os.path.join(save_dir, "scores{}.pkl".format(iteration)))

        if len(new_features) > 0:
            X_train = np.concatenate([X_train, new_features], 0)
            y_train = np.concatenate([y_train, np.array(scores)[:, None]], 0)

        iteration += 1
Beispiel #12
0
def rdkit_fingerprint(mol, **kwargs):
    return list(RDKFingerprint(mol, **kwargs).GetOnBits())
def scorer(smiles, pIC50_weight, QED_weight, logP_weight, SA_weight, cycle_weight, sim_weight):
    smiles_rdkit = []
    for i in range(len(smiles)):
        smiles_rdkit.append(
            MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True))

    # calculate IC50 of training set using MPNN
    #IC50_scores=calculateScore(smiles_rdkit)

    # read in IC50 of training set from database
    IC50_scores = np.loadtxt('../data/covid/ic50-fulltrain.txt')
    IC50_scores = [x for x in IC50_scores]
    IC50_scores_normalized = (np.array(IC50_scores) - np.mean(IC50_scores)) / np.std(IC50_scores)

    if sim_weight != 0:
        # df_100 = list of molecules to match similarity
        df_100 = pd.read_csv('../data/covid/MPro_6wqf_A_ProteaseData_smiles_top100.csv')
        ms_db = [MolFromSmiles(x) for x in df_100['SMILES'].tolist()]
        fps_db = [RDKFingerprint(x) for x in ms_db]

        sim_values = []
        for i in range(len(smiles)):
            sim_values.append(
                similarity_search(fps_db, smiles_rdkit[i]))
        sim_values_normalized = (
            np.array(sim_values) - np.mean(sim_values)) / np.std(sim_values)
    else:
        sim_values, sim_values_normalized = [], []
        for i in range(len(smiles)):
            sim_values.append(0)
            sim_values_normalized.append(0)
        sim_values_normalized=np.array(sim_values_normalized)
    
    logP_values = []
    for i in range(len(smiles)):
        logP_values.append(
            Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i])))

    qed_values = []
    for i in range(len(smiles)):
        qed_values.append(
            QED.qed(MolFromSmiles(smiles_rdkit[i])))

    SA_scores = []
    for i in range(len(smiles)):
        SA_scores.append(
            -sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i])))

    cycle_scores = []
    for i in range(len(smiles)):
        cycle_list = nx.cycle_basis(
            nx.Graph(
                rdmolops.GetAdjacencyMatrix(MolFromSmiles(smiles_rdkit[i]))))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        cycle_scores.append(-cycle_length)

    SA_scores_normalized = (
        np.array(SA_scores) - np.mean(SA_scores)) / np.std(SA_scores)
    qed_values_normalized = (
        np.array(qed_values) - np.mean(qed_values)) / np.std(qed_values)
    cycle_scores_normalized = (
        np.array(cycle_scores) - np.mean(cycle_scores)) / np.std(cycle_scores)
    logP_values_normalized = (
        np.array(logP_values) - np.mean(logP_values)) / np.std(logP_values)

    targets = (pIC50_weight * IC50_scores_normalized + 
               logP_weight * logP_values_normalized +
               SA_weight * SA_scores_normalized +
               QED_weight * qed_values_normalized +
               cycle_weight * cycle_scores_normalized + 
               sim_weight * sim_values_normalized)
   
    return (IC50_scores, qed_values, logP_values, SA_scores, cycle_scores, sim_values, targets)