コード例 #1
0
 def fingerprint(fingerprint_method, keyword_dict, smi):
     mol = AllChem.MolFromSmiles(smi)
     if fingerprint_method == "Morgan":
         return AllChem.GetMorganFingerprintAsBitVect(mol, **keyword_dict)
     else:
         return RDKFingerprint(mol)
コード例 #2
0
def rdkit_fingerprint(mol, **kwargs):
    return list(RDKFingerprint(mol, **kwargs).GetOnBits())
コード例 #3
0
def main_bo(vocab_path,
            model_path,
            save_dir,
            descriptor_path,
            sampling=60,
            iterations=2,
            epochs=2,
            hidden_size=450,
            latent_size=56,
            depthT=20,
            depthG=3,
            random_seed=1,
            pIC50_weight=0,
            QED_weight=0,
            logP_weight=1,
            SA_weight=1,
            cycle_weight=1,
            sim_weight=0):
    if os.path.isdir(save_dir) is False:
        os.makedirs(save_dir)

    vocab = [x.strip("\r\n ") for x in open(vocab_path)]
    vocab = Vocab(vocab)

    model = JTNNVAE(vocab, hidden_size, latent_size, depthT, depthG)
    model.load_state_dict(torch.load(model_path))
    model = model.cuda()

    if sim_weight != 0:
        df_100 = pd.read_csv(
            '../data/covid/MPro_6wqf_A_ProteaseData_smiles_top100.csv')
        ms_db = [MolFromSmiles(x) for x in df_100['SMILES'].tolist()]
        fps_db = [RDKFingerprint(x) for x in ms_db]

    # We load the random seed
    np.random.seed(random_seed)

    # Path of the files
    latent_feature = os.path.join(descriptor_path, './latent_features.txt')
    target = os.path.join(descriptor_path, './targets.txt')
    logp_value = os.path.join(descriptor_path, './logP_values.txt')
    QED_value = os.path.join(descriptor_path, './QED_values.txt')
    pIC50_value = os.path.join(descriptor_path, './pIC50_values.txt')
    sa_score = os.path.join(descriptor_path, './SA_scores.txt')
    cycle_score = os.path.join(descriptor_path, './cycle_scores.txt')
    sim_score = os.path.join(descriptor_path, './sim_values.txt')

    # We load the data (y is minued!)
    X = np.loadtxt(latent_feature)
    y = -np.loadtxt(target)
    y = y.reshape((-1, 1))

    n = X.shape[0]
    permutation = np.random.choice(n, n, replace=False)

    X_train = X[permutation, :][0:np.int(np.round(0.9 * n)), :]
    X_test = X[permutation, :][np.int(np.round(0.9 * n)):, :]

    y_train = y[permutation][0:np.int(np.round(0.9 * n))]
    y_test = y[permutation][np.int(np.round(0.9 * n)):]

    np.random.seed(random_seed)

    pIC50_values = np.loadtxt(pIC50_value)
    QED_values = np.loadtxt(QED_value)
    logP_values = np.loadtxt(logp_value)
    SA_scores = np.loadtxt(sa_score)
    cycle_scores = np.loadtxt(cycle_score)
    sim_values = np.loadtxt(sim_score)

    iteration = 0
    while iteration < iterations:
        # We fit the GP
        np.random.seed(iteration * random_seed)
        M = 500
        sgp = SparseGP(X_train, 0 * X_train, y_train, M)
        # TODO: test hyperparameters
        sgp.train_via_ADAM(X_train,
                           0 * X_train,
                           y_train,
                           X_test,
                           X_test * 0,
                           y_test,
                           minibatch_size=10 * M,
                           max_iterations=5,
                           learning_rate=0.001)

        pred, uncert = sgp.predict(X_test, 0 * X_test)
        error = np.sqrt(np.mean((pred - y_test)**2))
        testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert)))
        print('Test RMSE: ', error)
        print('Test ll: ', testll)

        pred, uncert = sgp.predict(X_train, 0 * X_train)
        error = np.sqrt(np.mean((pred - y_train)**2))
        trainll = np.mean(
            sps.norm.logpdf(pred - y_train, scale=np.sqrt(uncert)))
        print('Train RMSE: ', error)
        print('Train ll: ', trainll)

        # We pick the next 60 inputs
        next_inputs = sgp.batched_greedy_ei(sampling, np.min(X_train, 0),
                                            np.max(X_train, 0))
        # joblib.dump(next_inputs, './next_inputs.pkl')
        # next_inputs = joblib.load('./next_inputs.pkl')
        valid_smiles = []
        new_features = []
        for i in tqdm(range(sampling)):
            all_vec = next_inputs[i].reshape((1, -1))
            tree_vec, mol_vec = np.hsplit(all_vec, 2)
            tree_vec = create_var(torch.from_numpy(tree_vec).float())
            mol_vec = create_var(torch.from_numpy(mol_vec).float())
            tree_vecs, _ = model.rsample(tree_vec, model.T_mean, model.T_var)
            mol_vecs, _ = model.rsample(mol_vec, model.G_mean, model.G_var)
            s = model.decode(tree_vecs, mol_vecs, prob_decode=False)
            if s is not None:
                valid_smiles.append(s)
                new_features.append(all_vec)

        print(len(valid_smiles), "molecules are found")
        valid_smiles = valid_smiles
        new_features = next_inputs
        new_features = np.vstack(new_features)
        save_object(
            valid_smiles,
            os.path.join(save_dir, "valid_smiles{}.pkl".format(iteration)))

        scores = []
        if pIC50_weight != 0:
            current_pIC50 = calculate_pIC50(valid_smiles)
            for i in range(len(valid_smiles)):
                current_pIC50_normalized = (
                    current_pIC50[i] -
                    np.mean(pIC50_values)) / np.std(pIC50_values)
        else:
            current_pIC50_normalized = []
            for i in range(len(valid_smiles)):
                current_pIC50_normalized.append(0)

        for i in range(len(valid_smiles)):
            if sim_weight != 0:
                current_sim_value = similarity_search(fps_db, valid_smiles[i])
                current_sim_value_normalized = (
                    current_sim_value -
                    np.mean(sim_values)) / np.std(sim_values)
            else:
                current_sim_value_normalized = 0

            current_QED_value = QED.qed(MolFromSmiles(valid_smiles[i]))
            current_log_P_value = Descriptors.MolLogP(
                MolFromSmiles(valid_smiles[i]))
            current_SA_score = -sascorer.calculateScore(
                MolFromSmiles(valid_smiles[i]))
            cycle_list = nx.cycle_basis(
                nx.Graph(
                    rdmolops.GetAdjacencyMatrix(MolFromSmiles(
                        valid_smiles[i]))))
            if len(cycle_list) == 0:
                cycle_length = 0
            else:
                cycle_length = max([len(j) for j in cycle_list])
            if cycle_length <= 6:
                cycle_length = 0
            else:
                cycle_length = cycle_length - 6

            current_cycle_score = -cycle_length

            current_SA_score_normalized = (
                current_SA_score - np.mean(SA_scores)) / np.std(SA_scores)

            current_QED_value_normalized = (
                current_QED_value - np.mean(QED_values)) / np.std(QED_values)

            current_log_P_value_normalized = (
                current_log_P_value -
                np.mean(logP_values)) / np.std(logP_values)

            current_cycle_score_normalized = (
                current_cycle_score -
                np.mean(cycle_scores)) / np.std(cycle_scores)

            score = (SA_weight * current_SA_score_normalized +
                     QED_weight * current_QED_value_normalized +
                     logP_weight * current_log_P_value_normalized +
                     cycle_weight * current_cycle_score_normalized +
                     pIC50_weight * current_pIC50_normalized[i] +
                     sim_weight * current_sim_value_normalized)

            scores.append(-score)  # target is always minused

        print(valid_smiles)
        print(scores)

        save_object(scores,
                    os.path.join(save_dir, "scores{}.pkl".format(iteration)))

        if len(new_features) > 0:
            X_train = np.concatenate([X_train, new_features], 0)
            y_train = np.concatenate([y_train, np.array(scores)[:, None]], 0)

        iteration += 1
コード例 #4
0
def scorer(smiles, pIC50_weight, QED_weight, logP_weight, SA_weight, cycle_weight, sim_weight):
    smiles_rdkit = []
    for i in range(len(smiles)):
        smiles_rdkit.append(
            MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True))

    # calculate IC50 of training set using MPNN
    #IC50_scores=calculateScore(smiles_rdkit)

    # read in IC50 of training set from database
    IC50_scores = np.loadtxt('../data/covid/ic50-fulltrain.txt')
    IC50_scores = [x for x in IC50_scores]
    IC50_scores_normalized = (np.array(IC50_scores) - np.mean(IC50_scores)) / np.std(IC50_scores)

    if sim_weight != 0:
        # df_100 = list of molecules to match similarity
        df_100 = pd.read_csv('../data/covid/MPro_6wqf_A_ProteaseData_smiles_top100.csv')
        ms_db = [MolFromSmiles(x) for x in df_100['SMILES'].tolist()]
        fps_db = [RDKFingerprint(x) for x in ms_db]

        sim_values = []
        for i in range(len(smiles)):
            sim_values.append(
                similarity_search(fps_db, smiles_rdkit[i]))
        sim_values_normalized = (
            np.array(sim_values) - np.mean(sim_values)) / np.std(sim_values)
    else:
        sim_values, sim_values_normalized = [], []
        for i in range(len(smiles)):
            sim_values.append(0)
            sim_values_normalized.append(0)
        sim_values_normalized=np.array(sim_values_normalized)
    
    logP_values = []
    for i in range(len(smiles)):
        logP_values.append(
            Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i])))

    qed_values = []
    for i in range(len(smiles)):
        qed_values.append(
            QED.qed(MolFromSmiles(smiles_rdkit[i])))

    SA_scores = []
    for i in range(len(smiles)):
        SA_scores.append(
            -sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i])))

    cycle_scores = []
    for i in range(len(smiles)):
        cycle_list = nx.cycle_basis(
            nx.Graph(
                rdmolops.GetAdjacencyMatrix(MolFromSmiles(smiles_rdkit[i]))))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        cycle_scores.append(-cycle_length)

    SA_scores_normalized = (
        np.array(SA_scores) - np.mean(SA_scores)) / np.std(SA_scores)
    qed_values_normalized = (
        np.array(qed_values) - np.mean(qed_values)) / np.std(qed_values)
    cycle_scores_normalized = (
        np.array(cycle_scores) - np.mean(cycle_scores)) / np.std(cycle_scores)
    logP_values_normalized = (
        np.array(logP_values) - np.mean(logP_values)) / np.std(logP_values)

    targets = (pIC50_weight * IC50_scores_normalized + 
               logP_weight * logP_values_normalized +
               SA_weight * SA_scores_normalized +
               QED_weight * qed_values_normalized +
               cycle_weight * cycle_scores_normalized + 
               sim_weight * sim_values_normalized)
   
    return (IC50_scores, qed_values, logP_values, SA_scores, cycle_scores, sim_values, targets)