def calc_properties(smiles_list):
    logp_vals, mw_vals = [], []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        logp_vals.append(MolLogP(mol, True))
        mw_vals.append(ExactMolWt(mol))
    return pd.DataFrame({'logp': logp_vals, 'mw': mw_vals})
Esempio n. 2
0
    def score(self, smiles):
        mol = Chem.MolFromSmiles(smiles)

        try:
            logp = MolLogP(mol)
        except:
            logp = -1000

        sa_score = -sascorer.calculateScore(mol)
        cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol)))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6

        cycle_score = -cycle_length
        sa_score_norm = (sa_score - self._sa_mean) / self._sa_std
        logp_norm = (logp - self._logp_mean) / self._logp_std
        cycle_score_norm = (cycle_score - self._cycle_mean) / self._cycle_std

        return sa_score_norm + logp_norm + cycle_score_norm
Esempio n. 3
0
    def eval_function(text):
        generated = ''.join(text)
        try:
            decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                          start='<s>',
                                                          end='</s>')
            DeepSMILESLanguageModelUtils.sanitize(decoded)
        except Exception:
            return 0

        # extracted = DeepSMILESLanguageModelUtils.extract(generated)
        # tokenized = DeepSMILESTokenizer(extracted)
        # len_score = len(tokenized.get_tokens()) / (text_length - 1)  # provide more reward for longer text sequences

        decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                      start='<s>',
                                                      end='</s>')
        smiles = DeepSMILESLanguageModelUtils.sanitize(decoded)
        mol = Chem.MolFromSmiles(smiles)
        logp = factor * MolLogP(mol)
        logp_score = (logp - logp_min) / (logp_max - logp_min
                                          )  # normalize logP between 0 and 1

        score = logp_score  # (logp_score * 0.5) + (len_score * 0.5)

        logger.info("%s, %s" % (generated, str(score)))
        return score
Esempio n. 4
0
 def save(self,
          force_insert=False,
          force_update=False,
          using=None,
          update_fields=None,
          *args,
          **kwargs):
     smiles = self.smiles
     if smiles:
         try:
             self.mol = Chem.MolFromSmiles(smiles)
             self.mol_block = Chem.MolToMolBlock(self.mol)
             self.mol_weight = Descriptors.ExactMolWt(self.mol)
             self.alogp = MolLogP(self.mol)
             self.hba = NumHAcceptors(self.mol)
             self.hbd = NumHDonors(self.mol)
             self.psa = Chem.MolSurf.TPSA(self.mol)
             self.rtb = NumRotatableBonds(self.mol)
             super(Compound, self).save(*args, **kwargs)
             self.formula = Chem.rdMolDescriptors.CalcMolFormula(self.mol)
             self.bfp = MORGANBV_FP(Value(smiles))
         except (ValueError, TypeError):
             print "Error when storing mol object"
             pass
     super(Compound, self).save(*args, **kwargs)
Esempio n. 5
0
    def save(self,
             force_insert=False,
             force_update=False,
             using=None,
             update_fields=None):
        self.molecule_chembl_id_url = 'https://www.ebi.ac.uk/chembl/compound/inspect/{}'.format(
            self.molecule_chembl_id)
        super(ChEMBL_small_molecule, self).save()

        smiles = self.molecule_smile

        if smiles:
            try:
                self.mol = Chem.MolFromSmiles(smiles)
                self.mol_block = Chem.MolToMolBlock(self.mol)
                self.mol_weight = Descriptors.ExactMolWt(self.mol)
                self.alogp = MolLogP(self.mol)
                self.hba = NumHAcceptors(self.mol)
                self.hbd = NumHDonors(self.mol)
                self.psa = Chem.MolSurf.TPSA(self.mol)
                self.rtb = NumRotatableBonds(self.mol)
                super(ChEMBL_small_molecule, self).save()
                self.formula = Chem.rdMolDescriptors.CalcMolFormula(self.mol)
                self.bfp = MORGANBV_FP(Value(smiles))
            except (ValueError, TypeError):
                print('Error when storing mol object')
                pass
        super(ChEMBL_small_molecule, self).save()
Esempio n. 6
0
def get_logP(smi_list):
    logP_list = []

    for smi in smi_list:
        m = Chem.MolFromSmiles(smi)
        logP_list.append(MolLogP(m))

    return logP_list
Esempio n. 7
0
    def preprocess(dataset, dir_input):

        train_smiles = list(dataset['SMILES'])
        train_adducts = dataset['Adducts']
        train_ccs = list(dataset['CCS'])

        adducts_encoder = AdductToOneHotEncoder()
        adducts_encoder.fit(train_adducts)
        adducts = adducts_encoder.transform(train_adducts)

        Smiles, molecules, adjacencies, properties, descriptors = '', [], [], [], []
        for i, smi in enumerate(train_smiles):
            if '.' in smi:
                continue
            smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
            mol = Chem.MolFromSmiles(smi)
            mol = Chem.AddHs(mol)
            atoms = create_atoms(mol)
            i_jbond_dict = create_ijbonddict(mol)

            fingerprints = extract_fingerprints(atoms, i_jbond_dict, radius)
            adjacency = create_adjacency(mol)

            Smiles += smi + '\n'
            molecules.append(fingerprints)
            adjacencies.append(adjacency)
            properties.append([[train_ccs[i]]])
            descriptors.append([
                ExactMolWt(mol),
                MolLogP(mol),
                GetFormalCharge(mol),
                CalcNumRings(mol),
                CalcNumRotatableBonds(mol),
                CalcLogS(mol),
                AcidCount(mol),
                BaseCount(mol),
                APolar(mol),
                BPolar(mol)
            ])

        properties = np.array(properties)
        mean, std = np.mean(properties), np.std(properties)
        properties = np.array((properties - mean) / std)

        os.makedirs(dir_input, exist_ok=True)

        with open(dir_input + 'Smiles.txt', 'w') as f:
            f.write(Smiles)
        np.save(dir_input + 'molecules', molecules)
        np.save(dir_input + 'adducts', adducts)
        np.save(dir_input + 'adjacencies', adjacencies)
        np.save(dir_input + 'properties', properties)
        np.save(dir_input + 'descriptors', descriptors)
        np.save(dir_input + 'mean', mean)
        np.save(dir_input + 'std', std)
        dump_dictionary(fingerprint_dict,
                        dir_input + 'fingerprint_dict.pickle')
Esempio n. 8
0
def calc_properties(smi):
    # returns logP, TPSA, MW, MR
    m = Chem.MolFromSmiles(smi.numpy())
    logP = MolLogP(m)
    tpsa = CalcTPSA(m)
    # sas = calculateScore(m)
    mw = ExactMolWt(m)
    mr = MolMR(m)
    return np.asarray(logP), np.asarray(tpsa), np.asarray(mw), np.asarray(mr)
Esempio n. 9
0
    def analyze(self, smiles: List[str], only_drugs=True) -> pd.DataFrame:
        features = self.preprocessor.transform(smiles)

        # RDKit molecular properties
        inchikey = []
        weight = []
        logp = []
        hdonors = []
        hacceptors = []
        for example in smiles:
            mol = MolFromSmiles(example)
            if not mol:
                raise ValueError("Malformed molecule passed in to analyze")

            inchikey.append(MolToInchiKey(mol))
            weight.append(ExactMolWt(mol))
            logp.append(MolLogP(mol))
            hdonors.append(NumHDonors(mol))
            hacceptors.append(NumHAcceptors(mol))

        # Scores
        safety = self.safety.predict(features)
        feasibility = self.feasibility.predict(features)
        bbbp = self.bbbp.predict_proba(features)

        dataframe = pd.DataFrame(
            {
                "key": inchikey,
                "smiles": smiles,
                "weight": weight,
                "logp": logp,
                "hdonors": hdonors,
                "hacceptors": hacceptors,
                "safety": safety,
                "feasibility": feasibility,
                "bbbp": (i[1] for i in bbbp),
            }
        )

        if only_drugs:
            # Lipinsky's rules
            dataframe = dataframe[dataframe.weight < 500]
            dataframe = dataframe[dataframe.hdonors <= 5]
            dataframe = dataframe[dataframe.hacceptors <= 10]
            dataframe = dataframe[dataframe.logp <= 5]

            # Filter too toxic and infeasible compounds
            dataframe = dataframe[dataframe.safety > 0.75]
            dataframe = dataframe[dataframe.feasibility > 0.75]

            dataframe = dataframe.reset_index(drop=True)

        return dataframe
def process_smile(row):
    """Return molecular properties """
    try:
        smi = row.strip()
        m = Chem.MolFromSmiles(smi)
        logP = MolLogP(m)
        length = len(list(smi))

        del m
        return smi, logP, length
    except:
        return None, None, None
Esempio n. 11
0
def check_lipinski(mol):
    fgs = load_functional_groups()
    h_donors = Lipinski.NumHDonors(mol.rdmol)
    h_acceptors = Lipinski.NumHAcceptors(mol.rdmol)
    log_p = MolLogP(mol.rdmol)
    wt = MolWt(mol.rdmol)
    if h_donors <= 5 and h_acceptors <= 5 and log_p < 5:
        if wt >= 450:
            mol.join(fgs['terminal_fg'].get_random())
            return True, False
        else:
            return True, False
    else:
        return False, False
Esempio n. 12
0
def read_ZINC_smiles(file_name, num_mol):
    f = open(file_name, 'r')
    contents = f.readlines()

    smi_list = []
    logP_list = []

    for i in tqdm_notebook(range(num_mol), desc='Reading Data'):
        smi = contents[i].strip()
        m = Chem.MolFromSmiles(smi)
        smi_list.append(smi)
        logP_list.append(MolLogP(m))

    logP_list = np.asarray(logP_list).astype(float)

    return smi_list, logP_list
Esempio n. 13
0
def read_ZINC_smiles(num_mol):
    f = open('ZINC.smiles', 'r')
    contents = f.readlines()

    smi_list = []
    logP_list = []
    tpsa_list = []
    for i in range(num_mol):
        smi = contents[i].strip()
        m = Chem.MolFromSmiles(smi)
        smi_list.append(smi)
        logP_list.append(MolLogP(m))
        tpsa_list.append(CalcTPSA(m))

    logP_list = np.asarray(logP_list).astype(float)
    tpsa_list = np.asarray(tpsa_list).astype(float)

    return smi_list, logP_list, tpsa_list
Esempio n. 14
0
def get_global_features(mol):
    """Computes global-level features for a molecule.

    Parameters
    ----------
    mol : rdkit mol

    Returns
    -------
    [np.ndarray]
        Global-level features
    """
    # MW, TPSA, logP, n.hdonors
    mw = MolWt(mol)
    tpsa = CalcTPSA(mol)
    logp = MolLogP(mol)
    n_hdonors = NumHDonors(mol)

    desc = np.array([mw, tpsa, logp, n_hdonors], dtype=np.float32)
    return desc
Esempio n. 15
0
def calc_properties(smi):
    # returns logP, TPSA, MW, MR
    # normalize quantities
    m = Chem.MolFromSmiles(smi.numpy())
    logP = np.asarray(MolLogP(m))
    logP = (logP - LOGP_MEAN) / LOGP_STD

    tpsa = np.asarray(CalcTPSA(m))
    tpsa = np.log10(tpsa + 1)
    tpsa = (tpsa - TPSA_MEAN) / TPSA_STD

    # sas = calculateScore(m)

    mw = np.asarray(ExactMolWt(m))
    mw = np.log10(mw + 1)
    mw = (mw - MW_MEAN) / MW_STD

    mr = np.asarray(MolMR(m))
    mr = np.log10(mr + 1)
    mr = (mr - MR_MEAN) / MR_STD
    return logP, tpsa, mw, mr
Esempio n. 16
0
def read_ZINC(num_mol):
    f = open('ZINC.smiles', 'r')
    contents = f.readlines()

    smi = []
    fps = []
    logP = []
    tpsa = []
    for i in range(num_mol):
        smi = contents[i].strip()
        m = Chem.MolFromSmiles(smi)
        fp = AllChem.GetMorganFingerprintAsBitVect(m, 2)
        arr = np.zeros((1, ))
        DataStructs.ConvertToNumpyArray(fp, arr)
        fps.append(arr)
        logP.append(MolLogP(m))
        tpsa.append(CalcTPSA(m))

    fps = np.asarray(fps)
    logP = np.asarray(logP)
    tpsa = np.asarray(tpsa)

    return fps, logP, tpsa
Esempio n. 17
0
def calc_properties(smi):
    """
    :param smi:
    :return: logP, TPSA, MR, MW
    """
    m = Chem.MolFromSmiles(smi.numpy())
    logP = np.asarray(MolLogP(m))
    logP = (logP - LOGP_MEAN) / LOGP_STD

    tpsa = np.asarray(CalcTPSA(m))
    tpsa = np.log10(tpsa + 1)
    tpsa = (tpsa - TPSA_MEAN) / TPSA_STD

    # sas = calculateScore(m)

    mw = np.asarray(ExactMolWt(m))
    mw = np.log10(mw + 1)
    mw = (mw - MW_MEAN) / MW_STD

    mr = np.asarray(MolMR(m))
    mr = np.log10(mr + 1)
    mr = (mr - MR_MEAN) / MR_STD
    return logP, tpsa, mr, mw
Esempio n. 18
0
def read_ZINC(num_mol):
    f = open('../Data/logP/ZINC.smiles', 'r')
    contents = f.readlines()

    list_smi = []
    fps = []
    logP = []
    tpsa = []
    for i in tqdm_notebook(range(num_mol)):
        smi = contents[i].strip()
        list_smi.append(smi)
        m = Chem.MolFromSmiles(smi)
        fp = AllChem.GetMorganFingerprintAsBitVect(m, 2)
        arr = np.zeros((1, ))
        DataStructs.ConvertToNumpyArray(fp, arr)
        fps.append(arr)
        logP.append(MolLogP(m))
        tpsa.append(CalcTPSA(m))

    fps = np.asarray(fps).astype(float)
    logP = np.asarray(logP).astype(float)
    tpsa = np.asarray(tpsa).astype(float)

    return list_smi, logP, tpsa
Esempio n. 19
0
    logps = []
    nhdonors = []
    values = []
    dataset = []

    for data in list(LABEL_GUIDE.keys()) + ["cyp"]:
        with open(os.path.join(DATA_PATH, data, f"data_{data}.pt"),
                  "rb") as handle:
            inchis, v = pickle.load(handle)

        values.extend(v)

        for inchi in tqdm(inchis):
            mol = MolFromInchi(inchi)
            mws.append(MolWt(mol))
            logps.append(MolLogP(mol))
            nhdonors.append(NumHDonors(mol))
            dataset.append(DATASET_GUIDE[data])

    df = pd.DataFrame({
        "Molecular weight (gr./mol)": mws,
        r"aLog$P$": logps,
        "No. hydrogen donors": nhdonors,
        "values": values,
        "dataset": dataset,
    })

    f, axs = plt.subplots(1, 3, figsize=(18, 6))

    axs[0].grid(alpha=0.5)
    axs[1].grid(alpha=0.5)
import numpy as np
from utils import *
from rdkit import Chem
from rdkit.Chem.Crippen import MolLogP

molecules = np.load("./inputs/molecules.npy")
char = np.load("./inputs/char.npy")
N = 200000

logP_list = []
for i in range(N):
    smi = convert_to_smiles(molecules[i], char)
    mol = Chem.MolFromSmiles(smi)
    logP = MolLogP(mol)
    logP_list.append(logP)

np.save("./inputs/logP.npy", np.asarray(logP_list))
Esempio n. 21
0
def cal_prop(s):
    m = Chem.MolFromSmiles(s)
    if m is None : return None
    return Chem.MolToSmiles(m), ExactMolWt(m), MolLogP(m), CalcNumHBD(m), CalcNumHBA(m), CalcTPSA(m)
Esempio n. 22
0
def convert_to_clogp(SMILES):
    mol = MS(SMILES)
    logp = MolLogP(mol)
    return logp
Esempio n. 23
0
def logP_benchmark(smi):
    m = Chem.MolFromSmiles(smi.numpy())
    logP = MolLogP(m)
    return np.asarray(logP)
Esempio n. 24
0
_, _, char, vocab, _, _ = load_data(args.prop_file, args.seq_length)
vocab_size = len(char)

model = GNMTP(vocab_size,
             args
             )
model.restore(args.save_file)

target_prop = np.array([[float(p) for p in args.target_prop.split()] for _ in range(args.batch_size)])
start_codon = np.array([np.array(list(map(vocab.get, 'X')))for _ in range(args.batch_size)])

smiles = []
for _ in range(args.num_iteration):
    latent_vector = s = np.random.normal(args.mean, args.stddev, (args.batch_size, args.latent_size))
    generated = model.sample(latent_vector, target_prop, start_codon, args.seq_length)
    smiles += [convert_to_smiles(generated[i], char) for i in range(len(generated))]

print ('number of trial : ', len(smiles))
smiles = list(set([s.split('E')[0] for s in smiles]    ))
print ('number of generated smiles : ', len(smiles))
ms = [Chem.MolFromSmiles(s) for s in smiles]
ms = [m for m in ms if m is not None]
print ('number of valid smiles : ', len(ms))
with open(args.result_filename, 'w') as w:
    w.write('smiles\t MW\t LogP\t TPSA\n')
    for m in ms:
        try:
            w.write('%s\t%.3f\t%.3f\t%.3f\n' %(Chem.MolToSmiles(m), ExactMolWt(m), MolLogP(m), CalcTPSA(m)))
        except:
            continue            
Esempio n. 25
0
def lipinski_filter(smiles):
    mol = MolFromSmiles(smiles)
    return MolLogP(mol) <= 5 and NumHAcceptors(mol) <= 10 and NumHDonors(mol) <= 5 and 100 <= ExactMolWt(mol) <= 500
Esempio n. 26
0
def get_logP(mol):
    ''' clogP 或 LogP '''
    return MolLogP(mol)
Esempio n. 27
0
        s for s in smiles[:20000] if Chem.MolFromSmiles(s).GetNumAtoms() < 50
    ]

print('Number of smiles:', len(smiles))

Y = []

num_data = 20000

st = time.time()

for s in smiles[:num_data]:

    m = Chem.MolFromSmiles(s)

    logp = MolLogP(m)

    Y.append(logp)

end = time.time()

print(f'Time:{(end-st):.3f}')

#Dataset

from torch.utils.data import Dataset, DataLoader

from rdkit.Chem.rdmolops import GetAdjacencyMatrix


class MolDataset(Dataset):
Esempio n. 28
0
        score = logp_score  # (logp_score * 0.5) + (len_score * 0.5)

        logger.info("%s, %s" % (generated, str(score)))
        return score

    # mcts = LanguageModelMCTSWithUCB1(lm, width, text_length, eval_function)
    mcts = LanguageModelMCTSWithPUCT(lm,
                                     width,
                                     text_length,
                                     eval_function,
                                     cpuct=5)
    state = start_state

    logger.info("beginning search...")
    mcts.search(state, num_simulations)

    best = mcts.get_best_sequence()

    generated_text = ''.join(best[0])
    logger.info("generated text: %s (score: %s, perplexity: %s)" %
                (generated_text, str(best[1]), lm.perplexity(generated_text)))

    decoded = DeepSMILESLanguageModelUtils.decode(generated_text,
                                                  start='<s>',
                                                  end='</s>')
    smiles = DeepSMILESLanguageModelUtils.sanitize(decoded)

    mol = Chem.MolFromSmiles(smiles)
    logp = MolLogP(mol)
    logger.info("SMILES: %s, logP: %s" % (smiles, logp))
Esempio n. 29
0
lm = EmptyDeepSMILESLanguageModel(vocab, n=6)

current_best_score = None
current_best_smiles = None
beats_current = lambda score: score < current_best_score

for i in range(1000):
    generated = lm.generate(num_chars=25, text_seed="<s>")
    try:

        decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                      start='<s>',
                                                      end='</s>')
        sanitized = DeepSMILESLanguageModelUtils.sanitize(decoded)

        mol = Chem.MolFromSmiles(sanitized)
        logp_score = MolLogP(mol)

        logger.info("successful: %s , score: %s" %
                    (sanitized, str(logp_score)))

        if current_best_score is None or beats_current(logp_score):
            current_best_score = logp_score
            current_best_smiles = sanitized

    except Exception as e:
        pass

logger.info("best: %s , score: %s" %
            (current_best_smiles, str(current_best_score)))
Esempio n. 30
0
from rdkit import Chem
from rdkit.Chem.Crippen import MolLogP

with open('../id_smiles.txt') as f, open('data.txt', 'w') as w:
    for l in f:
        m_id, s1, s2 = l.split()
        m1, m2 = Chem.MolFromSmiles(s1), Chem.MolFromSmiles(s2)
        if m1 is None or m2 is None: continue
        c1, c2 = MolLogP(m1), MolLogP(m2)
        w.write(m_id + '\t' + str(c1) + '\t' + str(c2) + '\n')