def __call__(self, smiles: str):
        mol = Chem.MolFromSmiles(smiles)
        if not (self.rule_dict["MW"][0] <= MolWt(mol) <=
                self.rule_dict["MW"][1]):
            return False

        if not (self.rule_dict["LogP"][0] <= MolLogP(mol) <=
                self.rule_dict["LogP"][1]):
            return False

        if not (self.rule_dict["HBD"][0] <= NumHDonors(mol) <=
                self.rule_dict["HBD"][1]):
            return False

        if not (self.rule_dict["HBA"][0] <= NumHAcceptors(mol) <=
                self.rule_dict["HBA"][1]):
            return False

        if not (self.rule_dict["TPSA"][0] <= TPSA(mol) <=
                self.rule_dict["TPSA"][1]):
            return False

        for row in self.rule_list:
            patt, max_val, desc = row
            if len(mol.GetSubstructMatches(patt)) > max_val:
                return False

        return True
Beispiel #2
0
 def evaluate(self, lst_in):
     """
     Evaluate structure alerts on a list of SMILES
     :param lst_in: input list of [SMILES, Name]
     :return: list of alerts matched or "OK"
     """
     smiles, name = lst_in
     mol = Chem.MolFromSmiles(smiles)
     if mol is None:
         return [
             smiles, name, 'INVALID', -999, -999, -999, -999, -999, -999
         ]
     desc_list = [
         MolWt(mol),
         MolLogP(mol),
         NumHDonors(mol),
         NumHAcceptors(mol),
         TPSA(mol),
         CalcNumRotatableBonds(mol)
     ]
     for row in self.rule_list:
         patt, max_val, desc = row
         if len(mol.GetSubstructMatches(patt)) > max_val:
             return [smiles, name] + [desc + " > %d" %
                                      (max_val)] + desc_list
     return [smiles, name] + ["OK"] + desc_list
Beispiel #3
0
def reward_target_logp(mol, target, ratio=0.5, max=4):
    """
    Reward for a target log p
    :param mol: rdkit mol object
    :param target: float
    :return: float (-inf, max]
    """
    x = MolLogP(mol)
    reward = -1 * np.abs((x - target) / ratio) + max
    return reward
def worker(line):
    smiles, cid = line.strip().split()[:2]
    mol = MolFromSmiles(smiles)
    if mol:
        if '.' in smiles:
            mol = remover.StripMol(mol)
        logp = MolLogP(mol)
        num_heavy_atoms = mol.GetNumHeavyAtoms()
        if num_heavy_atoms > 99:
            num_heavy_atoms = 99
        sign = 'M' if logp < 0.0 else 'P'
        return f'{smiles} {cid} H{num_heavy_atoms:02}{sign}{abs(scale_logp_value(logp)):03}\n'
def get_normalized_values():
    fname = '/home/bowen/pycharm_deployment_directory/rl_graph_generation/gym-molecule/gym_molecule/dataset/250k_rndm_zinc_drugs_clean.smi'
    with open(fname) as f:
        smiles = f.readlines()

    for i in range(len(smiles)):
        smiles[i] = smiles[i].strip()
    smiles_rdkit = []

    for i in range(len(smiles)):
        smiles_rdkit.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles[i])))
    print(i)

    logP_values = []
    for i in range(len(smiles)):
        logP_values.append(MolLogP(Chem.MolFromSmiles(smiles_rdkit[i])))
    print(i)

    SA_scores = []
    for i in range(len(smiles)):
        SA_scores.append(-calculateScore(Chem.MolFromSmiles(smiles_rdkit[i])))
    print(i)

    cycle_scores = []
    for i in range(len(smiles)):
        cycle_list = nx.cycle_basis(
            nx.Graph(
                Chem.rdmolops.GetAdjacencyMatrix(
                    Chem.MolFromSmiles(smiles_rdkit[i]))))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        cycle_scores.append(-cycle_length)
    print(i)

    SA_scores_normalized = (np.array(SA_scores) -
                            np.mean(SA_scores)) / np.std(SA_scores)
    logP_values_normalized = (np.array(logP_values) -
                              np.mean(logP_values)) / np.std(logP_values)
    cycle_scores_normalized = (np.array(cycle_scores) -
                               np.mean(cycle_scores)) / np.std(cycle_scores)

    return np.mean(SA_scores), np.std(SA_scores), np.mean(logP_values), np.std(
        logP_values), np.mean(cycle_scores), np.std(cycle_scores)
Beispiel #6
0
def get_normalized_values(smi_filename):
    with open(smi_filename) as f:
        smiles = f.readlines()

    for i in range(len(smiles)):
        smiles[i] = smiles[i].strip()
    smiles_rdkit = []

    for i in range(len(smiles)):
        smiles_rdkit.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles[i])))
    print(i)

    logP_values = []
    for i in range(len(smiles)):
        logP_values.append(MolLogP(Chem.MolFromSmiles(smiles_rdkit[i])))
    print(i)

    SA_scores = []
    for i in range(len(smiles)):
        SA_scores.append(-calculateScore(Chem.MolFromSmiles(smiles_rdkit[i])))
    print(i)

    cycle_scores = []
    for i in range(len(smiles)):
        cycle_list = nx.cycle_basis(
            nx.Graph(
                Chem.rdmolops.GetAdjacencyMatrix(
                    Chem.MolFromSmiles(smiles_rdkit[i]))))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        cycle_scores.append(-cycle_length)
    print(i)

    SA_scores_normalized = (np.array(SA_scores) -
                            np.mean(SA_scores)) / np.std(SA_scores)
    logP_values_normalized = (np.array(logP_values) -
                              np.mean(logP_values)) / np.std(logP_values)
    cycle_scores_normalized = (np.array(cycle_scores) -
                               np.mean(cycle_scores)) / np.std(cycle_scores)

    return np.mean(SA_scores), np.std(SA_scores), np.mean(logP_values), np.std(
        logP_values), np.mean(cycle_scores), np.std(cycle_scores)
Beispiel #7
0
    def reward_penalized_log_p(mol):
        """
        Reward that consists of log p penalized by SA and # long cycles,
        as described in (Kusner et al. 2017). Scores are normalized based on the
        statistics of 250k_rndm_zinc_drugs_clean.smi dataset
        Code taken from implementation of:
        You, Jiaxuan, et al. "Graph Convolutional Policy Network for Goal-Directed
        Molecular Graph Generation." arXiv preprint arXiv:1806.02473 (2018).
        https://github.com/bowenliu16/rl_graph_generation
        """
        # normalization constants, statistics from 250k_rndm_zinc_drugs_clean.smi
        logP_mean = 2.4570953396190123
        logP_std = 1.434324401111988
        SA_mean = -3.0525811293166134
        SA_std = 0.8335207024513095
        cycle_mean = -0.0485696876403053
        cycle_std = 0.2860212110245455

        try:
            log_p = MolLogP(mol)
        except ValueError:
            return 0
        try:
            SA = -sascorer.calculateScore(mol)
        except ZeroDivisionError:
            return 0

        # cycle score
        cycle_list = nx.cycle_basis(
            nx.Graph(Chem.rdmolops.GetAdjacencyMatrix(mol)))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        cycle_score = -cycle_length

        normalized_log_p = (log_p - logP_mean) / logP_std
        normalized_SA = (SA - SA_mean) / SA_std
        normalized_cycle = (cycle_score - cycle_mean) / cycle_std

        return normalized_log_p + normalized_SA + normalized_cycle
Beispiel #8
0
def smiles_reaction_matrix(smarts, *sources, **kwargs):
    sep = kwargs.setdefault('sep', ' ')
    molValue = int(kwargs.get('molValue', 400))
    logValue = float(kwargs.get('logValue', 4.0))
    reaction = ReactionFromSmarts(smarts)
    smilesLists = [load_smiles_file(source) for source in sources]
    products = reaction_matrix(reaction, *smilesLists)
    for reactants, product in products:
        cids = [r.GetProp("_Name") for r in reactants]
        product_id = '.'.join(cids)
        for mol in product:
            smiles = MolToSmiles(mol, isomericSmiles=True)
            mol.UpdatePropertyCache(strict=False)
            mh = AddHs(mol, addCoords=True)
            mwt = MolWt(mol)
            if mwt <= molValue:
                logp = MolLogP(mol)
                if logp < logValue:
                    yield sep.join((smiles, product_id, str(mwt), str(logp)))+"\n"
Beispiel #9
0
def get_task(name_of_task: str):
    """
    Given a task name (eg handed in as an argument to a script call) return the relevant PropertyEvaluator.
    See code for definition of class names. NB that Guacamol names are given by 'guac_<name>'
    """
    if name_of_task == 'qed':
        return PropertyEvaluator(qed)
    elif name_of_task == 'sas':
        return PropertyEvaluator(
            lambda smiles:
            [sascorer.calculateScore(Chem.MolFromSmiles(smiles))])
    elif name_of_task == 'pen_logp':
        return PropertyEvaluator(
            lambda smiles: [MolLogP(Chem.MolFromSmiles(smiles))])
    elif name_of_task[:5] == 'guac_':
        task = GuacTask.get_name_to_enum()[name_of_task[5:]]
        return GuacTask.get_guac_property_eval(task)
    else:
        raise NotImplementedError(f"{name_of_task} is not implemented.")
Beispiel #10
0
def penalized_logp(mol):
    """
    Reward that consists of log p penalized by SA and # long cycles,
    as described in (Kusner et al. 2017). Scores are normalized based on the
    statistics of 250k_rndm_zinc_drugs_clean.smi dataset.

    Args:
        mol: Rdkit mol object
    
    :rtype:
        :class:`float`
    """

    # normalization constants, statistics from 250k_rndm_zinc_drugs_clean.smi
    logP_mean = 2.4570953396190123
    logP_std = 1.434324401111988
    SA_mean = -3.0525811293166134
    SA_std = 0.8335207024513095
    cycle_mean = -0.0485696876403053
    cycle_std = 0.2860212110245455

    log_p = MolLogP(mol)
    SA = -calculateScore(mol)

    # cycle score
    cycle_list = nx.cycle_basis(nx.Graph(
        Chem.rdmolops.GetAdjacencyMatrix(mol)))
    if len(cycle_list) == 0:
        cycle_length = 0
    else:
        cycle_length = max([len(j) for j in cycle_list])
    if cycle_length <= 6:
        cycle_length = 0
    else:
        cycle_length = cycle_length - 6
    cycle_score = -cycle_length

    normalized_log_p = (log_p - logP_mean) / logP_std
    normalized_SA = (SA - SA_mean) / SA_std
    normalized_cycle = (cycle_score - cycle_mean) / cycle_std

    return normalized_log_p + normalized_SA + normalized_cycle
Beispiel #11
0
def cal_prop(q, return_dict_prop):

    nbits = 1024
    while True:
        qqq = q.get()
        if qqq == 'DONE':
            #            print('proc =', os.getpid())
            break
        idx, smi = qqq

        #        if idx%10000==0:
        #            print(idx)
        mol = Chem.MolFromSmiles(smi)
        logP = MolLogP(mol)
        SAS = sascorer.calculateScore(mol)
        QED = qed(mol)
        MW = MolWt(mol)
        TPSA0 = TPSA(mol)

        return_dict_prop[idx] = [logP, SAS, QED, MW, TPSA0]
Beispiel #12
0
def penalized_logp(molecule):
    """Calculates the penalized logP of a molecule.
    Refactored from
    https://github.com/wengong-jin/icml18-jtnn/blob/master/bo/run_bo.py
    See Junction Tree Variational Autoencoder for Molecular Graph Generation
    https://arxiv.org/pdf/1802.04364.pdf
    Section 3.2
    Penalized logP is defined as:
    y(m) = logP(m) - SA(m) - cycle(m)
    y(m) is the penalized logP,
    logP(m) is the logP of a molecule,
    SA(m) is the synthetic accessibility score,
    cycle(m) is the largest ring size minus by six in the molecule.
    Args:
    molecule: Chem.Mol. A molecule.
    Returns:
    Float. The penalized logP value.
    """
    log_p = MolLogP(molecule)
    sas_score = sascorer.calculateScore(molecule)
    largest_ring_size = get_largest_ring_size(molecule)
    cycle_score = max(largest_ring_size - 6, 0)
    return log_p - sas_score - cycle_score
Beispiel #13
0
    canvas.addCanvasText('%s\r\nMolWt: %g\tTPSA: %g' % (s, MolWt(m), TPSA(m)),
                         pos, font)

    with open('xx' + s + '.png', 'w') as f:
        canvas.flush()
        img.save(f)


if __name__ == '__main__':
    drawmol('CN1CCC[C@H]1c2cccnc2')
    drawmol('CC(=O)OC1=CC=CC=C1C(=O)O')
    drawmol('O1C=C[C@H]([C@H]1O2)c3c2cc(OC)c4c3OC(=O)C5=C4CCC(=O)5')
    sys.exit(0)

    # sample code to use new drawing API (older rdkit do not have DrawString)
    from rdkit.Chem.AllChem import EmbedMolecule
    assert EmbedMolecule(m) >= 0
    x = Draw.rdMolDraw2D.MolDraw2DSVG(200, 250)
    x.DrawMolecule(m)
    x.DrawString('Test String', 20, 200)
    x.FinishDrawing()
    print(x.GetDrawingText())

    # sample code to generate a legend
    legstr = ''
    if molname:
        legstr += molname + '\n'
    legstr += '%s\nWt=%g LogP=%g TPSA=%g\nHBA=%d HBD=%d RotBond=%d\n' % \
        (smiles, MolWt(mol), MolLogP(mol), TPSA(mol),
         NumHAcceptors(mol), NumHDonors(mol), NumRotatableBonds(mol))
Beispiel #14
0
 def _calculate_phys_chem_property(self, mol):
     return MolLogP(mol)
Beispiel #15
0
def calc_logp(smiles_string):
    """Given a smiles string (ex. C1CCCCC1), calculate and return the LogP"""
    mol = Chem.MolFromSmiles(smiles_string)
    return MolLogP(mol)
        print('Usage: python rdkit_hlogp_batch.py <smiles> <batch_size>')
        exit()
    
    BATCH_SIZE = int(sys.argv[2])
    hlogp_list = list()
    with open(sys.argv[1]) as smiles_file:
        file_lines = smiles_file.readlines()
        for line in file_lines:
            if line.strip():
                smiles, cid = str(line).strip().split()[:2]
                mol = MolFromSmiles(smiles)
                remover = SaltRemover()
                res, deleted = remover.StripMolWithDeleted(mol)
                if res is not None:
                    res.SetProp('_Name', cid)
                logp = MolLogP(res)
                num_heavy_atoms = res.GetNumHeavyAtoms()
                if num_heavy_atoms > 99:
                    num_heavy_atoms = 99
                scaled_logp = scale_logp_value(logp)
                if logp < 0.0:
                    sign = 'M'
                    #remove the minus sign so it's not printed
                    scaled_logp = scaled_logp * -1
                else:
                    sign = 'P'
                key_string = 'H{:02}{}{:03}'.format(num_heavy_atoms, sign, scaled_logp)

                #store in list up to batch size, then write out to new file
                final_string = '{0} {1} {2}\n'.format(smiles, cid, key_string)
                hlogp_list.append(final_string)
Beispiel #17
0
def get_logp_score(states):
    if not isinstance(states, list):
        states = [states]
    return [MolLogP(state) for state in states]
def calc_logp(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    return MolLogP(mol)
Beispiel #19
0
                check=False
        else:
            print(char1, char2, "error")
            error = True
            break
        X_d[istring+1]=j
        Y_d[istring]=j
        istring+=1
    if error:
        continue
    for i in range(istring,seq_length):
        X_d[i+1]=char_dict['Y']
        Y_d[i]=char_dict['Y']

    m = Chem.MolFromSmiles(smiles)
    logP = MolLogP(m)
    SAS = sascorer.calculateScore(m)
    tpsa0 = TPSA(m)
    Xdata+=[X_d]
    Ydata+=[Y_d]
    Ldata+=[istring+1]
    cdd=[arr[1], logP/10.0, SAS/10.0, tpsa0/150.0]
    Pdata+=[cdd] #affinity classification

Xdata = np.asarray(Xdata,dtype="int32")
Ydata = np.asarray(Ydata,dtype="int32")
Ldata = np.asarray(Ldata,dtype="int32")
Pdata = np.asarray(Pdata,dtype="float32")
print(Xdata.shape,Ydata.shape,Ldata.shape,Pdata.shape)

data_dir2="./data/EGFR_property/"