Beispiel #1
0
def check_drigoni(download_path):

    def conv(smile):
        return Chem.MolToSmiles(Chem.MolFromSmiles(smile))

    print('reading data...')
    raw_data = train_valid_split(download_path)
    data = [conv(data_item['smiles']) for data_item in raw_data['valid']]
    data.extend([conv(data_item['smiles']) for data_item in raw_data['train']])


    print('reading dataset')
    dataset = readStr_qm9()

    print('len dataset:' + str(len(dataset)) + "  len data: " + str(len(data)))

    count = 0
    saw = 0
    for i, data_item in enumerate(data):
        if data_item not in dataset:
            count +=1
        saw += 1
    print('end: ' + str(count) + " saw = " + str(saw))


    count = 0
    saw = 0
    for i, data_item in enumerate(dataset):
        if data_item not in data:
            count +=1
        saw += 1
    print('end: ' + str(count) + " saw = " + str(saw))
Beispiel #2
0
def createSubSetQM9(n=5000):
    from read_dataset import readStr_qm9

    D = readStr_qm9()
    np.random.shuffle(D)
    subSet = D[:n]

    f = open("qm9_sub" + str(n) + ".smi", "w")
    for l in subSet:
        f.write(l)
        f.write("\n")
    f.close()
Beispiel #3
0
def main():
    decoded_file = GRAMMAR_WEIGHTS.split(".")[0] + "_decRes.txt"
    priors_file = GRAMMAR_WEIGHTS.split(".")[0] + "_priorsRes.txt"
    generation_file = GRAMMAR_WEIGHTS.split(".")[0] + "_generationRes.txt"
    grammar_model = molecule_vae.Qm9GrammarModel(GRAMMAR_WEIGHTS)

    XTE = readStr_qm9()
    XTE = XTE[0:5000]
    # rember to comment and uncomment the line  in the #moelcule_vae file
    decoded_result = reconstruction(grammar_model, XTE)
    save_decoded_results(XTE, decoded_result, decoded_file)
    # decoded_priors = prior(grammar_model)
    # save_decoded_priors(decoded_priors, priors_file)
    decoded_generation = generation(grammar_model)
    save_decoded_priors(decoded_generation, generation_file)
def main():
    from att_model_proxy import AttMolProxy as ProxyModel
    from att_model_proxy import cmd_args
    # takes the model and calculate the decode results
    model = ProxyModel()
    # update where to save
    decoded_file = cmd_args.save_dir + '/decoded_results.txt'

    # reading smiles test set
    if cmd_args.smiles_file == 'qm9':
        smiles_list = readStr_qm9()
    elif cmd_args.smiles_file == 'zinc':
        smiles_list = read_zinc()

    XTE = smiles_list[0:nb_smiles]

    decoded_result = reconstruct(model, XTE)
    decoded_result = np.array(decoded_result)
    save_decoded_results(XTE, decoded_result, decoded_file)
Beispiel #5
0
def train_valid_split_drigoni():

    print('reading dataset')
    dataset = readStr_qm9()[5000:]

    valid_idx = np.random.randint(0, high=len(dataset), size=round(len(dataset)*0.1))

    raw_data = {'train': [], 'valid': []}  # save the train, valid dataset.
    file_count=0
    for i, smiles in enumerate(dataset):
        val = QED.qed(Chem.MolFromSmiles(smiles))
        if i not in valid_idx:
            raw_data['train'].append({'smiles': smiles, 'QED': val})
        else:
            raw_data['valid'].append({'smiles': smiles, 'QED': val})
        file_count += 1
        if file_count % 2000 == 0:
            print('finished reading: %d' % file_count, end='\r')
    return raw_data
Beispiel #6
0
def qm9StrProve():
    L = readStr_qm9()
    MAX = 120
    count = 0
    countDot = 0
    countAst = 0
    nMolMIn9 = 0
    nMaxAtoms = 0
    nMinAtom = 100

    nMol1 = 0

    for s in L:
        if len(s) > MAX:
            count = count + 1
        if "." in s:
            countDot = countDot + 1
        if "*" in s:
            countAst = countAst + 1
        m = Chem.MolFromSmiles(s)
        atom = m.GetNumAtoms()
        if atom <= 9:
            nMolMIn9 = nMolMIn9 + 1
        if atom > nMaxAtoms:
            nMaxAtoms = atom
        elif atom < nMinAtom:
            nMinAtom = atom

        if atom == 1:
            nMol1 = nMol1 + 1

    print("Numero molecole con num. atomi <= 9: {}".format(nMolMIn9))
    print("Numero massimo di atomi: {}".format(nMaxAtoms))
    print("Numero minimo di atomi: {}".format(nMinAtom))
    print("Numero molecole con caratteri superiori a 120: {}".format(count))
    print("Numero molecole con carattere '.' : {}".format(countDot))
    print("Numero molecole con carattere '*' : {}".format(countAst))
    print("Numero di molecole lette: {}".format(len(L)))
    print(
        "Numero di molecole lette formate da un solo atomo: {}".format(nMol1))
    print("-------------- FINE ---------------")
Beispiel #7
0
    all_onehot = np.zeros((len(L), cmd_args.max_decode_steps, DECISION_DIM),
                          dtype=np.byte)
    all_masks = np.zeros((len(L), cmd_args.max_decode_steps, DECISION_DIM),
                         dtype=np.byte)

    for start, b_pair in zip(range(0, len(L), chunk_size), list_binary):
        all_onehot[start:start + chunk_size, :, :] = b_pair[0]
        all_masks[start:start + chunk_size, :, :] = b_pair[1]

    #f_smiles = '.'.join(cmd_args.smiles_file.split('/')[-1].split('.')[0:-1])
    f_smiles = cmd_args.smiles_file
    out_file = '%s/%s-%d.h5' % (cmd_args.save_dir, f_smiles,
                                cmd_args.skip_deter)
    h5f = h5py.File(out_file, 'w')
    h5f.create_dataset('x', data=all_onehot)
    h5f.create_dataset('masks', data=all_masks)
    h5f.close()


if __name__ == '__main__':

    smiles_list = []

    if cmd_args.smiles_file == 'qm9':
        smiles_list = readStr_qm9()
    elif cmd_args.smiles_file == 'zinc':
        smiles_list = read_zinc()

    train_dataset = smiles_list[5000:]
    run_job(train_dataset)
from plot_utils import *
from read_dataset import readStr_qm9, read_zinc
from smile_metrics import MolecularMetrics as mm
from utils import save_scores_bias, load_decoded_results, calc_perc

folder = "bias/"

# take params
name = sys.argv[1]
file = sys.argv[2]
dataset = sys.argv[3]

if dataset == "zinc":
    trainingSet = read_zinc()
else:
    trainingSet = readStr_qm9()
trainingSet = trainingSet[5000:]

# make folder
try:
    os.makedirs(folder + name)
except OSError:
    print("Creation of the directory %s failed" % (folder + name))
else:
    print("Successfully created the directory %s " % (folder + name))

# READ SMILES
smi = dict()
smi['smiles'], smi['decoded'] = load_decoded_results(file)
smi['valid'] = []
for line in smi['decoded']:
Beispiel #9
0
from __future__ import print_function
import nltk
import qm9_grammar
import numpy as np
import h5py
import molecule_vae
import sys, os
sys.path.append('%s/../_utils' % os.path.dirname(os.path.realpath(__file__)))
from read_dataset import readStr_qm9

MAX_LEN = 277

L = readStr_qm9()

NCHARS = len(qm9_grammar.GCFG.productions())


def to_one_hot(smiles):
    """ Encode a list of smiles strings to one-hot vectors """
    assert type(smiles) == list
    prod_map = {}
    for ix, prod in enumerate(qm9_grammar.GCFG.productions()):
        prod_map[prod] = ix
    tokenize = molecule_vae.get_zinc_tokenizer(qm9_grammar.GCFG)
    tokens = map(tokenize, smiles)
    parser = nltk.ChartParser(qm9_grammar.GCFG)
    parse_trees = [parser.parse(t).next() for t in tokens]
    productions_seq = [tree.productions() for tree in parse_trees]
    indices = [
        np.array([prod_map[prod] for prod in entry], dtype=int)
        for entry in productions_seq
Beispiel #10
0
    result_list = Parallel(n_jobs=-1)(delayed(parse_many)(chunk[i:i +
                                                                size], grammar)
                                      for i in range(0, len(chunk), size))
    return [_1 for _0 in result_list for _1 in _0]


import cPickle as cp

from tqdm import tqdm

if __name__ == '__main__':
    save_dir = cmd_args.save_dir
    fname = save_dir + (
        cmd_args.smiles_file.split('/')[-1]).split('.')[0] + '.cfg_dump'
    fout = open(fname, 'wb')
    grammar = parser.Grammar(cmd_args.grammar_file)

    smiles = []
    if cmd_args.smiles_file == 'qm9':
        smiles = readStr_qm9()
    elif cmd_args.smiles_file == 'zinc':
        smiles = read_zinc()

    for i in tqdm(range(len(smiles))):
        ts = parser.parse(smiles[i], grammar)
        assert isinstance(ts, list) and len(ts) == 1
        n = AnnotatedTree2MolTree(ts[0])
        cp.dump(n, fout, cp.HIGHEST_PROTOCOL)

    fout.close()
Beispiel #11
0
def main():
    torch.manual_seed(0)
    lg = rdkit.RDLogger.logger()
    lg.setLevel(rdkit.RDLogger.CRITICAL)

    parser = OptionParser()
    parser.add_option("-t", "--test", dest="test_path")
    parser.add_option("-v", "--vocab", dest="vocab_path")
    parser.add_option("-m", "--model", dest="model_path")
    parser.add_option("-w", "--hidden", dest="hidden_size", default=200)
    parser.add_option("-l", "--latent", dest="latent_size", default=56)
    parser.add_option("-d", "--depth", dest="depth", default=3)
    opts, args = parser.parse_args()

    vocab = [x.strip("\r\n ") for x in open(opts.vocab_path)]
    vocab = Vocab(vocab)

    hidden_size = int(opts.hidden_size)
    latent_size = int(opts.latent_size)
    depth = int(opts.depth)

    model = JTNNVAE(vocab, hidden_size, latent_size, depth)
    model.load_state_dict(torch.load(opts.model_path))
    model = model.cuda()

    dataset_name = opts.test_path
    result_file = dataset_name + "_decoded_results.txt"
    priors_file = dataset_name + "_decoded_priors.txt"
    generation_fie = dataset_name + "_generation.txt"

    # read dataset
    if dataset_name == "zinc":
        XTE = read_zinc()
    else:
        D = readStr_qm9()
        # fix problem about molecule with '.' inside
        XTE = []
        for mol in D:
            if "." not in mol:
                XTE.append(mol)

    # reconstruction
    XTE = XTE[0:5000]
    XTE = filter(lambda x: len(x) > 1,
                 XTE)  #needed for removing smiles with only a char.
    decoded_result = reconstruction(model, XTE, 20, 1)
    save_decoded_results(XTE, decoded_result, result_file)

    # prior
    # decoded_priors_witherrors = model.sample_prior_eval(True, 1000, 10)
    # decoded_priors = []
    # for i in decoded_priors_witherrors:
    #     decoded_priors.append(sanitize(i))
    # save_decoded_priors(decoded_priors, priors_file)

    # generation
    generation_witherrors = model.sample_prior_eval(True, 20000, 1)
    generation = []
    for i in generation_witherrors:
        generation.append(sanitize(i))
    save_decoded_priors(generation, generation_fie)