Esempio n. 1
0
def get_dataset(dbpath, dataset, dataset_properties=None):
    """
    Get a dataset from the configuration.

    Args:
        dbpath (str): path to the local database
        dataset (str): name of the dataset
        dataset_properties (list): properties of the dataset

    Returns:
        AtomsData object

    """
    dataset = dataset.upper()
    if dataset == 'QM9':
        return QM9(dbpath, properties=dataset_properties)
    elif dataset == 'ISO17':
        return get_iso17(dataset_properties=dataset_properties)
    elif dataset == 'ANI1':
        return get_ani1(dataset_properties=dataset_properties)
    elif dataset == 'MD17':
        return get_md17(dataset_properties=dataset_properties)
    elif dataset == 'MATPROJ':
        return get_matproj(dataset_properties=dataset_properties)
    elif dataset == 'CUSTOM':
        return AtomsData(dbpath, required_properties=dataset_properties)
    else:
        raise NotImplementedError
Esempio n. 2
0
def get_dataset(_log, dbpath, dataset, dataset_properties=None):
    """
    Get a dataset from the configuration.

    Args:
        dbpath (str): path to the local database
        dataset (str): name of the dataset
        dataset_properties (list): properties of the dataset

    Returns:
        AtomsData object

    """
    dataset = dataset.upper()
    _log.info('Load {} dataset'.format(dataset))
    if dataset == 'QM9':
        return QM9(dbpath, properties=dataset_properties)
    elif dataset == 'ISO17':
        return get_iso17(dataset_properties=dataset_properties)
    elif dataset == 'ANI1':
        return get_ani1(dataset_properties=dataset_properties)
    elif dataset == 'MD17':
        return get_md17(dataset_properties=dataset_properties)
    elif dataset == 'MATPROJ':
        return get_matproj(dataset_properties=dataset_properties)
    elif dataset == 'CUSTOM':
        file, extension = os.path.splitext(dbpath)
        if extension == '.db':
            return AtomsData(dbpath, required_properties=dataset_properties)
        else:
            generate_db(db_path=file + '.db', file_path=dbpath)
            return AtomsData(file + '.db',
                             required_properties=dataset_properties)
def partition_polar_molecules():
    db = "qm9.db"
    dataset = QM9(db)
    _, _, test = spk.train_test_split(
        dataset,
        num_train=109000,
        num_val=1000,
        split_file="pst.npz"
    )
    dipoles = [t["dipole_moment"].item() for t in test]
    print(any(dipoles == 0.))  # Only gave 8!!
def test_set_stds():
    dataset = QM9('qm9.db')
    _, _, test = spk.train_test_split(
        dataset,
        num_train=109000,
        num_val=1000,
        split_file='pst.npz'
    )
    test_loader = spk.AtomsLoader(test, batch_size=len(test))
    for item in test_loader:
        break
    outs = {tn: item[tn].std() for tn in TARGET_NAMES}
    return outs
Esempio n. 5
0
def get_data(args, properties):
    split_file = os.path.join(args.model_dir, "split.npz") if not args.split_file else args.split_file
    dataset = QM9(args.db, load_only=properties)
    train, val, test = spk.train_test_split(
        dataset,
        num_train=args.ntr,
        num_val=args.nva,
        split_file=split_file
    )
    assert len(train) == args.ntr
    assert len(val) == args.nva
    train_loader = spk.AtomsLoader(train, batch_size=args.bs, shuffle=True, num_workers=args.num_workers)
    val_loader = spk.AtomsLoader(val, batch_size=args.bs, num_workers=args.num_workers)
    test_loader = spk.AtomsLoader(test, batch_size=args.bs, num_workers=args.num_workers)
    return dataset, split_file, train_loader, val_loader, test_loader
Esempio n. 6
0
from schnetpack.datasets import QM9
from schnetpack.train import Trainer, CSVHook, ReduceLROnPlateauHook
from schnetpack.metrics import MeanAbsoluteError
from schnetpack.metrics import build_mse_loss as mse_loss


logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

# basic settings
model_dir = "qm9_model"
os.makedirs(model_dir)
properties = [QM9.U0]

# data preparation
logging.info("get dataset")
dataset = QM9("data/qm9.db", properties=[QM9.U0])
train, val, test = spk.train_test_split(
    dataset, 1000, 100, os.path.join(model_dir, "split.npz")
)
train_loader = spk.AtomsLoader(train, batch_size=64)
val_loader = spk.AtomsLoader(val, batch_size=64)

# statistics
atomrefs = dataset.get_atomrefs(properties)
means, stddevs = train_loader.get_statistics(
    properties, per_atom=True, atomrefs=atomrefs
)

# model build
logging.info("build model")
representation = spk.SchNet(n_interactions=6)
Esempio n. 7
0
        if not os.path.exists(args.modelpath):
            os.makedirs(args.modelpath)

        to_json(jsonpath, argparse_dict)

        spk.utils.set_random_seed(args.seed)
        train_args = args
    else:
        train_args = read_from_json(jsonpath)

    # will download qm9 if necessary, calculate_triples is required for wACSF angular functions
    logging.info('QM9 will be loaded...')
    qm9 = QM9(args.datapath,
              download=True,
              properties=[train_args.property],
              collect_triples=args.model == 'wacsf',
              remove_uncharacterized=train_args.remove_uncharacterized)
    atomref = qm9.get_atomref(train_args.property)

    # splits the dataset in test, val, train sets
    split_path = os.path.join(args.modelpath, 'split.npz')
    if args.mode == 'train':
        if args.split_path is not None:
            copyfile(args.split_path, split_path)

    logging.info('create splits...')
    data_train, data_val, data_test = qm9.create_splits(*train_args.split,
                                                        split_file=split_path)

    logging.info('load data...')
Created on Sun Dec  6 04:26:29 2020

@author: Chris

Convert QM9 from xyz to useful feature database
"""

from schnetpack.datasets import QM9
from dscribe import descriptors
import pandas as pd
import numpy as np
from tqdm import tqdm

#%% import QM9 database
props = ['energy_U0', 'gap', 'heat_capacity']
qm9data = QM9('./qm9.db', download=True, load_only=props)
size = len(qm9data)

#%% convert to different representations
species = ['H', 'C', 'N', 'O', 'F']
rcut = 6.0
n_atoms_max = 29
rep_init = {
    'CoulombMatrix':
    descriptors.CoulombMatrix(n_atoms_max=n_atoms_max, flatten=False)
}
'''
           'ACSF': descriptors.SOAP(species=species, rcut=rcut,
                        g2_params = [[]],
                        g3_params = [],
                        g4_params = [[[]]],
Esempio n. 9
0
    device = torch.device("cuda" if args.cuda else "cpu")

    # define metrics
    metrics = [
        schnetpack.train.metrics.MeanAbsoluteError(train_args.property,
                                                   train_args.property),
        schnetpack.train.metrics.RootMeanSquaredError(train_args.property,
                                                      train_args.property),
    ]

    # build dataset
    logging.info("QM9 will be loaded...")
    qm9 = QM9(
        args.datapath,
        download=True,
        load_only=[train_args.property],
        collect_triples=args.model == "wacsf",
        remove_uncharacterized=train_args.remove_uncharacterized,
    )

    # get atomrefs
    atomref = qm9.get_atomrefs(train_args.property)

    # splits the dataset in test, val, train sets
    split_path = os.path.join(args.modelpath, "split.npz")
    train_loader, val_loader, test_loader = get_loaders(args,
                                                        dataset=qm9,
                                                        split_path=split_path,
                                                        logging=logging)

    if args.mode == "train":
Esempio n. 10
0
import schnetpack.atomistic.model
from schnetpack.datasets import QM9
from schnetpack.train import Trainer, CSVHook, ReduceLROnPlateauHook
from schnetpack.train.metrics import MeanAbsoluteError
from schnetpack.train import build_mse_loss

logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

# basic settings
model_dir = "qm9_model"
os.makedirs(model_dir)
properties = [QM9.U0]

# data preparation
logging.info("get dataset")
dataset = QM9("data/qm9.db", load_only=[QM9.U0])
train, val, test = spk.train_test_split(dataset, 1000, 100,
                                        os.path.join(model_dir, "split.npz"))
train_loader = spk.AtomsLoader(train, batch_size=64, shuffle=True)
val_loader = spk.AtomsLoader(val, batch_size=64)

# statistics
atomrefs = dataset.get_atomrefs(properties)
means, stddevs = train_loader.get_statistics(properties,
                                             get_atomwise_statistics=True,
                                             single_atom_ref=atomrefs)

# model build
logging.info("build model")
representation = spk.SchNet(n_interactions=6)
output_modules = [
Esempio n. 11
0
from schnetpack.datasets import QM9
import ase.io
import numpy as np
qm9data = QM9('qm9.db', download=True, remove_uncharacterized=True)
properties_key = [
    "dipole_moment", "isotropic_polarizability", "h**o", "lumo",
    "electronic_spatial_extent", "zpve", "energy_U0", "energy_U", "enthalpy_H",
    "free_energy", "heat_capacity"
]
# len(qm9data) = all structures
nb_structures = len(qm9data)
frames = ase.io.read('qm9.db', ':' + str(nb_structures))
for i in range(nb_structures):
    _, struc_property = qm9data.get_properties(idx=i)
    frames[i].cell = np.eye(3) * 100
    frames[i].center()
    frames[i].wrap(eps=1e-11)

    for key in properties_key:
        property_value = float(struc_property[key][0])
        frames[i].info[key] = property_value

ase.io.write('qm9.extxyz', frames)
Esempio n. 12
0
Created on Wed Jul 24 14:23:51 2019

@author: will
"""

#import schnetpack.atomistic.output_modules
import schnetpack as spk
import schnetpack.representation as rep
from schnetpack.datasets import QM9
import pandas as pd
import numpy as np
import pickle
from ase import Atoms

# load qm9 dataset and download if necessary
data = QM9("qm9.db", collect_triples=True)
loader = spk.data.AtomsLoader(data, batch_size=1, num_workers=2)
reps = rep.BehlerSFBlock()

# get wACSF feature
reps_dict = {}
for i,x in enumerate(loader):
    reps_dict[data.get_atoms(i).__repr__()] = reps(x).squeeze(0)
    

structures = pd.read_csv('/home/will/Desktop/kaggle/QM/Data/structures.csv')
structures_gb = structures.groupby(['molecule_name'])

atoms_structures = {}
for k,v in structures_gb:
    atom_dict = {'positions':v[['x','y','z']].values.tolist(),
Esempio n. 13
0
Created on Mon Dec  7 02:34:32 2020

@author: Chris

extract raw data from QM9 files
"""

from schnetpack.datasets import QM9
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
import json

#%% import QM9 database
qm9data = QM9('./qm9.db', download=True)

#%% convert to different representations
rep = pd.DataFrame(data=None, columns=qm9data[0].keys())
row = {}
i = 0

for n in tqdm(range(len(qm9data))):
    print(i)
    datum = qm9data[n]

    # convert tensors to numpy arrays
    for k, v in datum.items():
        row[k] = v.numpy()

    # append row to dataframe
Esempio n. 14
0
def qm9_dataset(qm9_dbpath):
    print(os.path.exists(qm9_dbpath))
    return QM9(qm9_dbpath)