def get_dataset(dbpath, dataset, dataset_properties=None): """ Get a dataset from the configuration. Args: dbpath (str): path to the local database dataset (str): name of the dataset dataset_properties (list): properties of the dataset Returns: AtomsData object """ dataset = dataset.upper() if dataset == 'QM9': return QM9(dbpath, properties=dataset_properties) elif dataset == 'ISO17': return get_iso17(dataset_properties=dataset_properties) elif dataset == 'ANI1': return get_ani1(dataset_properties=dataset_properties) elif dataset == 'MD17': return get_md17(dataset_properties=dataset_properties) elif dataset == 'MATPROJ': return get_matproj(dataset_properties=dataset_properties) elif dataset == 'CUSTOM': return AtomsData(dbpath, required_properties=dataset_properties) else: raise NotImplementedError
def get_dataset(_log, dbpath, dataset, dataset_properties=None): """ Get a dataset from the configuration. Args: dbpath (str): path to the local database dataset (str): name of the dataset dataset_properties (list): properties of the dataset Returns: AtomsData object """ dataset = dataset.upper() _log.info('Load {} dataset'.format(dataset)) if dataset == 'QM9': return QM9(dbpath, properties=dataset_properties) elif dataset == 'ISO17': return get_iso17(dataset_properties=dataset_properties) elif dataset == 'ANI1': return get_ani1(dataset_properties=dataset_properties) elif dataset == 'MD17': return get_md17(dataset_properties=dataset_properties) elif dataset == 'MATPROJ': return get_matproj(dataset_properties=dataset_properties) elif dataset == 'CUSTOM': file, extension = os.path.splitext(dbpath) if extension == '.db': return AtomsData(dbpath, required_properties=dataset_properties) else: generate_db(db_path=file + '.db', file_path=dbpath) return AtomsData(file + '.db', required_properties=dataset_properties)
def partition_polar_molecules(): db = "qm9.db" dataset = QM9(db) _, _, test = spk.train_test_split( dataset, num_train=109000, num_val=1000, split_file="pst.npz" ) dipoles = [t["dipole_moment"].item() for t in test] print(any(dipoles == 0.)) # Only gave 8!!
def test_set_stds(): dataset = QM9('qm9.db') _, _, test = spk.train_test_split( dataset, num_train=109000, num_val=1000, split_file='pst.npz' ) test_loader = spk.AtomsLoader(test, batch_size=len(test)) for item in test_loader: break outs = {tn: item[tn].std() for tn in TARGET_NAMES} return outs
def get_data(args, properties): split_file = os.path.join(args.model_dir, "split.npz") if not args.split_file else args.split_file dataset = QM9(args.db, load_only=properties) train, val, test = spk.train_test_split( dataset, num_train=args.ntr, num_val=args.nva, split_file=split_file ) assert len(train) == args.ntr assert len(val) == args.nva train_loader = spk.AtomsLoader(train, batch_size=args.bs, shuffle=True, num_workers=args.num_workers) val_loader = spk.AtomsLoader(val, batch_size=args.bs, num_workers=args.num_workers) test_loader = spk.AtomsLoader(test, batch_size=args.bs, num_workers=args.num_workers) return dataset, split_file, train_loader, val_loader, test_loader
from schnetpack.datasets import QM9 from schnetpack.train import Trainer, CSVHook, ReduceLROnPlateauHook from schnetpack.metrics import MeanAbsoluteError from schnetpack.metrics import build_mse_loss as mse_loss logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) # basic settings model_dir = "qm9_model" os.makedirs(model_dir) properties = [QM9.U0] # data preparation logging.info("get dataset") dataset = QM9("data/qm9.db", properties=[QM9.U0]) train, val, test = spk.train_test_split( dataset, 1000, 100, os.path.join(model_dir, "split.npz") ) train_loader = spk.AtomsLoader(train, batch_size=64) val_loader = spk.AtomsLoader(val, batch_size=64) # statistics atomrefs = dataset.get_atomrefs(properties) means, stddevs = train_loader.get_statistics( properties, per_atom=True, atomrefs=atomrefs ) # model build logging.info("build model") representation = spk.SchNet(n_interactions=6)
if not os.path.exists(args.modelpath): os.makedirs(args.modelpath) to_json(jsonpath, argparse_dict) spk.utils.set_random_seed(args.seed) train_args = args else: train_args = read_from_json(jsonpath) # will download qm9 if necessary, calculate_triples is required for wACSF angular functions logging.info('QM9 will be loaded...') qm9 = QM9(args.datapath, download=True, properties=[train_args.property], collect_triples=args.model == 'wacsf', remove_uncharacterized=train_args.remove_uncharacterized) atomref = qm9.get_atomref(train_args.property) # splits the dataset in test, val, train sets split_path = os.path.join(args.modelpath, 'split.npz') if args.mode == 'train': if args.split_path is not None: copyfile(args.split_path, split_path) logging.info('create splits...') data_train, data_val, data_test = qm9.create_splits(*train_args.split, split_file=split_path) logging.info('load data...')
Created on Sun Dec 6 04:26:29 2020 @author: Chris Convert QM9 from xyz to useful feature database """ from schnetpack.datasets import QM9 from dscribe import descriptors import pandas as pd import numpy as np from tqdm import tqdm #%% import QM9 database props = ['energy_U0', 'gap', 'heat_capacity'] qm9data = QM9('./qm9.db', download=True, load_only=props) size = len(qm9data) #%% convert to different representations species = ['H', 'C', 'N', 'O', 'F'] rcut = 6.0 n_atoms_max = 29 rep_init = { 'CoulombMatrix': descriptors.CoulombMatrix(n_atoms_max=n_atoms_max, flatten=False) } ''' 'ACSF': descriptors.SOAP(species=species, rcut=rcut, g2_params = [[]], g3_params = [], g4_params = [[[]]],
device = torch.device("cuda" if args.cuda else "cpu") # define metrics metrics = [ schnetpack.train.metrics.MeanAbsoluteError(train_args.property, train_args.property), schnetpack.train.metrics.RootMeanSquaredError(train_args.property, train_args.property), ] # build dataset logging.info("QM9 will be loaded...") qm9 = QM9( args.datapath, download=True, load_only=[train_args.property], collect_triples=args.model == "wacsf", remove_uncharacterized=train_args.remove_uncharacterized, ) # get atomrefs atomref = qm9.get_atomrefs(train_args.property) # splits the dataset in test, val, train sets split_path = os.path.join(args.modelpath, "split.npz") train_loader, val_loader, test_loader = get_loaders(args, dataset=qm9, split_path=split_path, logging=logging) if args.mode == "train":
import schnetpack.atomistic.model from schnetpack.datasets import QM9 from schnetpack.train import Trainer, CSVHook, ReduceLROnPlateauHook from schnetpack.train.metrics import MeanAbsoluteError from schnetpack.train import build_mse_loss logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) # basic settings model_dir = "qm9_model" os.makedirs(model_dir) properties = [QM9.U0] # data preparation logging.info("get dataset") dataset = QM9("data/qm9.db", load_only=[QM9.U0]) train, val, test = spk.train_test_split(dataset, 1000, 100, os.path.join(model_dir, "split.npz")) train_loader = spk.AtomsLoader(train, batch_size=64, shuffle=True) val_loader = spk.AtomsLoader(val, batch_size=64) # statistics atomrefs = dataset.get_atomrefs(properties) means, stddevs = train_loader.get_statistics(properties, get_atomwise_statistics=True, single_atom_ref=atomrefs) # model build logging.info("build model") representation = spk.SchNet(n_interactions=6) output_modules = [
from schnetpack.datasets import QM9 import ase.io import numpy as np qm9data = QM9('qm9.db', download=True, remove_uncharacterized=True) properties_key = [ "dipole_moment", "isotropic_polarizability", "h**o", "lumo", "electronic_spatial_extent", "zpve", "energy_U0", "energy_U", "enthalpy_H", "free_energy", "heat_capacity" ] # len(qm9data) = all structures nb_structures = len(qm9data) frames = ase.io.read('qm9.db', ':' + str(nb_structures)) for i in range(nb_structures): _, struc_property = qm9data.get_properties(idx=i) frames[i].cell = np.eye(3) * 100 frames[i].center() frames[i].wrap(eps=1e-11) for key in properties_key: property_value = float(struc_property[key][0]) frames[i].info[key] = property_value ase.io.write('qm9.extxyz', frames)
Created on Wed Jul 24 14:23:51 2019 @author: will """ #import schnetpack.atomistic.output_modules import schnetpack as spk import schnetpack.representation as rep from schnetpack.datasets import QM9 import pandas as pd import numpy as np import pickle from ase import Atoms # load qm9 dataset and download if necessary data = QM9("qm9.db", collect_triples=True) loader = spk.data.AtomsLoader(data, batch_size=1, num_workers=2) reps = rep.BehlerSFBlock() # get wACSF feature reps_dict = {} for i,x in enumerate(loader): reps_dict[data.get_atoms(i).__repr__()] = reps(x).squeeze(0) structures = pd.read_csv('/home/will/Desktop/kaggle/QM/Data/structures.csv') structures_gb = structures.groupby(['molecule_name']) atoms_structures = {} for k,v in structures_gb: atom_dict = {'positions':v[['x','y','z']].values.tolist(),
Created on Mon Dec 7 02:34:32 2020 @author: Chris extract raw data from QM9 files """ from schnetpack.datasets import QM9 import pandas as pd import torch import numpy as np from tqdm import tqdm import json #%% import QM9 database qm9data = QM9('./qm9.db', download=True) #%% convert to different representations rep = pd.DataFrame(data=None, columns=qm9data[0].keys()) row = {} i = 0 for n in tqdm(range(len(qm9data))): print(i) datum = qm9data[n] # convert tensors to numpy arrays for k, v in datum.items(): row[k] = v.numpy() # append row to dataframe
def qm9_dataset(qm9_dbpath): print(os.path.exists(qm9_dbpath)) return QM9(qm9_dbpath)