def __init__(self, datafile): self.fdata = dict() for df in datafile: adl = ant.anidataloader(df) tdata = dict() for data in adl: tdata.update({data['path'].split('/')[-1]: data}) adl.cleanup() self.fdata[df.split('tsdata_')[-1].split('.h5')[0]] = tdata
def sae_linear_fitting(self, Ekey='energies', energy_unit=1.0, Eax0sum=False): from sklearn import linear_model print('Performing linear fitting...') datadir = self.h5dir sae_out = self.netdict['saefile'] smap = dict() for i,Z in enumerate(self.netdict['atomtyp']): smap.update({Z:i}) Na = len(smap) files = os.listdir(datadir) X = [] y = [] for f in files[0:20]: print(f) adl = pyt.anidataloader(datadir + f) for data in adl: # print(data['path']) S = data['species'] if data[Ekey].size > 0: if Eax0sum: E = energy_unit*np.sum(np.array(data[Ekey], order='C', dtype=np.float64), axis=1) else: E = energy_unit*np.array(data[Ekey], order='C', dtype=np.float64) S = S[0:data['coordinates'].shape[1]] unique, counts = np.unique(S, return_counts=True) x = np.zeros(Na, dtype=np.float64) for u, c in zip(unique, counts): x[smap[u]] = c for e in E: X.append(np.array(x)) y.append(np.array(e)) X = np.array(X) y = np.array(y).reshape(-1, 1) lin = linear_model.LinearRegression(fit_intercept=False) lin.fit(X, y) coef = lin.coef_ print(coef) sae = open(sae_out, 'w') for i, c in enumerate(coef[0]): sae.write(next(key for key, value in smap.items() if value == i) + ',' + str(i) + '=' + str(c) + '\n') sae.close() print('Linear fitting complete.')
def compute_test (self, h5file): mNa = 100 # Declare loader adl = pyt.anidataloader(h5file) # Declare containers Eact = [] Ecmp = [] Nmt = 0 for data in adl: # Extract the data xyz = data['coordinates'] Eqm = data['energies'] spc = data['species'] xyz = xyz.reshape(Eqm.shape[0], len(spc), 3) if xyz.shape[0] > 0: Nm = xyz.shape[0] Na = xyz.shape[1] if Na < mNa: mNa = Na Nat = Na * Nm Nit = int(np.ceil(Nat / 65000.0)) Nmo = int(65000 / Na) Nmx = Nm for j in range(0, Nit): # Setup idicies i1 = j * Nmo i2 = min(j * Nmo + Nmo, Nm) # copy array subset Eact_t = Eqm[i1:i2] # Set the conformers in NeuroChem self.nc.setConformers(confs=xyz[i1:i2], types=list(spc)) Ecmp_t = self.nc.energy() Ecmp.append(np.sum(np.power(hdn.hatokcal * Ecmp_t - hdn.hatokcal * Eact_t,2))) Nmt = Nmt + Ecmp_t.size #Eact.append(Eact_t) #print(hdn.hatokcal * np.sum(np.abs(Ecmp_t-Eact_t))/float(Ecmp_t.size)) Ecmp = np.array(Ecmp, dtype=np.float64) return np.sqrt(np.sum(Ecmp) / float(Nmt))
def check_for_outsider(okayl, chckl): for i in chckl: if i not in okayl: return False return True dst = "/home/jujuman/Research/ANI-DATASET/h5data/gdb9-2500-bad_new.h5" src = "/home/jujuman/Research/ANI-DATASET/GDB-09-Data/gdb9-2500-bad.h5" #open an HDF5 for compressed storage. #Note that if the path exists, it will open whatever is there. dpack = pyt.datapacker(dst) aload = pyt.anidataloader(src) at = [ 'H', 'C', 'N', 'O', #'F', #'S', ] for id, data in enumerate(aload.get_roman_data()): xyz = np.asarray(data['coordinates'], dtype=np.float32) erg = np.asarray(data['energies'], dtype=np.float64) spc = [str(a.decode('ascii')) for a in data['species']]
# Import pyanitools import pyanitools as pyt # path the the store file store_file = '/home/jujuman/Research/ANI-DATASET/rxn_db_mig.h5' # Declare the loader, opens the store loader = pyt.anidataloader(store_file) # Load the entire store into memory loader.totalload() # Loop over store data for i in range(loader.size()): data = loader.getdata(i) print(data[0]) # Closes the store file loader.cleanup()
path = "/home/jujuman/Research/ANI-DATASET/ANI-1_release/data/ani-1_data_c08.h5" wkdir = '/home/jujuman/Research/CrossValidation/' cnstfile = wkdir + 'rHCNO-4.6A_16-3.1A_a4-8.params' saefile = wkdir + 'sae_6-31gd.dat' #------------------------------------------- # Build networks nc = [ pync.conformers(cnstfile, saefile, wkdir + 'cv_c08e_ntw_' + str(l) + '/networks/', 0) for l in range(5) ] # Build loader adl = pyt.anidataloader(path) # Load data adl.load_node("/gdb11_s01/") # Loop for i in range(adl.size()): #print(i, ' of ', adl.size()) data = adl.getdata(i) x = data[0] e = data[1] s = data[2] Nm = e.shape[0] Na = len(s)
def __init__(self, hdf5files, saef, output, storecac, storetest, Naev): self.xyz = [] self.frc = [] self.Eqm = [] self.spc = [] self.idx = [] self.gid = [] self.prt = [] self.Naev = Naev self.kid = [] # list to track data kept self.nt = [] # total conformers self.nc = [] # total kept self.of = open(output, 'w') self.tf = 0 for f in hdf5files: # Construct the data loader class adl = pyt.anidataloader(f) print('Loading file:', f) # Declare test cache if os.path.exists(storetest): os.remove(storetest) dpack = pyt.datapacker(storetest) for i, data in enumerate(adl): xyz = data['coordinates'] frc = data['forces'] eng = data['energies'] spc = data['species'] nme = data['path'] # Toss out high forces Mv = np.max(np.linalg.norm(frc, axis=2), axis=1) index = np.where(Mv > 1.75)[0] indexk = np.where(Mv <= 1.75)[0] # CLear forces xyz = xyz[indexk] frc = frc[indexk] eng = eng[indexk] idx = np.random.uniform(0.0, 1.0, eng.size) tr_idx = np.asarray(np.where(idx < 0.99))[0] te_idx = np.asarray(np.where(idx >= 0.99))[0] #print(tr_idx) if tr_idx.size > 0: self.prt.append(nme) self.xyz.append( np.ndarray.astype(xyz[tr_idx], dtype=np.float32)) self.frc.append( np.ndarray.astype(frc[tr_idx], dtype=np.float32)) self.Eqm.append( np.ndarray.astype(eng[tr_idx], dtype=np.float64)) self.spc.append(spc) Nd = eng[tr_idx].size #print(Nd) self.idx.append(np.arange(Nd)) self.kid.append(np.array([], dtype=np.int)) self.gid.append(np.array([], dtype=np.int)) self.tf = self.tf + Nd self.nt.append(Nd) self.nc.append(0) # Prepare and store the test data set if xyz[te_idx].size != 0: #t_xyz = xyz[te_idx].reshape(te_idx.size, xyz[te_idx].shape[1] * xyz[te_idx].shape[2]) dpack.store_data(nme + '/mol' + str(i), coordinates=xyz[te_idx], forces=frc[te_idx], energies=np.array(eng[te_idx]), species=spc) # Clean up adl.cleanup() # Clean up dpack.cleanup() self.nt = np.array(self.nt) self.nc = np.array(self.nc) self.ts = 0 self.vs = 0 self.Nbad = self.tf self.saef = saef self.storecac = storecac
] cachev = [ cg('_valid', saef, store_dir + str(r) + '/', forcet, chargt, False) for r in range(N) ] testh5 = [ pyt.datapacker(store_dir + str(r) + '/testset/testset.h5') for r in range(N) ] Nd = np.zeros(N, dtype=np.int32) Nbf = 0 for f, fn in enumerate(h5files): print('Processing file(' + str(f + 1) + ' of ' + str(len(h5files)) + '):', fn) adl = pyt.anidataloader(fn) To = adl.size() Ndc = 0 Fmt = [] Emt = [] for c, data in enumerate(adl): if True: # Get test store name Pn = fn.split('/')[-1].rsplit('.', 1)[0] + data['path'] # Progress indicator sys.stdout.write("\r%d%% %s" % (int(100 * c / float(To)), Pn)) sys.stdout.flush()
ax.set_ylim([shr1, shr2]) font = {'family': 'Bitstream Vera Sans', 'weight': 'heavy', 'size': 24} ax.set_ylabel('$E_{cmp}$', fontdict=font) ax.set_xlabel('$E_{ref}$', fontdict=font) # Set data fields #h5file = '/home/jujuman/Research/SingleNetworkTest/cache02/testset/testset.h5' h5file = '/home/jujuman/Research/DataReductionMethods/models/cache/testset/testset.h5' #h5file = '/home/jujuman/Research/ANI-DATASET/h5data/ani-gdb-c03.h5' # Declare loader adl = pyt.anidataloader(h5file) nl = adl.get_group_list() print(nl) #node = "gdb11_s10" #Network 1 Files #wkdir = '/home/jujuman/Scratch/Dropbox/ChemSciencePaper.AER/networks/ANI-SN_CHNOSF-1/' wkdir = '/home/jujuman/Research/DataReductionMethods/models/train_c08f/' #wkdir = '/home/jujuman/Dropbox/ChemSciencePaper.AER/networks/ANI-c08f-ntwk/' #wkdir = '/home/jujuman/Research/GDB-11-wB97X-6-31gd/train_08_9/' #wkdir = '/home/jujuman/Research/GDB-11-wB97X-6-31gd/train_01/' cnstfile = wkdir + 'rHCNO-4.6A_16-3.1A_a4-8.params' saefile = wkdir + 'sae_6-31gd.dat'
def build_strided_training_cache(self, Nblocks, Nvalid, Ntest, build_test=True, build_valid=False, forces=True, grad=False, Fkey='forces', forces_unit=1.0, Ekey='energies', energy_unit=1.0, Eax0sum=False, rmhighe=True): if not os.path.isfile(self.netdict['saefile']): self.sae_linear_fitting(Ekey=Ekey, energy_unit=energy_unit, Eax0sum=Eax0sum) h5d = self.h5dir store_dir = self.train_root + "cache-data-" N = self.Nn Ntrain = Nblocks - Nvalid - Ntest if Nblocks % N != 0: raise ValueError( 'Error: number of networks must evenly divide number of blocks.' ) Nstride = Nblocks / N for i in range(N): if not os.path.exists(store_dir + str(i)): os.mkdir(store_dir + str(i)) if build_test: if os.path.exists(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5'): os.remove(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5') if not os.path.exists(store_dir + str(i) + '/../testset'): os.mkdir(store_dir + str(i) + '/../testset') cachet = [ cg('_train', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] cachev = [ cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] if build_test: testh5 = [ pyt.datapacker(store_dir + str(r) + '/../testset/testset' + str(r) + '.h5') for r in range(N) ] if build_valid: valdh5 = [ pyt.datapacker(store_dir + str(r) + '/../testset/valdset' + str(r) + '.h5') for r in range(N) ] if rmhighe: dE = [] for f in self.h5file: adl = pyt.anidataloader(h5d + f) for data in adl: S = data['species'] E = data['energies'] X = data['coordinates'] Esae = hdt.compute_sae(self.netdict['saefile'], S) dE.append((E - Esae) / np.sqrt(len(S))) dE = np.concatenate(dE) cidx = np.where(np.abs(dE) < 15.0) std = np.abs(dE[cidx]).std() men = np.mean(dE[cidx]) print(men, std, men + std) idx = np.intersect1d( np.where(dE >= -np.abs(15 * std + men))[0], np.where(dE <= np.abs(11 * std + men))[0]) cnt = idx.size print('DATADIST: ', dE.size, cnt, (dE.size - cnt), 100.0 * ((dE.size - cnt) / dE.size)) E = [] data_count = np.zeros((N, 3), dtype=np.int32) for f in self.h5file: print('Reading data file:', h5d + f) adl = pyt.anidataloader(h5d + f) for data in adl: #print(data['path'],data['energies'].size) S = data['species'] if data[Ekey].size > 0 and (set(S).issubset( self.netdict['atomtyp'])): X = np.array(data['coordinates'], order='C', dtype=np.float32) #print(np.array(data[Ekey].shape),np.sum(np.array(data[Ekey], order='C', dtype=np.float64),axis=1).shape,data[Fkey].shape) if Eax0sum: E = energy_unit * np.sum(np.array( data[Ekey], order='C', dtype=np.float64), axis=1) else: E = energy_unit * np.array( data[Ekey], order='C', dtype=np.float64) if forces and not grad: F = forces_unit * np.array( data[Fkey], order='C', dtype=np.float32) elif forces and grad: F = -forces_unit * np.array( data[Fkey], order='C', dtype=np.float32) else: F = 0.0 * X if rmhighe: Esae = hdt.compute_sae(self.netdict['saefile'], S) ind_dE = (E - Esae) / np.sqrt(len(S)) hidx = np.union1d( np.where(ind_dE < -(15.0 * std + men))[0], np.where(ind_dE > (11.0 * std + men))[0]) lidx = np.intersect1d( np.where(ind_dE >= -(15.0 * std + men))[0], np.where(ind_dE <= (11.0 * std + men))[0]) if hidx.size > 0: print( ' -(' + f + ':' + data['path'] + ')High energies detected:\n ', (E[hidx] - Esae) / np.sqrt(len(S))) X = X[lidx] E = E[lidx] F = F[lidx] # Build random split index ridx = np.random.randint(0, Nblocks, size=E.size) Didx = [ np.argsort(ridx)[np.where(ridx == i)] for i in range(Nblocks) ] # Build training cache for nid, cache in enumerate(cachet): set_idx = np.concatenate([ Didx[((bid + nid * int(Nstride)) % Nblocks)] for bid in range(Ntrain) ]) if set_idx.size != 0: data_count[nid, 0] += set_idx.size cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) # for nid,cache in enumerate(cachev): # set_idx = np.concatenate([Didx[((1+bid+nid*int(Nstride)) % Nblocks)] for bid in range(Ntrain)]) # if set_idx.size != 0: # data_count[nid,0]+=set_idx.size # cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) for nid, cache in enumerate(cachev): set_idx = np.concatenate([ Didx[(Ntrain + bid + nid * int(Nstride)) % Nblocks] for bid in range(Nvalid) ]) if set_idx.size != 0: data_count[nid, 1] += set_idx.size cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) if build_valid: valdh5[nid].store_data(f + data['path'], coordinates=X[set_idx], forces=F[set_idx], energies=E[set_idx], species=list(S)) if build_test: for nid, th5 in enumerate(testh5): set_idx = np.concatenate([ Didx[(Ntrain + Nvalid + bid + nid * int(Nstride)) % Nblocks] for bid in range(Ntest) ]) if set_idx.size != 0: data_count[nid, 2] += set_idx.size th5.store_data(f + data['path'], coordinates=X[set_idx], forces=F[set_idx], energies=E[set_idx], species=list(S)) # Save train and valid meta file and cleanup testh5 for t, v in zip(cachet, cachev): t.makemetadata() v.makemetadata() if build_test: for th in testh5: th.cleanup() if build_valid: for vh in valdh5: vh.cleanup() print(' Train ', ' Valid ', ' Test ') print(data_count) print('Training set built.')
def __init__(self, hdf5files, saef, storecac, storetest): self.xyz = [] self.Eqm = [] self.spc = [] self.idx = [] self.prt = [] self.kid = [] # list to track data kept self.nt = [] # total conformers self.nc = [] # total kept self.tf = 0 for f in hdf5files: # Construct the data loader class adl = pyt.anidataloader(f) # Declare test cache if os.path.exists(storetest): os.remove(storetest) dpack = pyt.datapacker(storetest) for i, data in enumerate(adl): xyz = np.array_split(data['coordinates'], 10) eng = np.array_split(data['energies'], 10) spc = data['species'] nme = data['parent'] self.prt.append(nme) self.xyz.append( np.concatenate(xyz[0:9]) ) self.Eqm.append( np.concatenate(eng[0:9]) ) self.spc.append(spc) Nd = np.concatenate(eng[0:9]).shape[0] self.idx.append( np.arange(Nd) ) self.kid.append( np.array([], dtype=np.int) ) self.tf = self.tf + Nd self.nt.append(Nd) self.nc.append(0) # Prepare and store the test data set if xyz[9].size != 0: t_xyz = xyz[9].reshape(xyz[9].shape[0], xyz[9].shape[1] * xyz[9].shape[2]) dpack.store_data(nme + '/mol' + str(i), coordinates=t_xyz, energies=np.array(eng[9]), species=spc) # Clean up adl.cleanup() # Clean up dpack.cleanup() self.nt = np.array(self.nt) self.nc = np.array(self.nc) self.ts = 0 self.vs = 0 self.Nbad = self.tf self.saef = saef self.storecac = storecac
'/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0002.h5', '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0003.h5', '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0004.h5', '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0005.h5', '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0006.h5', '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0007.h5', '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0008.h5', '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0009.h5', '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0010.h5', '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0605.0001.0011.h5', ] r = re.compile('(.+?)(\d+?)') comb = dict() for h5 in h5_list: adl = pyt.anidataloader(h5) for data in adl: key = data['path'].split('_')[1].split('/')[0] items = r.findall(key) #print(key, items, sorted(items)) #print(data.keys()) if key in comb: comb[key].append(data) else: comb[key] = [data] print(len(list(comb.keys()))) for k in comb.keys(): data = comb[k] data_new = dict() data_new['energies'] = data
import numpy as np import hdnntools as gt import pyanitools as pyt import os lfile = '/home/jujuman/DataTesting/gdb9-2500-div-dim.h5' sfile = '/home/jujuman/DataTesting/gdb9-2500-div-dim_35.h5' if os.path.exists(sfile): os.remove(sfile) adl = pyt.anidataloader(lfile) dpk = pyt.datapacker(sfile) for i,x in enumerate(adl): print(i) xyz = np.asarray(x['coordinates'],dtype=np.float32) erg = x['energies'] spc = x['species'] dpk.store_data('/gdb-09-DIV/mol'+str(i), coordinates=xyz.reshape(erg.shape[0],len(spc)*3), energies=erg, species=spc) adl.cleanup() dpk.cleanup()
def MAE(act, pre): N = act.shape[0] e = (np.abs(pre - act)).sum() return e / float(N) # Set required files for pyNeuroChem anipath = '/home/jujuman/Research/QM-7TEST/tester/ANI-QM7-ntwk' cnstfile = anipath + '/rHCNOS-5.0A_16-3.1A_a4-8.params' saefile = anipath + '/../sae_6-31gd.dat' nnfdir = anipath + '/networks/' path = "/home/jujuman/Scratch/Research/QM-7TEST/QM7-test-ho.h5" datas = pyt.anidataloader(path) datas.totalload() # Construct pyNeuroChem class nc = pync.conformers(cnstfile, saefile, nnfdir, 0) Ea = np.zeros(datas.size()) Ec = np.zeros(datas.size()) for i in range(datas.size()): print(i, ' of ', datas.size()) data = datas.getdata(i) x = data[0] e = data[1]
model = Net(dims).to('cuda') model.load_state_dict(torch.load(args.model)) optimizer = optim.SGD(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) # load the data into memory (big) mcnt = 0 #molecules ccnt = 0 #conformers elements = set() examples = [] #the entire training set loaded into memory examplesbysize = dict() for hd5file in sorted(glob.glob('*.h5')): for data in pya.anidataloader(hd5file): #calculate some statistics mcnt += 1 ccnt += len(data['energies']) elements.update(data['species']) #molecule types and radii types = np.array([typemap[elem] for elem in data['species']], dtype=np.float32) radii = np.array([typeradii[int(index)] for index in types], dtype=np.float32) sz = len(radii) if sz not in examplesbysize: examplesbysize[sz] = [] #create an example for every conformer for coord, energy in zip(data['coordinates'],data['energies']): c = molgrid.CoordinateSet(coord.astype(np.float32), types, radii,4)
def build_training_cache(self, forces=True): store_dir = self.train_root + "cache-data-" N = self.Nn for i in range(N): if not os.path.exists(store_dir + str(i)): os.mkdir(store_dir + str(i)) if os.path.exists(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5'): os.remove(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5') if not os.path.exists(store_dir + str(i) + '/../testset'): os.mkdir(store_dir + str(i) + '/../testset') cachet = [ cg('_train', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] cachev = [ cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] testh5 = [ pyt.datapacker(store_dir + str(r) + '/../testset/testset' + str(r) + '.h5') for r in range(N) ] Nd = np.zeros(N, dtype=np.int32) Nbf = 0 for f, fn in enumerate(self.h5file): print( 'Processing file(' + str(f + 1) + ' of ' + str(len(self.h5file)) + '):', fn) adl = pyt.anidataloader(self.h5dir + fn) To = adl.size() Ndc = 0 Fmt = [] Emt = [] for c, data in enumerate(adl): Pn = data['path'] + '_' + str(f).zfill(6) + '_' + str(c).zfill( 6) # Extract the data X = data['coordinates'] E = data['energies'] S = data['species'] # 0.0 forces if key doesnt exist if forces: F = data['forces'] else: F = 0.0 * X Fmt.append(np.max(np.linalg.norm(F, axis=2), axis=1)) Emt.append(E) Mv = np.max(np.linalg.norm(F, axis=2), axis=1) index = np.where(Mv > 10.5)[0] indexk = np.where(Mv <= 10.5)[0] Nbf += index.size # Clear forces X = X[indexk] F = F[indexk] E = E[indexk] Esae = hdt.compute_sae(self.netdict['saefile'], S) hidx = np.where(np.abs(E - Esae) > 10.0) lidx = np.where(np.abs(E - Esae) <= 10.0) if hidx[0].size > 0: print( ' -(' + str(c).zfill(3) + ')High energies detected:\n ', E[hidx]) X = X[lidx] E = E[lidx] F = F[lidx] Ndc += E.size if (set(S).issubset(self.netdict['atomtyp'])): # Random mask R = np.random.uniform(0.0, 1.0, E.shape[0]) idx = np.array([interval(r, N) for r in R]) # Build random split lists split = [] for j in range(N): split.append([i for i, s in enumerate(idx) if s == j]) nd = len([i for i, s in enumerate(idx) if s == j]) Nd[j] = Nd[j] + nd # Store data for i, t, v, te in zip(range(N), cachet, cachev, testh5): ## Store training data X_t = np.array(np.concatenate( [X[s] for j, s in enumerate(split) if j != i]), order='C', dtype=np.float32) F_t = np.array(np.concatenate( [F[s] for j, s in enumerate(split) if j != i]), order='C', dtype=np.float32) E_t = np.array(np.concatenate( [E[s] for j, s in enumerate(split) if j != i]), order='C', dtype=np.float64) if E_t.shape[0] != 0: t.insertdata(X_t, F_t, E_t, list(S)) ## Store Validation if np.array(split[i]).size > 0: X_v = np.array(X[split[i]], order='C', dtype=np.float32) F_v = np.array(F[split[i]], order='C', dtype=np.float32) E_v = np.array(E[split[i]], order='C', dtype=np.float64) if E_v.shape[0] != 0: v.insertdata(X_v, F_v, E_v, list(S)) # Print some stats print('Data count:', Nd) print('Data split:', 100.0 * Nd / np.sum(Nd), '%') # Save train and valid meta file and cleanup testh5 for t, v, th in zip(cachet, cachev, testh5): t.makemetadata() v.makemetadata() th.cleanup()
import pyanitools as pya import json fns = glob('../ANI-1_release/ani*.h5') frames_different_molecule = [] frames_different_molecule_test = [] frames100_conf_per_mol = [] total_molecules = 22057374 seed = 2020 Nstruct = 0 for it,fn in enumerate(fns): print(fn) adl = pya.anidataloader(fn) # Print the species of the data set one by one for in_data,data in enumerate(adl): # Extract the data E = data['energies'] mol_in_the_block = E.shape[0] shifts = np.random.RandomState(seed = seed).permutation(np.arange(mol_in_the_block)) frames_different_molecule.append(Nstruct + shifts[0]) frames_different_molecule_test.append(Nstruct + shifts[1]) Nstruct += mol_in_the_block adl.cleanup() print('Number of molecule in the dataset is ' + str(Nstruct)) frames = {'frames':frames_different_molecule,
#hdf5file = '/home/jujuman/Research/ANI-DATASET/ani-1_data_c03.h5' storecac = '/home/jujuman/Research/SingleNetworkTest/cache06/' saef = "/home/jujuman/Research/SingleNetworkTest/sae_6-31gd.dat" path = "/home/jujuman/Research/SingleNetworkTest/cache06/testset/testset.h5" # Declare data cache cachet = cg('_train', saef, storecac, False) cachev = cg('_valid', saef, storecac, False) # Declare test cache dpack = pyt.datapacker(path) for f in hdf5files: # Construct the data loader class print(f) adl = pyt.anidataloader(f[0]) print(adl.get_group_list()) # Loop over data in set dc = 0 for i, data in enumerate(adl): #if (i == 2): xyz = np.array_split(data['coordinates'], 10) eng = np.array_split(data['energies'], 10) spc = data['species'] nme = data['parent'] #print('Parent: ', nme, eng) dc = dc + np.concatenate(eng[0:8]).shape[0]
def build_strided_training_cache(self, Nblocks, Nvalid, Ntest, build_test=True, forces=True, grad=False, Fkey='forces', forces_unit=1.0, Ekey='energies', energy_unit=1.0, Eax0sum=False): if not os.path.isfile(self.netdict['saefile']): self.sae_linear_fitting(Ekey=Ekey, energy_unit=energy_unit, Eax0sum=Eax0sum) h5d = self.h5dir store_dir = self.train_root + "cache-data-" N = self.Nn Ntrain = Nblocks - Nvalid - Ntest if Nblocks % N != 0: raise ValueError( 'Error: number of networks must evenly divide number of blocks.' ) Nstride = Nblocks / N for i in range(N): if not os.path.exists(store_dir + str(i)): os.mkdir(store_dir + str(i)) if build_test: if os.path.exists(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5'): os.remove(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5') if not os.path.exists(store_dir + str(i) + '/../testset'): os.mkdir(store_dir + str(i) + '/../testset') cachet = [ cg('_train', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] cachev = [ cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] if build_test: testh5 = [ pyt.datapacker(store_dir + str(r) + '/../testset/testset' + str(r) + '.h5') for r in range(N) ] E = [] data_count = np.zeros((N, 3), dtype=np.int32) for f in self.h5file: adl = pyt.anidataloader(h5d + f) for data in adl: #print(data['path'],data['energies'].size) S = data['species'] if data[Ekey].size > 0 and (set(S).issubset( self.netdict['atomtyp'])): X = np.array(data['coordinates'], order='C', dtype=np.float32) if Eax0sum: E = energy_unit * np.sum(np.array( data[Ekey], order='C', dtype=np.float64), axis=1) else: E = energy_unit * np.array( data[Ekey], order='C', dtype=np.float64) if forces and not grad: F = forces_unit * np.array( data[Fkey], order='C', dtype=np.float32) if forces and grad: F = -forces_unit * np.array( data[Fkey], order='C', dtype=np.float32) else: F = 0.0 * X # Build random split index ridx = np.random.randint(0, Nblocks, size=E.size) Didx = [ np.argsort(ridx)[np.where(ridx == i)] for i in range(Nblocks) ] # Build training cache for nid, cache in enumerate(cachet): set_idx = np.concatenate([ Didx[((bid + nid * int(Nstride)) % Nblocks)] for bid in range(Ntrain) ]) if set_idx.size != 0: data_count[nid, 0] += set_idx.size cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) for nid, cache in enumerate(cachev): set_idx = np.concatenate([ Didx[(Ntrain + bid + nid * int(Nstride)) % Nblocks] for bid in range(Nvalid) ]) if set_idx.size != 0: data_count[nid, 1] += set_idx.size cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) if build_test: for nid, th5 in enumerate(testh5): set_idx = np.concatenate([ Didx[(Ntrain + Nvalid + bid + nid * int(Nstride)) % Nblocks] for bid in range(Ntest) ]) if set_idx.size != 0: data_count[nid, 2] += set_idx.size th5.store_data(f + data['path'], coordinates=X[set_idx], forces=F[set_idx], energies=E[set_idx], species=list(S)) # Save train and valid meta file and cleanup testh5 for t, v in zip(cachet, cachev): t.makemetadata() v.makemetadata() if build_test: for th in testh5: th.cleanup() print(' Train ', ' Valid ', ' Test ') print(data_count) print('Training set built.')
import hdnntools as hdt import pyanitools as pyt import os file = '/home/jujuman/Research/DataReductionMethods/model6/model0.05me/ani_red_c06.h5' sdir = '/home/jujuman/Research/GDB-11-AL-wB97x631gd/' aload = pyt.anidataloader(file) for data in aload: X = data['coordinates'] S = data['species'] P = data['path'] parent = P.split('/')[1] index = P.split('/')[2].split('mol')[1].zfill(7) path = sdir+parent if not os.path.exists(path): os.mkdir(path) print(path + '/' + parent + '-' + index + '.xyz','DATA:',X.shape[0]) hdt.writexyzfile(path+'/'+parent+'-'+index+'.xyz',X,S)
import os import pickle import pyanitools from neurochem_calculator import NeuroChem, path import tqdm neurochem = NeuroChem() # generate expect for ANI1 subset mol_count = 0 for i in [1, 2, 3, 4]: data_file = os.path.join( path, '../../dataset/ani1-up_to_gdb4/ani_gdb_s0{}.h5'.format(i)) adl = pyanitools.anidataloader(data_file) for data in tqdm.tqdm(adl, desc='ANI1: {} heavy atoms'.format(i)): coordinates = data['coordinates'][:10, :] pickleobj = neurochem(coordinates, data['species']) dumpfile = os.path.join( path, '../../tests/test_data/ANI1_subset/{}'.format(mol_count)) with open(dumpfile, 'wb') as f: pickle.dump(pickleobj, f) mol_count += 1
def shard_generator(): shard_size = 4096 * 64 row_idx = 0 group_idx = 0 X_cache = [] y_cache = [] w_cache = [] ids_cache = [] for hdf5file in hdf5files: adl = pya.anidataloader(hdf5file) for data in adl: # Extract the data P = data['path'] R = data['coordinates'] E = data['energies'] S = data['species'] smi = data['smiles'] if len(S) > 23: print("skipping:", smi, "due to atom count.") continue # Print the data print("Processing: ", P) print(" Smiles: ", "".join(smi)) print(" Symbols: ", S) print(" Coordinates: ", R.shape) print(" Energies: ", E.shape) Z_padded = np.zeros((23,), dtype=np.float32) nonpadded = convert_species_to_atomic_nums(S) Z_padded[:nonpadded.shape[0]] = nonpadded if mode == "relative": offset = np.amin(E) elif mode == "atomization": # self-interaction energies taken from # https://github.com/isayev/ANI1_dataset README atomizationEnergies = { 0: 0, 1: -0.500607632585, 6: -37.8302333826, 7: -54.5680045287, 8: -75.0362229210 } offset = 0 for z in nonpadded: offset -= atomizationEnergies[z] elif mode == "absolute": offset = 0 else: raise Exception("Unsupported mode: ", mode) for k in range(len(E)): R_padded = np.zeros((23, 3), dtype=np.float32) R_padded[:R[k].shape[0], :R[k].shape[1]] = R[k] X = np.concatenate([np.expand_dims(Z_padded, 1), R_padded], axis=1) y = E[k] - offset if len(X_cache) == shard_size: yield np.array(X_cache), np.array(y_cache), np.array( w_cache), np.array(ids_cache) X_cache = [] y_cache = [] w_cache = [] ids_cache = [] else: X_cache.append(X) y_cache.append(np.array(y).reshape((1,))) w_cache.append(np.array(1).reshape((1,))) ids_cache.append(row_idx) row_idx += 1 groups.append(group_idx) group_idx += 1 # flush once more at the end if len(X_cache) > 0: yield np.array(X_cache), np.array(y_cache), np.array(w_cache), np.array( ids_cache)
import pyanitools as pyt adl = pyt.anidataloader( '/home/jujuman/Research/ANI-DATASET/h5data/r10_ccsd.h5') for i, data in enumerate(adl): print(data['energies'])
def shard_generator(): shard_size = 4096 * 64 row_idx = 0 group_idx = 0 X_cache = [] y_cache = [] w_cache = [] ids_cache = [] for hdf5file in hdf5files: adl = pya.anidataloader(hdf5file) for data in adl: # Extract the data P = data['path'] R = data['coordinates'] E = data['energies'] S = data['species'] smi = data['smiles'] if len(S) > 23: print("skipping:", smi, "due to atom count.") continue # Print the data print("Processing: ", P) print(" Smiles: ", "".join(smi)) print(" Symbols: ", S) print(" Coordinates: ", R.shape) print(" Energies: ", E.shape) Z_padded = np.zeros((23, ), dtype=np.float32) nonpadded = convert_species_to_atomic_nums(S) Z_padded[:nonpadded.shape[0]] = nonpadded if mode == "relative": offset = np.amin(E) elif mode == "atomization": # self-interaction energies taken from # https://github.com/isayev/ANI1_dataset README atomizationEnergies = { 0: 0, 1: -0.500607632585, 6: -37.8302333826, 7: -54.5680045287, 8: -75.0362229210 } offset = 0 for z in nonpadded: offset -= atomizationEnergies[z] elif mode == "absolute": offset = 0 else: raise Exception("Unsupported mode: ", mode) for k in range(len(E)): R_padded = np.zeros((23, 3), dtype=np.float32) R_padded[:R[k].shape[0], :R[k].shape[1]] = R[k] X = np.concatenate([np.expand_dims(Z_padded, 1), R_padded], axis=1) y = E[k] - offset if len(X_cache) == shard_size: yield np.array(X_cache), np.array(y_cache), np.array( w_cache), np.array(ids_cache) X_cache = [] y_cache = [] w_cache = [] ids_cache = [] else: X_cache.append(X) y_cache.append(np.array(y).reshape((1, ))) w_cache.append(np.array(1).reshape((1, ))) ids_cache.append(row_idx) row_idx += 1 groups.append(group_idx) group_idx += 1 # flush once more at the end if len(X_cache) > 0: yield np.array(X_cache), np.array(y_cache), np.array( w_cache), np.array(ids_cache)
def generate_stats(self, maxe=sys.float_info.max, forces=True, grad=False): self.tdata = dict() for key in self.tsfiles.keys(): print(' -Working on', key, '...') cdata = dict({ 'Eani': [], 'Edft': [], 'Fani': [], 'Fdft': [], 'dEani': [], 'dEdft': [], 'Na': [], 'Na2': [], }) for file in self.tsfiles[key]: adl = ant.anidataloader(file) for i, data in enumerate(adl): #if i > 5: # break if data['coordinates'].shape[0] != 0: Eani, Fani, sig = self.compute_energyandforce_conformations( data['coordinates'], data['species'], ensemble=False) midx = np.where( data['energies'] - data['energies'].min() < maxe / hdt.hatokcal)[0] Eani = Eani[:, midx] Edft = data['energies'][midx] Fani = Fani[:, midx, :, :] if forces: if grad: Fdft = -data['forces'][midx] else: Fdft = data['forces'][midx] else: Fdft = 0.0 * data['coordinates'] #Eestd = np.std(Eani, axis=0)/np.sqrt(len(data['species'])) Eeani = np.mean(Eani, axis=0).reshape(1, -1) Feani = np.mean(Fani, axis=0).flatten().reshape(1, -1) Fani = Fani.reshape(Fani.shape[0], -1) Eani = np.vstack([Eani, Eeani]) Fani = np.vstack([Fani, Feani]) Edft = hdt.hatokcal * Edft Fdft = hdt.hatokcal * Fdft.flatten() cdata['Na'].append( np.full(Edft.size, len(data['species']), dtype=np.int32)) cdata['Eani'].append(Eani) cdata['Edft'].append(Edft) cdata['Fani'].append(Fani) cdata['Fdft'].append(Fdft) #cdata['Frmse'].append(np.sqrt(np.mean((Fani-Fdft).reshape(Fdft.shape[0], -1)**2, axis=1))) #cdata['Frmae'].append(np.sqrt(np.mean(np.abs((Fani - Fdft).reshape(Fdft.shape[0], -1)), axis=1))) cdata['dEani'].append( hdt.calculateKdmat(self.Nn + 1, Eani)) cdata['dEdft'].append(hdt.calculatedmat(Edft)) cdata['Na2'].append( np.full(cdata['dEdft'][-1].size, len(data['species']), dtype=np.int32)) #cdata['Erani'].append(Eani-Eani.min()) #cdata['Erdft'].append(Edft-Edft.min()) for k in ['Na', 'Na2', 'Edft', 'Fdft', 'dEdft']: cdata[k] = np.concatenate(cdata[k]) for k in ['Eani', 'Fani', 'dEani']: cdata[k] = np.hstack(cdata[k]) self.tdata.update({key: cdata})
import pyanitools as pya # Set the HDF5 file containing the data hdf5file = '../ani_gdb_s01.h5' # Construct the data loader class adl = pya.anidataloader(hdf5file) # Print the species of the data set one by one for data in adl: # Extract the data P = data['path'] X = data['coordinates'] E = data['energies'] S = data['species'] sm = data['smiles'] # Print the data print("Path: ", P) print(" Smiles: ", "".join(sm)) print(" Symbols: ", S) print(" Coordinates: ", X) print(" Energies: ", E, "\n") # Closes the H5 data file adl.cleanup()
import pyanitools as pyt #import pyaniasetools as aat import numpy as np import hdnntools as hdt import os #import matplotlib.pyplot as plt file_old = '/home/jsmith48/scratch/auto_al/h5files/ANI-AL-0707.0000.0408.h5' file_new = '/home/jsmith48/scratch/auto_al/h5files_fix/ANI-AL-0707.0000.0408.h5' print('Working on file:', file_old) adl = pyt.anidataloader(file_old) # Data storage dpack = pyt.datapacker(file_new, mode='w') for i, data in enumerate(adl): #if i == 20: # break X = data['coordinates'] S = data['species'] Edft = data['energies'] path = data['path'] del data['path'] #Eani, Fani = anicv.compute_energy_conformations(X=np.array(X,dtype=np.float32),S=S) Esae = hdt.compute_sae( '/home/jsmith48/scratch/auto_al/modelCNOSFCl/sae_wb97x-631gd.dat', S)