xt_data = [] yt_data = [] xv_data = [] yv_data = [] # Print the species of the data set one by one for data in adl.getnextdata(): # Extract the data xyz = np.array_split(data['coordinates'], 250) erg = np.array_split(data['energies'], 250) spc = data['species'] sae = hdt.compute_sae( '/home/jujuman/Research/GDB-11-wB97X-6-31gd/sae_6-31gd.dat', spc) print('TShape: ', xyz[0].shape[0], ' VShape: ', xyz[1].shape[0]) xt_data.append( hdt.ncdata(hdt.generatedmatsd3(xyz[0]), spc, xyz[0].shape[1])) yt_data.append(erg[0] - sae) xv_data.append( hdt.ncdata(hdt.generatedmatsd3(xyz[1]), spc, xyz[1].shape[1])) yv_data.append(erg[1] - sae) yt_data = np.concatenate(yt_data) yv_data = np.concatenate(yv_data) print('Training Data Shape: ', yt_data.shape) print('Testing Data Shape: ', yv_data.shape)
def build_strided_training_cache(self, Nblocks, Nvalid, Ntest, build_test=True, build_valid=False, forces=True, grad=False, Fkey='forces', forces_unit=1.0, Ekey='energies', energy_unit=1.0, Eax0sum=False, rmhighe=True): if not os.path.isfile(self.netdict['saefile']): self.sae_linear_fitting(Ekey=Ekey, energy_unit=energy_unit, Eax0sum=Eax0sum) h5d = self.h5dir store_dir = self.train_root + "cache-data-" N = self.Nn Ntrain = Nblocks - Nvalid - Ntest if Nblocks % N != 0: raise ValueError( 'Error: number of networks must evenly divide number of blocks.' ) Nstride = Nblocks / N for i in range(N): if not os.path.exists(store_dir + str(i)): os.mkdir(store_dir + str(i)) if build_test: if os.path.exists(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5'): os.remove(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5') if not os.path.exists(store_dir + str(i) + '/../testset'): os.mkdir(store_dir + str(i) + '/../testset') cachet = [ cg('_train', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] cachev = [ cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] if build_test: testh5 = [ pyt.datapacker(store_dir + str(r) + '/../testset/testset' + str(r) + '.h5') for r in range(N) ] if build_valid: valdh5 = [ pyt.datapacker(store_dir + str(r) + '/../testset/valdset' + str(r) + '.h5') for r in range(N) ] if rmhighe: dE = [] for f in self.h5file: adl = pyt.anidataloader(h5d + f) for data in adl: S = data['species'] E = data['energies'] X = data['coordinates'] Esae = hdt.compute_sae(self.netdict['saefile'], S) dE.append((E - Esae) / np.sqrt(len(S))) dE = np.concatenate(dE) cidx = np.where(np.abs(dE) < 15.0) std = np.abs(dE[cidx]).std() men = np.mean(dE[cidx]) print(men, std, men + std) idx = np.intersect1d( np.where(dE >= -np.abs(15 * std + men))[0], np.where(dE <= np.abs(11 * std + men))[0]) cnt = idx.size print('DATADIST: ', dE.size, cnt, (dE.size - cnt), 100.0 * ((dE.size - cnt) / dE.size)) E = [] data_count = np.zeros((N, 3), dtype=np.int32) for f in self.h5file: print('Reading data file:', h5d + f) adl = pyt.anidataloader(h5d + f) for data in adl: #print(data['path'],data['energies'].size) S = data['species'] if data[Ekey].size > 0 and (set(S).issubset( self.netdict['atomtyp'])): X = np.array(data['coordinates'], order='C', dtype=np.float32) #print(np.array(data[Ekey].shape),np.sum(np.array(data[Ekey], order='C', dtype=np.float64),axis=1).shape,data[Fkey].shape) if Eax0sum: E = energy_unit * np.sum(np.array( data[Ekey], order='C', dtype=np.float64), axis=1) else: E = energy_unit * np.array( data[Ekey], order='C', dtype=np.float64) if forces and not grad: F = forces_unit * np.array( data[Fkey], order='C', dtype=np.float32) elif forces and grad: F = -forces_unit * np.array( data[Fkey], order='C', dtype=np.float32) else: F = 0.0 * X if rmhighe: Esae = hdt.compute_sae(self.netdict['saefile'], S) ind_dE = (E - Esae) / np.sqrt(len(S)) hidx = np.union1d( np.where(ind_dE < -(15.0 * std + men))[0], np.where(ind_dE > (11.0 * std + men))[0]) lidx = np.intersect1d( np.where(ind_dE >= -(15.0 * std + men))[0], np.where(ind_dE <= (11.0 * std + men))[0]) if hidx.size > 0: print( ' -(' + f + ':' + data['path'] + ')High energies detected:\n ', (E[hidx] - Esae) / np.sqrt(len(S))) X = X[lidx] E = E[lidx] F = F[lidx] # Build random split index ridx = np.random.randint(0, Nblocks, size=E.size) Didx = [ np.argsort(ridx)[np.where(ridx == i)] for i in range(Nblocks) ] # Build training cache for nid, cache in enumerate(cachet): set_idx = np.concatenate([ Didx[((bid + nid * int(Nstride)) % Nblocks)] for bid in range(Ntrain) ]) if set_idx.size != 0: data_count[nid, 0] += set_idx.size cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) # for nid,cache in enumerate(cachev): # set_idx = np.concatenate([Didx[((1+bid+nid*int(Nstride)) % Nblocks)] for bid in range(Ntrain)]) # if set_idx.size != 0: # data_count[nid,0]+=set_idx.size # cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) for nid, cache in enumerate(cachev): set_idx = np.concatenate([ Didx[(Ntrain + bid + nid * int(Nstride)) % Nblocks] for bid in range(Nvalid) ]) if set_idx.size != 0: data_count[nid, 1] += set_idx.size cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S)) if build_valid: valdh5[nid].store_data(f + data['path'], coordinates=X[set_idx], forces=F[set_idx], energies=E[set_idx], species=list(S)) if build_test: for nid, th5 in enumerate(testh5): set_idx = np.concatenate([ Didx[(Ntrain + Nvalid + bid + nid * int(Nstride)) % Nblocks] for bid in range(Ntest) ]) if set_idx.size != 0: data_count[nid, 2] += set_idx.size th5.store_data(f + data['path'], coordinates=X[set_idx], forces=F[set_idx], energies=E[set_idx], species=list(S)) # Save train and valid meta file and cleanup testh5 for t, v in zip(cachet, cachev): t.makemetadata() v.makemetadata() if build_test: for th in testh5: th.cleanup() if build_valid: for vh in valdh5: vh.cleanup() print(' Train ', ' Valid ', ' Test ') print(data_count) print('Training set built.')
def build_training_cache(self, forces=True): store_dir = self.train_root + "cache-data-" N = self.Nn for i in range(N): if not os.path.exists(store_dir + str(i)): os.mkdir(store_dir + str(i)) if os.path.exists(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5'): os.remove(store_dir + str(i) + '/../testset/testset' + str(i) + '.h5') if not os.path.exists(store_dir + str(i) + '/../testset'): os.mkdir(store_dir + str(i) + '/../testset') cachet = [ cg('_train', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] cachev = [ cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/', False) for r in range(N) ] testh5 = [ pyt.datapacker(store_dir + str(r) + '/../testset/testset' + str(r) + '.h5') for r in range(N) ] Nd = np.zeros(N, dtype=np.int32) Nbf = 0 for f, fn in enumerate(self.h5file): print( 'Processing file(' + str(f + 1) + ' of ' + str(len(self.h5file)) + '):', fn) adl = pyt.anidataloader(self.h5dir + fn) To = adl.size() Ndc = 0 Fmt = [] Emt = [] for c, data in enumerate(adl): Pn = data['path'] + '_' + str(f).zfill(6) + '_' + str(c).zfill( 6) # Extract the data X = data['coordinates'] E = data['energies'] S = data['species'] # 0.0 forces if key doesnt exist if forces: F = data['forces'] else: F = 0.0 * X Fmt.append(np.max(np.linalg.norm(F, axis=2), axis=1)) Emt.append(E) Mv = np.max(np.linalg.norm(F, axis=2), axis=1) index = np.where(Mv > 10.5)[0] indexk = np.where(Mv <= 10.5)[0] Nbf += index.size # Clear forces X = X[indexk] F = F[indexk] E = E[indexk] Esae = hdt.compute_sae(self.netdict['saefile'], S) hidx = np.where(np.abs(E - Esae) > 10.0) lidx = np.where(np.abs(E - Esae) <= 10.0) if hidx[0].size > 0: print( ' -(' + str(c).zfill(3) + ')High energies detected:\n ', E[hidx]) X = X[lidx] E = E[lidx] F = F[lidx] Ndc += E.size if (set(S).issubset(self.netdict['atomtyp'])): # Random mask R = np.random.uniform(0.0, 1.0, E.shape[0]) idx = np.array([interval(r, N) for r in R]) # Build random split lists split = [] for j in range(N): split.append([i for i, s in enumerate(idx) if s == j]) nd = len([i for i, s in enumerate(idx) if s == j]) Nd[j] = Nd[j] + nd # Store data for i, t, v, te in zip(range(N), cachet, cachev, testh5): ## Store training data X_t = np.array(np.concatenate( [X[s] for j, s in enumerate(split) if j != i]), order='C', dtype=np.float32) F_t = np.array(np.concatenate( [F[s] for j, s in enumerate(split) if j != i]), order='C', dtype=np.float32) E_t = np.array(np.concatenate( [E[s] for j, s in enumerate(split) if j != i]), order='C', dtype=np.float64) if E_t.shape[0] != 0: t.insertdata(X_t, F_t, E_t, list(S)) ## Store Validation if np.array(split[i]).size > 0: X_v = np.array(X[split[i]], order='C', dtype=np.float32) F_v = np.array(F[split[i]], order='C', dtype=np.float32) E_v = np.array(E[split[i]], order='C', dtype=np.float64) if E_v.shape[0] != 0: v.insertdata(X_v, F_v, E_v, list(S)) # Print some stats print('Data count:', Nd) print('Data split:', 100.0 * Nd / np.sum(Nd), '%') # Save train and valid meta file and cleanup testh5 for t, v, th in zip(cachet, cachev, testh5): t.makemetadata() v.makemetadata() th.cleanup()
wkdir2 = '/home/jujuman/Dropbox/ChemSciencePaper.AER/ANI-c08e-ccdissotest1-ntwk/' cnstfile2 = wkdir2 + 'rHCNO-4.6A_16-3.1A_a4-8.params' saefile2 = wkdir2 + 'sae_6-31gd.dat' nnfdir2 = wkdir2 + 'networks/' # Construct pyNeuroChem classes nc = pync.conformers(cnstfile, saefile, nnfdir, 1) nc2 = pync.conformers(cnstfile2, saefile2, nnfdir2, 1) scan = hdt.readncdat( '/home/jujuman/Research/GDB-11-wB97X-6-31gd/dnnts_testdata/waterdimerscan/ethene_diss.dat', type=np.float32) sdat = [hdt.ncdata(hdt.generatedmatsd3(scan[0]), scan[1], scan[0].shape[1])] sae = hdt.compute_sae(saefile, scan[1]) serg = scan[2] - sae # Set the conformers in NeuroChem nc.setConformers(confs=scan[0], types=list(scan[1])) nc2.setConformers(confs=scan[0], types=list(scan[1])) x = 0.05 * np.array(range(serg.shape[0]), dtype=np.float64) + 0.6 print(len(x)) popt = np.load('mp_ani_params_test.npz')['param'] fsEc = hdt.buckingham_pot(sdat, *popt) #fsEc = hdt.src_pot(sdat) aerg = nc.energy() + fsEc - sae a2erg = nc2.energy() - sae
print("MAX FORCE:", F.max(),S) if F.max() > 0.0: print(np.mean(F.reshape(E.size,F.shape[1]*F.shape[2]),axis=1).shape, E.size) plt.hist(np.max(np.abs(F).reshape(E.size,F.shape[1]*F.shape[2]),axis=1),bins=100) plt.show() plt.scatter(np.max(np.abs(F).reshape(E.size,F.shape[1]*F.shape[2]),axis=1), E) plt.show() ''' Ru = np.random.uniform(0.0, 1.0, E.shape[0]) nidx = np.where(Ru < 1.0) X = X[nidx] F = F[nidx] E = E[nidx] Esae = hdn.compute_sae(saef, S) Hidx = np.where(E - Esae < 4.5) X = X[Hidx] F = F[Hidx] E = E[Hidx] Hidx = np.where(E - Esae > -20.0) X = X[Hidx] F = F[Hidx] E = E[Hidx] Ndc += E.size #for i in range(E.size): # X[i] = X[0]
# Data storage dpack = pyt.datapacker(file_new, mode='w') for i, data in enumerate(adl): #if i == 20: # break X = data['coordinates'] S = data['species'] Edft = data['energies'] path = data['path'] del data['path'] #Eani, Fani = anicv.compute_energy_conformations(X=np.array(X,dtype=np.float32),S=S) Esae = hdt.compute_sae( '/home/jsmith48/scratch/auto_al/modelCNOSFCl/sae_wb97x-631gd.dat', S) idx = np.where(np.abs(Edft - Esae) < 5.0) bidx = np.where(np.abs(Edft - Esae) >= 5.0) if bidx[0].size > 0: # SAE Check print(S) print(bidx, np.abs(Edft - Esae)) #hdt.writexyzfile(file_new+'file_'+str(i).zfill(5)+'.xyz', X[bidx], S) #Eani_m = np.mean(Eani, axis=0) #Fani = np.mean(Fani, axis=0) #err = Eani_m - Edft #pae = np.abs(err)/np.sqrt(float(len(S))) #idx = np.where(pae > 0.15)