xt_data = []
yt_data = []

xv_data = []
yv_data = []

# Print the species of the data set one by one
for data in adl.getnextdata():

    # Extract the data
    xyz = np.array_split(data['coordinates'], 250)
    erg = np.array_split(data['energies'], 250)
    spc = data['species']

    sae = hdt.compute_sae(
        '/home/jujuman/Research/GDB-11-wB97X-6-31gd/sae_6-31gd.dat', spc)
    print('TShape: ', xyz[0].shape[0], ' VShape: ', xyz[1].shape[0])

    xt_data.append(
        hdt.ncdata(hdt.generatedmatsd3(xyz[0]), spc, xyz[0].shape[1]))
    yt_data.append(erg[0] - sae)

    xv_data.append(
        hdt.ncdata(hdt.generatedmatsd3(xyz[1]), spc, xyz[1].shape[1]))
    yv_data.append(erg[1] - sae)

yt_data = np.concatenate(yt_data)
yv_data = np.concatenate(yv_data)

print('Training Data Shape: ', yt_data.shape)
print('Testing  Data Shape: ', yv_data.shape)
Example #2
0
    def build_strided_training_cache(self,
                                     Nblocks,
                                     Nvalid,
                                     Ntest,
                                     build_test=True,
                                     build_valid=False,
                                     forces=True,
                                     grad=False,
                                     Fkey='forces',
                                     forces_unit=1.0,
                                     Ekey='energies',
                                     energy_unit=1.0,
                                     Eax0sum=False,
                                     rmhighe=True):
        if not os.path.isfile(self.netdict['saefile']):
            self.sae_linear_fitting(Ekey=Ekey,
                                    energy_unit=energy_unit,
                                    Eax0sum=Eax0sum)
        h5d = self.h5dir
        store_dir = self.train_root + "cache-data-"
        N = self.Nn
        Ntrain = Nblocks - Nvalid - Ntest
        if Nblocks % N != 0:
            raise ValueError(
                'Error: number of networks must evenly divide number of blocks.'
            )
        Nstride = Nblocks / N
        for i in range(N):
            if not os.path.exists(store_dir + str(i)):
                os.mkdir(store_dir + str(i))
            if build_test:
                if os.path.exists(store_dir + str(i) + '/../testset/testset' +
                                  str(i) + '.h5'):
                    os.remove(store_dir + str(i) + '/../testset/testset' +
                              str(i) + '.h5')
                if not os.path.exists(store_dir + str(i) + '/../testset'):
                    os.mkdir(store_dir + str(i) + '/../testset')
        cachet = [
            cg('_train', self.netdict['saefile'], store_dir + str(r) + '/',
               False) for r in range(N)
        ]
        cachev = [
            cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/',
               False) for r in range(N)
        ]

        if build_test:
            testh5 = [
                pyt.datapacker(store_dir + str(r) + '/../testset/testset' +
                               str(r) + '.h5') for r in range(N)
            ]

        if build_valid:
            valdh5 = [
                pyt.datapacker(store_dir + str(r) + '/../testset/valdset' +
                               str(r) + '.h5') for r in range(N)
            ]

        if rmhighe:
            dE = []
            for f in self.h5file:
                adl = pyt.anidataloader(h5d + f)
                for data in adl:
                    S = data['species']
                    E = data['energies']
                    X = data['coordinates']

                    Esae = hdt.compute_sae(self.netdict['saefile'], S)

                    dE.append((E - Esae) / np.sqrt(len(S)))

            dE = np.concatenate(dE)
            cidx = np.where(np.abs(dE) < 15.0)
            std = np.abs(dE[cidx]).std()
            men = np.mean(dE[cidx])

            print(men, std, men + std)
            idx = np.intersect1d(
                np.where(dE >= -np.abs(15 * std + men))[0],
                np.where(dE <= np.abs(11 * std + men))[0])
            cnt = idx.size
            print('DATADIST: ', dE.size, cnt, (dE.size - cnt),
                  100.0 * ((dE.size - cnt) / dE.size))

        E = []
        data_count = np.zeros((N, 3), dtype=np.int32)
        for f in self.h5file:
            print('Reading data file:', h5d + f)
            adl = pyt.anidataloader(h5d + f)
            for data in adl:
                #print(data['path'],data['energies'].size)

                S = data['species']

                if data[Ekey].size > 0 and (set(S).issubset(
                        self.netdict['atomtyp'])):

                    X = np.array(data['coordinates'],
                                 order='C',
                                 dtype=np.float32)

                    #print(np.array(data[Ekey].shape),np.sum(np.array(data[Ekey], order='C', dtype=np.float64),axis=1).shape,data[Fkey].shape)

                    if Eax0sum:
                        E = energy_unit * np.sum(np.array(
                            data[Ekey], order='C', dtype=np.float64),
                                                 axis=1)
                    else:
                        E = energy_unit * np.array(
                            data[Ekey], order='C', dtype=np.float64)

                    if forces and not grad:
                        F = forces_unit * np.array(
                            data[Fkey], order='C', dtype=np.float32)
                    elif forces and grad:
                        F = -forces_unit * np.array(
                            data[Fkey], order='C', dtype=np.float32)
                    else:
                        F = 0.0 * X

                    if rmhighe:
                        Esae = hdt.compute_sae(self.netdict['saefile'], S)

                        ind_dE = (E - Esae) / np.sqrt(len(S))

                        hidx = np.union1d(
                            np.where(ind_dE < -(15.0 * std + men))[0],
                            np.where(ind_dE > (11.0 * std + men))[0])
                        lidx = np.intersect1d(
                            np.where(ind_dE >= -(15.0 * std + men))[0],
                            np.where(ind_dE <= (11.0 * std + men))[0])

                        if hidx.size > 0:
                            print(
                                '  -(' + f + ':' + data['path'] +
                                ')High energies detected:\n    ',
                                (E[hidx] - Esae) / np.sqrt(len(S)))

                        X = X[lidx]
                        E = E[lidx]
                        F = F[lidx]

                    # Build random split index
                    ridx = np.random.randint(0, Nblocks, size=E.size)
                    Didx = [
                        np.argsort(ridx)[np.where(ridx == i)]
                        for i in range(Nblocks)
                    ]

                    # Build training cache
                    for nid, cache in enumerate(cachet):
                        set_idx = np.concatenate([
                            Didx[((bid + nid * int(Nstride)) % Nblocks)]
                            for bid in range(Ntrain)
                        ])
                        if set_idx.size != 0:
                            data_count[nid, 0] += set_idx.size
                            cache.insertdata(X[set_idx], F[set_idx],
                                             E[set_idx], list(S))

                    # for nid,cache in enumerate(cachev):
                    #     set_idx = np.concatenate([Didx[((1+bid+nid*int(Nstride)) % Nblocks)] for bid in range(Ntrain)])
                    #     if set_idx.size != 0:
                    #         data_count[nid,0]+=set_idx.size
                    #         cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S))

                    for nid, cache in enumerate(cachev):
                        set_idx = np.concatenate([
                            Didx[(Ntrain + bid + nid * int(Nstride)) % Nblocks]
                            for bid in range(Nvalid)
                        ])
                        if set_idx.size != 0:
                            data_count[nid, 1] += set_idx.size
                            cache.insertdata(X[set_idx], F[set_idx],
                                             E[set_idx], list(S))
                            if build_valid:
                                valdh5[nid].store_data(f + data['path'],
                                                       coordinates=X[set_idx],
                                                       forces=F[set_idx],
                                                       energies=E[set_idx],
                                                       species=list(S))

                    if build_test:
                        for nid, th5 in enumerate(testh5):
                            set_idx = np.concatenate([
                                Didx[(Ntrain + Nvalid + bid +
                                      nid * int(Nstride)) % Nblocks]
                                for bid in range(Ntest)
                            ])
                            if set_idx.size != 0:
                                data_count[nid, 2] += set_idx.size
                                th5.store_data(f + data['path'],
                                               coordinates=X[set_idx],
                                               forces=F[set_idx],
                                               energies=E[set_idx],
                                               species=list(S))

        # Save train and valid meta file and cleanup testh5
        for t, v in zip(cachet, cachev):
            t.makemetadata()
            v.makemetadata()

        if build_test:
            for th in testh5:
                th.cleanup()

        if build_valid:
            for vh in valdh5:
                vh.cleanup()

        print(' Train ', ' Valid ', ' Test ')
        print(data_count)
        print('Training set built.')
Example #3
0
 def build_training_cache(self, forces=True):
     store_dir = self.train_root + "cache-data-"
     N = self.Nn
     for i in range(N):
         if not os.path.exists(store_dir + str(i)):
             os.mkdir(store_dir + str(i))
         if os.path.exists(store_dir + str(i) + '/../testset/testset' +
                           str(i) + '.h5'):
             os.remove(store_dir + str(i) + '/../testset/testset' + str(i) +
                       '.h5')
         if not os.path.exists(store_dir + str(i) + '/../testset'):
             os.mkdir(store_dir + str(i) + '/../testset')
     cachet = [
         cg('_train', self.netdict['saefile'], store_dir + str(r) + '/',
            False) for r in range(N)
     ]
     cachev = [
         cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/',
            False) for r in range(N)
     ]
     testh5 = [
         pyt.datapacker(store_dir + str(r) + '/../testset/testset' +
                        str(r) + '.h5') for r in range(N)
     ]
     Nd = np.zeros(N, dtype=np.int32)
     Nbf = 0
     for f, fn in enumerate(self.h5file):
         print(
             'Processing file(' + str(f + 1) + ' of ' +
             str(len(self.h5file)) + '):', fn)
         adl = pyt.anidataloader(self.h5dir + fn)
         To = adl.size()
         Ndc = 0
         Fmt = []
         Emt = []
         for c, data in enumerate(adl):
             Pn = data['path'] + '_' + str(f).zfill(6) + '_' + str(c).zfill(
                 6)
             # Extract the data
             X = data['coordinates']
             E = data['energies']
             S = data['species']
             # 0.0 forces if key doesnt exist
             if forces:
                 F = data['forces']
             else:
                 F = 0.0 * X
             Fmt.append(np.max(np.linalg.norm(F, axis=2), axis=1))
             Emt.append(E)
             Mv = np.max(np.linalg.norm(F, axis=2), axis=1)
             index = np.where(Mv > 10.5)[0]
             indexk = np.where(Mv <= 10.5)[0]
             Nbf += index.size
             # Clear forces
             X = X[indexk]
             F = F[indexk]
             E = E[indexk]
             Esae = hdt.compute_sae(self.netdict['saefile'], S)
             hidx = np.where(np.abs(E - Esae) > 10.0)
             lidx = np.where(np.abs(E - Esae) <= 10.0)
             if hidx[0].size > 0:
                 print(
                     '  -(' + str(c).zfill(3) +
                     ')High energies detected:\n    ', E[hidx])
             X = X[lidx]
             E = E[lidx]
             F = F[lidx]
             Ndc += E.size
             if (set(S).issubset(self.netdict['atomtyp'])):
                 # Random mask
                 R = np.random.uniform(0.0, 1.0, E.shape[0])
                 idx = np.array([interval(r, N) for r in R])
                 # Build random split lists
                 split = []
                 for j in range(N):
                     split.append([i for i, s in enumerate(idx) if s == j])
                     nd = len([i for i, s in enumerate(idx) if s == j])
                     Nd[j] = Nd[j] + nd
                 # Store data
                 for i, t, v, te in zip(range(N), cachet, cachev, testh5):
                     ## Store training data
                     X_t = np.array(np.concatenate(
                         [X[s] for j, s in enumerate(split) if j != i]),
                                    order='C',
                                    dtype=np.float32)
                     F_t = np.array(np.concatenate(
                         [F[s] for j, s in enumerate(split) if j != i]),
                                    order='C',
                                    dtype=np.float32)
                     E_t = np.array(np.concatenate(
                         [E[s] for j, s in enumerate(split) if j != i]),
                                    order='C',
                                    dtype=np.float64)
                     if E_t.shape[0] != 0:
                         t.insertdata(X_t, F_t, E_t, list(S))
                     ## Store Validation
                     if np.array(split[i]).size > 0:
                         X_v = np.array(X[split[i]],
                                        order='C',
                                        dtype=np.float32)
                         F_v = np.array(F[split[i]],
                                        order='C',
                                        dtype=np.float32)
                         E_v = np.array(E[split[i]],
                                        order='C',
                                        dtype=np.float64)
                         if E_v.shape[0] != 0:
                             v.insertdata(X_v, F_v, E_v, list(S))
     # Print some stats
     print('Data count:', Nd)
     print('Data split:', 100.0 * Nd / np.sum(Nd), '%')
     # Save train and valid meta file and cleanup testh5
     for t, v, th in zip(cachet, cachev, testh5):
         t.makemetadata()
         v.makemetadata()
         th.cleanup()
Example #4
0
wkdir2 = '/home/jujuman/Dropbox/ChemSciencePaper.AER/ANI-c08e-ccdissotest1-ntwk/'
cnstfile2 = wkdir2 + 'rHCNO-4.6A_16-3.1A_a4-8.params'
saefile2 = wkdir2 + 'sae_6-31gd.dat'
nnfdir2 = wkdir2 + 'networks/'

# Construct pyNeuroChem classes
nc = pync.conformers(cnstfile, saefile, nnfdir, 1)
nc2 = pync.conformers(cnstfile2, saefile2, nnfdir2, 1)

scan = hdt.readncdat(
    '/home/jujuman/Research/GDB-11-wB97X-6-31gd/dnnts_testdata/waterdimerscan/ethene_diss.dat',
    type=np.float32)

sdat = [hdt.ncdata(hdt.generatedmatsd3(scan[0]), scan[1], scan[0].shape[1])]

sae = hdt.compute_sae(saefile, scan[1])
serg = scan[2] - sae

# Set the conformers in NeuroChem
nc.setConformers(confs=scan[0], types=list(scan[1]))
nc2.setConformers(confs=scan[0], types=list(scan[1]))

x = 0.05 * np.array(range(serg.shape[0]), dtype=np.float64) + 0.6
print(len(x))

popt = np.load('mp_ani_params_test.npz')['param']
fsEc = hdt.buckingham_pot(sdat, *popt)
#fsEc = hdt.src_pot(sdat)

aerg = nc.energy() + fsEc - sae
a2erg = nc2.energy() - sae
        print("MAX FORCE:", F.max(),S)

        if F.max() > 0.0:
            print(np.mean(F.reshape(E.size,F.shape[1]*F.shape[2]),axis=1).shape, E.size)
            plt.hist(np.max(np.abs(F).reshape(E.size,F.shape[1]*F.shape[2]),axis=1),bins=100)
            plt.show()
            plt.scatter(np.max(np.abs(F).reshape(E.size,F.shape[1]*F.shape[2]),axis=1), E)
            plt.show()
        '''
        Ru = np.random.uniform(0.0, 1.0, E.shape[0])
        nidx = np.where(Ru < 1.0)
        X = X[nidx]
        F = F[nidx]
        E = E[nidx]

        Esae = hdn.compute_sae(saef, S)
        Hidx = np.where(E - Esae < 4.5)

        X = X[Hidx]
        F = F[Hidx]
        E = E[Hidx]

        Hidx = np.where(E - Esae > -20.0)

        X = X[Hidx]
        F = F[Hidx]
        E = E[Hidx]

        Ndc += E.size
        #for i in range(E.size):
        #    X[i] = X[0]
Example #6
0
# Data storage
dpack = pyt.datapacker(file_new, mode='w')

for i, data in enumerate(adl):
    #if i == 20:
    #    break
    X = data['coordinates']
    S = data['species']
    Edft = data['energies']
    path = data['path']
    del data['path']

    #Eani, Fani = anicv.compute_energy_conformations(X=np.array(X,dtype=np.float32),S=S)

    Esae = hdt.compute_sae(
        '/home/jsmith48/scratch/auto_al/modelCNOSFCl/sae_wb97x-631gd.dat', S)

    idx = np.where(np.abs(Edft - Esae) < 5.0)
    bidx = np.where(np.abs(Edft - Esae) >= 5.0)
    if bidx[0].size > 0:
        # SAE Check
        print(S)
        print(bidx, np.abs(Edft - Esae))
        #hdt.writexyzfile(file_new+'file_'+str(i).zfill(5)+'.xyz', X[bidx], S)

    #Eani_m = np.mean(Eani, axis=0)
    #Fani = np.mean(Fani, axis=0)

    #err = Eani_m - Edft
    #pae = np.abs(err)/np.sqrt(float(len(S)))
    #idx = np.where(pae > 0.15)