コード例 #1
0
    def add_bad_data (self, cnstfile, saefile, nnfdir, gpuid, sinet, P, T=0.8, V=0.2, M=0.06):
        atest = anitester(cnstfile, saefile, nnfdir, gpuid, sinet)

        # Declare data cache
        cachet = cg('_train', self.saef, self.storecac, True)
        cachev = cg('_valid', self.saef, self.storecac, True)

        Nidx = 0
        Nbad = 0
        Nadd = 0

        for i, (X, E, S) in enumerate(zip(self.xyz, self.Eqm, self.spc)):

            if self.idx[i].size != 0:
                Nidx = Nidx + self.idx[i].size
                self.idx[i],m,diff = atest.test_for_bad(X,E,S,self.idx[i],M)

                Nbad = Nbad + self.idx[i].shape[0]

                self.idx[i], kat, Nt = self.store_random(cachet, X, E, S, self.idx[i], P, T)
                self.idx[i], kav, Nv = self.store_random(cachev, X, E, S, self.idx[i], P, V)

                self.kid[i] = np.array(np.concatenate([self.kid[i],kat]),dtype=np.int)

                self.ts = self.ts + Nt
                self.vs = self.vs + Nv

                self.nc[i] = self.nc[i] + Nt + Nv

                Nadd = Nadd + Nt + Nv

                # Add data to the cache
                #if idxt.shape[0] != 0:
                #    cachet.insertdata(X[idxt], E[idxt], list(S))

                #if idxv.shape[0] != 0:
                #    cachev.insertdata(X[idxv], E[idxv], list(S))

        print('\n--------Data health intformation---------')
        print('   -Full: ', self.tf, 'Percent of full used:',"{:.2f}".format(100.0*(self.ts+self.vs)/float(self.tf))+'%')
        print('   -Used: ', self.ts,':',self.vs, ':', self.ts+self.vs)
        print('   -Added:', Nadd,' bad:',Nbad,'of',Nidx)
        print('-----------------------------------------\n')

        self.Nbad = Nbad

        # Make meta data file for caches
        cachet.makemetadata()
        cachev.makemetadata()
コード例 #2
0
    def init_dataset(self, P, T=0.9, V=0.1):

        # Declare data cache
        cachet = cg('_train', self.saef, self.storecac, False)
        cachev = cg('_valid', self.saef, self.storecac, False)

        for i, (X, F, E,
                S) in enumerate(zip(self.xyz, self.frc, self.Eqm, self.spc)):

            N = E.shape[0]

            Tp = int(float(T) * float(P) * float(N))
            Vp = int(float(V) * float(P) * float(N))

            # Randomize index
            np.random.shuffle(self.idx[i])

            # get indicies
            iix = np.random.uniform(0.0, 1.0, self.idx[i].size)
            tr_idx = np.asarray(np.where(iix < T * P))[0]
            vd_idx = np.asarray(np.where(iix >= 1.0 - (V * P)))[0]

            idxt = self.idx[i][tr_idx].copy()
            idxv = self.idx[i][vd_idx].copy()

            self.kid[i] = np.concatenate([self.kid[i], idxt, idxv])

            self.nc[i] = self.nc[i] + idxt.shape[0] + idxv.shape[0]

            self.ts = self.ts + idxt.shape[0]
            self.vs = self.vs + idxv.shape[0]

            # Update index list
            self.idx[i] = self.idx[i][Tp + Vp + 1:]

            # Add data to the cache
            if idxt.shape[0] != 0:
                cachet.insertdata(X[idxt], F[idxt], E[idxt], list(S))

            if idxv.shape[0] != 0:
                cachev.insertdata(X[idxv], F[idxv], E[idxv], list(S))

        print('Full: ', self.tf)
        print('Used: ', self.ts, ':', self.vs, ':', self.ts + self.vs)

        # Make meta data file for caches
        cachet.makemetadata()
        cachev.makemetadata()
コード例 #3
0
    def init_dataset(self, P, T=0.8, V=0.2):

        # Declare data cache
        cachet = cg('_train', self.saef, self.storecac, False)
        cachev = cg('_valid', self.saef, self.storecac, False)

        for i,(X,E,S) in enumerate(zip(self.xyz,self.Eqm,self.spc)):
            N = E.shape[0]

            Tp = int(float(T)*float(P)*float(N))
            Vp = int(float(V)*float(P)*float(N))

            # Randomize index
            np.random.shuffle(self.idx[i])

            # get indicies
            idxt = self.idx[i][0:Tp].copy()
            idxv = self.idx[i][Tp+1:Tp+Vp].copy()

            self.kid[i] = np.concatenate([self.kid[i], idxt])

            self.nc[i] = self.nc[i] + idxt.shape[0] + idxv.shape[0]

            self.ts = self.ts + idxt.shape[0]
            self.vs = self.vs + idxv.shape[0]

            # Update index list
            self.idx[i] = self.idx[i][Tp+Vp+1:]

            # Add data to the cache
            if idxt.shape[0] != 0:
                cachet.insertdata(X[idxt], E[idxt], list(S))

            if idxv.shape[0] != 0:
                cachev.insertdata(X[idxv], E[idxv], list(S))

        print('Full: ', self.tf)
        print('Used: ', self.ts,':',self.vs, ':', self.ts+self.vs)

        # Make meta data file for caches
        cachet.makemetadata()
        cachev.makemetadata()
コード例 #4
0
    def add_bad_data(self,
                     cnstfile,
                     saefile,
                     nnfdir,
                     gpuid,
                     sinet,
                     P,
                     T=0.9,
                     V=0.1,
                     M=0.3):
        atest = anitester(cnstfile, saefile, nnfdir, gpuid, sinet)

        # Declare data cache
        cachet = cg('_train', self.saef, self.storecac, True)
        cachev = cg('_valid', self.saef, self.storecac, True)

        Nbad = 0
        Nadd = 0
        Ngwd = 0
        Ngto = 0

        Nidx = 0
        Nkid = 0
        Ngid = 0

        for i, (X, F, E,
                S) in enumerate(zip(self.xyz, self.frc, self.Eqm, self.spc)):

            if self.idx[i].size != 0:
                #print('Parent:', self.prt[i])
                # Check if any "Good" milk went sour
                tmp_idx1, self.gid[i], mt, difft = atest.test_for_bad(
                    X, E, S, self.gid[i], M)

                # Add the soured milk to the pot
                self.idx[i] = np.array(np.concatenate([tmp_idx1, self.idx[i]]),
                                       dtype=np.int32)

                # Test the pot for good and bad
                self.idx[i], god_idx, m, diff = atest.test_for_bad(
                    X, E, S, self.idx[i], M)

                # Add good to good index
                self.gid[i] = np.array(np.concatenate([self.gid[i], god_idx]),
                                       dtype=np.int32)

                # Add to size of good, good went bad, and total bad
                Ngto = Ngto + self.gid[i].size
                Ngwd = Ngwd + tmp_idx1.size
                Nbad = Nbad + self.idx[i].size

                # Store a random subset of the bad for training
                self.idx[i], kat, Nt = self.store_random(
                    cachet, X, F, E, S, self.idx[i], P, T)
                self.idx[i], kav, Nv = self.store_random(
                    cachev, X, F, E, S, self.idx[i], P, V)

                #self.idx[i], kat, Nt = self.store_diverse(cachet, atest, X, F, E, S, self.idx[i], P, T)
                #self.idx[i], kav, Nv = self.store_diverse(cachev, atest, X, F, E, S, self.idx[i], P, V)

                # Add the training data to kid
                self.kid[i] = np.array(np.concatenate([self.kid[i], kat, kav]),
                                       dtype=np.int)

                # Count total in the pot
                Nidx = Nidx + self.idx[i].size
                Nkid = Nkid + self.kid[i].size
                Ngid = Ngid + self.gid[i].size

                # Increment training and validation size
                self.ts = self.ts + Nt
                self.vs = self.vs + Nv

                self.nc[i] = self.nc[i] + Nt + Nv

                Nadd = Nadd + Nt + Nv

        self.Nbad = Nbad

        output = '\n--------Data health intformation---------\n' +\
                 '   -Full: ' + str(self.tf) + ' Percent of full used: ' + "{:.2f}".format(100.0*(self.ts+self.vs)/float(self.tf)) + '%\n' +\
                 '   -Used: ' + str(self.ts) + ' : ' + str(self.vs) + ' : ' + str(self.ts+self.vs) + ' Ngwd: ' + str(Ngwd) + '\n' +\
                 '   -Skip: Ngwd: ' +  str(Ngwd) + ' of ' + str(Ngto) + '\n' +\
                 '   -Size: ' + str(Nkid) + ' : ' + str(Nidx) + ' : ' + str(Ngid) + ' : ' + str(Nkid+Nidx+Ngid) + '\n' +\
                 '   -Added: ' + str(Nadd) + ' bad: ' +str(Nbad) + ' of ' + str(Nidx) + ' ('+"{:.1f}".format(self.get_percent_bad())+'%)' + '\n' +\
                 '-----------------------------------------\n\n'

        print(output)
        self.of.write(output)
        self.of.flush()
        # Make meta data file for caches
        cachet.makemetadata()
        cachev.makemetadata()
コード例 #5
0
store_dir = wkdir + "cache-data-"

N = 5

for i in range(N):
    if not os.path.exists(store_dir + str(i)):
        os.mkdir(store_dir + str(i))

    if os.path.exists(store_dir + str(i) + '/testset/testset.h5'):
        os.remove(store_dir + str(i) + '/testset/testset.h5')

    if not os.path.exists(store_dir + str(i) + '/testset'):
        os.mkdir(store_dir + str(i) + '/testset')

cachet = [
    cg('_train', saef, store_dir + str(r) + '/', forcet, chargt, False)
    for r in range(N)
]
cachev = [
    cg('_valid', saef, store_dir + str(r) + '/', forcet, chargt, False)
    for r in range(N)
]
testh5 = [
    pyt.datapacker(store_dir + str(r) + '/testset/testset.h5')
    for r in range(N)
]

Nd = np.zeros(N, dtype=np.int32)
Nbf = 0
for f, fn in enumerate(h5files):
    print('Processing file(' + str(f + 1) + ' of ' + str(len(h5files)) + '):',
コード例 #6
0
hdf5file = '/home/jujuman/Research/ANI-DATASET/ani_data_c08e_gdb09aug.h5'
storecac = '/home/jujuman/Research/GDB-11-wB97X-6-31gd/cache09fsrc/'
saef = "/home/jujuman/Research/GDB-11-wB97X-6-31gd/sae_6-31gd.dat"
path = "/home/jujuman/Research/GDB-11-wB97X-6-31gd/cache09fsrc/testset/c09fsrc-testset.h5"
'''
hdf5file = '/home/jujuman/Research/ANI-DATASET/ani_data_c01test.h5'
storecac = '/home/jujuman/Research/GDB-11-wB97X-6-31gd/cache01_2/'
saef   = "/home/jujuman/Research/GDB-11-wB97X-6-31gd/sae_6-31gd.dat"
path = "/home/jujuman/Research/GDB-11-wB97X-6-31gd/cache01_2/testset/c01-testset.h5"
'''

# Construct the data loader class
adl = pya.anidataloader(hdf5file)

# Declare data cache
cachet = cg('_train', saef, storecac)
cachev = cg('_valid', saef, storecac)

# Declare test cache
dpack = pyt.datapacker(path)

# Load morse parameters
popt = np.load('mp_ani_params_test.npz')['param']

# Loop over data in set
for data in adl.getnextdata():
    loc = data['parent'] + "/" + data['child']
    print(loc)

    xyz = data['coordinates']
    eng = data['energies']
コード例 #7
0
store_dir = "/home/jujuman/Research/QM-7TEST/tester/"
saef   = "/home/jujuman/Research/QM-7TEST/tester/sae_6-31gd.dat"

data_index = np.array(range(eng.shape[0]))

#unc_file = '/home/jujuman/dataset-qm9/uncharacterized.txt'
#data_index = remove_bad_data(unc_file, data_index)
np.random.shuffle(data_index)
print(data_index.shape)

listt = data_index[:int(0.8*len(data_index))]
listv = data_index[int(0.8*len(data_index)):int(0.9*len(data_index))]
listte = data_index[int(0.9*len(data_index)):]

cachet = cg('_train', saef, store_dir)
cachev = cg('_valid', saef, store_dir)

eng = eng / hdn.hatokcal

print('max: ', eng.max(), ' min: ', eng.min())

for n,i in enumerate(listt):
    print(n)
    x = xyz[i]
    e = eng[i]
    z = spc[i]
    #z = atn[i]

    z = z[~((z == 0))]
    Na = z.shape[0]
コード例 #8
0
    def build_strided_training_cache(self,
                                     Nblocks,
                                     Nvalid,
                                     Ntest,
                                     build_test=True,
                                     build_valid=False,
                                     forces=True,
                                     grad=False,
                                     Fkey='forces',
                                     forces_unit=1.0,
                                     Ekey='energies',
                                     energy_unit=1.0,
                                     Eax0sum=False,
                                     rmhighe=True):
        if not os.path.isfile(self.netdict['saefile']):
            self.sae_linear_fitting(Ekey=Ekey,
                                    energy_unit=energy_unit,
                                    Eax0sum=Eax0sum)
        h5d = self.h5dir
        store_dir = self.train_root + "cache-data-"
        N = self.Nn
        Ntrain = Nblocks - Nvalid - Ntest
        if Nblocks % N != 0:
            raise ValueError(
                'Error: number of networks must evenly divide number of blocks.'
            )
        Nstride = Nblocks / N
        for i in range(N):
            if not os.path.exists(store_dir + str(i)):
                os.mkdir(store_dir + str(i))
            if build_test:
                if os.path.exists(store_dir + str(i) + '/../testset/testset' +
                                  str(i) + '.h5'):
                    os.remove(store_dir + str(i) + '/../testset/testset' +
                              str(i) + '.h5')
                if not os.path.exists(store_dir + str(i) + '/../testset'):
                    os.mkdir(store_dir + str(i) + '/../testset')
        cachet = [
            cg('_train', self.netdict['saefile'], store_dir + str(r) + '/',
               False) for r in range(N)
        ]
        cachev = [
            cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/',
               False) for r in range(N)
        ]

        if build_test:
            testh5 = [
                pyt.datapacker(store_dir + str(r) + '/../testset/testset' +
                               str(r) + '.h5') for r in range(N)
            ]

        if build_valid:
            valdh5 = [
                pyt.datapacker(store_dir + str(r) + '/../testset/valdset' +
                               str(r) + '.h5') for r in range(N)
            ]

        if rmhighe:
            dE = []
            for f in self.h5file:
                adl = pyt.anidataloader(h5d + f)
                for data in adl:
                    S = data['species']
                    E = data['energies']
                    X = data['coordinates']

                    Esae = hdt.compute_sae(self.netdict['saefile'], S)

                    dE.append((E - Esae) / np.sqrt(len(S)))

            dE = np.concatenate(dE)
            cidx = np.where(np.abs(dE) < 15.0)
            std = np.abs(dE[cidx]).std()
            men = np.mean(dE[cidx])

            print(men, std, men + std)
            idx = np.intersect1d(
                np.where(dE >= -np.abs(15 * std + men))[0],
                np.where(dE <= np.abs(11 * std + men))[0])
            cnt = idx.size
            print('DATADIST: ', dE.size, cnt, (dE.size - cnt),
                  100.0 * ((dE.size - cnt) / dE.size))

        E = []
        data_count = np.zeros((N, 3), dtype=np.int32)
        for f in self.h5file:
            print('Reading data file:', h5d + f)
            adl = pyt.anidataloader(h5d + f)
            for data in adl:
                #print(data['path'],data['energies'].size)

                S = data['species']

                if data[Ekey].size > 0 and (set(S).issubset(
                        self.netdict['atomtyp'])):

                    X = np.array(data['coordinates'],
                                 order='C',
                                 dtype=np.float32)

                    #print(np.array(data[Ekey].shape),np.sum(np.array(data[Ekey], order='C', dtype=np.float64),axis=1).shape,data[Fkey].shape)

                    if Eax0sum:
                        E = energy_unit * np.sum(np.array(
                            data[Ekey], order='C', dtype=np.float64),
                                                 axis=1)
                    else:
                        E = energy_unit * np.array(
                            data[Ekey], order='C', dtype=np.float64)

                    if forces and not grad:
                        F = forces_unit * np.array(
                            data[Fkey], order='C', dtype=np.float32)
                    elif forces and grad:
                        F = -forces_unit * np.array(
                            data[Fkey], order='C', dtype=np.float32)
                    else:
                        F = 0.0 * X

                    if rmhighe:
                        Esae = hdt.compute_sae(self.netdict['saefile'], S)

                        ind_dE = (E - Esae) / np.sqrt(len(S))

                        hidx = np.union1d(
                            np.where(ind_dE < -(15.0 * std + men))[0],
                            np.where(ind_dE > (11.0 * std + men))[0])
                        lidx = np.intersect1d(
                            np.where(ind_dE >= -(15.0 * std + men))[0],
                            np.where(ind_dE <= (11.0 * std + men))[0])

                        if hidx.size > 0:
                            print(
                                '  -(' + f + ':' + data['path'] +
                                ')High energies detected:\n    ',
                                (E[hidx] - Esae) / np.sqrt(len(S)))

                        X = X[lidx]
                        E = E[lidx]
                        F = F[lidx]

                    # Build random split index
                    ridx = np.random.randint(0, Nblocks, size=E.size)
                    Didx = [
                        np.argsort(ridx)[np.where(ridx == i)]
                        for i in range(Nblocks)
                    ]

                    # Build training cache
                    for nid, cache in enumerate(cachet):
                        set_idx = np.concatenate([
                            Didx[((bid + nid * int(Nstride)) % Nblocks)]
                            for bid in range(Ntrain)
                        ])
                        if set_idx.size != 0:
                            data_count[nid, 0] += set_idx.size
                            cache.insertdata(X[set_idx], F[set_idx],
                                             E[set_idx], list(S))

                    # for nid,cache in enumerate(cachev):
                    #     set_idx = np.concatenate([Didx[((1+bid+nid*int(Nstride)) % Nblocks)] for bid in range(Ntrain)])
                    #     if set_idx.size != 0:
                    #         data_count[nid,0]+=set_idx.size
                    #         cache.insertdata(X[set_idx], F[set_idx], E[set_idx], list(S))

                    for nid, cache in enumerate(cachev):
                        set_idx = np.concatenate([
                            Didx[(Ntrain + bid + nid * int(Nstride)) % Nblocks]
                            for bid in range(Nvalid)
                        ])
                        if set_idx.size != 0:
                            data_count[nid, 1] += set_idx.size
                            cache.insertdata(X[set_idx], F[set_idx],
                                             E[set_idx], list(S))
                            if build_valid:
                                valdh5[nid].store_data(f + data['path'],
                                                       coordinates=X[set_idx],
                                                       forces=F[set_idx],
                                                       energies=E[set_idx],
                                                       species=list(S))

                    if build_test:
                        for nid, th5 in enumerate(testh5):
                            set_idx = np.concatenate([
                                Didx[(Ntrain + Nvalid + bid +
                                      nid * int(Nstride)) % Nblocks]
                                for bid in range(Ntest)
                            ])
                            if set_idx.size != 0:
                                data_count[nid, 2] += set_idx.size
                                th5.store_data(f + data['path'],
                                               coordinates=X[set_idx],
                                               forces=F[set_idx],
                                               energies=E[set_idx],
                                               species=list(S))

        # Save train and valid meta file and cleanup testh5
        for t, v in zip(cachet, cachev):
            t.makemetadata()
            v.makemetadata()

        if build_test:
            for th in testh5:
                th.cleanup()

        if build_valid:
            for vh in valdh5:
                vh.cleanup()

        print(' Train ', ' Valid ', ' Test ')
        print(data_count)
        print('Training set built.')
コード例 #9
0
 def build_training_cache(self, forces=True):
     store_dir = self.train_root + "cache-data-"
     N = self.Nn
     for i in range(N):
         if not os.path.exists(store_dir + str(i)):
             os.mkdir(store_dir + str(i))
         if os.path.exists(store_dir + str(i) + '/../testset/testset' +
                           str(i) + '.h5'):
             os.remove(store_dir + str(i) + '/../testset/testset' + str(i) +
                       '.h5')
         if not os.path.exists(store_dir + str(i) + '/../testset'):
             os.mkdir(store_dir + str(i) + '/../testset')
     cachet = [
         cg('_train', self.netdict['saefile'], store_dir + str(r) + '/',
            False) for r in range(N)
     ]
     cachev = [
         cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/',
            False) for r in range(N)
     ]
     testh5 = [
         pyt.datapacker(store_dir + str(r) + '/../testset/testset' +
                        str(r) + '.h5') for r in range(N)
     ]
     Nd = np.zeros(N, dtype=np.int32)
     Nbf = 0
     for f, fn in enumerate(self.h5file):
         print(
             'Processing file(' + str(f + 1) + ' of ' +
             str(len(self.h5file)) + '):', fn)
         adl = pyt.anidataloader(self.h5dir + fn)
         To = adl.size()
         Ndc = 0
         Fmt = []
         Emt = []
         for c, data in enumerate(adl):
             Pn = data['path'] + '_' + str(f).zfill(6) + '_' + str(c).zfill(
                 6)
             # Extract the data
             X = data['coordinates']
             E = data['energies']
             S = data['species']
             # 0.0 forces if key doesnt exist
             if forces:
                 F = data['forces']
             else:
                 F = 0.0 * X
             Fmt.append(np.max(np.linalg.norm(F, axis=2), axis=1))
             Emt.append(E)
             Mv = np.max(np.linalg.norm(F, axis=2), axis=1)
             index = np.where(Mv > 10.5)[0]
             indexk = np.where(Mv <= 10.5)[0]
             Nbf += index.size
             # Clear forces
             X = X[indexk]
             F = F[indexk]
             E = E[indexk]
             Esae = hdt.compute_sae(self.netdict['saefile'], S)
             hidx = np.where(np.abs(E - Esae) > 10.0)
             lidx = np.where(np.abs(E - Esae) <= 10.0)
             if hidx[0].size > 0:
                 print(
                     '  -(' + str(c).zfill(3) +
                     ')High energies detected:\n    ', E[hidx])
             X = X[lidx]
             E = E[lidx]
             F = F[lidx]
             Ndc += E.size
             if (set(S).issubset(self.netdict['atomtyp'])):
                 # Random mask
                 R = np.random.uniform(0.0, 1.0, E.shape[0])
                 idx = np.array([interval(r, N) for r in R])
                 # Build random split lists
                 split = []
                 for j in range(N):
                     split.append([i for i, s in enumerate(idx) if s == j])
                     nd = len([i for i, s in enumerate(idx) if s == j])
                     Nd[j] = Nd[j] + nd
                 # Store data
                 for i, t, v, te in zip(range(N), cachet, cachev, testh5):
                     ## Store training data
                     X_t = np.array(np.concatenate(
                         [X[s] for j, s in enumerate(split) if j != i]),
                                    order='C',
                                    dtype=np.float32)
                     F_t = np.array(np.concatenate(
                         [F[s] for j, s in enumerate(split) if j != i]),
                                    order='C',
                                    dtype=np.float32)
                     E_t = np.array(np.concatenate(
                         [E[s] for j, s in enumerate(split) if j != i]),
                                    order='C',
                                    dtype=np.float64)
                     if E_t.shape[0] != 0:
                         t.insertdata(X_t, F_t, E_t, list(S))
                     ## Store Validation
                     if np.array(split[i]).size > 0:
                         X_v = np.array(X[split[i]],
                                        order='C',
                                        dtype=np.float32)
                         F_v = np.array(F[split[i]],
                                        order='C',
                                        dtype=np.float32)
                         E_v = np.array(E[split[i]],
                                        order='C',
                                        dtype=np.float64)
                         if E_v.shape[0] != 0:
                             v.insertdata(X_v, F_v, E_v, list(S))
     # Print some stats
     print('Data count:', Nd)
     print('Data split:', 100.0 * Nd / np.sum(Nd), '%')
     # Save train and valid meta file and cleanup testh5
     for t, v, th in zip(cachet, cachev, testh5):
         t.makemetadata()
         v.makemetadata()
         th.cleanup()
コード例 #10
0
    #wkdir + "/h5data/ani-gdb-c08e.h5",
]

store_dir = wkdir + "/cache-c08e-"

#adl.split_load(10)
N = 10

train_idx = [[2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 4, 5, 6, 7, 8, 9],
             [0, 1, 2, 3, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 8, 9],
             [0, 1, 2, 3, 4, 5, 6, 7]]

valid_idx = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]

cachet = [
    cg('_train', saef, store_dir + str(r) + '/', False) for r in range(5)
]
cachev = [
    cg('_valid', saef, store_dir + str(r) + '/', False) for r in range(5)
]

for fn in h5files:
    adl = pyt.anidataloader(fn)

    for c, data in enumerate(adl):
        # Print file
        print('Processing file: ', c)

        # Extract the data
        xyz = data['coordinates']
        erg = data['energies']
コード例 #11
0
store_dir = wkdir + "cache-data-"

N = 5

for i in range(N):
    if not os.path.exists(store_dir + str(i)):
        os.mkdir(store_dir + str(i))

    if os.path.exists(store_dir + str(i) + '/../testset/testset'+str(i)+'.h5'):
        os.remove(store_dir + str(i) + '/../testset/testset'+str(i)+'.h5')

    if not os.path.exists(store_dir + str(i) + '/../testset'):
        os.mkdir(store_dir + str(i) + '/../testset')

cachet = [cg('_train', saef, store_dir + str(r) + '/',False) for r in range(N)]
cachev = [cg('_valid', saef, store_dir + str(r) + '/',False) for r in range(N)]
testh5 = [pyt.datapacker(store_dir + str(r) + '/../testset/testset'+str(r)+'.h5') for r in range(N)]

Nd = np.zeros(N,dtype=np.int32)
Nbf = 0
for f,fn in enumerate(h5files):
    print('Processing file('+ str(f+1) +' of '+ str(len(h5files)) +'):', fn)
    adl = pyt.anidataloader(fn)

    To = adl.size()
    Ndc = 0
    Fmt = []
    Emt = []
    for c, data in enumerate(adl):
        #if c == 2 or c == 2 or c == 2:
コード例 #12
0
    def build_strided_training_cache(self,
                                     Nblocks,
                                     Nvalid,
                                     Ntest,
                                     build_test=True,
                                     forces=True,
                                     grad=False,
                                     Fkey='forces',
                                     forces_unit=1.0,
                                     Ekey='energies',
                                     energy_unit=1.0,
                                     Eax0sum=False):
        if not os.path.isfile(self.netdict['saefile']):
            self.sae_linear_fitting(Ekey=Ekey,
                                    energy_unit=energy_unit,
                                    Eax0sum=Eax0sum)

        h5d = self.h5dir

        store_dir = self.train_root + "cache-data-"
        N = self.Nn
        Ntrain = Nblocks - Nvalid - Ntest

        if Nblocks % N != 0:
            raise ValueError(
                'Error: number of networks must evenly divide number of blocks.'
            )

        Nstride = Nblocks / N

        for i in range(N):
            if not os.path.exists(store_dir + str(i)):
                os.mkdir(store_dir + str(i))

            if build_test:
                if os.path.exists(store_dir + str(i) + '/../testset/testset' +
                                  str(i) + '.h5'):
                    os.remove(store_dir + str(i) + '/../testset/testset' +
                              str(i) + '.h5')

                if not os.path.exists(store_dir + str(i) + '/../testset'):
                    os.mkdir(store_dir + str(i) + '/../testset')

        cachet = [
            cg('_train', self.netdict['saefile'], store_dir + str(r) + '/',
               False) for r in range(N)
        ]
        cachev = [
            cg('_valid', self.netdict['saefile'], store_dir + str(r) + '/',
               False) for r in range(N)
        ]

        if build_test:
            testh5 = [
                pyt.datapacker(store_dir + str(r) + '/../testset/testset' +
                               str(r) + '.h5') for r in range(N)
            ]

        E = []
        data_count = np.zeros((N, 3), dtype=np.int32)
        for f in self.h5file:
            adl = pyt.anidataloader(h5d + f)
            for data in adl:
                #print(data['path'],data['energies'].size)

                S = data['species']

                if data[Ekey].size > 0 and (set(S).issubset(
                        self.netdict['atomtyp'])):

                    X = np.array(data['coordinates'],
                                 order='C',
                                 dtype=np.float32)

                    if Eax0sum:
                        E = energy_unit * np.sum(np.array(
                            data[Ekey], order='C', dtype=np.float64),
                                                 axis=1)
                    else:
                        E = energy_unit * np.array(
                            data[Ekey], order='C', dtype=np.float64)

                    if forces and not grad:
                        F = forces_unit * np.array(
                            data[Fkey], order='C', dtype=np.float32)
                    if forces and grad:
                        F = -forces_unit * np.array(
                            data[Fkey], order='C', dtype=np.float32)
                    else:
                        F = 0.0 * X

                    # Build random split index
                    ridx = np.random.randint(0, Nblocks, size=E.size)
                    Didx = [
                        np.argsort(ridx)[np.where(ridx == i)]
                        for i in range(Nblocks)
                    ]

                    # Build training cache
                    for nid, cache in enumerate(cachet):
                        set_idx = np.concatenate([
                            Didx[((bid + nid * int(Nstride)) % Nblocks)]
                            for bid in range(Ntrain)
                        ])
                        if set_idx.size != 0:
                            data_count[nid, 0] += set_idx.size
                            cache.insertdata(X[set_idx], F[set_idx],
                                             E[set_idx], list(S))

                    for nid, cache in enumerate(cachev):
                        set_idx = np.concatenate([
                            Didx[(Ntrain + bid + nid * int(Nstride)) % Nblocks]
                            for bid in range(Nvalid)
                        ])
                        if set_idx.size != 0:
                            data_count[nid, 1] += set_idx.size
                            cache.insertdata(X[set_idx], F[set_idx],
                                             E[set_idx], list(S))

                    if build_test:
                        for nid, th5 in enumerate(testh5):
                            set_idx = np.concatenate([
                                Didx[(Ntrain + Nvalid + bid +
                                      nid * int(Nstride)) % Nblocks]
                                for bid in range(Ntest)
                            ])
                            if set_idx.size != 0:
                                data_count[nid, 2] += set_idx.size
                                th5.store_data(f + data['path'],
                                               coordinates=X[set_idx],
                                               forces=F[set_idx],
                                               energies=E[set_idx],
                                               species=list(S))

        # Save train and valid meta file and cleanup testh5
        for t, v in zip(cachet, cachev):
            t.makemetadata()
            v.makemetadata()

        if build_test:
            for th in testh5:
                th.cleanup()

        print(' Train ', ' Valid ', ' Test ')
        print(data_count)
        print('Training set built.')
コード例 #13
0
adl = pyt.anidataloader(h5file)

adl.split_load(10)

train_idx = [[2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 4, 5, 6, 7, 8, 9],
             [0, 1, 2, 3, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 8, 9],
             [0, 1, 2, 3, 4, 5, 6, 7]]

valid_idx = [[0], [2], [4], [6], [8]]

r = 0
for t, v in zip(train_idx, valid_idx):
    print("Working on index: ", r)

    cachet = cg('_train', saef, store_dir + str(r) + '/')
    cachev = cg('_valid', saef, store_dir + str(r) + '/')
    for i in range(0, adl.size()):
        print("Working on : ", i, ' from set ', r)

        t_data = adl.getdata(i, t)
        v_data = adl.getdata(i, v)

        #cn = 0
        #for x,y in zip(v_data[0],v_data[1]):
        #    print('Element ',cn,': ',x,'\n',y)

        #print(t_data[0].shape, ' : ', t_data[1].shape, ' : ', t_data[2].shape)
        #print(v_data[0].shape, ' : ', v_data[1].shape, ' : ', v_data[2].shape)

        if t_data[0].shape[0] != t_data[1].shape[0]: