def get_data(self): """Generate features from atoms objects.""" # Connect database generated by a GA search. gadb = DataConnection('{}/data/gadb.db'.format(wkdir)) # Get all relaxed candidates from the db file. print('Getting candidates from the database') all_cand = gadb.get_all_relaxed_candidates(use_extinct=False) # Setup the test and training datasets. testset = get_unique(atoms=all_cand, size=test_size, key='raw_score') trainset = get_train(atoms=all_cand, size=train_size, taken=testset['taken'], key='raw_score') # Clear out some old saved data. for i in trainset['atoms']: del i.info['data']['nnmat'] # Initiate the fingerprint generators with relevant input variables. print('Getting the fingerprints') f = FeatureGenerator() train_features = f.return_vec(trainset['atoms'], [f.nearestneighbour_vec]) test_features = f.return_vec(testset['atoms'], [f.nearestneighbour_vec]) train_targets = [] for a in trainset['atoms']: train_targets.append(a.info['key_value_pairs']['raw_score']) test_targets = [] for a in testset['atoms']: test_targets.append(a.info['key_value_pairs']['raw_score']) return train_features, train_targets, trainset['atoms'], \ test_features, test_targets, testset['atoms']
# In[2]: # Connect ase atoms database. gadb = DataConnection('../../data/gadb.db') # Get all relaxed candidates from the db file. all_cand = gadb.get_all_relaxed_candidates(use_extinct=False) # We then split this data into some training data and a holdout test set. # In[3]: testset = get_unique(atoms=all_cand, size=100, key='raw_score') trainset = get_train(atoms=all_cand, size=500, taken=testset['taken'], key='raw_score') trainval = trainset['target'] testval = testset['target'] # Once the data is divided up, we then generate some feature sets. The eigenspectrum features are generated and then single transform engineering functions are used to expand the space slightly. # In[4]: generator = FeatureGenerator(atom_types=[78, 79], nprocs=1) train_data = generator.return_vec(trainset['atoms'], [generator.eigenspectrum_vec]) test_data = generator.return_vec(testset['atoms'], [generator.eigenspectrum_vec])
def test_generators(self): """Generate features from atoms objects.""" # Test generic features for Pt then both Pt and Au. get_mendeleev_params(atomic_number=78) get_mendeleev_params(atomic_number=[78, 79], params=default_params + ['en_ghosh']) # Connect database generated by a GA search. gadb = DataConnection('{}/data/gadb.db'.format(wkdir)) # Get all relaxed candidates from the db file. print('Getting candidates from the database') all_cand = gadb.get_all_relaxed_candidates(use_extinct=False) # Setup the test and training datasets. testset = get_unique(atoms=all_cand, size=test_size, key='raw_score') self.assertTrue(len(testset['atoms']) == test_size) self.assertTrue(len(testset['taken']) == test_size) trainset = get_train(atoms=all_cand, size=train_size, taken=testset['taken'], key='raw_score') self.assertTrue(len(trainset['atoms']) == train_size) self.assertTrue(len(trainset['target']) == train_size) # Initiate the fingerprint generators with relevant input variables. print('Getting the fingerprints') f = FeatureGenerator(element_parameters='atomic_radius', nprocs=1) f.normalize_features(trainset['atoms'], testset['atoms']) data = f.return_vec(trainset['atoms'], [f.nearestneighbour_vec]) n, d = np.shape(data) self.assertTrue(n == train_size and d == 4) self.assertTrue(len(f.return_names([f.nearestneighbour_vec])) == d) print('passed nearestneighbour_vec') train_fp = f.return_vec(trainset['atoms'], [f.bond_count_vec]) n, d = np.shape(train_fp) data = np.concatenate((data, train_fp), axis=1) self.assertTrue(n == train_size and d == 52) print('passed bond_count_vec') train_fp = f.return_vec(trainset['atoms'], [f.distribution_vec]) n, d = np.shape(train_fp) data = np.concatenate((data, train_fp), axis=1) self.assertTrue(n == train_size and d == 10) print('passed distribution_vec') # EXPENSIVE to calculate. Not included in training data. train_fp = f.return_vec(testset['atoms'], [f.connections_vec]) n, d = np.shape(train_fp) self.assertTrue(n == test_size and d == 26) print('passed connections_vec') train_fp = f.return_vec(trainset['atoms'], [f.rdf_vec]) n, d = np.shape(train_fp) data = np.concatenate((data, train_fp), axis=1) self.assertTrue(n == train_size and d == 20) print('passed rdf_vec') # Start testing the standard fingerprint vector generators. train_fp = f.return_vec(trainset['atoms'], [f.element_mass_vec]) n, d = np.shape(train_fp) data = np.concatenate((data, train_fp), axis=1) self.assertTrue(n == train_size and d == 1) self.assertTrue(len(f.return_names([f.element_mass_vec])) == d) print('passed element_mass_vec') train_fp = f.return_vec(trainset['atoms'], [f.element_parameter_vec]) n, d = np.shape(train_fp) data = np.concatenate((data, train_fp), axis=1) # print(f.return_names([f.element_parameter_vec])) self.assertTrue(n == train_size and d == 4) self.assertTrue(len(f.return_names([f.element_parameter_vec])) == d) print('passed element_parameter_vec') train_fp = f.return_vec(trainset['atoms'], [f.composition_vec]) n, d = np.shape(train_fp) data = np.concatenate((data, train_fp), axis=1) self.assertTrue(n == train_size and d == 2) self.assertTrue(len(f.return_names([f.composition_vec])) == d) print('passed composition_vec') train_fp = f.return_vec(trainset['atoms'], [f.eigenspectrum_vec]) n, d = np.shape(train_fp) data = np.concatenate((data, train_fp), axis=1) self.assertTrue(n == train_size and d == 147) self.assertTrue(len(f.return_names([f.eigenspectrum_vec])) == d) print('passed eigenspectrum_vec') train_fp = f.return_vec(trainset['atoms'], [f.distance_vec]) n, d = np.shape(train_fp) data = np.concatenate((data, train_fp), axis=1) self.assertTrue(n == train_size and d == 2) self.assertTrue(len(f.return_names([f.distance_vec])) == d) print('passed distance_vec') train_fp = f.return_vec( trainset['atoms'], [f.eigenspectrum_vec, f.element_mass_vec, f.composition_vec]) n, d = np.shape(train_fp) self.assertTrue(n == train_size and d == 150) self.assertTrue( len( f.return_names([ f.eigenspectrum_vec, f.element_mass_vec, f.composition_vec ])) == d) print('passed combined generation') train_fp = f.return_vec(trainset['atoms'], [f.neighbor_sum_vec]) n, d = np.shape(train_fp) self.assertTrue(n == train_size and d == len(trainset['atoms'][0])) # self.assertTrue(len(f.return_names([f.distance_vec])) == d) print('passed neighbor_sum_vec') train_fp = f.return_vec(trainset['atoms'], [f.neighbor_mean_vec]) n, d = np.shape(train_fp) self.assertTrue(n == train_size and d == len(trainset['atoms'][0])) # self.assertTrue(len(f.return_names([f.distance_vec])) == d) print('passed neighbor_mean_vec') f = FeatureGenerator(element_parameters='atomic_radius', max_neighbors='full', nprocs=1) f.normalize_features(trainset['atoms'], testset['atoms']) train_fp = f.return_vec(trainset['atoms'], [f.neighbor_sum_vec]) n, d = np.shape(train_fp) self.assertTrue(n == train_size and d == len(trainset['atoms'][0])) print('passed neighbor_sum_vec all neighbors') train_fp = f.return_vec(trainset['atoms'], [f.neighbor_mean_vec]) n, d = np.shape(train_fp) self.assertTrue(n == train_size and d == len(trainset['atoms'][0])) print('passed neighbor_mean_vec all neighbors') # Do basic check for atomic porperties. no_prop = [] an_prop = [] # EXPENSIVE to calculate. Not included in training data. for atoms in testset['atoms']: no_prop.append(neighbor_features(atoms=atoms)) an_prop.append( neighbor_features(atoms=atoms, property=['atomic_number'])) self.assertTrue(np.shape(no_prop) == (test_size, 15)) self.assertTrue(np.shape(an_prop) == (test_size, 30)) print('passed graph_vec') self.__class__.all_cand = all_cand self.__class__.data = data