Exemple #1
0
def get_catlearn_fp(A, Name=False):

    if not Name:
        A = autogen_info([A])
    gen = FeatureGenerator()
    if isinstance(A, list):
        gen._get_atom_types(A)
    else:
        gen._get_atom_types([A])

    train_fpv = [gen.mean_chemisorbed_atoms,
                 gen.bulk,
                 gen.term,
                 gen.strain,
                 gen.mean_surf_ligands,
                 gen.mean_site,
                 gen.median_site,
                 gen.max_site,
                 gen.min_site,
                 gen.sum_site,
                 #gen.generalized_cn,
                 #gen.bag_cn,
                 #gen.en_difference_ads,
                 #gen.en_difference_chemi,
                 #gen.en_difference_active,
                 #gen.bag_atoms_ads,
                 #gen.bag_connections_ads,
                 gen.count_chemisorbed_fragment]
    
    if not Name:
        gen.normalize_features(A)
        return gen.return_vec(A, train_fpv)
    else:
        return gen.return_names(train_fpv)
 def test_adsorption(self):
     images = database_to_list('data/bajdichWO32018_ads.db')
     images = images_connectivity(images)
     slabs = database_to_list('data/bajdichWO32018_slabs.db')
     slabs_dict = {}
     for slab in slabs:
         slabs_dict[slab.info['id']] = slab
     for i, atoms in enumerate(images):
         species = atoms.info['key_value_pairs']['species']
         atoms.subsets['ads_atoms'] = \
             slab_positions2ads_index(atoms, slabs[i], species)
         if 'slab_atoms' not in atoms.subsets:
             atoms.subsets['slab_atoms'] = slab_index(atoms)
         if ('chemisorbed_atoms' not in atoms.subsets
                 or 'site_atoms' not in atoms.subsets
                 or 'ligand_atoms' not in atoms.subsets):
             chemi, site, ligand = info2primary_index(atoms)
             atoms.subsets['chemisorbed_atoms'] = chemi
             atoms.subsets['site_atoms'] = site
             atoms.subsets['ligand_atoms'] = ligand
         attach_cations(atoms, anion_number=8)
         charges = formal_charges(atoms)
         atoms.set_initial_charges(charges)
     gen = FeatureGenerator(nprocs=1)
     train_fpv = default_fingerprinters(gen, 'chalcogenides')
     matrix = gen.return_vec(images, train_fpv)
     labels = gen.return_names(train_fpv)
     if __name__ == '__main__':
         for i, l in enumerate(labels):
             print(i, l)
         print(atoms.get_initial_charges())
     self.assertTrue(len(labels) == np.shape(matrix)[1])
Exemple #3
0
    def test_ase_api(self):
        """Test the ase api."""
        gadb = DataConnection('{}/data/gadb.db'.format(wkdir))
        all_cand = gadb.get_all_relaxed_candidates()

        cf = all_cand[0].get_chemical_formula()

        extend_atoms_class(all_cand[0])
        self.assertTrue(isinstance(all_cand[0], type(all_cand[1])))

        f = FeatureGenerator()
        fp = f.composition_vec(all_cand[0])
        all_cand[0].set_features(fp)

        self.assertTrue(np.allclose(all_cand[0].get_features(), fp))
        self.assertTrue(all_cand[0].get_chemical_formula() == cf)

        extend_atoms_class(all_cand[1])
        self.assertTrue(all_cand[1].get_features() is None)

        g = ase_to_networkx(all_cand[2])
        all_cand[2].set_graph(g)

        self.assertTrue(all_cand[2].get_graph() == g)
        self.assertTrue(all_cand[1].get_graph() is None)
    def test_generator(self):
        """Test the feature generation."""
        atoms = molecule('HCOOH')
        atoms.center(vacuum=5)
        radii = [covalent_radii[z] for z in atoms.numbers]
        atoms.connectivity = ase_connectivity(atoms, radii)
        images = [atoms]
        gen = FeatureGenerator()
        features = gen.return_vec(images, [gen.get_autocorrelation])

        np.testing.assert_allclose(features, truth)
    def test_bulk_fp_gen(self):
        """Test the feature generation."""
        images = self.setup_metal()
        images = images_connectivity(images)

        gen = FeatureGenerator()
        train_fpv = default_fingerprinters(gen, 'bulk') + [gen.bag_edges]
        matrix = gen.return_vec(images, train_fpv)
        labels = gen.return_names(train_fpv)
        print(np.shape(matrix), print(type(matrix)))
        self.assertTrue(len(labels) == np.shape(matrix)[1])
Exemple #6
0
 def test_tags(self):
     """Test the feature generation."""
     images = self.setup_metals()
     images = autogen_info(images)
     print(str(len(images)) + ' training examples.')
     gen = FeatureGenerator(nprocs=1)
     train_fpv = default_fingerprinters(gen, 'adsorbates')
     train_fpv += [
         gen.formal_charges, gen.bag_edges_ads, gen.ads_av, gen.ads_sum
     ]
     matrix = gen.return_vec(images, train_fpv)
     labels = gen.return_names(train_fpv)
     print(np.shape(matrix), type(matrix))
     if __name__ == '__main__':
         for i, l in enumerate(labels):
             print(i, l)
     self.assertTrue(len(labels) == np.shape(matrix)[1])
Exemple #7
0
 def test_constrained_ads(self):
     """Test the feature generation."""
     images = self.setup_metals()
     [atoms.set_tags(np.zeros(len(atoms))) for atoms in images]
     for atoms in images:
         c_atoms = [a.index for a in atoms if
                    a.z < atoms.cell[2, 2] / 2. + 0.1]
         atoms.set_constraint(FixAtoms(c_atoms))
     images = autogen_info(images)
     print(str(len(images)) + ' training examples.')
     gen = FeatureGenerator(nprocs=1)
     train_fpv = default_fingerprinters(gen, 'adsorbates')
     matrix = gen.return_vec(images, train_fpv)
     labels = gen.return_names(train_fpv)
     print(np.shape(matrix), type(matrix))
     if __name__ == '__main__':
         for i, l in enumerate(labels):
             print(i, l)
     self.assertTrue(len(labels) == np.shape(matrix)[1])
Exemple #8
0
 def test_db_ads(self):
     """Test the feature generation."""
     images = database_to_list('data/ads_example.db')
     [atoms.set_tags(np.zeros(len(atoms))) for atoms in images]
     images = autogen_info(images)
     layers2ads_index(images[0],
                      images[0].info['key_value_pairs']['species'])
     print(str(len(images)) + ' training examples.')
     gen = FeatureGenerator(nprocs=1)
     train_fpv = default_fingerprinters(gen, 'adsorbates')
     # Test db specific functions.
     train_fpv += [gen.db_size, gen.ctime, gen.dbid, gen.delta_energy]
     # Old CatApp AxBy fingerprints.
     train_fpv += [gen.catapp_AB]
     matrix = gen.return_vec(images, train_fpv)
     labels = gen.return_names(train_fpv)
     print(np.shape(matrix), type(matrix))
     if __name__ == '__main__':
         for i, l in enumerate(labels):
             print(i, l)
     self.assertTrue(len(labels) == np.shape(matrix)[1])
Exemple #9
0
    def get_data(self):
        """Generate features from atoms objects."""
        # Connect database generated by a GA search.
        gadb = DataConnection('{}/data/gadb.db'.format(wkdir))

        # Get all relaxed candidates from the db file.
        print('Getting candidates from the database')
        all_cand = gadb.get_all_relaxed_candidates(use_extinct=False)

        # Setup the test and training datasets.
        testset = get_unique(atoms=all_cand, size=test_size, key='raw_score')

        trainset = get_train(atoms=all_cand,
                             size=train_size,
                             taken=testset['taken'],
                             key='raw_score')

        # Clear out some old saved data.
        for i in trainset['atoms']:
            del i.info['data']['nnmat']

        # Initiate the fingerprint generators with relevant input variables.
        print('Getting the fingerprints')
        f = FeatureGenerator()

        train_features = f.return_vec(trainset['atoms'],
                                      [f.nearestneighbour_vec])
        test_features = f.return_vec(testset['atoms'],
                                     [f.nearestneighbour_vec])

        train_targets = []
        for a in trainset['atoms']:
            train_targets.append(a.info['key_value_pairs']['raw_score'])
        test_targets = []
        for a in testset['atoms']:
            test_targets.append(a.info['key_value_pairs']['raw_score'])

        return train_features, train_targets, trainset['atoms'], \
            test_features, test_targets, testset['atoms']
Exemple #10
0
 def test_connectivity(self):
     images = self.setup_metals(n=2)
     images = images_pair_distances(images)
     gen = FeatureGenerator(nprocs=1)
     gen.featurize_atomic_pairs(images)
Exemple #11
0
testset = get_unique(atoms=all_cand, size=100, key='raw_score')

trainset = get_train(atoms=all_cand,
                     size=500,
                     taken=testset['taken'],
                     key='raw_score')

trainval = trainset['target']
testval = testset['target']

# Once the data is divided up, we then generate some feature sets. The eigenspectrum features are generated and then single transform engineering functions are used to expand the space slightly.

# In[4]:

generator = FeatureGenerator(atom_types=[78, 79], nprocs=1)
train_data = generator.return_vec(trainset['atoms'],
                                  [generator.eigenspectrum_vec])
test_data = generator.return_vec(testset['atoms'],
                                 [generator.eigenspectrum_vec])

train_data = single_transform(train_data)
test_data = single_transform(test_data)

# ## Baseline
#
# Initially, a GP is trained on all the features and the error calculated as a baseline.

# In[5]:

kdict = [{
Exemple #12
0
    def test_generators(self):
        """Generate features from atoms objects."""
        # Test generic features for Pt then both Pt and Au.
        get_mendeleev_params(atomic_number=78)
        get_mendeleev_params(atomic_number=[78, 79],
                             params=default_params + ['en_ghosh'])

        # Connect database generated by a GA search.
        gadb = DataConnection('{}/data/gadb.db'.format(wkdir))

        # Get all relaxed candidates from the db file.
        print('Getting candidates from the database')
        all_cand = gadb.get_all_relaxed_candidates(use_extinct=False)

        # Setup the test and training datasets.
        testset = get_unique(atoms=all_cand, size=test_size, key='raw_score')
        self.assertTrue(len(testset['atoms']) == test_size)
        self.assertTrue(len(testset['taken']) == test_size)

        trainset = get_train(atoms=all_cand,
                             size=train_size,
                             taken=testset['taken'],
                             key='raw_score')
        self.assertTrue(len(trainset['atoms']) == train_size)
        self.assertTrue(len(trainset['target']) == train_size)

        # Initiate the fingerprint generators with relevant input variables.
        print('Getting the fingerprints')
        f = FeatureGenerator(element_parameters='atomic_radius', nprocs=1)
        f.normalize_features(trainset['atoms'], testset['atoms'])

        data = f.return_vec(trainset['atoms'], [f.nearestneighbour_vec])
        n, d = np.shape(data)
        self.assertTrue(n == train_size and d == 4)
        self.assertTrue(len(f.return_names([f.nearestneighbour_vec])) == d)
        print('passed nearestneighbour_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.bond_count_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 52)
        print('passed bond_count_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.distribution_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 10)
        print('passed distribution_vec')

        # EXPENSIVE to calculate. Not included in training data.
        train_fp = f.return_vec(testset['atoms'], [f.connections_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == test_size and d == 26)
        print('passed connections_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.rdf_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 20)
        print('passed rdf_vec')

        # Start testing the standard fingerprint vector generators.
        train_fp = f.return_vec(trainset['atoms'], [f.element_mass_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 1)
        self.assertTrue(len(f.return_names([f.element_mass_vec])) == d)
        print('passed element_mass_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.element_parameter_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        # print(f.return_names([f.element_parameter_vec]))
        self.assertTrue(n == train_size and d == 4)
        self.assertTrue(len(f.return_names([f.element_parameter_vec])) == d)
        print('passed element_parameter_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.composition_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 2)
        self.assertTrue(len(f.return_names([f.composition_vec])) == d)
        print('passed composition_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.eigenspectrum_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 147)
        self.assertTrue(len(f.return_names([f.eigenspectrum_vec])) == d)
        print('passed eigenspectrum_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.distance_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 2)
        self.assertTrue(len(f.return_names([f.distance_vec])) == d)
        print('passed distance_vec')

        train_fp = f.return_vec(
            trainset['atoms'],
            [f.eigenspectrum_vec, f.element_mass_vec, f.composition_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == train_size and d == 150)
        self.assertTrue(
            len(
                f.return_names([
                    f.eigenspectrum_vec, f.element_mass_vec, f.composition_vec
                ])) == d)
        print('passed combined generation')

        train_fp = f.return_vec(trainset['atoms'], [f.neighbor_sum_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == train_size and d == len(trainset['atoms'][0]))
        # self.assertTrue(len(f.return_names([f.distance_vec])) == d)
        print('passed neighbor_sum_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.neighbor_mean_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == train_size and d == len(trainset['atoms'][0]))
        # self.assertTrue(len(f.return_names([f.distance_vec])) == d)
        print('passed neighbor_mean_vec')

        f = FeatureGenerator(element_parameters='atomic_radius',
                             max_neighbors='full',
                             nprocs=1)
        f.normalize_features(trainset['atoms'], testset['atoms'])

        train_fp = f.return_vec(trainset['atoms'], [f.neighbor_sum_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == train_size and d == len(trainset['atoms'][0]))
        print('passed neighbor_sum_vec all neighbors')

        train_fp = f.return_vec(trainset['atoms'], [f.neighbor_mean_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == train_size and d == len(trainset['atoms'][0]))
        print('passed neighbor_mean_vec all neighbors')

        # Do basic check for atomic porperties.
        no_prop = []
        an_prop = []
        # EXPENSIVE to calculate. Not included in training data.
        for atoms in testset['atoms']:
            no_prop.append(neighbor_features(atoms=atoms))
            an_prop.append(
                neighbor_features(atoms=atoms, property=['atomic_number']))
        self.assertTrue(np.shape(no_prop) == (test_size, 15))
        self.assertTrue(np.shape(an_prop) == (test_size, 30))
        print('passed graph_vec')

        self.__class__.all_cand = all_cand
        self.__class__.data = data
Exemple #13
0

# Then we can load some data. There is some pre-generated data in an ase-db so first, the atoms objects are loaded into a list. Then they are fed through a feature generator.

# In[2]:


# Connect ase atoms database.
gadb = DataConnection('../../data/gadb.db')

# Get all relaxed candidates from the db file.
all_cand = gadb.get_all_relaxed_candidates(use_extinct=False)
print('Loaded {} atoms objects'.format(len(all_cand)))

# Generate the feature matrix.
fgen = FeatureGenerator()
features = fgen.return_vec(all_cand, [fgen.eigenspectrum_vec])
print('Generated {} feature matrix'.format(np.shape(features)))

# Get the target values.
targets = []
for a in all_cand:
    targets.append(a.info['key_value_pairs']['raw_score'])
print('Generated {} target vector'.format(np.shape(targets)))


# It is important to note that the `all_cand` variable is simply a list of atoms objects. There are no constraints on how this should be set up, the above example is just a succinct method for generating the list.
#
# ## Prediction Setup <a name="prediction-setup"></a>
# [(Back to top)](#head)
#
Exemple #14
0
train_atoms = alist[:train_size]
test_atoms = alist[train_size:]
train_targets = np.asarray(targets[:train_size])
test_targets = np.asarray(targets[train_size:])

print('{} shape training atoms data'.format(np.shape(train_atoms)))
print('{} shape testing atoms data'.format(np.shape(test_atoms)))

# ## Feature Generation <a name="feature-generation"></a>
# [(Back to top)](#head)
#
# It can be necessary to work out the full range of elements that need to be accounted for in the model. The feature generator tries to work out the range of elements to account for based on the maximum composition. However, explicitly specifying this is more robust.

# In[4]:

generator = FeatureGenerator()

generator.normalize_features(train_candidates=train_atoms,
                             test_candidates=test_atoms)
print('Max number of atom present in data: {}'.format(generator.atom_len))
print('Atom numbers present in data: {}'.format(generator.atom_types))

# We then generate the feature array for all the atoms objects. The `return_vec()` function takes the list of atoms objects and the type of features to generate.

# In[5]:

train_features = generator.return_vec(
    train_atoms, [generator.eigenspectrum_vec, generator.composition_vec])

test_features = generator.return_vec(
    test_atoms, [generator.eigenspectrum_vec, generator.composition_vec])
# In[4]:

# Divide up the data into a test and training set.
train_size = 1000
train_atoms = alist[:train_size]
test_atoms = alist[train_size:]
train_targets = np.asarray(targets[:train_size])
test_targets = np.asarray(targets[train_size:])

print('{} shape training atoms data'.format(np.shape(train_atoms)))
print('{} shape testing atoms data'.format(np.shape(test_atoms)))

# In[5]:

generator = FeatureGenerator(element_parameters=['atomic_number'])

generator.normalize_features(train_candidates=train_atoms,
                             test_candidates=test_atoms)
print('Max number of atom present in data: {}'.format(generator.atom_len))
print('Atom numbers present in data: {}'.format(generator.atom_types))

# In[6]:

train_features = generator.return_vec(train_atoms,
                                      [generator.neighbor_sum_vec])

test_features = generator.return_vec(test_atoms, [generator.neighbor_sum_vec])

print('{} shape training feature matrix'.format(np.shape(train_features)))
print('{} shape testing feature matrix'.format(np.shape(test_features)))