예제 #1
0
    def get_data(self):
        """Generate features from atoms objects."""
        # Connect database generated by a GA search.
        gadb = DataConnection('{}/data/gadb.db'.format(wkdir))

        # Get all relaxed candidates from the db file.
        print('Getting candidates from the database')
        all_cand = gadb.get_all_relaxed_candidates(use_extinct=False)

        # Setup the test and training datasets.
        testset = get_unique(atoms=all_cand, size=test_size, key='raw_score')

        trainset = get_train(atoms=all_cand,
                             size=train_size,
                             taken=testset['taken'],
                             key='raw_score')

        # Clear out some old saved data.
        for i in trainset['atoms']:
            del i.info['data']['nnmat']

        # Initiate the fingerprint generators with relevant input variables.
        print('Getting the fingerprints')
        f = FeatureGenerator()

        train_features = f.return_vec(trainset['atoms'],
                                      [f.nearestneighbour_vec])
        test_features = f.return_vec(testset['atoms'],
                                     [f.nearestneighbour_vec])

        train_targets = []
        for a in trainset['atoms']:
            train_targets.append(a.info['key_value_pairs']['raw_score'])
        test_targets = []
        for a in testset['atoms']:
            test_targets.append(a.info['key_value_pairs']['raw_score'])

        return train_features, train_targets, trainset['atoms'], \
            test_features, test_targets, testset['atoms']
예제 #2
0
#
# To start with we import some data. For this tutorial, the data for alloyed nanoparticles are used.

# In[2]:

# Connect ase atoms database.
gadb = DataConnection('../../data/gadb.db')

# Get all relaxed candidates from the db file.
all_cand = gadb.get_all_relaxed_candidates(use_extinct=False)

# We then split this data into some training data and a holdout test set.

# In[3]:

testset = get_unique(atoms=all_cand, size=100, key='raw_score')

trainset = get_train(atoms=all_cand,
                     size=500,
                     taken=testset['taken'],
                     key='raw_score')

trainval = trainset['target']
testval = testset['target']

# Once the data is divided up, we then generate some feature sets. The eigenspectrum features are generated and then single transform engineering functions are used to expand the space slightly.

# In[4]:

generator = FeatureGenerator(atom_types=[78, 79], nprocs=1)
train_data = generator.return_vec(trainset['atoms'],
예제 #3
0
    def test_generators(self):
        """Generate features from atoms objects."""
        # Test generic features for Pt then both Pt and Au.
        get_mendeleev_params(atomic_number=78)
        get_mendeleev_params(atomic_number=[78, 79],
                             params=default_params + ['en_ghosh'])

        # Connect database generated by a GA search.
        gadb = DataConnection('{}/data/gadb.db'.format(wkdir))

        # Get all relaxed candidates from the db file.
        print('Getting candidates from the database')
        all_cand = gadb.get_all_relaxed_candidates(use_extinct=False)

        # Setup the test and training datasets.
        testset = get_unique(atoms=all_cand, size=test_size, key='raw_score')
        self.assertTrue(len(testset['atoms']) == test_size)
        self.assertTrue(len(testset['taken']) == test_size)

        trainset = get_train(atoms=all_cand,
                             size=train_size,
                             taken=testset['taken'],
                             key='raw_score')
        self.assertTrue(len(trainset['atoms']) == train_size)
        self.assertTrue(len(trainset['target']) == train_size)

        # Initiate the fingerprint generators with relevant input variables.
        print('Getting the fingerprints')
        f = FeatureGenerator(element_parameters='atomic_radius', nprocs=1)
        f.normalize_features(trainset['atoms'], testset['atoms'])

        data = f.return_vec(trainset['atoms'], [f.nearestneighbour_vec])
        n, d = np.shape(data)
        self.assertTrue(n == train_size and d == 4)
        self.assertTrue(len(f.return_names([f.nearestneighbour_vec])) == d)
        print('passed nearestneighbour_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.bond_count_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 52)
        print('passed bond_count_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.distribution_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 10)
        print('passed distribution_vec')

        # EXPENSIVE to calculate. Not included in training data.
        train_fp = f.return_vec(testset['atoms'], [f.connections_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == test_size and d == 26)
        print('passed connections_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.rdf_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 20)
        print('passed rdf_vec')

        # Start testing the standard fingerprint vector generators.
        train_fp = f.return_vec(trainset['atoms'], [f.element_mass_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 1)
        self.assertTrue(len(f.return_names([f.element_mass_vec])) == d)
        print('passed element_mass_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.element_parameter_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        # print(f.return_names([f.element_parameter_vec]))
        self.assertTrue(n == train_size and d == 4)
        self.assertTrue(len(f.return_names([f.element_parameter_vec])) == d)
        print('passed element_parameter_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.composition_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 2)
        self.assertTrue(len(f.return_names([f.composition_vec])) == d)
        print('passed composition_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.eigenspectrum_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 147)
        self.assertTrue(len(f.return_names([f.eigenspectrum_vec])) == d)
        print('passed eigenspectrum_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.distance_vec])
        n, d = np.shape(train_fp)
        data = np.concatenate((data, train_fp), axis=1)
        self.assertTrue(n == train_size and d == 2)
        self.assertTrue(len(f.return_names([f.distance_vec])) == d)
        print('passed distance_vec')

        train_fp = f.return_vec(
            trainset['atoms'],
            [f.eigenspectrum_vec, f.element_mass_vec, f.composition_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == train_size and d == 150)
        self.assertTrue(
            len(
                f.return_names([
                    f.eigenspectrum_vec, f.element_mass_vec, f.composition_vec
                ])) == d)
        print('passed combined generation')

        train_fp = f.return_vec(trainset['atoms'], [f.neighbor_sum_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == train_size and d == len(trainset['atoms'][0]))
        # self.assertTrue(len(f.return_names([f.distance_vec])) == d)
        print('passed neighbor_sum_vec')

        train_fp = f.return_vec(trainset['atoms'], [f.neighbor_mean_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == train_size and d == len(trainset['atoms'][0]))
        # self.assertTrue(len(f.return_names([f.distance_vec])) == d)
        print('passed neighbor_mean_vec')

        f = FeatureGenerator(element_parameters='atomic_radius',
                             max_neighbors='full',
                             nprocs=1)
        f.normalize_features(trainset['atoms'], testset['atoms'])

        train_fp = f.return_vec(trainset['atoms'], [f.neighbor_sum_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == train_size and d == len(trainset['atoms'][0]))
        print('passed neighbor_sum_vec all neighbors')

        train_fp = f.return_vec(trainset['atoms'], [f.neighbor_mean_vec])
        n, d = np.shape(train_fp)
        self.assertTrue(n == train_size and d == len(trainset['atoms'][0]))
        print('passed neighbor_mean_vec all neighbors')

        # Do basic check for atomic porperties.
        no_prop = []
        an_prop = []
        # EXPENSIVE to calculate. Not included in training data.
        for atoms in testset['atoms']:
            no_prop.append(neighbor_features(atoms=atoms))
            an_prop.append(
                neighbor_features(atoms=atoms, property=['atomic_number']))
        self.assertTrue(np.shape(no_prop) == (test_size, 15))
        self.assertTrue(np.shape(an_prop) == (test_size, 30))
        print('passed graph_vec')

        self.__class__.all_cand = all_cand
        self.__class__.data = data