Ejemplo n.º 1
0
    def test_distribution(self):
        """Tests if the random sorting obeys a gaussian distribution. Can
        rarely fail when everything is OK.
        """
        # Get the mean value to compare to
        sigma = 5
        desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False)
        cm = desc.create(HHe)
        means = sorted(np.linalg.norm(cm, axis=1))
        means = np.linalg.norm(cm, axis=1)
        mu2 = means[0]
        mu1 = means[1]

        # Measures how many times the two rows with biggest norm exchange place
        # when random noise is added. This should correspond to the probability
        # P(X > Y), where X = N(\mu_1, \sigma^2), Y = N(\mu_2, \sigma^2). This
        # probability can be reduced to P(X > Y) = P(X-Y > 0) = P(N(\mu_1 -
        # \mu_2, \sigma^2 + sigma^2) > 0). See e.g.
        # https://en.wikipedia.org/wiki/Sum_of_normally_distributed_random_variables
        desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=sigma, flatten=False)
        count = 0
        rand_instances = 20000
        for i in range(0, rand_instances):
            cm = desc.create(HHe)
            if np.linalg.norm(cm[0]) < np.linalg.norm(cm[1]):
                count += 1

        # The expected probability is calculated from the cumulative
        # distribution function.
        expected = 1 - scipy.stats.norm.cdf(0, mu1 - mu2, np.sqrt(sigma**2 + sigma**2))
        observed = count/rand_instances

        self.assertTrue(abs(expected - observed) <= 1e-2)
Ejemplo n.º 2
0
 def test_exceptions(self):
     """Tests different invalid parameters that should raise an
     exception.
     """
     with self.assertRaises(ValueError):
         CoulombMatrix(n_atoms_max=5, permutation="unknown")
     with self.assertRaises(ValueError):
         CoulombMatrix(n_atoms_max=-1)
     with self.assertRaises(ValueError):
         cm = CoulombMatrix(n_atoms_max=2)
         cm.create([HHe, H2O])
Ejemplo n.º 3
0
    def test_flatten(self):
        """Tests the flattening."""
        # Unflattened
        desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False)
        cm = desc.create(H2O)
        self.assertEqual(cm.shape, (5, 5))

        # Flattened
        desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=True)
        cm = desc.create(H2O)
        self.assertEqual(cm.shape, (25,))
Ejemplo n.º 4
0
    def test_flatten(self):
        """Tests the flattening."""
        # Unflattened
        desc = CoulombMatrix(n_atoms_max=5, permutation="eigenspectrum", flatten=False)
        cm = desc.create(H2O)
        # print(cm)
        self.assertEqual(cm.shape, (5,))

        # Flattened
        desc = CoulombMatrix(n_atoms_max=5, permutation="eigenspectrum", flatten=True)
        cm = desc.create(H2O)
        self.assertEqual(cm.shape, (5,))
Ejemplo n.º 5
0
    def test_sparse(self):
        """Tests the sparse matrix creation.
        """
        # Dense
        desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=False, sparse=False)
        vec = desc.create(H2O)
        self.assertTrue(type(vec) == np.ndarray)

        # Sparse
        desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=True, sparse=True)
        vec = desc.create(H2O)
        self.assertTrue(type(vec) == scipy.sparse.coo_matrix)
Ejemplo n.º 6
0
    def test_norm_vector(self):
        """Tests if the attribute _norm_vector is written and used correctly
        """
        desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=100, flatten=False)
        cm = desc.create(H2O)
        self.assertEqual(len(cm), 5)

        # The norm_vector is not zero padded in this implementation. All zero-padding
        # is done at the end after randomly sorting
        self.assertEqual(len(desc._norm_vector), 3)
        cm = desc.create(H2O)
        self.assertEqual(len(cm), 5)
Ejemplo n.º 7
0
    def test_match_with_sorted(self):
        """Tests if sorting the random coulomb matrix results in the same as
        the sorted coulomb matrix
        """
        desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=100, flatten=False)
        rcm = desc.create(H2O)

        srcm = desc.sort(rcm)

        desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False)

        scm = desc.create(H2O)

        self.assertTrue(np.array_equal(scm, srcm))
Ejemplo n.º 8
0
    def test_sparse(self):
        """Tests the sparse matrix creation."""
        # Dense
        desc = CoulombMatrix(
            n_atoms_max=5, permutation="random", sigma=100, flatten=False, sparse=False
        )
        vec = desc.create(H2O)
        self.assertTrue(type(vec) == np.ndarray)

        # Sparse
        desc = CoulombMatrix(
            n_atoms_max=5, permutation="random", sigma=100, flatten=True, sparse=True
        )
        vec = desc.create(H2O)
        self.assertTrue(type(vec) == sparse.COO)
Ejemplo n.º 9
0
    def test_features(self):
        """Tests that the correct features are present in the desciptor.
        """
        desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=False)
        cm = desc.create(H2O)

        # Test against assumed values
        q = H2O.get_atomic_numbers()
        p = H2O.get_positions()
        norm = np.linalg.norm
        assumed = np.array([
            [
                0.5 * q[0]**2.4, q[0] * q[1] / (norm(p[0] - p[1])),
                q[0] * q[2] / (norm(p[0] - p[2]))
            ],
            [
                q[1] * q[0] / (norm(p[1] - p[0])), 0.5 * q[1]**2.4,
                q[1] * q[2] / (norm(p[1] - p[2]))
            ],
            [
                q[2] * q[0] / (norm(p[2] - p[0])),
                q[2] * q[1] / (norm(p[2] - p[1])), 0.5 * q[2]**2.4
            ],
        ])
        zeros = np.zeros((5, 5))
        zeros[:3, :3] = assumed
        assumed = zeros

        self.assertTrue(np.array_equal(cm, assumed))
Ejemplo n.º 10
0
    def test_parallel_dense(self):
        """Tests creating dense output parallelly."""
        samples = [molecule("CO"), molecule("N2O")]
        desc = CoulombMatrix(n_atoms_max=5,
                             permutation="none",
                             flatten=True,
                             sparse=False)
        n_features = desc.get_number_of_features()

        # Determining number of jobs based on the amount of CPUs
        desc.create(system=samples, n_jobs=-1, only_physical_cores=False)
        desc.create(system=samples, n_jobs=-1, only_physical_cores=True)

        # Test multiple systems, serial job
        output = desc.create(
            system=samples,
            n_jobs=1,
        )
        assumed = np.empty((2, n_features))
        assumed[0, :] = desc.create(samples[0])
        assumed[1, :] = desc.create(samples[1])
        self.assertTrue(np.allclose(output, assumed))

        # Test multiple systems, parallel job
        output = desc.create(
            system=samples,
            n_jobs=2,
        )
        assumed = np.empty((2, n_features))
        assumed[0, :] = desc.create(samples[0])
        assumed[1, :] = desc.create(samples[1])
        self.assertTrue(np.allclose(output, assumed))

        # Non-flattened output
        desc = CoulombMatrix(n_atoms_max=5,
                             permutation="none",
                             flatten=False,
                             sparse=False)
        output = desc.create(
            system=samples,
            n_jobs=2,
        )
        assumed = np.empty((2, 5, 5))
        assumed[0] = desc.create(samples[0])
        assumed[1] = desc.create(samples[1])
        self.assertTrue(np.allclose(np.array(output), assumed))
Ejemplo n.º 11
0
    def test_flatten(self):
        """Tests the flattening.
        """
        # Unflattened
        desc = CoulombMatrix(n_atoms_max=5,
                             permutation="random",
                             sigma=100,
                             flatten=False)
        cm = desc.create(H2O)
        self.assertEqual(cm.shape, (5, 5))

        # Flattened
        desc = CoulombMatrix(n_atoms_max=5,
                             permutation="random",
                             sigma=100,
                             flatten=True)
        cm = desc.create(H2O)
        self.assertEqual(cm.shape, (1, 25))
Ejemplo n.º 12
0
    def test_features(self):
        """Tests that the correct features are present in the desciptor."""
        desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False)
        cm = desc.create(H2O)

        lens = np.linalg.norm(cm, axis=1)
        old_len = lens[0]
        for length in lens[1:]:
            self.assertTrue(length <= old_len)
            old_len = length
Ejemplo n.º 13
0
class Global_Descriptor_CM(Global_Descriptor_Base):
    def __init__(self, desc_spec):
        """
        make a DScribe CM object
        """

        from dscribe.descriptors import CoulombMatrix

        if "type" not in desc_spec.keys() or desc_spec["type"] != "CM":
            raise ValueError(
                "Type is not CM or cannot find the type of the descriptor")

        # required
        try:
            self.max_atoms = desc_spec['max_atoms']
        except:
            raise ValueError(
                "Not enough information to intialize the `Atomic_Descriptor_CM` object"
            )

        if 'periodic' in desc_spec.keys() and desc_spec['periodic'] == True:
            raise ValueError(
                "Coulomb Matrix cannot be used for periodic systems")

        self.cm = CoulombMatrix(self.max_atoms)
        print("Using CoulombMatrix ...")
        # make an acronym
        self.acronym = "CM" + "-" + str(self.max_atoms)

    def create(self, frame):
        """
        compute the CM descriptor vector for a frame
        Parameters
        ----------
        frame: ASE atom object. Coordinates of a frame.

        Returns
        -------
        desc_dict: a dictionary. each entry contains the essential info of the descriptor, i.e. acronym 
                          and a np.array [N_desc]. Global descriptors for a frame.
                   e.g. {'d1':{ 'acronym': 'CM-*', 'descriptors': `a np.array [N_desc]`}}
        atomic_desc_dict : {}
        """
        if len(frame.get_positions()) > self.max_atoms:
            raise ValueError(
                'the size of the system is larger than the max_atoms of the CM descriptor'
            )
        # notice that we return an empty dictionary for "atomic descriptors"
        return {
            'acronym': self.acronym,
            'descriptors': self.cm.create(frame, n_jobs=1)
        }, {}
Ejemplo n.º 14
0
    def test_features(self):
        """Tests that the correct features are present in the desciptor."""
        desc = CoulombMatrix(n_atoms_max=5, permutation="eigenspectrum")
        cm = desc.create(H2O)

        self.assertEqual(cm.shape, (5,))

        # Test that eigenvalues are in decreasing order when looking at absolute value
        prev_eig = float("Inf")
        for eigenvalue in cm[: len(H2O)]:
            self.assertTrue(abs(eigenvalue) <= abs(prev_eig))
            prev_eig = eigenvalue

        # Test that array is zero-padded
        self.assertTrue(np.array_equal(cm[len(H2O) :], [0, 0]))
Ejemplo n.º 15
0
    def test_periodicity(self):
        """Tests that periodicity is not taken into account in Coulomb matrix
        even if the system is set as periodic.
        """
        system = Atoms(cell=[5, 5, 5],
                       scaled_positions=[
                           [0.1, 0, 0],
                           [0.9, 0, 0],
                       ],
                       symbols=["H", "H"],
                       pbc=True)
        desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=False)
        cm = desc.create(system)

        pos = system.get_positions()
        assumed = 1 * 1 / np.linalg.norm((pos[0] - pos[1]))
        self.assertEqual(cm[0, 1], assumed)
Ejemplo n.º 16
0
def ML_potential(config, data):
    model = data['metadata'][3]['best_model_fitted']
    if data['metadata'][1]['descriptor_type'] == 'Coulomb_matrix':
        descriptor = CoulombMatrix(
        n_atoms_max=7,
        flatten=True,
        permutation = 'sorted_l2')
        x = Atoms('O2H5',positions=config)
        X = descriptor.create(x)
        energy = model.predict(X)[0][0]
        return energy

    if data['metadata'][1]['descriptor_type'] == 'PIV':
        descriptor = data['metadata'][1]['descriptor']
        x = Atoms('O2H5', positions=config)
        X = descriptor(x)
        energy = model.predict(X)[0][0]
        return energy
Ejemplo n.º 17
0
def setupDescs(structs, indexs, level, descname, chemsyms_uniques, n_atoms,
               steve, v):
    """
    Setup descriptor and run it for ASE structures.
    Return DataFrame with given strictures as descriptors
    """
    # choose the descriptor
    if descname == "CM":
        desc = CoulombMatrix(n_atoms_max=n_atoms, flatten=True)
        # permutation = 'sorted_l2' is default
        n_feat = desc.get_number_of_features()

    if descname == "MBTR":
        desc = MBTR(species=chemsyms_uniques,
                    k1=mk1,
                    k2=mk2,
                    k3=mk3,
                    periodic=False,
                    normalization="l2_each",
                    flatten=True)
        n_feat = desc.get_number_of_features()

    if descname == "SOAP":
        desc = SOAP(species=chemsyms_uniques,
                    periodic=False,
                    rcut=srcut,
                    nmax=snmax,
                    lmax=slmax,
                    average=True)  # Averaging for global
        n_feat = desc.get_number_of_features()

    # Create descriptors
    descs = desc.create(structs, n_jobs=steve)  # Parallel

    # Create a DF of returned `list` of `arrays` of descs
    descs_df = pd.DataFrame(descs, index=indexs)

    if v:
        print("""🔘 Created {}-descriptors for all {} {}-structures.
    Number of features in {}: {}""".format(descname, structs.shape[0], level,
                                           descname, n_feat))

    return descs_df, n_feat
Ejemplo n.º 18
0
    def test_parallel_sparse(self):
        """Tests creating sparse output parallelly.
        """
        # Test indices
        samples = [molecule("CO"), molecule("N2O")]
        desc = CoulombMatrix(n_atoms_max=5,
                             permutation="none",
                             flatten=True,
                             sparse=True)
        n_features = desc.get_number_of_features()

        # Test multiple systems, serial job
        output = desc.create(
            system=samples,
            n_jobs=1,
        ).toarray()
        assumed = np.empty((2, n_features))
        assumed[0, :] = desc.create(samples[0]).toarray()
        assumed[1, :] = desc.create(samples[1]).toarray()
        self.assertTrue(np.allclose(output, assumed))

        # Test multiple systems, parallel job
        output = desc.create(
            system=samples,
            n_jobs=2,
        ).toarray()
        assumed = np.empty((2, n_features))
        assumed[0, :] = desc.create(samples[0]).toarray()
        assumed[1, :] = desc.create(samples[1]).toarray()
        self.assertTrue(np.allclose(output, assumed))

        # Non-flattened output
        desc = CoulombMatrix(n_atoms_max=5,
                             permutation="none",
                             flatten=False,
                             sparse=True)
        output = [
            x.toarray() for x in desc.create(
                system=samples,
                n_jobs=2,
            )
        ]
        assumed = np.empty((2, 5, 5))
        assumed[0] = desc.create(samples[0]).toarray()
        assumed[1] = desc.create(samples[1]).toarray()
        self.assertTrue(np.allclose(np.array(output), assumed))
Ejemplo n.º 19
0
atomic_numbers = [1, 8]
rcut = 6.0
nmax = 8
lmax = 6

# Setting up the CM descriptor
cm = CoulombMatrix(n_atoms_max=6, )

# Creation
from ase.build import molecule

# Molecule created as an ASE.Atoms
methanol = molecule("CH3OH")

# Create CM output for the system
cm_methanol = cm.create(methanol)

print(cm_methanol)
print("flattened", cm_methanol.shape)

# Create output for multiple system
samples = [molecule("H2O"), molecule("NO2"), molecule("CO2")]
coulomb_matrices = cm.create(samples)  # Serial
coulomb_matrices = cm.create(samples, n_jobs=2)  # Parallel

# No flattening
cm = CoulombMatrix(n_atoms_max=6, flatten=False)
cm_methanol = cm.create(methanol)

print(cm_methanol)
print("not flattened", cm_methanol.shape)
Ejemplo n.º 20
0
 def create(system):
     desc = CoulombMatrix(n_atoms_max=3,
                          permutation="none",
                          flatten=True)
     return desc.create(system)
Ejemplo n.º 21
0
def main(fxyz, dictxyz, prefix, output, max_atoms, stride):
    """

    Generate the SOAP descriptors.

    Parameters
    ----------
    fxyz: string giving location of xyz file
    dictxyz: string giving location of xyz file that is used as a dictionary
    prefix: string giving the filename prefix
    output: [xyz]: append the representations to extended xyz file; [mat] output as a standlone matrix
    max_atoms: int: Max number of atoms in the Coulomb Matrix
    stride: compute descriptor each X frames
    """

    fframes = []
    dictframes = []

    # read frames
    if fxyz != 'none':
        fframes = read(fxyz, slice(0, None, stride))
        nfframes = len(fframes)
        print("read xyz file:", fxyz, ", a total of", nfframes, "frames")
    # read frames in the dictionary
    if dictxyz != 'none':
        dictframes = read(dictxyz, ':')
        ndictframes = len(dictframes)
        print("read xyz file used for a dictionary:", dictxyz, ", a total of",
              ndictframes, "frames")

    frames = dictframes + fframes
    nframes = len(frames)
    global_species = []
    for frame in frames:
        global_species.extend(frame.get_atomic_numbers())
        frame.set_pbc([False, False, False])
    global_species = np.unique(global_species)
    print("a total of", nframes, "frames, with elements: ", global_species)

    rep_atomic = CoulombMatrix(max_atoms)
    foutput = prefix + "-max_atoms" + str(max_atoms)
    desc_name = "CM" + "-max_atoms" + str(max_atoms)

    # prepare for the output
    if os.path.isfile(foutput + ".xyz"):
        os.rename(foutput + ".xyz", "bck." + foutput + ".xyz")
    if os.path.isfile(foutput + ".desc"):
        os.rename(foutput + ".desc", "bck." + foutput + ".desc")

    for i, frame in enumerate(frames):
        fnow = rep_atomic.create(frame, n_jobs=8)

        frame.info[desc_name] = fnow
        # save
        if output == 'matrix':
            with open(foutput + ".desc", "ab") as f:
                np.savetxt(f, frame.info[desc_name][None])
                np.savetxt(fatomic, fnow)
        elif output == 'xyz':
            # output per-atom info
            # write xyze
            write(foutput + ".xyz", frame, append=True)
        else:
            raise ValueError('Cannot find the output format')
Ejemplo n.º 22
0
def select_descriptor(data, descriptor_metadata):
    print('creating descriptor....')
    from dscribe.descriptors import CoulombMatrix
    if descriptor_metadata['descriptor_type'] == 'Coulomb_matrix':
        scaling = descriptor_metadata['scaling']
        descriptor = CoulombMatrix(
            n_atoms_max=data['configuration'][0].get_number_of_atoms(),
            flatten=True,
            permutation=descriptor_metadata['permutation'])

        rounding = 5
        l = data['configuration'].tolist()
        features = [descriptor.create(l[i]) for i in range(len(l))]
        features = np.round(np.array(features), rounding)
        data = pd.concat([data, pd.DataFrame(features)], axis=1)

        descriptor_metadata['descriptor'] = descriptor

        features = [
            i for i in range(
                descriptor_metadata['descriptor'].get_number_of_features())
        ]
        labels = ['energy']
        features_and_labels = labels + features

        descriptor_metadata['features'] = features
        descriptor_metadata['labels'] = labels
        descriptor_metadata['features_and_labels'] = features_and_labels

        data['metadata'].at[1] = descriptor_metadata
        return data

    elif descriptor_metadata['descriptor_type'] == 'PIV':

        def switching_OO(x):
            n = 8
            d0 = 4.5
            return x

        def switching_OH_plus(x):
            n = 8
            d0 = 2.3
            #return 1/(1+(x/d0)**n)
            return x

        def switching_HH(x):
            return x

        def switching_HH_plus(x):
            return switching_HH(x)

        def switching_OH(x):
            return switching_HH(x)

        def PIV(configuration):
            distances = configuration.get_all_distances()
            np.fill_diagonal(distances, 0)
            distances[np.tril_indices(distances.shape[0], -1)] = 0

            OO = distances[0, 1].flatten()
            OH_plus = distances[0:2, 2].flatten()
            OH = distances[0:2, 3:7].flatten()
            HH_plus = distances[2, 3:7].flatten()
            HH = distances[3:6, 4:7].flatten()
            HH = np.delete(HH, (3, 6, 7))

            OO = switching_OO(OO)
            OH_plus = switching_OH_plus(OH_plus)
            OH = switching_OH(OH)
            HH_plus = switching_HH_plus(HH_plus)
            HH = switching_HH(HH)

            OO = np.sort(OO)
            OH_plus = np.sort(OH_plus)
            OH = np.sort(OH)
            HH_plus = np.sort(HH_plus)
            HH = np.sort(HH)

            PIV = np.concatenate((OO, OH_plus, OH, HH_plus, HH), axis=None)
            return PIV

        l = data['configuration'].tolist()
        features = [PIV(l[i]) for i in range(len(l))]
        data = pd.concat([data, pd.DataFrame(features)], axis=1)

        features = [i for i in range(21)]
        labels = ['energy']
        features_and_labels = labels + features
        descriptor_metadata['features'] = features
        descriptor_metadata['labels'] = labels
        descriptor_metadata['features_and_labels'] = features_and_labels
        descriptor_metadata['switching_OO'] = switching_OO
        descriptor_metadata['switching_OH_plus'] = switching_OH_plus
        descriptor_metadata['switching_OH'] = switching_OH
        descriptor_metadata['switching_HH_plus'] = switching_HH_plus
        descriptor_metadata['switching_HH'] = switching_HH
        descriptor_metadata['descriptor'] = PIV

        data['metadata'].at[1] = descriptor_metadata
        return data

    elif descriptor_metadata['descriptor_type'] == 'CM_with_PIV_sorting':

        def PIV(configuration):
            def w_diag(x):
                return 0.5 * np.power(x, 2.4)

            distances = configuration.get_all_distances()
            for i in range(len(distances)):
                distances[i, i] = 1

            def switching_function(x):
                return 1 / x

            distances = switching_function(distances)
            distances[0, 0] = w_diag(8)
            distances[1, 1] = w_diag(8)
            for i in range(2, 7):
                distances[i, i] = w_diag(1)
            distances[0, 1] = distances[0, 1] * 64
            distances[1, 0] = distances[1, 0] * 64
            distances[0:2, 2:7] = distances[0:2, 2:7] * 8
            distances[2:7, 0:2] = distances[2:7, 0:2] * 8
            OO = distances[0:2, 0:2].flatten()
            HH1 = distances[0:7, 2:7].flatten()
            HH2 = distances[2:7, 0:2].flatten()
            HH = np.append(HH2, HH1)

            OO = np.sort(OO)
            HH = np.sort(HH)

            PIV = np.concatenate((OO, HH), axis=None)
            return PIV

        l = data['configuration'].tolist()
        features = [PIV(l[i]) for i in range(len(l))]
        data = pd.concat([data, pd.DataFrame(features)], axis=1)

        features = [i for i in range(49)]
        labels = ['energy']
        features_and_labels = labels + features

        descriptor_metadata['features'] = features
        descriptor_metadata['labels'] = labels
        descriptor_metadata['features_and_labels'] = features_and_labels

        data['metadata'].at[1] = descriptor_metadata
        return data

        data['metadata'].at[1] = descriptor_metadata
        return data

    elif descriptor_metadata['descriptor_type'] == 'PIV_without_H_plus':

        def PIV(configuration):
            distances = configuration.get_all_distances()
            distances[np.tril_indices(distances.shape[0], -1)] = 0
            distances
            OO = distances[0:2, 0:2].flatten()
            HH1 = distances[0:7, 2:7].flatten()
            HH2 = distances[2:7, 0:2].flatten()
            HH = np.append(HH2, HH1)

            OO = np.sort(OO)
            HH = np.sort(HH)
            OO = OO[OO != 0]
            HH = HH[HH != 0]
            PIV = np.concatenate((OO, HH), axis=None)
            return PIV

        l = data['configuration'].tolist()
        features = [PIV(l[i]) for i in range(len(l))]
        data = pd.concat([data, pd.DataFrame(features)], axis=1)

        features = [i for i in range(21)]
        labels = ['energy']
        features_and_labels = labels + features

        descriptor_metadata['features'] = features
        descriptor_metadata['labels'] = labels
        descriptor_metadata['features_and_labels'] = features_and_labels

        data['metadata'].at[1] = descriptor_metadata
        return data

        data['metadata'].at[1] = descriptor_metadata
        return data

    elif descriptor_metadata[
            'descriptor_type'] == 'PIV_with_CM_diagonal_and_weighting':

        def PIV(configuration):
            def w_diag(x):
                return 0.5 * np.power(x, 2.4)

            #This PIV has 28 elements
            distances = configuration.get_all_distances()
            for i in range(len(distances)):
                distances[i, i] = 1

            def switching_function(x):
                return 1 / x

            distances = switching_function(distances)
            distances[0, 0] = w_diag(8)
            distances[1, 1] = w_diag(8)
            for i in range(2, 7):
                distances[i, i] = w_diag(1)
            distances[np.tril_indices(distances.shape[0], -1)] = 0
            distances
            OO = distances[0:2, 0:2].flatten()
            HH1 = distances[0:7, 2:7].flatten()
            HH2 = distances[2:7, 0:2].flatten()
            HH = np.append(HH2, HH1)

            OO = np.sort(OO)
            HH = np.sort(HH)

            OO = OO[OO != 0]
            HH = HH[HH != 0]

            PIV = np.concatenate((OO, HH), axis=None)
            return PIV

        l = data['configuration'].tolist()
        features = [PIV(l[i]) for i in range(len(l))]
        data = pd.concat([data, pd.DataFrame(features)], axis=1)

        features = [i for i in range(28)]
        labels = ['energy']
        features_and_labels = labels + features

        descriptor_metadata['features'] = features
        descriptor_metadata['labels'] = labels
        descriptor_metadata['features_and_labels'] = features_and_labels

        data['metadata'].at[1] = descriptor_metadata
        return data

        data['metadata'].at[1] = descriptor_metadata
        return data

    else:
        print('NO DESCRIPTOR SELECTED!')
        descriptor_metadata['descriptor_type'] = 'NO_DESCRIPTOR_SELECTED'
        data['metadata'].at[1] = descriptor_metadata
        return data
Ejemplo n.º 23
0
from dscribe.descriptors import SOAP
from dscribe.descriptors import CoulombMatrix
from ase.build import molecule

# Define geometry
mol = molecule("H2O")

# Setup descriptors
cm_desc = CoulombMatrix(n_atoms_max=3, permutation="sorted_l2")
soap_desc = SOAP(atomic_numbers=[1, 8], rcut=5, nmax=8, lmax=6, crossover=True)

# Create descriptors as numpy arrays or scipy sparse matrices
input_cm = cm_desc.create(mol)
input_soap = soap_desc.create(mol, positions=[0])
Ejemplo n.º 24
0
atomic_numbers = stats["atomic_numbers"]
max_atomic_number = stats["max_atomic_number"]
min_atomic_number = stats["min_atomic_number"]
min_distance = stats["min_distance"]

cm_desc = CoulombMatrix(
    n_atoms_max=
    29,  ## maximum number of atoms in a molecule that occurs in dataset
    permutation="sorted_l2",
    #sparse=True
)

time_start = time.time()
cm_start = time.time()
############# create CM for data ##############################################################################
cm = cm_desc.create(ase_mol)
cm_end = time.time()
cm_time = np.round(cm_end - cm_start, decimals=3)

################# split CM and h**o array into 5 different parts

### define index
index = np.arange(np.shape(cm)[0])
### shuffle index
np.random.shuffle(index)
### return shuffled cm matrix
shuffled_cm = cm[index, :]
### return shuffled h**o array
h**o = np.array(h**o)
shuffled_homo = h**o[index]
# shuffled_homo.tolist()
Ejemplo n.º 25
0
 def create(system):
     desc = CoulombMatrix(
         n_atoms_max=3, permutation="random", sigma=0.000001, flatten=True
     )
     return desc.create(system)
Ejemplo n.º 26
0
from ase.build import molecule
from dscribe.descriptors import CoulombMatrix

# Define atomic structures
samples_mol = [molecule("H2O"), molecule("NO2"), molecule("CO2")]

# Setup descriptor
cm_desc = CoulombMatrix(n_atoms_max=3, permutation="sorted_l2")

# Create descriptor
water = samples_mol[0]
coulomb_matrix = cm_desc.create(water)

print("Coulomb matrix for water:\n", coulomb_matrix)

# Create multiple descriptors
coulomb_matrices = cm_desc.create(samples_mol)

print("List of Coulomb matrices:\n", coulomb_matrices)
Ejemplo n.º 27
0
import numpy as np
from ase.build import molecule
from dscribe.descriptors import SOAP
from dscribe.descriptors import CoulombMatrix

# Define atomic structures
samples = [molecule("H2O"), molecule("NO2"), molecule("CO2")]

# Setup descriptors
cm_desc = CoulombMatrix(n_atoms_max=3, permutation="sorted_l2")
soap_desc = SOAP(species=["C", "H", "O", "N"], rcut=5, nmax=8, lmax=6, crossover=True)

# Create descriptors as numpy arrays or sparse arrays
water = samples[0]
coulomb_matrix = cm_desc.create(water)
soap = soap_desc.create(water, positions=[0])

# Easy to use also on multiple systems, can be parallelized across processes
coulomb_matrices = cm_desc.create(samples)
coulomb_matrices = cm_desc.create(samples, n_jobs=3)
oxygen_indices = [np.where(x.get_atomic_numbers() == 8)[0] for x in samples]
oxygen_soap = soap_desc.create(samples, oxygen_indices, n_jobs=3)

# Some descriptors also allow calculating derivatives with respect to atomic
# positions
der, des = soap_desc.derivatives(samples, method="auto", return_descriptor=True)
Ejemplo n.º 28
0
def plotDescs(structs,
              indexs,
              level,
              descname,
              chemsyms,
              n_atoms,
              steve,
              v,
              path_output,
              save=True):
    """
    Plot descriptors
    """
    # choose the descriptor
    if descname == "CM":
        desc = CoulombMatrix(
            n_atoms_max=n_atoms, flatten=False,
            permutation='none')  # permutation = 'sorted_l2' is default
        n_feat = desc.get_number_of_features()
        # Create descriptors
        descs = desc.create(structs, n_jobs=steve)  # Parallel
        # Plot CM of zero_cluster and save it to outputs-folder
        sns.heatmap(descs,
                    cmap='Spectral',
                    robust=True,
                    xticklabels=chemsyms,
                    yticklabels=chemsyms)
        plt.title("CM of {}".format(indexs))
        if save: plt.savefig("{}/{}_CM.png".format(path_output, indexs[:-4]))

    if descname == "MBTR":
        desc = MBTR(species=list(set(chemsyms)),
                    k1=mk1,
                    k2=mk2,
                    k3=mk3,
                    periodic=False,
                    normalization="l2_each",
                    flatten=False)
        n_feat = desc.get_number_of_features()
        descs = desc.create(structs, n_jobs=steve)  # Parallel
        # Create the mapping between an index in the output and the corresponding chemical symbol
        n_elements = len(desc.species)
        # dict({index_of_atom_type:Z_of_atom_type})
        imap = desc.index_to_atomic_number
        # dict({index_of_atom_type:atom_type_symbol})
        smap = {
            index: ase.data.chemical_symbols[number]
            for index, number in imap.items()
        }

        # Plot k=1
        x = np.linspace(0, 1,
                        100)  # las number defines the resolution of x-axis
        x1 = desc.get_k1_axis()  # from fullmetalfelix/ML-CSC-tutorial
        fig, ax = plt.subplots()
        for i in range(n_elements):
            plt.plot(x1, descs["k1"][i, :], label="{}".format(smap[i]))
        ax.set_xlabel("Charge")
        ax.set_xlabel(
            "Atomic number")  #, size=20) # from fullmetalfelix/ML-CSC-tutorial
        ax.set_ylabel("k1 values (arbitrary units)")  #, size=20)
        plt.legend()
        plt.title("MBTR k1 of {}".format(indexs))
        if save:
            plt.savefig("{}/{}_MBTR_k1.png".format(path_output, indexs[:-4]))

        # Plot k=2
        x = np.linspace(0, 0.5, 100)  # Kato mitä tää on docsista
        x2 = desc.get_k2_axis()  # from fullmetalfelix/ML-CSC-tutorial
        fig, ax = plt.subplots()
        for i in range(n_elements):
            for j in range(n_elements):
                if j >= i:
                    plt.plot(x2,
                             descs["k2"][i, j, :],
                             label="{}-{}".format(smap[i], smap[j]))
        ax.set_xlabel("Inverse distance (1/angstrom)"
                      )  #, size=20) # How to make not inverse?
        ax.set_ylabel("k2 values (arbitrary units)")  #, size=20)
        plt.legend()
        plt.title("MBTR k2 of {}".format(indexs))
        if save:
            plt.savefig("{}/{}_MBTR_k2.png".format(path_output, indexs[:-4]))

        # Plot k=3
        x = np.linspace(0, 0.5, 100)  # Kato mitä tää on docsista
        x3 = desc.get_k3_axis()  # from fullmetalfelix/ML-CSC-tutorial
        fig, ax = plt.subplots()
        for i in range(n_elements):
            for j in range(n_elements):
                if j >= i:
                    for k in range(n_elements):
                        if k >= j and smap[k] == "S":
                            plt.plot(x3,
                                     descs["k3"][i, j, k, :],
                                     label="{}-{}-{}".format(
                                         smap[i], smap[j], smap[k]))
        ax.set_xlabel("cos(angle)")  #, size=20)
        ax.set_ylabel("k3 values (arbitrary units)")  #, size=20)
        plt.legend()
        plt.title("MBTR k3 of {}".format(indexs))
        if save:
            plt.savefig("{}/{}_MBTR_k3.png".format(path_output, indexs[:-4]))

    if descname == "SOAP":
        desc = SOAP(species=list(set(chemsyms)),
                    periodic=False,
                    rcut=srcut,
                    nmax=snmax,
                    lmax=slmax,
                    average=False)  # Averaging for global
        n_feat = desc.get_number_of_features()
        descs = desc.create(structs, n_jobs=steve)

        # Plot SOAPs for all atom pairs
        chemsyms_combos = list(combinations_with_replacement(desc.species, 2))
        for combo in chemsyms_combos:
            # The locations of specific element combinations can be retrieved like this.
            pairloc = desc.get_location(combo)
            # These locations can be directly used to slice the corresponding part from an
            # SOAP output for e.g. plotting.
            plt.plot(descs[0, pairloc],
                     label="{}-{}".format(combo[0], combo[1]))
        plt.legend()
        #plt.xlim(20,40)
        plt.xlabel("N of features for an atom pair")
        plt.ylabel("Output value of SOAPs")
        plt.title("SOAPs of {}".format(indexs))
        if save: plt.savefig("{}/{}_SOAP.png".format(path_output, indexs[:-4]))

    if v: print("🔘 Plotting {} done.".format(descname))
Ejemplo n.º 29
0
atomic_numbers = [1, 8]
rcut = 6.0
nmax = 8
lmax = 6

# Setting up the CM descriptor
cm = CoulombMatrix(n_atoms_max=6, )

# Creating an atomic system as an ase.Atoms-object
from ase.build import molecule
methanol = molecule("CH3OH")
print(methanol)

# Create CM output for the system
cm_methanol = cm.create(methanol)

print(cm_methanol)
print("flattened", cm_methanol.shape)

# No flattening
cm = CoulombMatrix(n_atoms_max=6, flatten=False)
cm_methanol = cm.create(methanol)

print(cm_methanol)
print("not flattened", cm_methanol.shape)

# Introduce zero-padding
cm = CoulombMatrix(n_atoms_max=10, flatten=False)
cm_methanol = cm.create(methanol)
Ejemplo n.º 30
0
def f(x):

    filename = 'boss_outfile.txt'
    if os.path.exists(filename):
        append_write = 'a'  # append if already exists
    else:
        append_write = 'w'  # make a new file if not

    iteration_start = time.time()

    ## KRR parameters
    alpha_exp = -x[0][0]
    gamma_exp = -x[0][1]

    alpha = 10**alpha_exp
    gamma = 10**gamma_exp

    # write variables to file
    f = open('variables.in', 'w')
    f.write(str(alpha))
    f.write("\n")
    f.write(str(gamma))
    f.close()

    time_cv_array = []
    MAE_list = []
    cv_time_list = []

    #### Load data

    data = pd.read_json("../data/data_train_1k.json")

    ###### extract xyz coordinates and HOMOs from dataframe
    homo_array = []
    out_mol = StringIO()

    for i, row in data.iterrows():
        h**o = row[0][1]
        homo_array.append(h**o)
        x = "".join(row.molecule)
        #print("x:", x)
        out_mol.write(x)

    h**o = np.array(homo_array)
    h**o = [float(x) for x in h**o]
    #print(homo_train)
    ase_mol = list(ase.io.iread(out_mol, format="xyz"))

    ## Load statistics from the dataset
    stats = system_stats(ase_mol)
    atomic_numbers = stats["atomic_numbers"]
    max_atomic_number = stats["max_atomic_number"]
    min_atomic_number = stats["min_atomic_number"]
    min_distance = stats["min_distance"]

    cm_desc = CoulombMatrix(
        #n_atoms_max=max_atomic_number,
        n_atoms_max=29,
        permutation="sorted_l2",
        #sparse=True
    )

    ############# create CM for data ##############################################################################
    cm_start = time.time()
    cm = cm_desc.create(ase_mol)
    cm_end = time.time()
    cm_time = np.round(cm_end - cm_start, decimals=3)

    ################# split CM and h**o array into 5 different parts

    ### mbtr to csr
    #mbtr = mbtr_mol.tocsr()

    ## select 3 random rows of mbtr matrix
    #select_ind = np.array([0,2,4])
    #mbtr[select_ind, :]

    ## see contents: todense()

    ### define index
    index = np.arange(np.shape(cm)[0])
    ### shuffle index
    np.random.shuffle(index)
    ### return shuffled cm matrix
    shuffled_cm = cm[index, :]
    ### return shuffled h**o array
    h**o = np.array(h**o)
    shuffled_homo = h**o[index]
    # shuffled_homo.tolist()

    ### split data into 5 equal parts
    select_ind_1 = np.arange(0, 200, 1)
    cm_1 = shuffled_cm[select_ind_1, :]
    homo_1 = shuffled_homo[select_ind_1]

    select_ind_2 = np.arange(200, 400, 1)
    cm_2 = shuffled_cm[select_ind_2, :]
    homo_2 = shuffled_homo[select_ind_2]

    select_ind_3 = np.arange(400, 600, 1)
    cm_3 = shuffled_cm[select_ind_3, :]
    homo_3 = shuffled_homo[select_ind_3]

    select_ind_4 = np.arange(600, 800, 1)
    cm_4 = shuffled_cm[select_ind_4, :]
    homo_4 = shuffled_homo[select_ind_4]

    select_ind_5 = np.arange(800, 1000, 1)
    cm_5 = shuffled_cm[select_ind_5, :]
    homo_5 = shuffled_homo[select_ind_5]

    ##### arrange data into training and validation sets
    cm_train_1 = np.concatenate((cm_2, cm_3, cm_4, cm_5))
    print("cm_train_1:", cm_train_1)
    print("Length cm_train:", len(cm_train_1))
    print("Shape cm_train:", cm_train_1.shape)
    cm_val_1 = cm_1
    homo_train_1 = np.concatenate((homo_2, homo_3, homo_4, homo_5))
    homo_val_1 = homo_1

    cm_train_2 = np.concatenate((cm_3, cm_4, cm_5, cm_1))
    #print("Length cm_train:", cm_train_2.shape)
    cm_val_2 = cm_2
    #print("Length cm_val:", cm_val_2.shape)
    homo_train_2 = np.concatenate((homo_3, homo_4, homo_5, homo_1))
    #print("Length homo_train:", len(homo_train_2))
    homo_val_2 = homo_2
    #print("Length homo_val:", len(homo_val_2))

    cm_train_3 = np.concatenate((cm_4, cm_5, cm_1, cm_2))
    #print("Length cm_train:", cm_train_3.shape)
    cm_val_3 = cm_3
    #print("Length cm_val:", cm_val_3.shape)
    homo_train_3 = np.concatenate((homo_4, homo_5, homo_1, homo_2))
    homo_val_3 = homo_3

    cm_train_4 = np.concatenate((cm_5, cm_1, cm_2, cm_3))
    #print("Length cm_train:", cm_train_4.shape)
    cm_val_4 = cm_4
    #print("Length cm_val:", cm_val_4.shape)
    homo_train_4 = np.concatenate((homo_5, homo_1, homo_2, homo_3))
    homo_val_4 = homo_4

    cm_train_5 = np.concatenate((cm_1, cm_2, cm_3, cm_4))
    #print("Length cm_train:", cm_train_5.shape)
    cm_val_5 = cm_5
    #print("Length cm_val:", cm_val_5.shape)
    homo_train_5 = np.concatenate((homo_1, homo_2, homo_3, homo_4))
    homo_val_5 = homo_5

    cm_train = [cm_train_1, cm_train_2, cm_train_3, cm_train_4, cm_train_5]
    cm_val = [cm_val_1, cm_val_2, cm_val_3, cm_val_4, cm_val_5]
    homo_train = [
        homo_train_1, homo_train_2, homo_train_3, homo_train_4, homo_train_5
    ]
    homo_val = [homo_val_1, homo_val_2, homo_val_3, homo_val_4, homo_val_5]

    ### KRR ###############
    for cm_train_i, homo_train_i, cm_val_i, homo_val_i in zip(
            cm_train, homo_train, cm_val, homo_val):
        cv_start = time.time()

        model = KernelRidge(alpha=alpha, kernel='laplacian', gamma=gamma)

        model.fit(cm_train_i, homo_train_i)

        y_true = homo_val_i
        y_pred = model.predict(cm_val_i)

        MAE = mean_absolute_error(y_true, y_pred)
        cv_end = time.time()
        cv_time_list.append(np.round(cv_end - cv_start, decimals=3))
        MAE_list.append(MAE)
        print("MAE:", MAE)

    avg_MAE = np.mean(MAE_list)

    avg_cv_time = np.mean(cv_time_list)

    iteration_end = time.time()
    iteration_time = np.round(iteration_end - iteration_start, decimals=3)
    print("iteration time:", iteration_time)

    if os.path.isfile('results/df_results_cm.json'):
        df_results = pd.read_json('results/df_results_cm.json', orient='split')
        iteration = len(df_results) + 1
        print("iteration:", iteration)
        row = [
            iteration, avg_MAE, iteration_time, cm_time, avg_cv_time, alpha,
            gamma
        ]
        df_results.loc[len(df_results)] = row
        df_results.to_json('results/df_results_cm.json', orient='split')
        print(df_results)
    else:
        df_results = pd.DataFrame(
            [[1, avg_MAE, iteration_time, cm_time, avg_cv_time, alpha, gamma]],
            columns=[
                'iteration', 'avg_MAE', 'iteration_time', 'cm_time',
                'avg_cv_time', 'alpha', 'gamma'
            ])
        df_results.to_json('results/df_results_cm.json', orient='split')

    return avg_MAE