Beispiel #1
0
def test_krr_cmat():

    test_dir = os.path.dirname(os.path.realpath(__file__))

    # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames
    data = get_energies(test_dir + "/data/hof_qm7.txt")

    # Generate a list of qml.Compound() objects
    mols = []

    for xyz_file in sorted(data.keys())[:1000]:

        # Initialize the qml.Compound() objects
        mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file)

        # Associate a property (heat of formation) with the object
        mol.properties = data[xyz_file]

        # This is a Molecular Coulomb matrix sorted by row norm
        mol.generate_coulomb_matrix(size=23, sorting="row-norm")

        mols.append(mol)

    # Shuffle molecules
    np.random.seed(666)
    np.random.shuffle(mols)

    # Make training and test sets
    n_test = 300
    n_train = 700

    training = mols[:n_train]
    test = mols[-n_test:]

    # List of representations
    X = np.array([mol.representation for mol in training])
    Xs = np.array([mol.representation for mol in test])

    # List of properties
    Y = np.array([mol.properties for mol in training])
    Ys = np.array([mol.properties for mol in test])

    # Set hyper-parameters
    sigma = 10**(4.2)
    llambda = 10**(-10.0)

    # Generate training Kernel
    K = laplacian_kernel(X, X, sigma)

    # Solve alpha
    K[np.diag_indices_from(K)] += llambda
    alpha = cho_solve(K, Y)

    # Calculate prediction kernel
    Ks = laplacian_kernel(X, Xs, sigma)
    Yss = np.dot(Ks.transpose(), alpha)

    mae = np.mean(np.abs(Ys - Yss))

    assert mae < 6.0, "ERROR: Too high MAE!"
Beispiel #2
0
def krr(kernel, properties, rcond=1e-9, solver="cho"):
    # rcond = 1e-4

    if solver == "cho":
        alpha = cho_solve(kernel, properties, l2reg=rcond)
    else:
        alpha = svd_solve(kernel, properties, rcond=rcond)

    return alpha
Beispiel #3
0
def get_alphas(X, Y, sigma, llambda):
  ''' calculates the regression coefficient alpha
  '''
  K = laplacian_kernel(X, X, sigma)

  C = deepcopy(K)
  C[np.diag_indices_from(C)] += llambda

  alpha = cho_solve(C, Y)

  return alpha
Beispiel #4
0
def cross_validation(X, Y, sigmas, llambdas, Ntot):
  """ finds optimal hyperparameters sigma & lambda using cross validation
  """
  parameters = []
  random.seed(666)

  for i in range(len(sigmas)):
    K = laplacian_kernel(X, X, sigmas[i])

    for j in range(len(llambdas)):
  
      for m in range(5):
        maes = []
        split = range(Ntot)
        random.shuffle(split)

        train = int(len(split)*0.8)
        test  = int(Ntot - train)

        training_index  = split[:train]
        test_index      = split[-test:]

        y_train = Y[training_index]
        y_test  = Y[test_index]

        C = deepcopy(K[training_index][:,training_index])
        C[np.diag_indices_from(C)] += llambdas[j]

        alpha = cho_solve(C, y_train)

        y_est = np.dot((K[training_index][:,test_index]).T, alpha)

        diff = y_est  - y_test
        mae = np.mean(np.abs(diff))
        maes.append(mae)

      parameters.append([llambdas[j], sigmas[i], np.mean(maes)])

  maes = [mae[2] for mae in parameters]
  index = maes.index(min(maes))

  print("minimum MAE after CV: ", min(maes))

  return parameters[index][0], parameters[index][1]
Beispiel #5
0
    def do_qml_gaussian_kernel(self):

        #K is also a np array, create kernel matrix
        K = gaussian_kernel(self.x_training, self.x_training, self.sigma)

        #add small lambda to the diagonal of the kernel matrix
        K[np.diag_indices_from(K)] += self.lamda

        #use the built in Cholesky-decomposition to solve
        alpha = cho_solve(K, self.y_training)

        #predict new, calculate kernel matrix between test and training
        Ks = gaussian_kernel(self.x_test, self.x_training, self.sigma)

        #make prediction
        Y_predicted = np.dot(Ks, alpha)

        # Calculate mean-absolute-error (MAE):
        self.mae = np.mean(np.abs(Y_predicted - self.y_test))
        self.test_predicted_results = Y_predicted
Beispiel #6
0
def get_learning_curve(X, X_test, Y, Y_test, sigma, llambda, Ntot):
  ''' generate data (predictions) for learning curves
  '''
  K			 = laplacian_kernel(X, X,      sigma)
  K_test = laplacian_kernel(X, X_test, sigma)

  N = []
  j = 10

  while(j < Ntot):
    N.append(j)
    j *= 2 

  N.append(Ntot)

  random.seed(667)

  for train in N:
    maes = []

    for i in range(10):
      split = range(Ntot)
      random.shuffle(split)

      training_index = split[:train]

      y = Y[training_index]

      C = deepcopy(K[training_index][:,training_index])
      C[np.diag_indices_from(C)] += llambda 
                                                 
      alpha = cho_solve(C, y)                          

      Yss = np.dot(K_test[training_index].T, alpha)

      diff = Yss - Y_test
      mae = np.mean(np.abs(diff))
      maes.append(mae)

    print(str(train) + "\t" + str(sum(maes)/len(maes)))
Beispiel #7
0
        K_test = gaussian_kernel(X, X_test, sigmas[j])

        for train in N:
            test = total - train
            maes = []
            for i in range(nModels):
                split = list(range(total))
                random.shuffle(split)

                training_index = split[:train]
                test_index = split[-test:]

                Y = Yprime[training_index]
                Ys = Yprime[test_index]

                C = deepcopy(K[training_index][:, training_index])
                C[np.diag_indices_from(C)] += 10.0**(-7.0)

                alpha = cho_solve(C, Y)

                Yss = np.dot((K_test[training_index]).T, alpha)
                diff = Yss - Ytest
                mae = np.mean(np.abs(diff))
                maes.append(mae)
                rms = sqrt(mean_squared_error(Yss, Ytest))
            s = np.std(maes) / np.sqrt(nModels)

            print(
                str(sigmas[j]) + "\t" + str(train) + "\t" +
                str(sum(maes) / len(maes)) + "\t" + str(s) + "\t" + str(rms))
Beispiel #8
0
        reps.append(rep)
        energies.append(energy)
    except:
        print(ani['molecule'])
        print(ani['species'])
        print(ani['coordinates'])
        print(ani['energy'])

X = np.array(reps)[:5000]
y = np.array(energies)[:5000]

sigma = 2.5
K = get_local_kernels(X, X, [sigma], cut_distance=10.0)[0]
K[np.diag_indices_from(K)] += 1e-8

alpha = cho_solve(K, y)
np.save('/ihome/ghutchison/dlf57/ml-benchmark/alpha-sig25-5k.npz', alpha)

data = []
for out in sorted(glob.iglob(
        '/ihome/ghutchison/dlf57/ml-benchmark/molecules/stretch/*/sdf/*.sdf'),
                  key=numericalSort):
    name = out.split('stretch/')[1].split('/sdf')[0]
    pt = out.split('sdf/')[1].split('.')[0]
    if name != 'HF':
        mol = Molecule(out)
        coords = mol.xyz
        at_num = mol.at_num

        rep = generate_representation(coords, at_num, max_size=45)
        rep = np.array([rep])
Beispiel #9
0
def test_krr_fchl_local():

    # Test that all kernel arguments work
    kernel_args = {
        "cut_distance": 1e6,
        "cut_start": 0.5,
        "two_body_width": 0.1,
        "two_body_scaling": 2.0,
        "two_body_power": 6.0,
        "three_body_width": 3.0,
        "three_body_scaling": 2.0,
        "three_body_power": 3.0,
        "alchemy": "periodic-table",
        "alchemy_period_width": 1.0,
        "alchemy_group_width": 1.0,
        "fourier_order": 2,
    }

    test_dir = os.path.dirname(os.path.realpath(__file__))

    # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames
    data = get_energies(test_dir + "/data/hof_qm7.txt")

    # Generate a list of qml.Compound() objects"
    mols = []

    for xyz_file in sorted(data.keys())[:100]:

        # Initialize the qml.Compound() objects
        mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file)

        # Associate a property (heat of formation) with the object
        mol.properties = data[xyz_file]

        # This is a Molecular Coulomb matrix sorted by row norm
        mol.generate_fchl_representation(cut_distance=1e6)
        mols.append(mol)

    # Shuffle molecules
    np.random.seed(666)
    np.random.shuffle(mols)

    # Make training and test sets
    n_test = len(mols) // 3
    n_train = len(mols) - n_test

    training = mols[:n_train]
    test = mols[-n_test:]

    X = np.array([mol.representation for mol in training])
    Xs = np.array([mol.representation for mol in test])

    # List of properties
    Y = np.array([mol.properties for mol in training])
    Ys = np.array([mol.properties for mol in test])

    # Set hyper-parameters
    sigma = 2.5
    llambda = 1e-8

    K_symmetric = get_local_symmetric_kernels(X, [sigma], **kernel_args)[0]
    K = get_local_kernels(X, X, [sigma], **kernel_args)[0]

    assert np.allclose(K, K_symmetric), "Error in FCHL symmetric local kernels"
    assert np.invert(np.all(
        np.isnan(K_symmetric))), "FCHL local symmetric kernel contains NaN"
    assert np.invert(np.all(np.isnan(K))), "FCHL local kernel contains NaN"

    # Solve alpha
    K[np.diag_indices_from(K)] += llambda
    alpha = cho_solve(K, Y)

    # Calculate prediction kernel
    Ks = get_local_kernels(Xs, X, [sigma], **kernel_args)[0]
    assert np.invert(np.all(
        np.isnan(Ks))), "FCHL local testkernel contains NaN"

    Yss = np.dot(Ks, alpha)

    mae = np.mean(np.abs(Ys - Yss))
    assert abs(2 - mae) < 1.0, "Error in FCHL local kernel-ridge regression"
Beispiel #10
0
def test_krr_fchl_global():

    test_dir = os.path.dirname(os.path.realpath(__file__))

    # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames
    data = get_energies(test_dir + "/data/hof_qm7.txt")

    # Generate a list of qml.Compound() objects"
    mols = []

    for xyz_file in sorted(data.keys())[:100]:

        # Initialize the qml.Compound() objects
        mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file)

        # Associate a property (heat of formation) with the object
        mol.properties = data[xyz_file]

        # This is a Molecular Coulomb matrix sorted by row norm
        mol.representation = generate_representation(mol.coordinates, \
                                mol.nuclear_charges, cut_distance=1e6)
        mols.append(mol)

    # Shuffle molecules
    np.random.seed(666)
    np.random.shuffle(mols)

    # Make training and test sets
    n_test = len(mols) // 3
    n_train = len(mols) - n_test

    training = mols[:n_train]
    test = mols[-n_test:]

    X = np.array([mol.representation for mol in training])
    Xs = np.array([mol.representation for mol in test])

    # List of properties
    Y = np.array([mol.properties for mol in training])
    Ys = np.array([mol.properties for mol in test])

    # Set hyper-parameters
    sigma = 100.0
    llambda = 1e-8

    K_symmetric = get_global_symmetric_kernels(X, [sigma])[0]
    K = get_global_kernels(X, X, [sigma])[0]

    assert np.allclose(K,
                       K_symmetric), "Error in FCHL symmetric global kernels"
    assert np.invert(np.all(
        np.isnan(K_symmetric))), "FCHL global symmetric kernel contains NaN"
    assert np.invert(np.all(np.isnan(K))), "FCHL global kernel contains NaN"

    # Solve alpha
    K[np.diag_indices_from(K)] += llambda
    alpha = cho_solve(K, Y)

    # # Calculate prediction kernel
    Ks = get_global_kernels(Xs, X, [sigma])[0]
    assert np.invert(np.all(
        np.isnan(Ks))), "FCHL global testkernel contains NaN"

    Yss = np.dot(Ks, alpha)

    mae = np.mean(np.abs(Ys - Yss))
    assert abs(2 - mae) < 1.0, "Error in FCHL global kernel-ridge regression"
Beispiel #11
0
    for mol in compounds:

        mol.generate_coulomb_matrix(size=23, sorting="row-norm")
        # mol.generate_bob(size=23, asize={"O":3, "C":7, "N":3, "H":16, "S":1})

    # Make a big 2D array with all the representations
    X = np.array([mol.representation for mol in compounds])
    # X = np.array([mol.bob for mol in compounds])

    # Print all representations
    print("Representations:")
    print(X)

    # Assign 1000 first molecules to the training set
    X_training = X[:1000]
    Y_training = energy_pbe0[:1000]

    sigma = 4000.0
    K = gaussian_kernel(X_training, X_training, sigma)
    print("Gaussian kernel:")
    print(K)

    # Add a small lambda to the diagonal of the kernel matrix
    K[np.diag_indices_from(K)] += 1e-8

    # Use the built-in Cholesky-decomposition to solve
    alpha = cho_solve(K, Y_training)

    print("Alphas:")
    print(alpha)
Beispiel #12
0
            for lamb in lambda1:
                maes = []
                for ibin in bins:
                    split = np.array(list(range(N_train_val)))
                    random.shuffle(split)
                    training_index = split[:int(xxx * train)]
                    val_index = split[-int(train / nbins):]
                    # Assign bin for training
                    Y_train = Y_train_val[training_index]
                    # Assign remaining bins to cross validation
                    Y_val = Y_train_val[val_index]

                    C = deepcopy(K[training_index][:, training_index])
                    # Add a small lambda to the diagonal of the kernel matrix
                    C[np.diag_indices_from(C)] += lamb

                    # Use the built-in Cholesky-decomposition to solve
                    alpha = cho_solve(C, Y_train)

                    # Validate
                    Yss = np.dot((K[training_index][:, val_index]).T, alpha)
                    diff = Yss - Y_val
                    mae = np.mean(np.abs(diff))
                    maes.append(mae)

                # Calculate mean-absolute-error (MAE):
                s = np.std(maes) / np.sqrt(nbins)
                print(sig, lamb, train, sum(maes) / len(maes), s)
                #print(sig, lamb, train, sum(maes)/len(maes), s, file=open("direkt_S1_TZVP_osc_no_neg.dat", "a"))
                #print(sig, lamb, train, sum(maes)/len(maes), s, file=open("direkt_S1_TZVP_no_neg.dat", "a"))
Beispiel #13
0
def get_alphas(kernel, properties):
    alpha = qml_math.cho_solve(kernel, properties)
    return alpha
Beispiel #14
0
test_filenames = filenames[500:750]

# hyper parameters
sigmas = [1.0, 10.0, 10.0**2, 10.0**3]
cutoffs = [2.0, 3.0, 4.0]
llambda = 1e-8  # doesn't usually need to be changed

# try 3 different cutoffs
for cutoff in cutoffs:
    train_x, train_y = get_descriptor_and_property(train_filenames, atype,
                                                   cutoff)
    test_x, test_y = get_descriptor_and_property(test_filenames, atype, cutoff)
    # in this case try out 4 different values of sigma
    for sigma in sigmas:
        # Get the kernel between all descriptors in the training set
        K = laplacian_kernel(train_x, train_x,
                             sigma) + llambda * np.identity(train_x.shape[0])

        # get the KRR prefactors, i.e. this is the training of the network
        alpha = cho_solve(K, train_y)

        # get the kernel between all descriptors in the training set and all in the test set
        Ks = laplacian_kernel(test_x, train_x, sigma)

        # predict values of y
        y_pred = np.dot(Ks, alpha)

        print(
            "predicted MAE of %.4f for sigma: %.4g, cutoff: %.1f and %d training points"
            % (calc_mae(y_pred, test_y), sigma, cutoff, len(train_x)))
Beispiel #15
0
def test_krr_gaussian_local_cmat():

    test_dir = os.path.dirname(os.path.realpath(__file__))

    # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames
    data = get_energies(test_dir + "/data/hof_qm7.txt")

    # Generate a list of qml.Compound() objects"
    mols = []

    for xyz_file in sorted(data.keys())[:1000]:

        # Initialize the qml.Compound() objects
        mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file)

        # Associate a property (heat of formation) with the object
        mol.properties = data[xyz_file]

        # This is a Molecular Coulomb matrix sorted by row norm
        mol.generate_atomic_coulomb_matrix(size=23, sorting="row-norm")

        mols.append(mol)

    # Shuffle molecules
    np.random.seed(666)
    np.random.shuffle(mols)

    # Make training and test sets
    n_test = 100
    n_train = 200

    training = mols[:n_train]
    test = mols[-n_test:]

    X = np.concatenate([mol.representation for mol in training])
    Xs = np.concatenate([mol.representation for mol in test])

    N = np.array([mol.natoms for mol in training])
    Ns = np.array([mol.natoms for mol in test])

    # List of properties
    Y = np.array([mol.properties for mol in training])
    Ys = np.array([mol.properties for mol in test])

    # Set hyper-parameters
    sigma = 724.0
    llambda = 10**(-6.5)

    K = get_local_kernels_gaussian(X, X, N, N, [sigma])[0]
    assert np.allclose(K, K.T), "Error in local Gaussian kernel symmetry"

    K_test = np.loadtxt(test_dir + "/data/K_local_gaussian.txt")
    assert np.allclose(
        K, K_test), "Error in local Gaussian kernel (vs. reference)"

    K_test = get_atomic_kernels_gaussian(training, training, [sigma])[0]
    assert np.allclose(K,
                       K_test), "Error in local Gaussian kernel (vs. wrapper)"

    # Solve alpha
    K[np.diag_indices_from(K)] += llambda
    alpha = cho_solve(K, Y)

    # Calculate prediction kernel
    Ks = get_local_kernels_gaussian(Xs, X, Ns, N, [sigma])[0]

    Ks_test = np.loadtxt(test_dir + "/data/Ks_local_gaussian.txt")
    # Somtimes a few coulomb matrices differ because of parallel sorting and numerical error
    # Allow up to 5 molecules to differ from the supplied reference.
    differences_count = len(set(np.where(Ks - Ks_test > 1e-7)[0]))
    assert differences_count < 5, "Error in local Laplacian kernel (vs. reference)"
    # assert np.allclose(Ks, Ks_test), "Error in local Gaussian kernel (vs. reference)"

    Ks_test = get_atomic_kernels_gaussian(test, training, [sigma])[0]
    assert np.allclose(Ks,
                       Ks_test), "Error in local Gaussian kernel (vs. wrapper)"

    Yss = np.dot(Ks, alpha)

    mae = np.mean(np.abs(Ys - Yss))
    assert abs(19.0 -
               mae) < 1.0, "Error in local Gaussian kernel-ridge regression"
Beispiel #16
0
    def __init__(self, wds, ia1, ia2, coeff=1.0, llambda=1.e-4):
        """
        ia1, ia2 -- atomic index, starting from 0,
        """

        s1 = SLATM(wds, 'out', regexp='', properties='AE', M='slatm', \
                local=True, igroup=False, ow=False, nproc=1, istart=0, \
                slatm_params = { 'nbody':3, 'dgrids': [0.03,0.03],  'sigmas':[0.05,0.05],\
                                 'rcut':4.8, 'rpower2':6, 'ws':[1.,1.,1.], \
                                 'rpower3': 3, 'isf':0, 'kernel':'g', 'intc':3 }, \
                iY=False)
        fs = s1.fs
        coords = s1.coords
        iast2 = s1.nas.cumsum()
        iast1 = np.array([
            0,
        ] + list(kas2[:-1]))

        objs = []
        ds = []
        for i, f in enumerate(fs):
            obj = wfn(f)
            obj.get_dm()
            objs.append(obj)
            if i < self.nm - 1:
                ds.append(ssd.cdist(coords[i], coords[self.nm - 1]))

        ## specify target atom pairs!!
        #ia1, ia2 = 0, 1
        #coeff = 1.0; llambda = 1e-6

        cia1 = coords[-1][ia1]
        cia2 = coords[-1][ia2]
        xs = []
        ys = []
        nhass = []
        for i, f in enumerate(fs):
            dsi = ds[i]
            jas = np.arange(dsi.shape[0])
            filt1 = (dsi[:, ia1] <= 0.01)
            filt2 = (dsi[:, ia2] <= 0.01)
            if np.any(filt1) and np.any(filt2):
                nhass.append(s1.nhass[i])
                obj = objs[i]
                ja1 = jas[filt1]
                ja2 = jas[filt2]
                p, q, r, s = obj.ibs1[ja1], obj.ibs2[ja1], obj.ibs1[
                    ja2], obj.ibs2[ja2]
                dmij = obj.dm[p:q, r:s].ravel()
                ys.append(dmij)

                iat1 = iast1[i] + ja1
                iat2 = iast1[i] + ja2
                x1 = s1.X[iat1]
                x2 = s1.X[iat2]
                xs.append(np.concatenate((x1, x2), axis=0))

        nprop = len(dmij)

        nt = len(nhass)
        nhass = np.array(nhass)
        tidxs = np.arange(nt)
        nhass_u = np.unique(nhass)
        nu = len(nhass_u)
        xs = np.array(xs)
        ys = np.array(ys)
        xs2 = np.array([xs[-1]])
        ys2 = np.array([ys[-1]])
        for j in range(nu):
            jdxs = tidxs[nhass <= nhass_u[j]]
            xs1 = xs[jdxs, :]
            ys1 = ys[jdxs, :]
            ds1 = qd.l2_distance(X1, X1)  # ssd.pdist(x1, metric='euclidean')
            dmax = max(ds1.ravel())
            sigma = coeff * dmax / np.sqrt(2.0 * np.log(2.0))
            K1 = qk.gaussian_kernel(xs1, xs1, sigma)
            assert np.allclose(K1,
                               K1.T), "Error in local Gaussian kernel symmetry"

            K1[np.diag_indices_from(K1)] += llambda
            alpha = np.array([cho_solve(K1, ys1)]).T

            K2 = qk.gaussian_kernel(xs2, xs1, sigma)
            ys2_est = np.dot(K2, alpha)
            error = np.squeeze(ys2_est) - ys2
            mae = np.sum(np.abs(error)) / nprop
            rmse = np.sqrt(np.sum(error**2) / nprop)
            print('%4d %12.8f %12.8f' % (len(xs1), mae, rmse))
Beispiel #17
0
                        #copy relevant rows&columns from K for learning
                        C = deepcopy(K[training_indices][:, training_indices])

                        #add slight alteration lambda

                        C[np.diag_indices_from(C)] += lamda

                        #further info
                        m_c.x_training = X_list[rep][training_indices]
                        m_c.x_test = X_list[rep][test_indices]

                        m_c.y_training = Y_energy_list[training_indices]
                        m_c.y_test = Y_energy_list[test_indices]

                        #solve for alphas
                        alphas = cho_solve(C, m_c.y_training)

                        K_test = m_c.laplacian_kernel_matrix(
                            x_training=m_c.x_training, x_test=m_c.x_test)

                        m_c.test_predicted_results = np.dot(K_test, alphas)

                        mae = m_c.calculate_mae()

                        mae_nmodels += mae

                    print("mae_nmodels ", mae_nmodels)
                    avg_mae = mae_nmodels / float(nModels)
                    print("avg mae:", avg_mae)
                    m_c.mae = avg_mae
                    learning_list.append(m_c)
Beispiel #18
0
    d_power = 4.0
   
    # Width for Gaussians in the spectrum
    d_width = 0.1

    # Gaussian-kernel width
    sigma = 50.0


    K  = boss_kernel(X, X, n_train, n_train, sigma, d_width, d_power)
    Ks = boss_kernel(X, Xs, n_train, n_test, sigma, d_width, d_power)
    
    print K

    K[np.diag_indices_from(K)] += 1e-8
    alpha = cho_solve(K,Y)

    # Calculate prediction kernel
    Yss = np.dot(Ks.transpose(), alpha)

    mae = np.mean(np.abs(Ys - Yss))

    print "boss"
    print mae
    
    for mol in training:
        mol.generate_bob()
    
    for mol in test:
        mol.generate_bob()
Beispiel #19
0
def train_KRR_qml(X, y, sigma=1e3, llambda=1e-8):
    K = compute_kernel_qml(X, X, sigma=sigma)
    K[np.diag_indices_from(K)] += llambda
    alpha = cho_solve(K, y)
    return alpha