Exemple #1
0
def run_test(feat1, output_file, alpha, feat2=None, Gmax=0):
    feats = {
        "1": Connectivity(n_jobs=-1),
        "1c": Connectivity(n_jobs=-1, use_coordination=True, depth=1),
        "2": Connectivity(n_jobs=-1, depth=2),
        "2b": Connectivity(n_jobs=-1, depth=2, use_bond_order=True),
        "2NP": EncodedBond(n_jobs=-1, smoothing='norm', max_depth=Gmax),
        "2NC": EncodedBond(n_jobs=-1, smoothing='norm_cdf', max_depth=Gmax),
        "2LP": EncodedBond(n_jobs=-1, smoothing='expit_pdf', max_depth=Gmax),
        "2LC": EncodedBond(n_jobs=-1, smoothing='expit', max_depth=Gmax),
        "2SP": EncodedBond(n_jobs=-1, smoothing='spike', max_depth=Gmax),
        "2SC": EncodedBond(n_jobs=-1, smoothing='zero_one', max_depth=Gmax)
    }

    Xin_train_final = list(zip(train_anum, train_coor))
    Xin_test_final = list(zip(test_anum, test_coor))
    y_train_final = train_energy
    y_test_final = test_energy

    X_train1 = feats[feat1].fit_transform(Xin_train_final)
    X_test1 = feats[feat1].transform(Xin_test_final)
    if feat2 != None:
        X_train2 = feats[feat2].fit_transform(Xin_train_final)
        X_test2 = feats[feat2].transform(Xin_test_final)
        X_train_final = np.concatenate((X_train1, X_train2), axis=1)
        X_test_final = np.concatenate((X_test1, X_test2), axis=1)
    else:
        X_train_final = X_train1
        X_test_final = X_test1
    clf = Ridge(alpha=alpha)
    clf.fit(X_train_final, y_train_final)
    train_error = MAE(clf.predict(X_train_final), y_train_final) * 627.509
    test_error = MAE(clf.predict(X_test_final), y_test_final) * 627.509
    with open(output_file, 'a') as f:
        print("Test set", file=f)
        print(feat1, file=f)
        if (feat2 != None):
            print(feat2, file=f)
        print("Gmax: %d" % Gmax)
        print("Train MAE: %.6f    Test MAE: %.6f" % (train_error, test_error),
              file=f)
Exemple #2
0
def run_trial(feat1, output_file, feat2=None, Gmax=0):
    #%%
    feats = {
        "1": Connectivity(n_jobs=-1),
        "1c": Connectivity(n_jobs=-1, use_coordination=True, depth=1),
        "2": Connectivity(n_jobs=-1, depth=2),
        "2b": Connectivity(n_jobs=-1, depth=2, use_bond_order=True),
        "2NP": EncodedBond(n_jobs=-1, smoothing='norm', max_depth=Gmax),
        "2NC": EncodedBond(n_jobs=-1, smoothing='norm_cdf', max_depth=Gmax),
        "2LP": EncodedBond(n_jobs=-1, smoothing='expit_pdf', max_depth=Gmax),
        "2LC": EncodedBond(n_jobs=-1, smoothing='expit', max_depth=Gmax),
        "2SP": EncodedBond(n_jobs=-1, smoothing='spike', max_depth=Gmax),
        "2SC": EncodedBond(n_jobs=-1, smoothing='zero_one', max_depth=Gmax)
    }
    #cross validation list
    train_validation = ()
    train_validation = np.arange(0, train_set_size,
                                 dtype=int).reshape(4, int(train_set_size / 4))
    np.random.shuffle(train_validation)
    for i in range(4):
        np.random.shuffle(train_validation[i])

    train_error_temp = [None] * 4
    test_error_temp = [None] * 4

    alpha_range = [1, 0, 0.1, 0.01, 0.001, 0.0001]

    # Perform k-fold validation. Can be made into a function
    for i, alpha in enumerate(alpha_range):
        for fold in range(4):
            train_folds = [x for x in range(4) if x != fold]
            train_idxs = np.ravel(train_validation[train_folds])
            test_idxs = np.ravel(train_validation[fold])

            Xin_train = list(
                zip(train_anum[train_idxs], train_coor[train_idxs]))
            Xin_test = list(zip(train_anum[test_idxs], train_coor[test_idxs]))
            y_train = train_energy[train_idxs]
            y_test = train_energy[test_idxs]

            X_train1 = feats[feat1].fit_transform(Xin_train)
            X_test1 = feats[feat1].transform(Xin_test)
            if feat2 != None:
                X_train2 = feats[feat2].fit_transform(Xin_train)
                X_test2 = feats[feat2].transform(Xin_test)
                # concatenate feature vectors for combined features
                X_train = np.concatenate((X_train1, X_train2), axis=1)
                X_test = np.concatenate((X_test1, X_test2), axis=1)
            else:
                X_train = X_train1
                X_test = X_test1
            # LRR
            clf = Ridge(alpha=alpha)
            clf.fit(X_train, y_train)

            train_error_temp[fold] = MAE(clf.predict(X_train),
                                         y_train) * 627.509
            test_error_temp[fold] = MAE(clf.predict(X_test), y_test) * 627.509

        # Output the result to file
        with open(output_file, 'a') as f:
            print(feat1, file=f)
            if feat2 != None:
                print(feat2, file=f)
            print("Gmax = %d" % Gmax, file=f)
            print("alpha: %.4f" % alpha, file=f)
            print("Avg Train MAE: %.6f Avg Test MAE: %.6f" %
                  (statistics.mean(train_error_temp),
                   statistics.mean(test_error_temp)),
                  file=f)
            print()
            #caluate standard deviation
            print(
                "Train Standard Deviation: %.4f, Test Standard Deviation: %.4f"
                % (statistics.pstdev(train_error_temp),
                   statistics.pstdev(test_error_temp)),
                file=f)
Exemple #3
0
# complex (iron is not in the constants).
# Maybe at some point, molml will include more constants, but it seems outside
# of the scope of this library.

if __name__ == '__main__':
    elements = ['Fe', 'H', 'H', 'H', 'H', 'H', 'H']
    coords = np.array([
        [0., 0., 0.],
        [1.46, 0., 0.],
        [0., 1.46, 0.],
        [0., 0., 1.46],
        [-1.46, 0., 0.],
        [0., -1.46, 0.],
        [0., 0., -1.46],
    ])
    feat = Connectivity(depth=2)
    # Notice the warning about missing elements.
    print(feat.fit_transform([(elements, coords)]))

    # 1) Modify the values in the constants module before your script.
    BOND_LENGTHS['Fe'] = {'1': 1.32}
    print(feat.fit_transform([(elements, coords)]))
    del BOND_LENGTHS['Fe']

    # 2) Include connectivity information in your data. The other instances
    # where constants are used (electronegativity, element symbols, atomic
    # numbers).
    connections = {
        0: {1: '1', 2: '1', 3: '1', 4: '1', 5: '1', 6: '1'},
        1: {0: '1'},
        2: {0: '1'},
Exemple #4
0
if __name__ == "__main__":
    # This is just boiler plate code to load the data
    Xin_train, Xin_test, y_train, y_test = load_qm7()

    # Change this to make the tranformations parallel
    # Values less than 1 will set to the number of cores the CPU has
    N_JOBS = 1

    # Just a few examples of different features
    tfs = [
        EncodedBond(n_jobs=N_JOBS),
        EncodedBond(spacing="inverse", n_jobs=N_JOBS),
        BagOfBonds(n_jobs=N_JOBS),
        CoulombMatrix(n_jobs=N_JOBS),
        Connectivity(depth=1, n_jobs=N_JOBS),
        Connectivity(depth=2, use_bond_order=True, n_jobs=N_JOBS),
        Connectivity(depth=3, use_coordination=True, n_jobs=N_JOBS),
    ]

    for tf in tfs:
        print(tf)
        X_train = tf.fit_transform(Xin_train)
        X_test = tf.transform(Xin_test)

        # We will not do a hyperparmeter search for simplicity
        clf = Ridge()
        clf.fit(X_train, y_train)
        train_error = MAE(clf.predict(X_train), y_train)
        test_error = MAE(clf.predict(X_test), y_test)
        print("Train MAE: %.4f Test MAE: %.4f" % (train_error, test_error))
Exemple #5
0
from sklearn.metrics import mean_absolute_error as MAE
from molml.features import Connectivity, EncodedBond
import statistics
import numpy as np
from utils import load_qm7

if __name__ == "__main__":
    train_error = 0
    test_error = 0
    train_error_temp = [None] * 5
    test_error_temp = [None] * 5
    #feat = CoulombMatrix(n_jobs = -1)
    #feats2 = [Connectivity(n_jobs = -1, depth = 2, use_bond_order = True),
    #EncodedBond(n_jobs = -1, smoothing = 'expit_pdf',max_depth = 1)]
    #feat2 = Connectivity(n_jobs = -1, depth = 2)
    feat2b = Connectivity(n_jobs=-1, depth=2, use_bond_order=True)
    #feat2s = [(Connectivity(n_jobs = -1, depth = 2)),
    #EncodedBond(n_jobs = -1, max_depth = 1)]
    feat1 = Connectivity(n_jobs=1)
    featLC = EncodedBond(n_jobs=-1, smoothing='expit', max_depth=1)
    featNP = EncodedBond(n_jobs=-1, smoothing='norm', max_depth=2)
    #feat1b = (Connectivity(n_jobs = 1, use_coordination = True))

    #loop to  test each fold
    for x in range(5):
        # Fit and transform test and train set
        Xin_train, Xin_test, y_train, y_test = load_qm7(x)
        #for feat in featNP:
        #X_train = feat1.fit_transform(Xin_train)
        #X_test = feat1.transform(Xin_test)
        #for feat2 in feats2:
Exemple #6
0
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.pipeline import FeatureUnion

from molml.features import EncodedBond, Connectivity

from utils import load_qm7

if __name__ == "__main__":
    # This is just boiler plate code to load the data
    Xin_train, Xin_test, y_train, y_test = load_qm7()

    feats = [
        ("encoded_bond", EncodedBond(n_jobs=-1, max_depth=3)),
        ("atom_count", Connectivity(depth=1, n_jobs=-1)),
        ("angle_count", Connectivity(depth=3, use_coordination=True,
                                     n_jobs=-1)),
    ]

    full_feat = FeatureUnion(feats)
    X_train = full_feat.fit_transform(Xin_train)
    X_test = full_feat.transform(Xin_test)

    clfs = [
        Ridge(alpha=0.01),
        KernelRidge(alpha=1e-9, gamma=1e-5, kernel="rbf"),
    ]
    for clf in clfs:
        print(clf)
        clf.fit(X_train, y_train)
Exemple #7
0
from molml.features import Connectivity#, EncodedBond, CoulombMatrix
import statistics
from utils import load_qm7

if __name__ == "__main__":
    train_error = 0
    test_error= 0
    train_error_temp = [None] * 5
    test_error_temp = [None] * 5
    gamma = 1e-3
    alpha = 1e-7
    #kern = AtomKernel(gamma=gamma,transformer = Connectivity(n_jobs = -1), n_jobs=-1)
    #feat = CoulombMatrix(n_jobs = 1)
    #feats = [Connectivity(n_jobs = -1, depth = 3),
                 #EncodedBond(n_jobs = -1)]
    feat = (Connectivity(n_jobs = 1))
    #feat = (Connectivity(n_jobs = 1, use_coordination = True))

    #loop to  test each fold 
    for x in range(5):
       # Fit and transform test and train set
       Xin_train, Xin_test,y_train, y_test = load_qm7(x)
       #for feat in feats:
       
       #K_train = kern.fit_transform(Xin_train)
       #K_test = kern.transform(Xin_test)
       X_train = feat.fit_transform(Xin_train)
       X_test = feat.transform(Xin_test)
       clf = KernelRidge(alpha = alpha,gamma = gamma, kernel = "rbf")
       clf.fit(X_train, y_train)
       
Exemple #8
0
train_validation = np.arange(0, train_set_size,
                             dtype=int).reshape(4, int(train_set_size / 4))
np.random.shuffle(train_validation)
for i in range(4):
    np.random.shuffle(train_validation[i])
#train_validation = train_validation.tolist()

train_error = 0
test_error = 0
train_error_temp = [None] * 4
test_error_temp = [None] * 4

#feats  = [Connectivity(n_jobs = -1, depth = 2, use_bond_order = True),
#EncodedBond(n_jobs = -1, smoothing = 'expit_pdf',max_depth = 1)]
#feat = Connectivity(n_jobs = -1, depth = 2, use_bond_order = True)
feat = Connectivity(n_jobs=-1)
alpha = 0.1
gamma = 1

for fold in range(4):
    train_folds = [x for x in range(4) if x != fold]
    train_idxs = np.ravel(train_validation[train_folds])
    test_idxs = np.ravel(train_validation[fold])

    Xin_train = list(zip(train_anum[train_idxs], train_coor[train_idxs]))
    Xin_test = list(zip(train_anum[test_idxs], train_coor[test_idxs]))
    y_train = train_energy[train_idxs]
    y_test = train_energy[test_idxs]

    #for feat in feats:
    X_train = feat.fit_transform(Xin_train)
Exemple #9
0
# complex (iron is not in the constants).
# Maybe at some point, molml will include more constants, but it seems outside
# of the scope of this library.

if __name__ == '__main__':
    elements = ['Fe', 'H', 'H', 'H', 'H', 'H', 'H']
    coords = np.array([
        [0., 0., 0.],
        [1.46, 0., 0.],
        [0., 1.46, 0.],
        [0., 0., 1.46],
        [-1.46, 0., 0.],
        [0., -1.46, 0.],
        [0., 0., -1.46],
    ])
    feat = Connectivity(depth=2)
    # Notice the warning about missing elements.
    print(feat.fit_transform([(elements, coords)]))

    # 1) Modify the values in the constants module before your script.
    BOND_LENGTHS['Fe'] = {'1': 1.32}
    print(feat.fit_transform([(elements, coords)]))
    del BOND_LENGTHS['Fe']

    # 2) Include connectivity information in your data. The other instances
    # where constants are used (electronegativity, element symbols, atomic
    # numbers).
    connections = {
        0: {
            1: '1',
            2: '1',