def run_test(feat1, output_file, alpha, feat2=None, Gmax=0): feats = { "1": Connectivity(n_jobs=-1), "1c": Connectivity(n_jobs=-1, use_coordination=True, depth=1), "2": Connectivity(n_jobs=-1, depth=2), "2b": Connectivity(n_jobs=-1, depth=2, use_bond_order=True), "2NP": EncodedBond(n_jobs=-1, smoothing='norm', max_depth=Gmax), "2NC": EncodedBond(n_jobs=-1, smoothing='norm_cdf', max_depth=Gmax), "2LP": EncodedBond(n_jobs=-1, smoothing='expit_pdf', max_depth=Gmax), "2LC": EncodedBond(n_jobs=-1, smoothing='expit', max_depth=Gmax), "2SP": EncodedBond(n_jobs=-1, smoothing='spike', max_depth=Gmax), "2SC": EncodedBond(n_jobs=-1, smoothing='zero_one', max_depth=Gmax) } Xin_train_final = list(zip(train_anum, train_coor)) Xin_test_final = list(zip(test_anum, test_coor)) y_train_final = train_energy y_test_final = test_energy X_train1 = feats[feat1].fit_transform(Xin_train_final) X_test1 = feats[feat1].transform(Xin_test_final) if feat2 != None: X_train2 = feats[feat2].fit_transform(Xin_train_final) X_test2 = feats[feat2].transform(Xin_test_final) X_train_final = np.concatenate((X_train1, X_train2), axis=1) X_test_final = np.concatenate((X_test1, X_test2), axis=1) else: X_train_final = X_train1 X_test_final = X_test1 clf = Ridge(alpha=alpha) clf.fit(X_train_final, y_train_final) train_error = MAE(clf.predict(X_train_final), y_train_final) * 627.509 test_error = MAE(clf.predict(X_test_final), y_test_final) * 627.509 with open(output_file, 'a') as f: print("Test set", file=f) print(feat1, file=f) if (feat2 != None): print(feat2, file=f) print("Gmax: %d" % Gmax) print("Train MAE: %.6f Test MAE: %.6f" % (train_error, test_error), file=f)
def run_trial(feat1, output_file, feat2=None, Gmax=0): #%% feats = { "1": Connectivity(n_jobs=-1), "1c": Connectivity(n_jobs=-1, use_coordination=True, depth=1), "2": Connectivity(n_jobs=-1, depth=2), "2b": Connectivity(n_jobs=-1, depth=2, use_bond_order=True), "2NP": EncodedBond(n_jobs=-1, smoothing='norm', max_depth=Gmax), "2NC": EncodedBond(n_jobs=-1, smoothing='norm_cdf', max_depth=Gmax), "2LP": EncodedBond(n_jobs=-1, smoothing='expit_pdf', max_depth=Gmax), "2LC": EncodedBond(n_jobs=-1, smoothing='expit', max_depth=Gmax), "2SP": EncodedBond(n_jobs=-1, smoothing='spike', max_depth=Gmax), "2SC": EncodedBond(n_jobs=-1, smoothing='zero_one', max_depth=Gmax) } #cross validation list train_validation = () train_validation = np.arange(0, train_set_size, dtype=int).reshape(4, int(train_set_size / 4)) np.random.shuffle(train_validation) for i in range(4): np.random.shuffle(train_validation[i]) train_error_temp = [None] * 4 test_error_temp = [None] * 4 alpha_range = [1, 0, 0.1, 0.01, 0.001, 0.0001] # Perform k-fold validation. Can be made into a function for i, alpha in enumerate(alpha_range): for fold in range(4): train_folds = [x for x in range(4) if x != fold] train_idxs = np.ravel(train_validation[train_folds]) test_idxs = np.ravel(train_validation[fold]) Xin_train = list( zip(train_anum[train_idxs], train_coor[train_idxs])) Xin_test = list(zip(train_anum[test_idxs], train_coor[test_idxs])) y_train = train_energy[train_idxs] y_test = train_energy[test_idxs] X_train1 = feats[feat1].fit_transform(Xin_train) X_test1 = feats[feat1].transform(Xin_test) if feat2 != None: X_train2 = feats[feat2].fit_transform(Xin_train) X_test2 = feats[feat2].transform(Xin_test) # concatenate feature vectors for combined features X_train = np.concatenate((X_train1, X_train2), axis=1) X_test = np.concatenate((X_test1, X_test2), axis=1) else: X_train = X_train1 X_test = X_test1 # LRR clf = Ridge(alpha=alpha) clf.fit(X_train, y_train) train_error_temp[fold] = MAE(clf.predict(X_train), y_train) * 627.509 test_error_temp[fold] = MAE(clf.predict(X_test), y_test) * 627.509 # Output the result to file with open(output_file, 'a') as f: print(feat1, file=f) if feat2 != None: print(feat2, file=f) print("Gmax = %d" % Gmax, file=f) print("alpha: %.4f" % alpha, file=f) print("Avg Train MAE: %.6f Avg Test MAE: %.6f" % (statistics.mean(train_error_temp), statistics.mean(test_error_temp)), file=f) print() #caluate standard deviation print( "Train Standard Deviation: %.4f, Test Standard Deviation: %.4f" % (statistics.pstdev(train_error_temp), statistics.pstdev(test_error_temp)), file=f)
# complex (iron is not in the constants). # Maybe at some point, molml will include more constants, but it seems outside # of the scope of this library. if __name__ == '__main__': elements = ['Fe', 'H', 'H', 'H', 'H', 'H', 'H'] coords = np.array([ [0., 0., 0.], [1.46, 0., 0.], [0., 1.46, 0.], [0., 0., 1.46], [-1.46, 0., 0.], [0., -1.46, 0.], [0., 0., -1.46], ]) feat = Connectivity(depth=2) # Notice the warning about missing elements. print(feat.fit_transform([(elements, coords)])) # 1) Modify the values in the constants module before your script. BOND_LENGTHS['Fe'] = {'1': 1.32} print(feat.fit_transform([(elements, coords)])) del BOND_LENGTHS['Fe'] # 2) Include connectivity information in your data. The other instances # where constants are used (electronegativity, element symbols, atomic # numbers). connections = { 0: {1: '1', 2: '1', 3: '1', 4: '1', 5: '1', 6: '1'}, 1: {0: '1'}, 2: {0: '1'},
if __name__ == "__main__": # This is just boiler plate code to load the data Xin_train, Xin_test, y_train, y_test = load_qm7() # Change this to make the tranformations parallel # Values less than 1 will set to the number of cores the CPU has N_JOBS = 1 # Just a few examples of different features tfs = [ EncodedBond(n_jobs=N_JOBS), EncodedBond(spacing="inverse", n_jobs=N_JOBS), BagOfBonds(n_jobs=N_JOBS), CoulombMatrix(n_jobs=N_JOBS), Connectivity(depth=1, n_jobs=N_JOBS), Connectivity(depth=2, use_bond_order=True, n_jobs=N_JOBS), Connectivity(depth=3, use_coordination=True, n_jobs=N_JOBS), ] for tf in tfs: print(tf) X_train = tf.fit_transform(Xin_train) X_test = tf.transform(Xin_test) # We will not do a hyperparmeter search for simplicity clf = Ridge() clf.fit(X_train, y_train) train_error = MAE(clf.predict(X_train), y_train) test_error = MAE(clf.predict(X_test), y_test) print("Train MAE: %.4f Test MAE: %.4f" % (train_error, test_error))
from sklearn.metrics import mean_absolute_error as MAE from molml.features import Connectivity, EncodedBond import statistics import numpy as np from utils import load_qm7 if __name__ == "__main__": train_error = 0 test_error = 0 train_error_temp = [None] * 5 test_error_temp = [None] * 5 #feat = CoulombMatrix(n_jobs = -1) #feats2 = [Connectivity(n_jobs = -1, depth = 2, use_bond_order = True), #EncodedBond(n_jobs = -1, smoothing = 'expit_pdf',max_depth = 1)] #feat2 = Connectivity(n_jobs = -1, depth = 2) feat2b = Connectivity(n_jobs=-1, depth=2, use_bond_order=True) #feat2s = [(Connectivity(n_jobs = -1, depth = 2)), #EncodedBond(n_jobs = -1, max_depth = 1)] feat1 = Connectivity(n_jobs=1) featLC = EncodedBond(n_jobs=-1, smoothing='expit', max_depth=1) featNP = EncodedBond(n_jobs=-1, smoothing='norm', max_depth=2) #feat1b = (Connectivity(n_jobs = 1, use_coordination = True)) #loop to test each fold for x in range(5): # Fit and transform test and train set Xin_train, Xin_test, y_train, y_test = load_qm7(x) #for feat in featNP: #X_train = feat1.fit_transform(Xin_train) #X_test = feat1.transform(Xin_test) #for feat2 in feats2:
from sklearn.linear_model import Ridge from sklearn.kernel_ridge import KernelRidge from sklearn.metrics import mean_absolute_error as MAE from sklearn.pipeline import FeatureUnion from molml.features import EncodedBond, Connectivity from utils import load_qm7 if __name__ == "__main__": # This is just boiler plate code to load the data Xin_train, Xin_test, y_train, y_test = load_qm7() feats = [ ("encoded_bond", EncodedBond(n_jobs=-1, max_depth=3)), ("atom_count", Connectivity(depth=1, n_jobs=-1)), ("angle_count", Connectivity(depth=3, use_coordination=True, n_jobs=-1)), ] full_feat = FeatureUnion(feats) X_train = full_feat.fit_transform(Xin_train) X_test = full_feat.transform(Xin_test) clfs = [ Ridge(alpha=0.01), KernelRidge(alpha=1e-9, gamma=1e-5, kernel="rbf"), ] for clf in clfs: print(clf) clf.fit(X_train, y_train)
from molml.features import Connectivity#, EncodedBond, CoulombMatrix import statistics from utils import load_qm7 if __name__ == "__main__": train_error = 0 test_error= 0 train_error_temp = [None] * 5 test_error_temp = [None] * 5 gamma = 1e-3 alpha = 1e-7 #kern = AtomKernel(gamma=gamma,transformer = Connectivity(n_jobs = -1), n_jobs=-1) #feat = CoulombMatrix(n_jobs = 1) #feats = [Connectivity(n_jobs = -1, depth = 3), #EncodedBond(n_jobs = -1)] feat = (Connectivity(n_jobs = 1)) #feat = (Connectivity(n_jobs = 1, use_coordination = True)) #loop to test each fold for x in range(5): # Fit and transform test and train set Xin_train, Xin_test,y_train, y_test = load_qm7(x) #for feat in feats: #K_train = kern.fit_transform(Xin_train) #K_test = kern.transform(Xin_test) X_train = feat.fit_transform(Xin_train) X_test = feat.transform(Xin_test) clf = KernelRidge(alpha = alpha,gamma = gamma, kernel = "rbf") clf.fit(X_train, y_train)
train_validation = np.arange(0, train_set_size, dtype=int).reshape(4, int(train_set_size / 4)) np.random.shuffle(train_validation) for i in range(4): np.random.shuffle(train_validation[i]) #train_validation = train_validation.tolist() train_error = 0 test_error = 0 train_error_temp = [None] * 4 test_error_temp = [None] * 4 #feats = [Connectivity(n_jobs = -1, depth = 2, use_bond_order = True), #EncodedBond(n_jobs = -1, smoothing = 'expit_pdf',max_depth = 1)] #feat = Connectivity(n_jobs = -1, depth = 2, use_bond_order = True) feat = Connectivity(n_jobs=-1) alpha = 0.1 gamma = 1 for fold in range(4): train_folds = [x for x in range(4) if x != fold] train_idxs = np.ravel(train_validation[train_folds]) test_idxs = np.ravel(train_validation[fold]) Xin_train = list(zip(train_anum[train_idxs], train_coor[train_idxs])) Xin_test = list(zip(train_anum[test_idxs], train_coor[test_idxs])) y_train = train_energy[train_idxs] y_test = train_energy[test_idxs] #for feat in feats: X_train = feat.fit_transform(Xin_train)
# complex (iron is not in the constants). # Maybe at some point, molml will include more constants, but it seems outside # of the scope of this library. if __name__ == '__main__': elements = ['Fe', 'H', 'H', 'H', 'H', 'H', 'H'] coords = np.array([ [0., 0., 0.], [1.46, 0., 0.], [0., 1.46, 0.], [0., 0., 1.46], [-1.46, 0., 0.], [0., -1.46, 0.], [0., 0., -1.46], ]) feat = Connectivity(depth=2) # Notice the warning about missing elements. print(feat.fit_transform([(elements, coords)])) # 1) Modify the values in the constants module before your script. BOND_LENGTHS['Fe'] = {'1': 1.32} print(feat.fit_transform([(elements, coords)])) del BOND_LENGTHS['Fe'] # 2) Include connectivity information in your data. The other instances # where constants are used (electronegativity, element symbols, atomic # numbers). connections = { 0: { 1: '1', 2: '1',