def test_properties(self): """Used to test that changing the setup through properties works as intended. """ # Test changing species a = MBTR( k1=default_k1, k2=default_k2, k3=default_k3, periodic=False, species=[1, 8], sparse=False, flatten=True, ) nfeat1 = a.get_number_of_features() vec1 = a.create(H2O) a.species = ["C", "H", "O"] nfeat2 = a.get_number_of_features() vec2 = a.create(molecule("CH3OH")) self.assertTrue(nfeat1 != nfeat2) self.assertTrue(vec1.shape[1] != vec2.shape[1]) # Test changing geometry function and grid setup a.k1 = { "geometry": {"function": "atomic_number"}, "grid": {"min": 5, "max": 6, "sigma": 0.1, "n": 50}, } vec3 = a.create(H2O) self.assertTrue(not np.allclose(vec2, vec3)) a.k2 = { "geometry": {"function": "distance"}, "grid": {"min": 0, "max": 10, "sigma": 0.1, "n": 50}, "weighting": {"function": "exponential", "scale": 0.6, "cutoff": 1e-2}, } vec4 = a.create(H2O) self.assertTrue(not np.allclose(vec3, vec4)) a.k3 = { "geometry": {"function": "angle"}, "grid": {"min": 0, "max": 180, "sigma": 5, "n": 50}, "weighting": {"function": "exponential", "scale": 0.6, "cutoff": 1e-2}, } vec5 = a.create(H2O) self.assertTrue(not np.allclose(vec4, vec5))
def test_properties(self): """Used to test that changing the setup through properties works as intended. """ # Test changing species a = MBTR( k=[1, 2, 3], grid=default_grid, periodic=False, species=[1, 8], sparse=False, ) nfeat1 = a.get_number_of_features() vec1 = a.create(H2O) a.species = ["C", "H", "O"] nfeat2 = a.get_number_of_features() vec2 = a.create(molecule("CH3OH")) self.assertTrue(nfeat1 != nfeat2) self.assertTrue(vec1.shape[1] != vec2.shape[1])
def test_parallel_sparse(self): """Tests creating sparse output parallelly. """ # Test indices samples = [molecule("CO"), molecule("N2O")] desc = MBTR( species=[6, 7, 8], k={1, 2}, grid={ "k1": { "min": 1, "max": 8, "sigma": 0.1, "n": 100, }, "k2": { "min": 0, "max": 1 / 0.7, "sigma": 0.1, "n": 100, } }, weighting={ "k2": { "function": "exponential", "scale": 0.5, "cutoff": 1e-2 } }, periodic=False, flatten=True, sparse=True, ) n_features = desc.get_number_of_features() # Multiple systems, serial job output = desc.create( system=samples, n_jobs=1, ).toarray() assumed = np.empty((2, n_features)) assumed[0, :] = desc.create(samples[0]).toarray() assumed[1, :] = desc.create(samples[1]).toarray() self.assertTrue(np.allclose(output, assumed)) # Multiple systems, parallel job output = desc.create( system=samples, n_jobs=2, ).toarray() assumed = np.empty((2, n_features)) assumed[0, :] = desc.create(samples[0]).toarray() assumed[1, :] = desc.create(samples[1]).toarray() self.assertTrue(np.allclose(output, assumed))
def create(data): """This is the function that is called by each process but with different parts of the data. """ i_part = data[0] samples = data[1] mbtr = MBTR( atomic_numbers=atomic_numbers, k=[1, 2], periodic=True, grid={ "k1": { "min": min(atomic_numbers) - 1, "max": max(atomic_numbers) + 1, "sigma": 0.1, "n": 100, }, "k2": { "min": 0, "max": 1 / min_distance, "sigma": 0.01, "n": 100, }, }, weighting={ "k2": { "function": lambda x: np.exp(-0.5 * x), "threshold": 1e-3 }, }, flatten=True, ) n_samples = len(samples) n_features = int(mbtr.get_number_of_features()) mbtr_inputs = lil_matrix((n_samples, n_features)) # Create descriptors for the dataset for i_sample, sample in enumerate(samples): system = sample.value mbtr_mat = mbtr.create(system) mbtr_inputs[i_sample, :] = mbtr_mat # Return the list of features for each sample return { "part": i_part, "mbtr": mbtr_inputs, }
"scale": 0.5, "cutoff": 1e-3 }, }, periodic=False, normalization="l2_each", ) #.create(ase_train_cv) ############# create MBTR for training data ############################################################################## # Split the data into roughly equivalent chunks for each process n_proc = 24 # How many processes are spawned k, m = divmod(len(ase_mol), n_proc) atoms_split = (ase_mol[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n_proc)) n_features = int(mbtr_desc.get_number_of_features()) # Initialize a pool of processes, and tell each process in the pool to # handle a different part of the data mbtr_start = time.time() with multiprocessing.Pool(processes=n_proc) as pool: res = pool.map(create, atoms_split) # pool.map keeps the order mbtr_end = time.time() mbtr_time = np.round(mbtr_end - mbtr_start, decimals=3) # Save results n_samples = len(ase_mol) mbtr_mol = lil_matrix((n_samples, n_features)) i_sample = 0 for i, i_res in enumerate(res):
def test_number_of_features(self): """Tests that the reported number of features is correct. """ # K=1 n = 100 atomic_numbers = [1, 8] n_elem = len(atomic_numbers) mbtr = MBTR( species=atomic_numbers, k1={ "geometry": {"function": "atomic_number"}, "grid": {"min": 1, "max": 8, "sigma": 0.1, "n": 100} }, periodic=False, flatten=True ) n_features = mbtr.get_number_of_features() expected = n_elem*n self.assertEqual(n_features, expected) # K=2 mbtr = MBTR( species=atomic_numbers, k1={ "geometry": {"function": "atomic_number"}, "grid": {"min": 1, "max": 8, "sigma": 0.1, "n": 100}, }, k2={ "geometry": {"function": "inverse_distance"}, "grid": {"min": 0, "max": 1/0.7, "sigma": 0.1, "n": n}, "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-2}, }, periodic=False, flatten=True ) n_features = mbtr.get_number_of_features() expected = n_elem*n + 1/2*(n_elem)*(n_elem+1)*n self.assertEqual(n_features, expected) # K=3 mbtr = MBTR( species=atomic_numbers, k1={ "geometry": {"function": "atomic_number"}, "grid": {"min": 1, "max": 8, "sigma": 0.1, "n": 100}, }, k2={ "geometry": {"function": "inverse_distance"}, "grid": {"min": 0, "max": 1/0.7, "sigma": 0.1, "n": n}, "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-2}, }, k3={ "geometry": {"function": "cosine"}, "grid": {"min": -1, "max": 1, "sigma": 0.1, "n": n}, "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-2}, }, periodic=False, flatten=True ) n_features = mbtr.get_number_of_features() expected = n_elem*n + 1/2*(n_elem)*(n_elem+1)*n + n_elem*1/2*(n_elem)*(n_elem+1)*n self.assertEqual(n_features, expected)
def test_number_of_features(self): """Tests that the reported number of features is correct. """ # K = 1 n = 100 atomic_numbers = [1, 8] n_elem = len(atomic_numbers) mbtr = MBTR( atomic_numbers=atomic_numbers, k=[1], grid={ "k1": { "min": 1, "max": 8, "sigma": 0.1, "n": 100, } }, periodic=False, flatten=True ) n_features = mbtr.get_number_of_features() expected = n_elem*n self.assertEqual(n_features, expected) # K = 2 mbtr = MBTR( atomic_numbers=atomic_numbers, k={1, 2}, grid={ "k1": { "min": 1, "max": 8, "sigma": 0.1, "n": 100, }, "k2": { "min": 0, "max": 1/0.7, "sigma": 0.1, "n": n, } }, weighting={"k2": {"function": "exponential", "scale": 0.5, "cutoff": 1e-2}}, periodic=False, flatten=True ) n_features = mbtr.get_number_of_features() expected = n_elem*n + 1/2*(n_elem)*(n_elem+1)*n self.assertEqual(n_features, expected) # K = 3 mbtr = MBTR( atomic_numbers=atomic_numbers, k={1, 2, 3}, grid={ "k1": { "min": 1, "max": 8, "sigma": 0.1, "n": 100, }, "k2": { "min": 0, "max": 1/0.7, "sigma": 0.1, "n": n, }, "k3": { "min": -1, "max": 1, "sigma": 0.1, "n": n, } }, periodic=False, flatten=True ) n_features = mbtr.get_number_of_features() expected = n_elem*n + 1/2*(n_elem)*(n_elem+1)*n + n_elem*1/2*(n_elem)*(n_elem+1)*n self.assertEqual(n_features, expected)
def f(x): iteration_start = time.time() ## KRR hyperparameters alpha_exp = -x[0][0] gamma_exp = -x[0][1] alpha = 10**alpha_exp gamma = 10**gamma_exp ### MBTR hyperparameters sigma2_exp = -x[0][2] sigma3_exp = -x[0][3] sigma1 = 0.2 sigma2 = 10**sigma2_exp sigma3 = 10**sigma3_exp ### scaling for weighting function s2 = x[0][4] s3 = x[0][5] # write variables to file f = open('variables.in', 'w') f.write(str(alpha)) f.write("\n") f.write(str(gamma)) f.close() time_cv_array = [] mbtr_start = time.time() #### Load training data data = pd.read_json("../data/data_train_1k.json") global create def create(i_samples): """This is the function that is called by each process but with different parts of the data. """ n_i_samples = len(i_samples) i_res = lil_matrix((n_i_samples, n_features)) for i, i_sample in enumerate(i_samples): feat = mbtr_desc.create(i_sample) i_res[i, :] = feat #print("{} %".format((i+1)/n_i_samples*100)) return i_res ###### extract xyz coordinates and HOMOs from dataframe homo_array = [] out_mol = StringIO() for i, row in data.iterrows(): h**o = row[0][1] homo_array.append(h**o) x = "".join(row.molecule) #print("x:", x) out_mol.write(x) h**o = np.array(homo_array) h**o = [float(x) for x in h**o] #print(homo_train) ase_mol = list(ase.io.iread(out_mol, format="xyz")) ## Load statistics from the dataset stats = system_stats(ase_mol) atomic_numbers = stats["atomic_numbers"] max_atomic_number = stats["max_atomic_number"] min_atomic_number = stats["min_atomic_number"] min_distance = stats["min_distance"] ## define MBTR mbtr_desc = MBTR( species=atomic_numbers, k1={ "geometry": { "function": "atomic_number" }, "grid": { "min": min_atomic_number, "max": max_atomic_number, "n": 200, "sigma": sigma1 }, }, k2={ "geometry": { "function": "inverse_distance" }, "grid": { "min": 0, "max": 1, "n": 200, "sigma": sigma2 }, "weighting": { "function": "exponential", "scale": s2, "cutoff": 1e-3 }, }, k3={ "geometry": { "function": "cosine" }, "grid": { "min": -1, "max": 1, "n": 200, "sigma": sigma3 }, "weighting": { "function": "exponential", "scale": s3, "cutoff": 1e-3 }, }, periodic=False, normalization="l2_each", ) #.create(ase_train_cv) ############# create MBTR for data ############################################################################## # Split the data into roughly equivalent chunks for each process n_proc = 24 # How many processes are spawned k, m = divmod(len(ase_mol), n_proc) atoms_split = (ase_mol[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n_proc)) n_features = int(mbtr_desc.get_number_of_features()) # Initialize a pool of processes, and tell each process in the pool to # handle a different part of the data with multiprocessing.Pool(processes=n_proc) as pool: res = pool.map(create, atoms_split) # pool.map keeps the order # Save results n_samples = len(ase_mol) mbtr_mol = lil_matrix((n_samples, n_features)) i_sample = 0 for i, i_res in enumerate(res): i_n_samples = i_res.shape[0] mbtr_mol[i_sample:i_sample + i_n_samples, :] = i_res i_sample += i_n_samples ################# split MBTR and h**o array into 5 different parts ### mbtr to csr mbtr = mbtr_mol.tocsr() ### define index index = np.arange(np.shape(mbtr)[0]) ### shuffle index np.random.shuffle(index) ### return shuffled mbtr matrix shuffled_mbtr = mbtr[index, :] ### return shuffled h**o array h**o = np.array(h**o) shuffled_homo = h**o[index] ### split data into 5 equal parts select_ind_1 = np.arange(0, 200, 1) mbtr_1 = shuffled_mbtr[select_ind_1, :] homo_1 = shuffled_homo[select_ind_1] select_ind_2 = np.arange(200, 400, 1) mbtr_2 = shuffled_mbtr[select_ind_2, :] homo_2 = shuffled_homo[select_ind_2] select_ind_3 = np.arange(400, 600, 1) mbtr_3 = shuffled_mbtr[select_ind_3, :] homo_3 = shuffled_homo[select_ind_3] select_ind_4 = np.arange(600, 800, 1) mbtr_4 = shuffled_mbtr[select_ind_4, :] homo_4 = shuffled_homo[select_ind_4] select_ind_5 = np.arange(800, 1000, 1) mbtr_5 = shuffled_mbtr[select_ind_5, :] homo_5 = shuffled_homo[select_ind_5] ##### arrange data into training and validation sets mbtr_train_1 = vstack((mbtr_2, mbtr_3, mbtr_4, mbtr_5)) mbtr_val_1 = mbtr_1 homo_train_1 = np.concatenate((homo_2, homo_3, homo_4, homo_5), axis=0) homo_val_1 = homo_1 mbtr_train_2 = vstack((mbtr_3, mbtr_4, mbtr_5, mbtr_1)) mbtr_val_2 = mbtr_2 homo_train_2 = np.concatenate((homo_3, homo_4, homo_5, homo_1), axis=0) homo_val_2 = homo_2 mbtr_train_3 = vstack((mbtr_4, mbtr_5, mbtr_1, mbtr_2)) mbtr_val_3 = mbtr_3 homo_train_3 = np.concatenate((homo_4, homo_5, homo_1, homo_2), axis=0) homo_val_3 = homo_3 mbtr_train_4 = vstack((mbtr_5, mbtr_1, mbtr_2, mbtr_3)) mbtr_val_4 = mbtr_4 homo_train_4 = np.concatenate((homo_5, homo_1, homo_2, homo_3), axis=0) homo_val_4 = homo_4 mbtr_train_5 = vstack((mbtr_1, mbtr_2, mbtr_3, mbtr_4)) mbtr_val_5 = mbtr_5 homo_train_5 = np.concatenate((homo_1, homo_2, homo_3, homo_4), axis=0) homo_val_5 = homo_5 print("Finished building MBTR") mbtr_end = time.time() mbtr_time = np.round(mbtr_end - mbtr_start, decimals=3) with open('mbtr_train_1.pkl', 'wb') as f1, open('mbtr_train_2.pkl', 'wb') as f2, open( 'mbtr_train_3.pkl', 'wb') as f3, open('mbtr_train_4.pkl', 'wb') as f4, open('mbtr_train_5.pkl', 'wb') as f5: pickle.dump(mbtr_train_1, f1) pickle.dump(mbtr_train_2, f2) pickle.dump(mbtr_train_3, f3) pickle.dump(mbtr_train_4, f4) pickle.dump(mbtr_train_5, f5) with open('mbtr_val_1.pkl', 'wb') as f1, open('mbtr_val_2.pkl', 'wb') as f2, open( 'mbtr_val_3.pkl', 'wb') as f3, open('mbtr_val_4.pkl', 'wb') as f4, open('mbtr_val_5.pkl', 'wb') as f5: pickle.dump(mbtr_val_1, f1) pickle.dump(mbtr_val_2, f2) pickle.dump(mbtr_val_3, f3) pickle.dump(mbtr_val_4, f4) pickle.dump(mbtr_val_5, f5) with open('homo_train_1.pkl', 'wb') as f1, open('homo_train_2.pkl', 'wb') as f2, open( 'homo_train_3.pkl', 'wb') as f3, open('homo_train_4.pkl', 'wb') as f4, open('homo_train_5.pkl', 'wb') as f5: pickle.dump(homo_train_1, f1) pickle.dump(homo_train_2, f2) pickle.dump(homo_train_3, f3) pickle.dump(homo_train_4, f4) pickle.dump(homo_train_5, f5) with open('homo_val_1.pkl', 'wb') as f1, open('homo_val_2.pkl', 'wb') as f2, open( 'homo_val_3.pkl', 'wb') as f3, open('homo_val_4.pkl', 'wb') as f4, open('homo_val_5.pkl', 'wb') as f5: pickle.dump(homo_val_1, f1) pickle.dump(homo_val_2, f2) pickle.dump(homo_val_3, f3) pickle.dump(homo_val_4, f4) pickle.dump(homo_val_5, f5) subprocess.call('submit_cv.sh') pp_start = time.time() file1 = open('mae1.txt', 'r') mae1 = file1.read() output.write("mae1:" + mae1 + "\n") ftime1 = open('cv_time_1.txt', 'r') time1 = ftime1.read() file2 = open('mae2.txt', 'r') mae2 = file2.read() output.write("mae2:" + mae2 + "\n") ftime2 = open('cv_time_2.txt', 'r') time2 = ftime2.read() file3 = open('mae3.txt', 'r') mae3 = file3.read() output.write("mae3:" + mae3 + "\n") ftime3 = open('cv_time_3.txt', 'r') time3 = ftime3.read() file4 = open('mae4.txt', 'r') mae4 = file4.read() output.write("mae4:" + mae4 + "\n") ftime4 = open('cv_time_4.txt', 'r') time4 = ftime4.read() file5 = open('mae5.txt', 'r') mae5 = file5.read() output.write("mae5:" + mae5 + "\n") ftime5 = open('cv_time_5.txt', 'r') time5 = ftime5.read() while not (mae1 and mae2 and mae3 and mae4 and mae5 and time1 and time2 and time3 and time4 and time5): if (mae1 and mae2 and mae3 and mae4 and mae5 and time1 and time2 and time3 and time4 and time5): break output.write("Waiting for all cv rounds to finish..." + "\n") time.sleep(5) file1 = open('mae1.txt', 'r') mae1 = file1.read() output.write("mae1:" + mae1 + "\n") ftime1 = open('cv_time_1.txt', 'r') time1 = ftime1.read() file2 = open('mae2.txt', 'r') mae2 = file2.read() output.write("mae2:" + mae2 + "\n") ftime2 = open('cv_time_2.txt', 'r') time2 = ftime2.read() file3 = open('mae3.txt', 'r') mae3 = file3.read() output.write("mae3:" + mae3 + "\n") ftime3 = open('cv_time_3.txt', 'r') time3 = ftime3.read() file4 = open('mae4.txt', 'r') mae4 = file4.read() output.write("mae4:" + mae4 + "\n") ftime4 = open('cv_time_4.txt', 'r') time4 = ftime4.read() file5 = open('mae5.txt', 'r') mae5 = file5.read() output.write("mae5:" + mae5 + "\n") ftime5 = open('cv_time_5.txt', 'r') time5 = ftime5.read() MAE_list = [mae1, mae2, mae3, mae4, mae5] output.write("All cv rounds finished." + "\n") output.write("MAEs of CV rounds:" + str(MAE_list) + "\n") MAE_list = np.array(MAE_list).astype(np.float) avg_MAE = np.mean(MAE_list) output.write("Average MAE: %s eV" % avg_MAE + "\n") output.write("BREAKDOWN OF TIMINGS" + "\n") output.write("Time to load data and build MBTRs: %f s" % mbtr_time + "\n") cv_time_list = [time1, time2, time3, time4, time5] output.write("CV timings:" + str(cv_time_list) + "\n") cv_time_list = np.array(cv_time_list).astype(np.float) avg_cv_time = np.mean(cv_time_list) output.write("Average time for CV loop: %f s" % avg_cv_time + "\n") pp_end = time.time() pp_time = np.round(pp_end - pp_start, decimals=3) output.write("Postprocessing time: %f s" % pp_time + "\n") iteration_end = time.time() iteration_time = np.round(iteration_end - iteration_start, decimals=3) output.write("Total iteration time: %f s" % iteration_time + "\n") output.close() if os.path.isfile('results/df_results_mbtr.json'): df_results = pd.read_json('results/df_results_mbtr.json', orient='split') iteration = len(df_results) + 1 print("iteration:", iteration) row = [ iteration, avg_MAE, iteration_time, mbtr_time, avg_cv_time, alpha, gamma, sigma2, sigma3 ] df_results.loc[len(df_results)] = row df_results.to_json('results/df_results_mbtr.json', orient='split') else: df_results = pd.DataFrame([[ 1, avg_MAE, iteration_time, mbtr_time, avg_cv_time, alpha, gamma, sigma2, sigma3 ]], columns=[ 'iteration', 'avg_MAE', 'iteration_time', 'mbtr_time', 'avg_cv_time', 'alpha', 'gamma', 'sigma2', 'sigma3' ]) df_results.to_json('results/df_results_mbtr.json', orient='split') return avg_MAE
def test_parallel_dense(self): """Tests creating dense output parallelly. """ samples = [molecule("CO"), molecule("N2O")] desc = MBTR( species=[6, 7, 8], k={1, 2}, grid={ "k1": { "min": 1, "max": 8, "sigma": 0.1, "n": 100, }, "k2": { "min": 0, "max": 1 / 0.7, "sigma": 0.1, "n": 100, } }, weighting={ "k2": { "function": "exponential", "scale": 0.5, "cutoff": 1e-2 } }, periodic=False, flatten=True, sparse=False, ) n_features = desc.get_number_of_features() # Multiple systems, serial job output = desc.create( system=samples, n_jobs=1, ) assumed = np.empty((2, n_features)) assumed[0, :] = desc.create(samples[0]) assumed[1, :] = desc.create(samples[1]) self.assertTrue(np.allclose(output, assumed)) # Multiple systems, parallel job output = desc.create( system=samples, n_jobs=2, ) assumed = np.empty((2, n_features)) assumed[0, :] = desc.create(samples[0]) assumed[1, :] = desc.create(samples[1]) self.assertTrue(np.allclose(output, assumed)) # Non-flattened output desc._flatten = False output = desc.create( system=samples, n_jobs=2, ) assumed = [] assumed.append(desc.create(samples[0])) assumed.append(desc.create(samples[1])) for i, val in enumerate(output): for key in val.keys(): i_tensor = val[key] j_tensor = assumed[i][key] self.assertTrue(np.allclose(i_tensor, j_tensor))