Esempio n. 1
0
    for d in diff5:
        EVOM_error += d * d

    print("Difference between Compounds by representation:")
    print("BOB", BOB_error)
    print("CM", CM_error)
    print("EVCM", EVCM_error)
    print("OM", OM_error)
    print("EVOM", EVOM_error)

if do_derivative_calculation:
    #create results instances
    mols, compounds = datprep.read_xyz_energies(database)

    datprep.store_compounds(compounds, database + "compounds.pickle")

    #prepare derivatives of all representations
    replist = [ZRNrep.Coulomb_Matrix, ZRNrep.Eigenvalue_Coulomb_Matrix, ZRNrep.Bag_of_Bonds, ZRNrep.Overlap_Matrix, \
        ZRNrep.Eigenvalue_Overlap_Matrix]

    for i in range(5):
        results, resultaddition = jader.calculate_num_der(
            replist[i], compounds)
        res_file = database + namelist[i] + "der_results.pickle"
        datprep.store_compounds(results, res_file)
        print("results were successfully stored to ", res_file)

if do_plot_derivatives:
    yvals = []
    norms_nuc = []
Esempio n. 2
0
###analytical derivative:
#results = jader.calculate_eigenvalues('CM_EV', compound_ls)

###numerical derivative:
results, resultaddition = jader.calculate_num_der(numerical_representations[2],
                                                  compound_ls)

###B)
###store list of results in result_file
'''
#If you want to plot from multiple pickle results file, use this code:
#result_file = result_folder + "results_%i-%i.pickle" %(init, end)
'''
print("results:", results)
print("len of results:", len(results))
datprep.store_compounds(results, results_file_OM)

#C)

#If you want to plot from multiple pickle results file, use this code:
#these are used as file identifiers for the results_%i-%i.pickle files
#numbers = ["0-200", "200-400", "400-600", "600-800", "800-1000", "1000-1200", "1200-1400", "1400-1600", "1600-1800", "1800-2000", "2000-2200", "2200-2400", "2400-2600", "2600-2800", "2800-3000", "3000-3200", "3200-3400", "3400-3600", "3600-3800", "3800-3993"]
'''
#read list of compounds from data file
full_compound_ls = datprep.read_compounds(results_file)
#print(len(full_compound_ls), " compounds in full data file")

'''
###use if only part of dataset should be processed
#try:
#	compound_ls = full_compound_ls[init : end]
Esempio n. 3
0
                        K_test = m_c.laplacian_kernel_matrix(
                            x_training=m_c.x_training, x_test=m_c.x_test)

                        m_c.test_predicted_results = np.dot(K_test, alphas)

                        mae = m_c.calculate_mae()

                        mae_nmodels += mae

                    print("mae_nmodels ", mae_nmodels)
                    avg_mae = mae_nmodels / float(nModels)
                    print("avg mae:", avg_mae)
                    m_c.mae = avg_mae
                    learning_list.append(m_c)

                    mae_list.append(avg_mae)

                    print("totally tested:", total_tested)

                name = rep_names[
                    rep]  #"sigma: %.2e, lambda: %.2e" %(sigma, lamda)
                curve = datprep.CurveObj(name)
                curve.xnparray = training_no
                curve.ynparray = np.array(mae_list)
                curve_list.append(curve)

        final_file = "./tmp/Kernel_Results/" + final_file_list[rep]

    datprep.store_compounds(curve_list, final_curve_file)
Esempio n. 4
0

compounds = datprep.read_compounds(small_data_file)

single = False
if single:
    for c in compounds:
        ev, vectors = jrep.CM_ev(c.Z, c.R, c.N)
        print("name of compound:", c.filename)
        #print("eigenvalue repro:\n", ev)
    
        derivative = jader.sort_derivative('CM_EV', c.Z, c.R, c.N, 2, "R", "R")
        print(derivative)

results, fractions = jader.calculate_eigenvalues('CM_EV', compounds)
datprep.store_compounds(results, CM_ev_result_file)

print(type(results[0]))

#y-axis information
dZ_percentages = []
dR_percentages = []
dZdZ_percentages = []
dRdR_percentages = []
dZdR_percentages = []

#x-axis information
norms = []

#C)
#get all the data from our results list
Esempio n. 5
0
def full_kernel_ridge(fingerprint_list,
                      property_list,
                      result_file,
                      set_sizes,
                      sigmas=[],
                      lambdas=[],
                      rep_no=1,
                      upperlimit=12,
                      Choose_Folder=False,
                      representation="CM"):
    #print("result_file:", result_file)
    ''' Kernel ridge regression model
    y(X') = sum_i alpha_i K(X', X_i)
    
    Input
    -----
    fingerprint_list :  list of fingerprints
    property_list : list of learning data, e.g. energy values corresponding to the fingerprints
    result_file : file where data is stored with pickle. 
    training_size : desired size of training set
    sigmas : fitting coefficient
    lambdas : fitting coefficient
    upperlimit : int, total of training + test set. Can be used if more data is available than is being used or to bootstrap
    Choose_Folder: boolean, if True, file is directly stored to result_file.
                    if not, result file is stored in ./Pickled/Kernel_Results folder
    representation: str, abbreviation for fingerprint used


    Return
    ------
    learning_list : list of LearningResults Objects
    raw_data_files : list of names of files where raw data was stored to
    

    Stored
    ------
    raw data is stored to raw_data_file entries, learning_list is sotred to result_file
    '''

    start = tic()

    learning_list = []
    raw_data_files = []

    if not Choose_Folder:
        print("your results are stored to ./Pickled/Kernel_Results/")
        result_file = "./Pickled/Kernel_Results/" + result_file + "_" + str(
            rep_no) + "reps"

    #loop over learning defined by number of repetition, sigmas, and lamdas
    for i in range(rep_no):
        for s in sigmas:
            for l in lambdas:
                #for every i, s, l combination, a new Learning Object is created and stored to the learning list
                maes = []

                for sets in set_sizes:
                    t1 = tic()

                    #make training and test list:
                    training_indices, test_indices = make_training_test(
                        len(fingerprint_list), sets, upperlim=upperlimit)
                    #print("training:", training_indices)
                    #print("test:", test_indices)

                    tr_fingerprints = [
                        fingerprint_list[i] for i in training_indices
                    ]
                    tr_properties = [
                        property_list[i] for i in training_indices
                    ]
                    tr_size = len(training_indices)

                    tst_fingerprints = [
                        fingerprint_list[i] for i in test_indices
                    ]
                    tst_properties = [property_list[i] for i in test_indices]

                    t2 = tic()

                    K = build_kernel_matrix(tr_fingerprints, tr_size, s)
                    t3 = tic()

                    #print("\n \n \nkernel matrix:\n ", K)

                    #get alpha coefficients
                    alphas = get_alphas(K, tr_properties, tr_size, l)
                    t4 = tic()

                    #print("\n \n \n alphas:\n ", alphas)

                    #print("trainin/test split:", t2 - t1)
                    #print("kernel matrix:", t3-t2)
                    #print("alphas calculation:", t4 - t3)

                    #predict properties of test set
                    results, errors = predict_new(s, alphas, tr_fingerprints,
                                                  tst_fingerprints,
                                                  tst_properties)
                    mae = sum(abs(errors)) / (len(errors))
                    maes.append(mae)

                    #save raw data
                    filename = './tmp/%srawdata_rep%i_sigma%s_lamda%f_set%i.dat' % (
                        representation, i, str(s), l, sets)
                    raw_data_files.append(filename)
                    save_raw_data(filename, tr_properties, training_indices,
                                  tst_properties, results, test_indices)

                #add learning result to list
                learning_list.append(
                    LearningResults(l, s, np.array(set_sizes), np.array(maes)))
        print("round %i successfully finished" % (i + 1))

    #save maes with data so it can be plotted
    datprep.store_compounds(learning_list, result_file)

    return (learning_list, raw_data_files)
Esempio n. 6
0
with this file XYZ files can be converted to database_preparation.compounds class objects
'''

#define path to folder containing xyz files
database = "../Databases/QM9_XYZ/"

#define path to where you want to store your data
database_file = "../Databases/Pickled/qm9.pickle"

#define path to where you want to store data of molecules with
#less heavy atoms than in the database_file
dat_ha_file = "../Databases/Pickled/qm7.pickle"

#read all compounds in database file and convert to datprep class objects
mol_ls, compound_ls = datprep.read_xyz_energies(database)

#store compounds to database_file
datprep.store_compounds(compound_ls, database_file)

#store all compounds with less than 7 atoms with code below:
datprep.sortby_heavyatoms(database_file, dat_ha_file, 7)

ha_compounds = datprep.read_compounds(dat_ha_file)

for c in compound_ls:
    print(c.heavy_atoms())

print("now all heavy atoms in sorted list")
for i in ha_compounds:
    print(i.heavy_atoms())
Esempio n. 7
0
    if rep == 1:
        M = ZRN_rep.Eigenvalue_Coulomb_Matrix_h(compound.Z, compound.R)
        mol.representation = M.flatten()

    if rep == 2:
        mol.generate_bob(asize={'C': 7, 'H': 16, 'N': 6, 'O': 4, 'F': 4})

    if rep == 3:
        M = ZRN_rep.Overlap_Matrix_h(compound.Z, compound.R)
        mol.representation = M.flatten()

    if rep == 4:
        M = ZRN_rep.Eigenvalue_Overlap_Matrix_h(compound.Z, compound.R)
        mol.representation = M.flatten()

    #add representation array to X_list
    X_list[rep].append(mol.representation)

#prepare Kernel_Result raw instances (no training/test split, no sigma, no lamda)

m = datprep.Kernel_Result()
m.representation_name = rep_names[rep]
m.x = X_list[rep]
m.y = Y_energy_list

CM_list.append(m)

datprep.store_compounds(CM_list,
                        "./tmp/%s_raw_Kernel_Results" % rep_names[rep])