Beispiel #1
0
                "800-820", "820-840", "840-860", "880-900",\
                "920-940", "940-960", "980-1000", "1000-1020"]

        range_1000 = range(0, 3800, 100)
        srt_numbers = ["%i-%i"%(j, j+100) for j in range_1000]
        print(srt_numbers)
        number_ends = [srt_numbers, unsrt_numbers]
        print("number of molecules total:", 20*len(unsrt_numbers))
        for k in range(len(unsrt_numbers)):#range(0, 4000, 100):
            
                    
            partialfilename = filename + number_ends[i-1][k]
            print("file: ", partialfilename)

            if os.path.isfile(partialfilename):
                results = datprep.read_compounds(partialfilename)
            else:
                print(partialfilename, "was not found")
                continue

            xdata, ydata, newresults = prepresults(results, rep = repro,\
                    dwhich = which_d, repno = 2,\
                    norm = xnorm, yval = yvalues,\
                    with_whichd = False)
            
            #datprep.store_compounds(newresults, partialfilename)
            for yd in ydata:
                ax.scatter(xdata, yd[0], c = colorlist[i], label = repro if j == 0 else "")
                #np.savez(outfile, xdata = x, yd[0] = y)
                del(xdata)
                del(ydata)
Beispiel #2
0
###A)
###store compounds to database_file
#datprep.store_compounds(compound_ls, database_file)

###A) supplement: you don't need to do this step
###take info from database_file and extract all molecules with less than 7 heavy atoms to dat_ha_file
#max_atoms = datprep.sortby_heavyatoms(database_file, dat_ha_file, 7)

###A) supplement: if in doubt, just choose 23 for QM9 dataset. max_atoms is just the maximal size of your representation
###max_atoms is maximal number of atoms in file. needed to set size of CM
#print("all CM should have size " , max_atoms)
#input("Press enter once you have made sure the size of the unsorted CM matrix has been adapted accordingly")

###B)
###read list of compounds from data file
full_compound_ls = datprep.read_compounds(data_file)
print(len(full_compound_ls), " compounds in full data file")

###B)

#If you want to plot only part of all compounds, use this code:
try:
    compound_ls = full_compound_ls[init:end]
except IndexError:
    print("Your indices were out of bound, restart. min: 0, max: ",
          len(full_compound_ls))
    exit()

print(len(compound_ls), " of which are being processed")

###B)
Beispiel #3
0
def cleanup_results(result_file,
                    multiple_runs=False,
                    Choose_Folder=False,
                    rep_no=1):
    ''' gets data from resultfile and returns plottable Curve objects
    Variables
    ---------
    resultsfile : string, path to file containing pickled Result objects
    multiple_runs : if True, calculate mean of runs with same lamda and sigma
    Choose_Folder: boolean, if True, file is directly stored to result_file.
                    if not, result file is stored in ./Pickled/Kernel_Results folder


    Returns
    -------
    this_curve : LearningResults object
    '''

    if not Choose_Folder:
        #print("your results were stored to ./Pickled/Kernel_Results/")
        result_file = "./Pickled/Kernel_Results/" + result_file + "_" + str(
            rep_no) + "reps"

    plottable_curves = []
    if rep_no > 1:
        multiple_runs = True

    if multiple_runs:
        lamdas = []
        sigmas = []

    results_list = datprep.read_compounds(result_file)
    #print("len results_list:", len(results_list))

    for result in results_list:
        #print("type result:", type(result))
        lamda = result.lamda
        sigma = result.sigma
        xlist = result.set_sizes
        ylist = result.maes
        if not multiple_runs:
            name = curve_name(sigma, lamda)
            curve = CurveObj(name)
            curve.xnparray = xlist
            curve.ynparray = ylist

            plottable_curves.append(curve)

        else:
            lamdas.append(lamda)
            sigmas.append(sigma)

    #probably plottable_curves could already be returned here for False
    if multiple_runs:
        for l in list(set(lamdas)):  #get all unique occurances for lamda
            for s in list(set(sigmas)):  #get all unique occurances for sigma
                same_x = []
                same_y = []

                #find all results with these s and l
                for result in results_list:
                    if result.lamda == l and result.sigma == s:
                        same_x.append(result.set_sizes)
                        same_y.append(result.maes)
                #print("all arrays of same y:\n", same_y)
                #calculate average now
                av_ylist, yerror = jmath.calculate_mean(same_y)
                print("the calculated mean and it's error are:\n mean:",
                      av_ylist, "\n error:", yerror)
                #add Curve object
                name = curve_name(s, l)
                curve = CurveObj(name)
                curve.xnparray = same_x[0]
                curve.ynparray = av_ylist
                curve.yerror = yerror

                plottable_curves.append(curve)

    return (plottable_curves)
Beispiel #4
0
axb1 = fig.add_subplot(gs[1, 0:2])
axb2 = fig.add_subplot(gs[1, 2:4])
axb3 = fig.add_subplot(gs[1, 4:6])
axb4 = fig.add_subplot(gs[1, 6:8])
axb5 = fig.add_subplot(gs[1, 8:10])

plt.subplots_adjust(left=None,
                    bottom=None,
                    right=None,
                    top=None,
                    wspace=3,
                    hspace=None)

database = "../Databases/Pickled/qm7.pickle"

compoundlist = datprep.read_compounds(database)
#print("length of compoundlist:", len(compoundlist))

Zlist = []
atomlen = []
halist = []
atomtypes = []

atomfrequencylist = [[], [], [], [], [], [], [], [], [], [], [], [], [], [],
                     [], [], [], []]

for compound in compoundlist:
    Z = compound.Z
    atoms = len(Z)
    atomscount = Counter(list(Z))
    atomscountdict = dict(atomscount)
Beispiel #5
0
repno = 1

#final_file_list = ["./Results/trial.obj"]
final_file_list = ['CM_QM7', \
        'EVCM_QM7', \
        'BOB_QM7',\
        'OM_QM7',\
        'EVOM_QM7']

repnames = ["CM", "EVCM", "BOB", "OM", "EVOM"]
max_no = 3993  #150

#for i in [0, 1, 2]:
#    results = kproc.kernel_learning(datapath, final_file_list[i], representation_no = i, maxnumber = max_no, repno = repno)

final_file = "./tmp/Curves.pickle"
curve_list = datprep.read_compounds(final_file)
'''
#curves = kplot.cleanup_results(final_file, rep_no = repno)
for curve in curves:
    curve.name = repnames[i] + curve.name
    curve_list.append(curve)

        #print("curve", curve)
    #kplot.plot_curves(curves, file_title = final_file[11:], plottitle = final_file + "Learning on 200 QM7 datapoints")
'''

kplot.plot_curves(curve_list,
                  file_title="ML_trial",
                  plottitle="Learning of Molecular Energies on QM7 Dataset")
Beispiel #6
0
    for i in range(5):
        results, resultaddition = jader.calculate_num_der(
            replist[i], compounds)
        res_file = database + namelist[i] + "der_results.pickle"
        datprep.store_compounds(results, res_file)
        print("results were successfully stored to ", res_file)

if do_plot_derivatives:
    yvals = []
    norms_nuc = []

    for i in range(5):
        resfile = database + namelist[i] + "der_results.pickle"

        res_list = datprep.read_compounds(resfile)

        xlist, ylist, results = \
                pltder.prepresults(results = res_list,\
                rep = namelist[i],\
                repno = i,\
                yval = "perc",\
                with_whichd = True)

        yvals.extend(ylist)
        for i in ylist:
            norms_nuc.append(xlist)

    print(yvals)

    pltder.plot_percentage_zeroEV([1,2], yvals, title = "4 C Atoms",\
Beispiel #7
0
              1e-5]  #optimal lamdas for every representation

#get maximum number of compounds for which representations need to be calculated

final_file_list = ['CM_QM7', \
        'EVCM_QM7', \
        'BOB_QM7',\
        'OM_QM7',\
        'EVOM_QM7']

final_curve_file = "./tmp/Curves.pickle"
filepath_thisjob = "./tmp/trial"

if plot_scatter:
    #plot scatter plots
    results = datprep.read_compounds(filepath_thisjob)

    for i in range(len(training_no)):

        name = "%i Training Instances, OM representation" % training_no[i]
        y_test = results[i].y_test
        y_predicted = results[i].test_predicted_results

        pltker.plot_scatter(y_test,
                            y_predicted,
                            title=name,
                            figuretitle="Scatterplot_OM_%i" % i)

if plot_learning:
    results = datprep.read_compounds(filepath_thisjob)
    print("trying to plot learning from existing file")
Beispiel #8
0
import representation_ZRN as ZRNrep
import jax_representation as jrep
import database_preparation as datprep
from time import time as tic
import statistics
import numpy as np

data_file = "../Databases/Pickled/qm7.pickle"

repros = [ZRNrep.Coulomb_Matrix, ZRNrep.Eigenvalue_Coulomb_Matrix, ZRNrep.Overlap_Matrix, \
        ZRNrep.Eigenvalue_Overlap_Matrix, ZRNrep.Bag_of_Bonds]

repronames = ["CM", "EVCM", "OM", "EVOM", "BOB"]
###read list of compounds from data file

compounds = datprep.read_compounds(data_file)
compounds = compounds[:1]
print("number of compounds:", len(compounds))

#store times for every single and total calculation
one_times = [[], [], [], [], []]
total_times = []

for i in range(5):
    start = tic()
    for c in compounds:
        thisstart = tic()
        M = repros[i](c.Z, c.R)
        thisend = tic()
        one_times[i].append(thisend - thisstart)
Beispiel #9
0
import jax_representation as jrep
import jax.numpy as jnp
import plot_derivative as pltder

#define path to folder containing xyz files. All files are considered.
datapath = "../Database/QM9/"

compounds = 

#where do you want these compounds to be saved to?
small_data_file = "../Database/Pickled/compounds.pickle"
CM_ev_result_file = "/home/linux-miriam/Uniqueness_QML/Pickled/fourcompounds_res.pickle"



compounds = datprep.read_compounds(small_data_file)

single = False
if single:
    for c in compounds:
        ev, vectors = jrep.CM_ev(c.Z, c.R, c.N)
        print("name of compound:", c.filename)
        #print("eigenvalue repro:\n", ev)
    
        derivative = jader.sort_derivative('CM_EV', c.Z, c.R, c.N, 2, "R", "R")
        print(derivative)

results, fractions = jader.calculate_eigenvalues('CM_EV', compounds)
datprep.store_compounds(results, CM_ev_result_file)

print(type(results[0]))
Beispiel #10
0
def kernel_learning(datapath,
                    final_file,
                    representation_no=0,
                    maxnumber=3993,
                    repno=1):
    '''
    Metafunction to simplify a ML run
    datapath: list of pickled compound instances in a file
    final_file : where the results are dumped with pickle
    representation_no: [0,1,2,3,4] stand for [CM, EVCM, BOB, OM, EVOM] respectively
    maxnumber = number of compounds from datapath file to be considered

    returns:
    -------



    '''
    start = tic()

    representation_list = ["CM", "EVCM", "BOB", "OM", "EVOM"]
    repro_name = representation_list[representation_no]

    repro_sigmas = [[80], [120], [120], [15, 20, 120], [30, 150]]
    repro_lambdas = [[1e-15], [1e-15], [1e-15], [1e-15, 1e-13], [1e-14, 1e-13]]

    #define parameters for learning
    set_sizes = [5, 120, 600, 1500,
                 3000]  #not bigger than the total number of instances in set
    sigmas = repro_sigmas[
        representation_no]  # [9, 12, 15]       #how tight is the fit? needs to be tested depending on data varies widely
    lambdas = repro_lambdas[
        representation_no]  #how much variation to the initial data is introduced? 1e-17 - 1e-13 good to try

    number_of_runs = repno  #how many times should the learning be done before averaging and plotting?

    #define (hashed) representation
    representation_list = [ZRN_rep.Coulomb_Matrix_h,\
            ZRN_rep.Eigenvalue_Coulomb_Matrix_h,\
            ZRN_rep.Bag_of_Bonds_h,\
            ZRN_rep.Overlap_Matrix_h,\
            ZRN_rep.Eigenvalue_Overlap_Matrix_h]

    repro = representation_list[representation_no]

    #unpack pickled data to compounds list
    compounds = datprep.read_compounds(datapath)
    #print("len of compoundlist:", len(compounds))

    #shorten compounds to make stuff faster
    compounds = compounds[:maxnumber]
    #print("len of compoundlist used for this run:", len(compounds))

    #for compounds create list of energies and list of fingerprints in "repro" representation
    #internal = potential energy in hartree
    raw_energylist = []
    #convert to atomization energy in kcal/mol
    energylist = []
    fingerprintlist = []

    for c in compounds:

        #get properties from compound class
        energy = float(c.energy)

        Z = c.Z
        R = c.R
        N = c.N

        atomization_energy = datprep.atomization_energy(
            potential_energy=energy, nuclear_charges=Z)

        #calculate fingerprint of molecule
        fingerprint = repro(Z, R, N)

        #add energy and fingerprint to lists
        energylist.append(energy)
        fingerprintlist.append(fingerprint)

    t_compounds = tic()

    #print("time start to compounds: ", t_compounds - start)

    #run learning
    results, metadata = kler.full_kernel_ridge(fingerprintlist,\
            energylist,\
            final_file,\
            set_sizes,\
            sigmas,\
            lambdas,\
            rep_no = number_of_runs,\
            upperlimit = maxnumber,\
            representation = repro_name)

    return (results)
Beispiel #11
0
#
#    results.extend(compoundlist[0])

#print("number of compounds: ", len(results))

#datprep.store_compounds(results, result_file)

#data has now been stored to resultfile

#C)
#read data from result file

#If you want to plot from multiple pickle results file, use this code:
#result_file = resultfile

results_EV = datprep.read_compounds(results_file)
'''
results_EV = datprep.read_compounds(result_file_EV)
results_CM = datprep.read_compounds(result_file_CM)
'''

#C)
#prepare plotting
#y-axis information
dZ_percentages_EV = []
dR_percentages_EV = []
dZdZ_percentages_EV = []
dRdR_percentages_EV = []
dZdR_percentages_EV = []
'''
dZ_percentages_CM = []
Beispiel #12
0
import qml
import numpy as np
import database_preparation as datprep
import kernel_learning as kler
import plot_kernel as pltker
import representation_ZRN as ZRN_rep
""""
this file creates pickled lists of Kernel_Result class objects with the represented information
"""

calculated = datprep.read_compounds("./tmp/BOB_raw_Kernel_Results")

print("len of list: ", len(calculated))
print("first element:", calculated[0])
print("CM of first element:", calculated[0].representation_name)
print(calculated[0].x[0])
print("energy:", calculated[0].y[0])

#define datapath to a pickled list of compound instances
datapath = "./Pickled/qm7.pickle"

#list of representations to be considered, 0 = CM, 1 = EVCM, 2 = BOB, 3 = OM, 4 = EVOM
rep = 4

rep_names = ["CM", "EVCM", "BOB", "OM", "EVOM"]

#get maximum number of compounds for which representations need to be calculated
total = 3993

#unpickle list of compounds
compound_list = datprep.read_compounds(datapath)
Beispiel #13
0
         ZRNrep.Bag_of_Bonds, ZRNrep.Overlap_Matrix, \
        ZRNrep.Eigenvalue_Overlap_Matrix]

#which representation should be computed? 0 = CM, 1 = EVCM, 2 = BOB, 3 = OM, 4 = EVOM
which_rep = 0

try:
        init, end = int(sys.argv[1]), int(sys.argv[2])
except IndexError:
        init = int(input("starting point"))
        end = int(input("end point"))

name = str(init) + "-" + str(end)

###read list of compounds from data file
full_compound_ls = datprep.read_compounds(data_file)
print(len(full_compound_ls), " compounds in full data file")

###B)

#If you want to plot only part of all compounds, use this code:
try:
        compound_ls = full_compound_ls[init : end]
except IndexError:
        print("Your indices were out of bound, restart. min: 0, max: ", len(full_compound_ls))
        exit()

#print("you are going to calculate the repro on a list of compounds of length:")
#print(len(compound_ls))

t1 = tic()