Ejemplo n.º 1
0
def read_xyz_n_dm(fxyz, design_matrix, use_atomic_descriptors, only_use_species, peratom):
    dm = []
    dm_atomic = []
    # try to read the xyz file
    if fxyz is not None and fxyz != 'none':
        from asaplib.data import ASAPXYZ
#        from asapxyzs import ASAPXYZs
        asapxyz = ASAPXYZ(fxyz)
        if use_atomic_descriptors:
            dm = asapxyz.get_atomic_descriptors(design_matrix, only_use_species)
        else:
            dm, dm_atomic = asapxyz.get_descriptors(design_matrix, peratom)
    else:
        asapxyz = None
        print("Did not provide the xyz file. We can only output descriptor matrix.")
    # we can also load the descriptor matrix from a standalone file
    import os
    if os.path.isfile(design_matrix[0]):
        try:
            import numpy as np
            dm = np.genfromtxt(design_matrix[0], dtype=float)
            print("loaded the descriptor matrix from file: ", design_matrix[0])
        except:
            raise ValueError('Cannot load the descriptor matrix from file')
    return asapxyz, dm, dm_atomic
Ejemplo n.º 2
0
def main():
    """

    Test if Ridge regression is working.

    Parameters
    ----------
    fxyz: string giving location of xyz file
    prefix: string giving the filename prefix
    """
    fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz')
    fmat = ['SOAP-n4-l3-c1.9-g0.23']
    fy = 'dft_formation_energy_per_atom_in_eV'
    prefix = "test-skrr"
    test_ratio = 0.05
    lc_points = 8
    lc_repeats = 8

    # try to read the xyz file
    asapxyz = ASAPXYZ(fxyz)
    desc, _ = asapxyz.get_descriptors(fmat, False)
    y_all = asapxyz.get_property(fy)
    # print(desc)

    dm = Design_Matrix(X=desc, y=y_all, whiten=True, test_ratio=test_ratio)

    # kernel, jitter, delta, sigma, sparse_mode="fps", n_sparse=None
    k_spec = {
        'k0': {
            "type": "linear"
        }
    }  # { 'k1': {"type": "polynomial", "d": power}}

    # if sigma is not set...
    sigma = 0.001 * np.std(y_all)
    krr = KRRSparse(0., None, sigma)
    skrr = SPARSE_KRR_Wrapper(k_spec, krr, sparse_mode="fps", n_sparse=-1)

    # fit the model
    dm.compute_fit(skrr, 'skrr', store_results=True, plot=True)

    # learning curve
    if lc_points > 1:
        dm.compute_learning_curve(skrr,
                                  'ridge_regression',
                                  lc_points=lc_points,
                                  lc_repeats=lc_repeats,
                                  randomseed=42,
                                  verbose=False)

    dm.save_state(prefix)
    plt.show()
Ejemplo n.º 3
0
def main():
    """

    Test if Ridge regression is working.

    Parameters
    ----------
    fxyz: string giving location of xyz file
    prefix: string giving the filename prefix
    """
    fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz')
    fmat = ['SOAP-n4-l3-c1.9-g0.23']
    fy = 'dft_formation_energy_per_atom_in_eV'
    prefix = "test-rr"
    test_ratio = 0.05
    lc_points = 8
    lc_repeats = 8

    # try to read the xyz file
    asapxyz = ASAPXYZ(fxyz)
    desc, _ = asapxyz.get_descriptors(fmat, False)
    y_all = asapxyz.get_property(fy)
    # print(desc)

    dm = Design_Matrix(X=desc, y=y_all, whiten=True, test_ratio=test_ratio)

    # if sigma is not set...
    sigma = 0.001 * np.std(y_all)

    rr = RidgeRegression(sigma)

    # fit the model
    dm.compute_fit(rr, 'ridge_regression', store_results=True, plot=True)

    # learning curve
    if lc_points > 1:
        dm.compute_learning_curve(rr,
                                  'ridge_regression',
                                  lc_points=lc_points,
                                  lc_repeats=lc_repeats,
                                  randomseed=42,
                                  verbose=False)

    dm.save_state(prefix)
    plt.show()
Ejemplo n.º 4
0
def main():
    """
    Select frames from the supplied xyz file (fxyz) using one of the following algorithms:

    1. random: random selection
    2. fps: farthest point sampling selection. Need to supply a kernel matrix or descriptor matrix using -fmat
    4. CUR decomposition

    Parameters
    ----------
    fxyz: Path to xyz file.
    fmat: Path to the design matrix or name of the tags in ase xyz file
    prefix: Filename prefix, default is ASAP
    nkeep: The number of representative samples to select
    algorithm: 'the algorithm for selecting frames ([random], [fps], [cur])')
    fmat: Location of descriptor or kernel matrix file. Needed if you select [fps] or [cur].
    """

    fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz')
    fmat = ['SOAP-n4-l3-c1.9-g0.23']
    nkeep = 10
    prefix = "test-frame-select"

    # read the xyz file
    asapxyz = ASAPXYZ(fxyz)
    # for both algo we read in the descriptor matrix
    desc, _ = asapxyz.get_descriptors(fmat)
    print("shape of the descriptor matrix: ", np.shape(desc),
          "number of descriptors: ", np.shape(desc[0]))

    for algorithm in ['random', 'cur', 'fps']:
        sparsifier = Sparsifier(algorithm)
        sbs = sparsifier.sparsify(desc, nkeep)
        # save
        selection = np.zeros(asapxyz.get_num_frames(), dtype=int)
        for i in sbs:
            selection[i] = 1
        np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '.index',
                   selection,
                   fmt='%d')
        asapxyz.write(prefix + "-" + algorithm + "-n-" + str(nkeep), sbs)
Ejemplo n.º 5
0
def main(fmat, fxyz, fy, prefix, scale, test_ratio, sigma, lc_points,
         lc_repeats):
    """

    Parameters
    ----------
    fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it.
    fxyz: Location of xyz file for reading the properties.
    fy: Location of property list (1D-array of floats)
    prefix: filename prefix for learning curve figure
    scale: Scale the coordinates (True/False). Scaling highly recommanded.
    test_ratio: train/test ratio
    sigma: noise level in kernel ridge regression, default is 0.1% of the standard deviation of the data.
    lc_points : number of points on the learning curve
    lc_repeats : number of sub-sampling when compute the learning curve

    Returns
    -------

    Learning curve.

    """

    scale = bool(scale)

    # try to read the xyz file
    if fxyz != 'none':
        asapxyz = ASAPXYZ(fxyz)
        desc, _ = asapxyz.get_descriptors(fmat)
    # we can also load the descriptor matrix from a standalone file
    if os.path.isfile(fmat[0]):
        try:
            desc = np.genfromtxt(fmat[0], dtype=float)
            print("loaded the descriptor matrix from file: ", fmat)
        except:
            raise ValueError('Cannot load the descriptor matrix from file')
    if len(desc) == 0:
        raise ValueError(
            'Please supply descriptor in a xyz file or a standlone descriptor matrix'
        )
    print("shape of the descriptor matrix: ", np.shape(desc),
          "number of descriptors: ", np.shape(desc[0]))

    # read in the properties to be predicted
    y_all = []
    try:
        y_all = np.genfromtxt(fy, dtype=float)
    except:
        y_all = asapxyz.get_property(fy)

    dm = Design_Matrix(X=desc, y=y_all, whiten=True, test_ratio=test_ratio)

    # if sigma is not set...
    if sigma < 0:
        sigma = 0.001 * np.std(y_all)
    rr = RidgeRegression(sigma)

    # fit the model
    dm.compute_fit(rr, 'ridge_regression', store_results=True, plot=True)

    # learning curve
    if lc_points > 1:
        lc_scores = dm.compute_learning_curve(rr,
                                              'ridge_regression',
                                              lc_points=lc_points,
                                              lc_repeats=lc_repeats,
                                              randomseed=42,
                                              verbose=False)
        # make plot
        lc_scores.plot_learning_curve()
    plt.show()
def main(fmat, fxyz, ftags, prefix, dimension, pc1, pc2, adtext):
    """

    Parameters
    ----------
    fmat: Location of low-dimensional coordinate file.
    ftags: Location of tags for the first M samples.
    prefix: Filename prefix.
    dimension: Number of the first X dimensions to keep
    pc1: First principle axis (int)
    pc2: Second principle axis (int)
    adtext: Boolean giving whether to adjust text or not.

    Returns
    -------

    """

    # try to read the xyz file
    if fxyz != 'none':
        asapxyz = ASAPXYZ(fxyz)
        desc, _ = asapxyz.get_descriptors(fmat)
    if os.path.isfile(fmat):
        try:
            desc = np.genfromtxt(fmat, dtype=float)
            print("loaded the descriptor matrix from file: ", fmat)
        except:
            raise ValueError('Cannot load the descriptor matrix from file')
    if len(desc) == 0:
        raise ValueError('Please supply descriptor in a xyz file or a standlone descriptor matrix')
    print("loaded", fmat, " with shape", np.shape(desc))
    # load tags if any
    if ftags != 'none':
        tags = np.loadtxt(ftags, dtype="str")
        ndict = len(tags)

    proj = np.asmatrix(desc)[:, 0:dimension]
    density_model = KDE_internal() # KDE_sklearn(bandwidth=1) # KDE_scipy()
    # fit density model to data
    try:
        density_model.fit(proj)
    except:
        raise RuntimeError('KDE did not work. Try smaller dimension.')

    rho = density_model.evaluate_density(proj)
    # save the density
    np.savetxt(prefix + "-kde.dat", np.transpose([np.arange(len(rho)), rho]),
               header='index log_of_kernel_density_estimation', fmt='%d %4.8f')

    # color scheme
    plotcolor = rho
    colorlabel = 'Log of density for every point'
    [plotcolormin, plotcolormax] = [np.min(plotcolor), np.max(plotcolor)]

    # make plot
    plot_styles.set_nice_font()
    # density plot
    fig, ax = plot_styles.plot_density_map(np.asarray(proj[:, [pc1, pc2]]), plotcolor,
                                           xlabel='Princple Axis ' + str(pc1), ylabel='Princple Axis ' + str(pc2),
                                           clabel=colorlabel, label=None,
                                           xaxis=True, yaxis=True,
                                           centers=None,
                                           psize=None,
                                           out_file=None,
                                           title='KDE for: ' + prefix,
                                           show=False, cmap='gnuplot',
                                           remove_tick=False,
                                           use_perc=False,
                                           rasterized=True,
                                           fontsize=15,
                                           vmax=plotcolormax,
                                           vmin=plotcolormin)

    fig.set_size_inches(18.5, 10.5)
    if ftags != 'none':
        texts = []
        for i in range(ndict):
            if tags[i] != 'None' and tags[i] != 'none' and tags[i] != '':
                ax.scatter(proj[i, pc1], proj[i, pc2], marker='^', c='black')
                texts.append(ax.text(proj[i, pc1], proj[i, pc2], tags[i],
                                     ha='center', va='center', fontsize=15, color='red'))
            # ax.annotate(tags[i], (proj[i,pc1], proj[i,pc2]))
        if adtext:
            from adjustText import adjust_text
            adjust_text(texts, on_basemap=True,  # only_move={'points':'', 'text':'x'},
                        expand_text=(1.01, 1.05), expand_points=(1.01, 1.05),
                        force_text=(0.03, 0.5), force_points=(0.01, 0.25),
                        ax=ax, precision=0.01,
                        arrowprops=dict(arrowstyle="-", color='black', lw=1, alpha=0.8))

    plt.show()
    fig.savefig('kde_4_' + prefix + '.png')
Ejemplo n.º 7
0
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, kpca_d, pc1,
         pc2, adtext):
    """

    Parameters
    ----------
    fmat
    fxyz
    ftags
    fcolor
    colorscol
    prefix
    output
    kpca_d: number of dimensions
    pc1
    pc2
    adtext

    Returns
    -------

    """
    foutput = prefix + "-kpca-d" + str(kpca_d)
    # load the kernel matrix
    try:
        kNN = np.genfromtxt(fmat, dtype=float)
    except:
        raise ValueError('Cannot load the kernel matrix')

    print("loaded", fmat)
    if ftags != 'none':
        tags = np.loadtxt(ftags, dtype="str")
        if tags.ndim > 1:
            tags = tags[:, 0]
        ndict = len(tags)

    asapxyz = None
    # try to read the xyz file
    if fxyz != 'none':
        asapxyz = ASAPXYZ(fxyz)
    elif output == 'xyz':
        print(
            "Did not provide the xyz file. We can only output descriptor matrix."
        )
        output = 'matrix'

    # main thing
    proj = KernelPCA(kpca_d).fit_transform(kNN)

    # save
    if output == 'matrix':
        np.savetxt(prefix + "-kpca-d" + str(kpca_d) + ".coord",
                   proj,
                   fmt='%4.8f',
                   header='low D coordinates of samples')
    elif output == 'xyz':
        if os.path.isfile(foutput + ".xyz"):
            os.rename(foutput + ".xyz", "bck." + foutput + ".xyz")
        asapxyz.set_descriptors(proj, 'kpca_coord')
        asapxyz.write(foutput)

    # color scheme
    plotcolor, colorlabel, colorscale = set_color_function(
        fcolor, asapxyz, colorscol, len(proj))

    # make plot
    plot_styles.set_nice_font()
    # fig, ax = plt.subplots()

    fig, ax = plot_styles.plot_density_map(proj[:, [pc1, pc2]],
                                           plotcolor,
                                           xlabel='Principal Axis ' + str(pc1),
                                           ylabel='Principal Axis ' + str(pc2),
                                           clabel=colorlabel,
                                           label=None,
                                           xaxis=True,
                                           yaxis=True,
                                           centers=None,
                                           psize=None,
                                           out_file=None,
                                           title='KPCA for: ' + prefix,
                                           show=False,
                                           cmap='gnuplot',
                                           remove_tick=False,
                                           use_perc=True,
                                           rasterized=True,
                                           fontsize=15,
                                           vmax=colorscale[1],
                                           vmin=colorscale[0])

    fig.set_size_inches(18.5, 10.5)

    if ftags != 'none':
        texts = []
        for i in range(ndict):
            if tags[i] != 'None' and tags[i] != 'none' and tags[i] != '':
                ax.scatter(proj[i, pc1], proj[i, pc2], marker='^', c='black')
                texts.append(
                    ax.text(proj[i, pc1],
                            proj[i, pc2],
                            tags[i],
                            ha='center',
                            va='center',
                            fontsize=15,
                            color='red'))
            # ax.annotate(tags[i], (proj[i,pc1], proj[i,pc2]))
        if adtext:
            from adjustText import adjust_text
            adjust_text(
                texts,
                on_basemap=True,  # only_move={'points':'', 'text':'x'},
                expand_text=(1.01, 1.05),
                expand_points=(1.01, 1.05),
                force_text=(0.03, 0.5),
                force_points=(0.01, 0.25),
                ax=ax,
                precision=0.01,
                arrowprops=dict(arrowstyle="-", color='black', lw=1,
                                alpha=0.8))

    plt.show()
    fig.savefig('KPCA_4_' + prefix + '-c-' + fcolor + '.png')
Ejemplo n.º 8
0
parser.add_argument("--stride","-s",type=int,help="stride for asap gen_desc command; this function will be deprecated")
parser.add_argument("--method","-m",type=str,default='fps',help="method, 3 options: 'random', 'cur', 'fps'")
args   = parser.parse_args()


#dirctory = '/Users/jiedeng/Documents/tmp/jd848/project_folder/liquid_vapor/water1/r6-6k/cont1/asap'
#fxyz = dirctory+'/ASAP-desc.xyz'
fxyz = args.input
#fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz')
#    fmat = ['SOAP-n4-l3-c1.9-g0.23']
fmat = ['*']
nkeep = args.number #50
prefix = "test-frame-select"

# read the xyz file
asapxyz = ASAPXYZ(fxyz)
# for both algo we read in the descriptor matrix
desc, _ = asapxyz.get_descriptors(fmat)
print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0]))


algorithm = args.method#'fps' # 3 options: 'random', 'cur', 'fps'
#algorithm = 'random' # 3 options: 'random', 'cur', 'fps'

sparsifier = Sparsifier(algorithm)
sbs = sparsifier.sparsify(desc, nkeep)
sbs.sort()
if args.stride is None:
    pass
else:
    sbs = sbs*args.stride
def main(fxyz, dictxyz, prefix, output, peratom, fsoap_param, soap_rcut, soap_g, soap_n, soap_l, zeta_list, kernel_type, element_wise, soap_periodic, stride):
    """

    Generate the SOAP descriptors.

    Parameters
    ----------
    fxyz: string giving location of xyz file
    prefix: string giving the filename prefix
    output: [xyz]: append the SOAP descriptors to extended xyz file; [mat] output as a standlone matrix
    fsoap_param: use (possibly multiple sets) of SOAP descriptors using parameters specified in fsoap_param file (json format)
    soap_rcut: float giving the cutoff radius, default value is 3.0
    soap_g: float giving the atom width
    soap_n: int giving the maximum radial label
    soap_l: int giving the maximum angular label. Must be less than or equal to 9
    zeta_list : get the global descriptor from atomic ones of zeta th power
    kernel_type: type of operations to get global descriptors from the atomic soap vectors
    elementwise: consider different species seperately when computing global descriptors from the atomic soap vectors
    soap_periodic: string (True or False) indicating whether the system is periodic
    stride: compute descriptor each X frames
    """

    # read frames
    asapxyz = ASAPXYZ(fxyz)

    if fsoap_param is not None:
        import json
        # load the parameter from json file
        if os.path.isfile(fsoap_param):
            try:
                with open(fsoap_param, 'r') as soapfile:
                    soap_js = json.load(soapfile)
            except:
                raise IOError('Cannot load the json file for soap parameters')

        # use the default parameters
        else: 
            soap_js = universal_soap_hyper(global_species, fsoap_param, dump=True)

        # make descriptors
        soap_desc_atomic = []
        for element in soap_js.keys():
            soap_param = soap_js[element]
            [species_now, cutoff_now, g_now, n_now, l_now] = [soap_param['species'], soap_param['cutoff'],
                                                              soap_param['atom_gaussian_width'], soap_param['n'],
                                                              soap_param['l']]
            soap_desc_atomic.append(SOAP(species=species_now, rcut=cutoff_now, nmax=n_now, lmax=l_now,
                                         sigma=g_now, rbf="gto", crossover=False, average=False,
                                         periodic=soap_periodic))

        foutput = prefix + "-soapparam" + '-' + fsoap_param
        desc_name = "SOAPPARAM" + '-' + fsoap_param

    else:
        soap_desc_atomic = [SOAP(species=global_species, rcut=soap_rcut, nmax=soap_n, lmax=soap_l,
                                 sigma=soap_g, rbf="gto", crossover=False, average=False, periodic=soap_periodic)]
        foutput = prefix + "-n" + str(soap_n) + "-l" + str(soap_l) + "-c" + str(soap_rcut) + "-g" + str(soap_g)
        desc_name = "SOAP" + "-n" + str(soap_n) + "-l" + str(soap_l) + "-c" + str(soap_rcut) + "-g" + str(soap_g)
    for i, frame in enumerate(frames):
        fnow = soap_desc_atomic[0].create(frame, n_jobs=8)

        for soap_desc_atomic_now in soap_desc_atomic[1:]:
            fnow = np.append(fnow, soap_desc_atomic_now.create(frame, n_jobs=8), axis=1)
            
        if kernel_type == 'average' and element_wise == False and len(zeta_list)==1 and zeta_list[0]==1:
            # this is the vanilla situation. We just take the average soap for all atoms
            frame.info[desc_name] = Atomic_2_Global_Descriptor_By_Species(fnow, [], [], kernel_type, zeta_list)
        elif element_wise == False:
            frame.info[desc_name+'-'+kernel_type] = Atomic_2_Global_Descriptor_By_Species(fnow, [], [], kernel_type, zeta_list)
        else:
            frame.info[desc_name+'-'+kernel_type+'-elementwise'] = Atomic_2_Global_Descriptor_By_Species(fnow, frame.get_atomic_numbers(), global_species, kernel_type, zeta_list)

        # save
        if output == 'matrix':
            asapxyz.write_descriptor_matrix(desc_name, desc_name)
            if peratom or nframes == 1:
                asapxyz.write_atomic_descriptor_matrix(desc_name, desc_name)
        elif output == 'xyz':
           asapxyz.write(foutput)
        else:
            raise ValueError('Cannot find the output format')
Ejemplo n.º 10
0
def load_asapxyz(data_spec):
    from asaplib.data import ASAPXYZ
    return ASAPXYZ(data_spec['fxyz'], data_spec['stride'],
                   data_spec['periodic'], data_spec['fxyz_format'])
Ejemplo n.º 11
0
def main(fmat, fxyz, fy, prefix, test_ratio, jitter, n_sparse, sigma,
         lc_points, lc_repeats):
    """

    Parameters
    ----------
    fmat: Location of kernel matrix file.
    fy: Location of property list (1D-array of floats)
    prefix: filename prefix for learning curve figure
    test_ratio: train/test ratio
    jitter: jitter level, default is 1e-10
    n_sparse: number of representative samples, default is 5% of the data
    sigma: noise level in kernel ridge regression, default is 0.1% of the standard deviation of the data.
    lc_points : number of points on the learning curve
    lc_repeats : number of sub-sampling when compute the learning curve

    Returns
    -------

    Fitting outcome & Learning curve.

    """

    # if it has been computed before we can simply load it
    try:
        K_all = np.genfromtxt(fmat, dtype=float)
    except OSError:
        raise Exception(
            'fmat file could not be loaded. Please check the filename')
    print("loaded", fmat)

    # read in the properties to be predicted
    y_all = []
    try:
        y_all = np.genfromtxt(fy, dtype=float)
    except:
        try:
            # try to read the xyz file
            if fxyz != 'none':
                asapxyz = ASAPXYZ(fxyz)
                y_all = asapxyz.get_property(fy)
        except OSError:
            raise Exception(
                'property vector file could not be loaded. Please check the filename'
            )

    if len(y_all) != len(K_all):
        raise ValueError(
            'Length of the vector of properties is not the same as number of samples'
        )
    else:
        n_sample = len(K_all)

    # train test split
    if test_ratio > 0:
        K_train, K_test, y_train, y_test, _, _ = kernel_random_split(
            K_all, y_all, test_ratio)
    else:
        K_train = K_test = K_all
        y_train = y_test = y_all
    n_train = len(K_train)
    n_test = len(K_test)

    # set default value of n_sparse
    if n_sparse == 0:
        n_sparse = n_train // 20
    # sparsification
    if n_sparse >= n_train:
        print(
            "the number of representative structure is too large, please select n < ",
            n_train)
    elif n_sparse > 0:
        ifps, dfps = fps(K_train, n_sparse, 0)
        K_MM = K_train[:, ifps][ifps]
        K_NM = K_train[:, ifps]
        K_TM = K_test[:, ifps]
    else:
        print("it's usually better to use some sparsification")
        K_MM = K_train
        K_NM = K_train
        K_TM = K_test

    # if sigma is not set...
    if sigma < 0:
        sigma = 0.001 * np.std(y_train)

    delta = np.std(y_train) / (np.trace(K_MM) / len(K_MM))
    krr = KRRSparse(jitter, delta, sigma)
    # fit the model
    krr.fit(K_MM, K_NM, y_train)

    fit_error = {}
    # get the predictions for train set
    y_pred = krr.predict(K_NM)
    # compute the CV score for the dataset
    y_pred, y_pred_test, fit_error = krr.get_train_test_error(K_NM,
                                                              y_train,
                                                              K_TM,
                                                              y_test,
                                                              verbose=True,
                                                              return_pred=True)
    # dump to file
    import json
    with open('KRR_train_test_errors_4' + prefix + '.json', 'w') as fp:
        json.dump(fit_error, fp)

    # learning curve
    # decide train sizes
    if lc_points > 1 and n_sparse > 0:
        train_sizes = exponential_split(n_sparse, n_train - n_test, lc_points)
        print("Learning curves using train sizes: ", train_sizes)
        lc_stats = lc_repeats * np.ones(lc_points, dtype=int)
        lc = LCSplit(ShuffleSplit,
                     n_repeats=lc_stats,
                     train_sizes=train_sizes,
                     test_size=n_test,
                     random_state=10)

        lc_scores = LC_SCOREBOARD(train_sizes)
        for lctrain, _ in lc.split(y_train):
            Ntrain = len(lctrain)
            lc_K_NM = K_NM[lctrain, :]
            lc_y_train = y_train[lctrain]
            # here we always use the same test set
            # otherwise, one can do `lc_K_test = K_NM[lctest,:]; lc_y_test = y_train[lctest]`
            krr.fit(K_MM, lc_K_NM, lc_y_train)
            # here we always use the same test set
            _, lc_score_now = krr.fit_predict_error(K_MM, lc_K_NM, lc_y_train,
                                                    K_TM, y_test)
            lc_scores.add_score(Ntrain, lc_score_now)

        sc_name = 'RMSE'  #     MAE, RMSE, SUP, R2, CORR
        lc_results = lc_scores.fetch(sc_name)
        # output learning curve
        np.savetxt("KRR_learning_curve_4" + prefix + ".dat", lc_results)

    plot_styles.set_nice_font()

    if lc_points > 1 and n_sparse > 0:
        fig = plt.figure(figsize=(8 * 2.1, 8))
        ax = fig.add_subplot(121)
    else:
        fig = plt.figure(figsize=(8, 8))
        ax = fig.add_subplot(111)
    ax.plot(y_train, y_pred, 'b.', label='train')
    ax.plot(y_test, y_pred_test, 'r.', label='test')
    ax.legend()
    ax.set_title('KRR for: ' + fy)
    ax.set_xlabel('actual y')
    ax.set_ylabel('predicted y')

    if lc_points > 1 and n_sparse > 0:
        ax2 = fig.add_subplot(122)
        ax2.errorbar(lc_results[:, 0],
                     lc_results[:, 1],
                     yerr=lc_results[:, 2],
                     linestyle='',
                     uplims=True,
                     lolims=True)
        ax2.set_title('Learning curve')
        ax2.set_xlabel('Number of training samples')
        ax2.set_ylabel('Test {}'.format(sc_name))
        ax2.set_xscale('log')
        ax2.set_yscale('log')

    plt.show()
    fig.savefig('KRR_4_' + prefix + '.png')
Ejemplo n.º 12
0
r1_tag = np.ones(100).astype(int)
r3_tag = (np.ones(250) * 0).astype(int)
ppv_tag = (np.ones(75) * 2).astype(int)
r1_r3_tag = np.concatenate((r1_tag, r3_tag))

r1_r3_ppv_tag = np.concatenate((r1_tag, r3_tag, ppv_tag))

np.savetxt('tag', r1_r3_tag, fmt='%d')
np.savetxt('tag_ppv', r1_r3_ppv_tag, fmt='%d')

#fmat = 'pca_coord'

fmat = 'skpca-d-10'
#fmat = '[*]'
asapxyz = ASAPXYZ(fxyz)

dm, _ = asapxyz.get_descriptors(fmat, False)
dm_mg = asapxyz.get_atomic_descriptors(fmat, 12)
dm_oxygen = asapxyz.get_atomic_descriptors(fmat, 8)
dm_silicon = asapxyz.get_atomic_descriptors(fmat, 14)

plotcolor_volume, _, _, _ = set_color_function('volume', asapxyz)
plotcolor_density = np.zeros(len(plotcolor_volume))
for i in range(len(plotcolor_volume)):
    plotcolor_density[i] = 29.889703 / plotcolor_volume[i] / 3.

#tags = np.loadtxt('ice-54-labels.dat', dtype="str")[:,0]

#iceornot_hydrogen, _, _, _ = set_color_function('ice-or-not.tag', asapxyz, 0, 0, False, True, 1, False)
iceornot_oxygen, _, _, _ = set_color_function('tag', asapxyz, 0, 0, False,
Ejemplo n.º 13
0
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, peratom, keepraw, scale, pca_d, pc1, pc2, projectatomic, plotatomic,
         adtext):
    """

    Parameters
    ----------
    fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it.
    fxyz: Location of xyz file for reading the properties.
    ftags: Location of tags for the first M samples. Plot the tags on the PCA map.
    fcolor: Location of a file or name of the tags in ase xyz file. It should contain properties for all samples (N floats) used to color the scatterplot'
    colorscol: The column number of the properties used for the coloring. Starts from 0.
    prefix: Filename prefix, default is ASAP
    output: The format for output files ([xyz], [matrix]). Default is xyz.
    peratom: Whether to output per atom pca coordinates (True/False)
    keepraw: Whether to keep the high dimensional descriptor when output is an xyz file (True/False)
    scale: Scale the coordinates (True/False). Scaling highly recommanded.
    pca_d: Number of the principle components to keep
    pc1: Plot the projection along which principle axes
    pc2: Plot the projection along which principle axes
    projectatomic: build the projection using the (big) atomic descriptor matrix
    plotatomic: Plot the PCA coordinates of all atomic environments (True/False)
    adtext: Whether to adjust the texts (True/False)

    Returns
    -------

    """

    foutput = prefix + "-pca-d" + str(pca_d)
    use_atomic_desc = (peratom or plotatomic or projectatomic)

    # try to read the xyz file
    if fxyz != 'none':
        asapxyz = ASAPXYZ(fxyz)
        desc, desc_atomic = asapxyz.get_descriptors(fmat, use_atomic_desc)
        if projectatomic: desc = desc_atomic.copy()
    else:
        asapxyz = None
        print("Did not provide the xyz file. We can only output descriptor matrix.")
        output = 'matrix'
    # we can also load the descriptor matrix from a standalone file
    if os.path.isfile(fmat[0]):
        try:
            desc = np.genfromtxt(fmat[0], dtype=float)
            print("loaded the descriptor matrix from file: ", fmat)
        except:
            raise ValueError('Cannot load the descriptor matrix from file')
    # sanity check
    if len(desc) == 0:
        raise ValueError('Please supply descriptor in a xyz file or a standlone descriptor matrix')
    print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0]))

    if ftags != 'none':
        tags = np.loadtxt(ftags, dtype="str")[:]
        ndict = len(tags)
    else:
        tags = []


    reduce_dict = { "pca": 
                   {"type": 'PCA', 'parameter':{"n_components": pca_d, "scalecenter": scale}}
                  }
    """
    reduce_dict = { "umap": 
                   {"type": 'UMAP', 'parameter':{"n_components": pca_d, "n_neighbors": 10}}
                  }    

    reduce_dict = {
        "reduce1_pca": {"type": 'PCA', 'parameter':{"n_components": 20, "scalecenter":True}},
        "reduce2_tsne": {"type": 'TSNE', 'parameter': {"n_components": 2, "perplexity":20}}
        }
    """
    dreducer = Dimension_Reducers(reduce_dict)

    proj = dreducer.fit_transform(desc)
    if peratom or plotatomic and not projectatomic:
        proj_atomic_all = dreducer.transform(desc_atomic)

    # save
    if output == 'matrix':
        np.savetxt(foutput + ".coord", proj, fmt='%4.8f', header='low D coordinates of samples')
        if peratom:  
            np.savetxt(foutput + "-atomic.coord", proj_atomic_all, fmt='%4.8f', header='low D coordinates of samples')
    if output == 'xyz':
        asapxyz.set_descriptors(proj, 'pca_coord')
        if peratom:
            asapxyz.set_atomic_descriptors(proj_atomic_all, 'pca_coord')
        # remove the raw descriptors
        if not keepraw:
            asapxyz.remove_descriptors(fmat)
            asapxyz.remove_atomic_descriptors(fmat)
        asapxyz.write(foutput)

    # color scheme
    plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function(fcolor, asapxyz, colorscol, 0, (peratom or plotatomic), projectatomic)

    if plotatomic:
        outfile = 'PCA_4_' + prefix + '-c-' + fcolor + '-plotatomic.png'
    else:
        outfile = 'PCA_4_' + prefix + '-c-' + fcolor + '.png'

    fig_spec_dict = {
        'outfile': outfile,
        'show': False,
        'title': None,
        'xlabel': 'Principal Axis 1',
        'ylabel': 'Principal Axis 2',
        'xaxis': True,  'yaxis': True,
        'remove_tick': False,
        'rasterized': True,
        'fontsize': 16,
        'components':{ 
            "first_p": {"type": 'scatter', 'clabel': colorlabel},
            "second_p": {"type": 'annotate', 'adtext': adtext}
             }
        }
    asap_plot = Plotters(fig_spec_dict)
    asap_plot.plot(proj[::-1, [pc1, pc2]], plotcolor[::-1], [], tags)
    if peratom or plotatomic and not projectatomic:
        asap_plot.plot(proj_atomic_all[::-1, [pc1, pc2]], plotcolor_peratom[::-1],[],[])
    plt.show()
Ejemplo n.º 14
0
# make tags

if ele == 14 or ele == 12:
    n_atoms = 32
    n_frames = get_nframes(fxyz_recal_om8, 'xyz')
    tag0 = sum(np.array(n_frames).astype(int)) * n_atoms * [0]

    n_frames = get_nframes(fxyz_recal_pv, 'xyz')
    tag1 = sum(np.array(n_frames).astype(int)) * n_atoms * [1]
    tags = tag0 + tag1
#    tag0=(n_atoms*9 + n_atoms*250)*[0] + n_atoms*100*[1]
###########
########### The following is equivalent to above and have
###########

asapxyz = ASAPXYZ(fxyz)

reduce_dict = {}
reduce_dict["preprocessing"] = {"type": 'SCALE', 'parameter': None}
reduce_dict['skpca'] = {
    "type": 'SPARSE_KPCA',
    'parameter': {
        "n_components": 3,
        "n_sparse": -1,  # no sparsification
        #                                     "scale":True,
        "kernel": {
            "first_kernel": {
                "type": 'linear'
            }
        }
    }
Ejemplo n.º 15
0
def main(fxyz, prefix):
    """

    Test if computing descriptors is working.

    Parameters
    ----------
    fxyz: string giving location of xyz file
    prefix: string giving the filename prefix
    """

    # read frames
    asapxyz = ASAPXYZ(fxyz, 1, False)  # not periodic

    peratom = True
    tag = 'test'

    soap_js = {
        'soap1': {
            'type': 'SOAP',
            'cutoff': 2.0,
            'n': 2,
            'l': 2,
            'atom_gaussian_width': 0.2,
            'rbf': 'gto',
            'crossover': False
        }
    }

    acsf_js = {
        'acsf1': {
            'type': 'ACSF',
            'cutoff': 2.0,
            'g2_params': [[1, 1], [1, 2], [1, 3]],
            'g4_params': [[1, 1, 1], [1, 2, 1], [1, 1, -1], [1, 2, -1]]
        }
    }

    k2_js = {
        'lmbtr-k2': {
            'type': 'LMBTR_K2',
            'k2': {
                "geometry": {
                    "function": "distance"
                },
                "grid": {
                    "min": 0,
                    "max": 2,
                    "n": 10,
                    "sigma": 0.1
                },
                "weighting": {
                    "function": "exponential",
                    "scale": 0.5,
                    "cutoff": 1e-3
                }
            },
            'periodic': False,
            'normalization': "l2_each"
        }
    }

    kernel_js = {}
    kernel_js['k1'] = {
        'reducer_type': 'moment_average',
        'zeta': 2,
        'element_wise': False
    }
    kernel_js['k2'] = {'reducer_type': 'sum', 'element_wise': True}

    desc_spec_js = {
        'test_cm': {
            'type': "CM"
        },
        'test_soap': {
            'atomic_descriptor': soap_js,
            'reducer_function': kernel_js
        },
        'test_acsf': {
            'atomic_descriptor': acsf_js,
            'reducer_function': kernel_js
        },
        'test_k2': {
            'atomic_descriptor': k2_js,
            'reducer_function': kernel_js
        }
    }

    # compute the descripitors
    asapxyz.compute_global_descriptors(desc_spec_js, [], peratom, tag)

    asapxyz.write_computed_descriptors(prefix, ['test_cm', 'test_soap'], [0])

    asapxyz.write(prefix)
    asapxyz.save_state(tag)
Ejemplo n.º 16
0
def main(fmat, kmat, fxyz, ftags, prefix, fcolor, colorscol, dimension, pc1, pc2, algorithm, projectatomic, adtext):

    """

    Parameters
    ----------
    fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it.
    kmat: Location of the kernel matrix.
    fxyz: Location of xyz file for reading the properties.
    ftags: Location of tags for the first M samples
    prefix: Filename prefix. Default is ASAP.
    fcolor: Properties for all samples (N floats) used to color the scatter plot,[filename/rho/cluster]
    colorscol: The column number of the properties used for the coloring. Starts from 0.
    dimension: The number of principle components to keep
    pc1: int, default is 0, which principle axis to plot the projection on
    pc2: int, default is 1, which principle axis to plot the projection on
    algorithm: the algorithm for density-based clustering options are: ([dbscan], [fdb])
    projectatomic: build the projection using the (big) atomic descriptor matrix
    adtext: Whether to adjust the text (True/False)

    Returns
    -------
    cluster labels, PCA plots
    """

    if fmat == 'none' and kmat == 'none':
        raise ValueError('Must provide either the low-dimensional coordinates fmat or the kernel matrix kmat')

    # try to read the xyz file
    if fxyz != 'none':
        asapxyz = ASAPXYZ(fxyz)
        desc, desc_atomic = asapxyz.get_descriptors(fmat, projectatomic)
        if projectatomic:
            desc = desc_atomic.copy()
    else:
        asapxyz = None
        print("Did not provide the xyz file. We can only output descriptor matrix.")
        output = 'matrix'

    # we can also load the descriptor matrix from a standalone file
    if os.path.isfile(fmat[0]):
        try:
            desc = np.genfromtxt(fmat[0], dtype=float)
            print("loaded the descriptor matrix from file: ", fmat)
        except:
            raise ValueError('Cannot load the descriptor matrix from file')

    if kmat != 'none':
        try:
            kNN = np.genfromtxt(kmat, dtype=float)
            print("loaded kernal matrix", kmat, "with shape", np.shape(kNN))
            desc =  kerneltodis(kNN)
        except:
            raise ValueError('Cannot load the coordinates')
        

    if ftags != 'none':
        tags = np.loadtxt(ftags, dtype="str")
        ndict = len(tags)

    # now we do the clustering
    if algorithm == 'dbscan':
        # we compute the characteristic bandwidth of the data
        # first select a subset of structures (20)
        sbs = np.random.choice(np.asarray(range(len(desc))), 50, replace=False)
        # the characteristic bandwidth of the data
        sigma_kij = np.percentile(cdist(desc[sbs], desc, 'euclidean'), 100*10./len(desc))
        trainer = sklearn_DB(sigma_kij, 5, 'euclidean')  # adjust the parameters here!
        do_clustering = DBCluster(trainer)
        do_clustering.fit(desc)

    elif algorithm == 'fdb' or algorithm == 'FDB':
        trainer = LAIO_DB()
        do_clustering = DBCluster(trainer)
        do_clustering.fit(desc)
    else:
        raise ValueError('Please select from fdb or dbscan')

    print(do_clustering.pack())
    #with open("clustering_results_4_" + prefix + ".json", 'w') as fp:
    #    json.dump(do_clustering.pack(), fp, cls=NpEncoder)

    labels_db = do_clustering.get_cluster_labels()
    n_clusters = do_clustering.get_n_cluster()
    
    if asapxyz is not None and projectatomic:
        asapxyz.set_atomic_descriptors(labels_db, 'cluster_label')
    elif asapxyz is not None:
        asapxyz.set_descriptors(labels_db, 'cluster_label')

    # save
    np.savetxt(prefix + "-cluster-label.dat", np.transpose([np.arange(len(labels_db)), labels_db]),
               header='index cluster_label', fmt='%d %d')

    if  fmat != 'none':
        pca = PCA(dimension, True)
        proj = pca.fit_transform(desc)
    elif  kmat != 'none':
        proj = KernelPCA(dimension).fit_transform(kNN)

    # color scheme
    if fcolor == 'cluster_label': 
        plotcolor = labels_db
        colorlabel = 'cluster_label'
    else:
        if projectatomic:
            _, plotcolor, colorlabel, colorscale = set_color_function(fcolor, asapxyz, colorscol, 0, True)
        else:
            plotcolor, colorlabel, colorscale = set_color_function(fcolor, asapxyz, colorscol, len(proj), False)

    print(labels_db[::-1])

    outfile = 'Clustering_4_' + prefix + '.png'
    # make plot
    fig_spec_dict = {
        'outfile': outfile,
        'show': True,
        'title': None,
        'xlabel': 'Principal Axis 1',
        'ylabel': 'Principal Axis 2',
        'xaxis': True,  'yaxis': True,
        'remove_tick': False,
        'rasterized': True,
        'fontsize': 16,
        'components':{ 
            "first_p": {"type": 'scatter', 'clabel': colorlabel},
            #"second_p": {"type": 'annotate', 'adtext': adtext},
            "third_p": {"type": 'cluster', 'w_label': True, 'circle_size': 20}
             }
        }
    asap_plot = Plotters(fig_spec_dict)
    asap_plot.plot(proj[::-1, [pc1, pc2]], plotcolor[::-1], labels_db[::-1], []) #tags)
Ejemplo n.º 17
0
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, peratom, keepraw, scale, tsne_d, dim1, dim2, perplexity,
         projectatomic, plotatomic, adtext):
    """

    Parameters
    ----------
    fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it.
    fxyz: Location of xyz file for reading the properties.
    ftags: Location of tags for the first M samples. Plot the tags on the t-SNE map.
    fcolor: Location of a file or name of the tags in ase xyz file. It should contain properties for all samples (N floats) used to color the scatterplot'
    colorscol: The column number of the properties used for the coloring. Starts from 0.
    prefix: Filename prefix, default is ASAP
    output: The format for output files ([xyz], [matrix]). Default is xyz.
    peratom: Whether to output per atom t-SNE coordinates (True/False)
    keepraw: Whether to keep the high dimensional descriptor when output is an xyz file (True/False)
    scale: Scale the coordinates (True/False). Scaling highly recommanded.
    tsne_d: Dimension of the embedded space.
    dim1: Plot the projection along which principle axes
    dim2: Plot the projection along which principle axes
    projectatomic: build the projection using the (big) atomic descriptor matrix
    perplexity: Perplexity setting for t-SNE: Typical values between 5 and 50.
    plotatomic: Plot the PCA coordinates of all atomic environments (True/False)
    adtext: Whether to adjust the texts (True/False)

    Returns
    -------

    """

    foutput = prefix + "-pca-d" + str(tsne_d)
    use_atomic_desc = (peratom or plotatomic or projectatomic)

    # try to read the xyz file
    if fxyz != 'none':
        asapxyz = ASAPXYZ(fxyz)
        desc, desc_atomic = asapxyz.get_descriptors(fmat, use_atomic_desc)
        if projectatomic: desc = desc_atomic.copy()
    else:
        asapxyz = None
        print("Did not provide the xyz file. We can only output descriptor matrix.")
        output = 'matrix'
    # we can also load the descriptor matrix from a standalone file
    if os.path.isfile(fmat[0]):
        try:
            desc = np.genfromtxt(fmat[0], dtype=float)
            print("loaded the descriptor matrix from file: ", fmat)
        except:
            raise ValueError('Cannot load the descriptor matrix from file')
    # sanity check
    if len(desc) == 0:
        raise ValueError('Please supply descriptor in a xyz file or a standlone descriptor matrix')
    print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0]))

    if ftags != 'none':
        tags = np.loadtxt(ftags, dtype="str")[:]
        ndict = len(tags)
    else:
        tags = []

    # scale & center
    if scale:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        print('Shape of descriptor matrix is {}'.format(desc.shape))
        print(scaler.fit(desc))
        desc = scaler.transform(desc)  # normalizing the features

    # fit t-SNE

    if desc.shape[1] >= 50:
        # pre-process with PCA if dim > 50
        # suggested here: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html

        pca = PCA(n_components=50)
        desc = pca.fit_transform(desc)
        print('Shape of processed descriptor matrix after applying PCA is {}'.format(desc.shape))

    tsne = TSNE(n_components=tsne_d, perplexity=perplexity)
    proj = tsne.fit_transform(desc)
    if peratom or plotatomic and not projectatomic:
        raise NotImplementedError
        #proj_atomic_all = tsne.transform(desc_atomic)

    # save
    if output == 'matrix':
        np.savetxt(foutput + ".coord", proj, fmt='%4.8f', header='low D coordinates of samples')
    if output == 'xyz':
        if os.path.isfile(foutput + ".xyz"):
            os.rename(foutput + ".xyz", "bck." + foutput + ".xyz")
        asapxyz.set_descriptors(proj, 'pca_coord')
        if peratom:
            asapxyz.set_atomic_descriptors(proj_atomic_all, 'pca_coord')
        # remove the raw descriptors
        if not keepraw:
            asapxyz.remove_descriptors(fmat)
            asapxyz.remove_atomic_descriptors(fmat)
        asapxyz.write(foutput)

    # color scheme
    plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function(fcolor, asapxyz, colorscol, 0, (peratom or plotatomic), projectatomic)

    # make plot
    plot_styles.set_nice_font()
    fig, ax = plt.subplots()
    if plotatomic and not projectatomic:
        # notice that we reverse the list of coordinates, in order to make the structures in the dictionary more obvious
        fig, ax = plot_styles.plot_density_map(proj_atomic_all[::-1, [dim1, dim2]], plotcolor_peratom[::-1], fig, ax,
                                               xlabel='Principal Axis ' + str(dim1),
                                               ylabel='Principal Axis ' + str(dim2),
                                               clabel=None, label=None,
                                               xaxis=True, yaxis=True,
                                               centers=None,
                                               psize=None,
                                               out_file=None,
                                               title=None,
                                               show=False, cmap='gnuplot',
                                               remove_tick=False,
                                               use_perc=False,
                                               rasterized=True,
                                               fontsize=15,
                                               vmax=colorscale[1],
                                               vmin=colorscale[0])

    fig, ax = plot_styles.plot_density_map(proj[::-1, [dim1, dim2]], plotcolor[::-1], fig, ax,
                                           xlabel='Principal Axis ' + str(dim1), ylabel='Principal Axis ' + str(dim2),
                                           clabel=colorlabel, label=None,
                                           xaxis=True, yaxis=True,
                                           centers=None,
                                           psize=None,
                                           out_file=None,
                                           title='t-SNE for: ' + prefix,
                                           show=False, cmap='gnuplot',
                                           remove_tick=False,
                                           use_perc=False,
                                           rasterized=True,
                                           fontsize=15,
                                           vmax=colorscale[1],
                                           vmin=colorscale[0])

    fig.set_size_inches(160.5, 80.5)

    if ftags != 'none':
        texts = []
        for i in range(ndict):
            if tags[i] != 'None' and tags[i] != 'none' and tags[i] != '':
                ax.scatter(proj[i, dim1], proj[i, dim2], marker='^', c='black')
                texts.append(ax.text(proj[i, dim1], proj[i, dim2], tags[i],
                                     ha='center', va='center', fontsize=10, color='red'))
        if adtext:
            from adjustText import adjust_text
            adjust_text(texts, on_basemap=True,  # only_move={'points':'', 'text':'x'},
                        expand_text=(1.01, 1.05), expand_points=(1.01, 1.05),
                        force_text=(0.03, 0.5), force_points=(0.01, 0.25),
                        ax=ax, precision=0.01,
                        arrowprops=dict(arrowstyle="-", color='black', lw=1, alpha=0.8))

    plt.show()
    if plotatomic:
        fig.savefig('t-SNE_4_' + prefix + '-c-' + fcolor + '-plotatomic.png')
    else:
        fig.savefig('t-SNE_4_' + prefix + '-c-' + fcolor + '.png')
Ejemplo n.º 18
0
def main(fxyz, fy, prefix, nkeep, algorithm, fmat, fkde, reweight_lambda):
    """
    Select frames from the supplied xyz file (fxyz) using one of the following algorithms:

    1. random: random selection
    2. fps: farthest point sampling selection. Need to supply a kernel matrix or descriptor matrix using -fmat
    3. sortmin/sortmax: select the frames with the largest/smallest value. Need to supply the vector of properties using
       -fy
    4. CUR decomposition
    5. Reweight according to the re-weighted distribution exp(-f/\lambda),
       where exp(-f) is the precomputed kernel density estimation of the original samples.

    Parameters
    ----------
    fxyz: Path to xyz file.
    fy: Path to the list of properties (N floats) or name of the tags in ase xyz file
    prefix: Filename prefix, default is ASAP
    nkeep: The number of representative samples to select
    algorithm: 'the algorithm for selecting frames ([random], [fps], [sort], [reweight])')
    fmat: Location of descriptor or kernel matrix file. Needed if you select [fps].
    You can use gen_kmat.py to compute it.
    reweight_lambda: select samples according to the re-weighted distribution exp(-f/\lambda),
              where exp(-f) is the kernel density estimation of the original samples.
    """

    # read the xyz file
    asapxyz = ASAPXYZ(fxyz)
    nframes = asapxyz.get_num_frames()

    if nkeep == 0:
        nkeep = nframes

    if fy != 'none':
        y_all = []
        try:
            y_all = np.genfromtxt(fy, dtype=float)
        except:
            y_all = asapxyz.get_property(fy)
        if len(y_all) != nframes:
            raise ValueError(
                'Length of the vector of properties is not the same as number of samples'
            )

    if algorithm == 'random' or algorithm == 'RANDOM':
        idx = np.asarray(range(nframes))
        sbs = np.random.choice(idx, nkeep, replace=False)

    elif algorithm == 'sortmax' or algorithm == 'sortmin':
        if fy == 'none':
            raise ValueError(
                'must supply the vector of properties for sorting')

        idx = np.asarray(range(nframes))
        if algorithm == 'sortmax':
            sbs = [x for _, x in sorted(zip(y_all, idx))][:nkeep]
        elif algorithm == 'sortmin':
            sbs = [x for _, x in sorted(zip(y_all, idx))][nkeep:]

    elif algorithm == 'fps' or algorithm == 'FPS' or algorithm == 'cur' or algorithm == 'CUR':
        # for both algo we read in the descriptor matrix
        desc, _ = asapxyz.get_descriptors(fmat)
        if os.path.isfile(fmat):
            try:
                desc = np.genfromtxt(fmat, dtype=float)
            except:
                raise ValueError('Cannot load the kernel matrix')
        print("shape of the descriptor matrix: ", np.shape(desc),
              "number of descriptors: ", np.shape(desc[0]))

        # FPS
        if algorithm == 'fps' or algorithm == 'FPS':
            sbs, dmax_remain = fps(desc, nkeep, 0)
            print("Making farthest point sampling selection")
            np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) +
                       '.error',
                       dmax_remain,
                       fmt='%4.8f',
                       header='the maximum remaining distance in FPS')
        # CUR decomposition
        if algorithm == 'cur' or algorithm == 'CUR':
            desc = np.asmatrix(desc)
            cov = np.dot(desc, desc.T)
            print("Making CUR selection")
            print("shape of the covariance matrix:", np.shape(cov))
            sbs, rcov_error = CUR_deterministic(cov, nkeep)
            np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) +
                       '.error',
                       rcov_error,
                       fmt='%4.8f',
                       header='the remaining error of the covariance matrix')

    elif algorithm == 'reweight':
        if os.path.isfile(fkde):
            try:
                logkde = np.genfromtxt(fkde, dtype=float)[:, 1]
            except:
                raise IOError(
                    'Cannot load the (log of) kernel density for each sample')
            if len(logkde) != nframes:
                raise ValueError(
                    'mismatch of number of frames and kernel densities')
        else:
            raise ValueError(
                'must suply the (log of) kernel density for each sample')

        new_kde = np.zeros(nframes)
        for i in range(nframes):
            new_kde[i] = np.exp(logkde[i] / reweight_lambda) / np.exp(
                logkde[i])
        # compute the normalization factor so we expect to select n samples in the end
        normalization = nkeep / np.sum(new_kde)
        new_kde *= normalization
        sbs = []
        randomchoice = np.random.rand(nframes)
        for i in range(nframes):
            if randomchoice[i] < new_kde[i]:
                sbs.append(i)
        algorithm = algorithm + "-lambda-" + str(reweight_lambda)
    # save
    selection = np.zeros(nframes, dtype=int)
    for i in sbs:
        selection[i] = 1
    np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '.index',
               selection,
               fmt='%d')
    if fy != 'none':
        np.savetxt(prefix + "-" + algorithm + "-n-" + str(nkeep) + '-' + fy,
                   np.asarray(y_all)[sbs],
                   fmt='%4.8f')
    asapxyz.write(prefix + "-" + algorithm + "-n-" + str(nkeep), sbs)
Ejemplo n.º 19
0
def main():
    """

    Test if dimensionality reduction is working.

    Parameters
    ----------
    fxyz: string giving location of xyz file
    prefix: string giving the filename prefix
    """
    fxyz = os.path.join(os.path.split(__file__)[0], 'small_molecules-SOAP.xyz')
    fmat = ['SOAP-n4-l3-c1.9-g0.23']
    fcolor = 'dft_formation_energy_per_atom_in_eV' 
    pca_d = 10
    prefix = "test-dimensionality-reduction"
    foutput = prefix + "-pca-d" + str(pca_d)

    # try to read the xyz file
    asapxyz = ASAPXYZ(fxyz)
    desc, _ = asapxyz.get_descriptors(fmat, False)

    print(desc)
    """
    reduce_dict = { "pca": 
                   {"type": 'PCA', 'parameter':{"n_components": pca_d, "scalecenter": scale}}
                  }
    
    reduce_dict = {
                   "preprocessing": {"type": 'SCALE', 'parameter': None},
                   "umap":
                   {"type": 'UMAP', 'parameter':{"n_components": pca_d, "n_neighbors": 10}}
                  }    
    
    reduce_dict = {
        "reduce1_pca": {"type": 'PCA', 'parameter':{"n_components": 20, "scalecenter":True}},
        "reduce2_tsne": {"type": 'TSNE', 'parameter': {"n_components": 2, "perplexity":20}}
        }
    """

    reduce_dict = {
                   "preprocessing": {"type": 'SCALE', 'parameter': None},
                   "skpca":
                   {"type": 'SPARSE_KPCA', 
                   'parameter':{"n_components": pca_d, 
                                "kernel": {"first_kernel": {"type": 'linear', "normalize": True}}
                                }
                    }
                  }  

    dreducer = Dimension_Reducers(reduce_dict)

    proj = dreducer.fit_transform(desc)
    
    # save
    asapxyz.set_descriptors(proj, 'pca_coord')
    asapxyz.write(foutput)

    # color scheme
    plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function(fcolor, asapxyz)

    outfile = 'PCA_4_' + prefix + '-c-' + fcolor + '.png'

    fig_spec_dict = {
        'outfile': outfile,
        'show': False,
        'title': None,
        'xlabel': 'Principal Axis 1',
        'ylabel': 'Principal Axis 2',
        'xaxis': True,  'yaxis': True,
        'remove_tick': False,
        'rasterized': True,
        'fontsize': 16,
        'components':{ 
            "first_p": {"type": 'scatter', 'clabel': colorlabel},
            "second_p": {"type": 'annotate', 'adtext': False}
             }
        }
    asap_plot = Plotters(fig_spec_dict)
    asap_plot.plot(proj[::-1, [0, 1]], plotcolor[::-1], [], [])
    plt.show()
Ejemplo n.º 20
0
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, peratom,
         keepraw, sparse_mode, n_sparse, power, kpca_d, pc1, pc2,
         projectatomic, plotatomic, adjusttext):
    """
    Parameters
    ----------
    fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it.
    fxyz: Location of xyz file for reading the properties.
    ftags: Location of tags for the first M samples. Plot the tags on the (k)PCA map.
    fcolor: Location of a file or name of the tags in ase xyz file. It should contain properties for all samples (N floats) used to color the scatterplot'
    colorscol: The column number of the properties used for the coloring. Starts from 0.
    prefix: Filename prefix, default is ASAP
    output: The format for output files ([xyz], [matrix]). Default is xyz.
    peratom: Whether to output per atom pca coordinates (True/False)
    keepraw: Whether to keep the high dimensional descriptor when output is an xyz file (True/False)
    n_sparse: number of representative samples, default is 5% of the data
    power: use polynomial kernel function of degree n. 
    kpca_d: Number of the principle components to keep
    pc1: Plot the projection along which principle axes
    pc2: Plot the projection along which principle axes
    projectatomic: build the projection using the (big) atomic descriptor matrix
    plotatomic: Plot the PCA coordinates of all atomic environments (True/False)
    adtext: Whether to adjust the texts (True/False)

    Returns
    -------
    """

    use_atomic_desc = (peratom or plotatomic or projectatomic)
    # try to read the xyz file
    if fxyz != 'none':
        asapxyz = ASAPXYZ(fxyz)
        desc, desc_atomic = asapxyz.get_descriptors(fmat, use_atomic_desc)
        if projectatomic: desc = desc_atomic.copy()
    else:
        asapxyz = None
        print(
            "Did not provide the xyz file. We can only output descriptor matrix."
        )
        output = 'matrix'
    # we can also load the descriptor matrix from a standalone file
    if os.path.isfile(fmat[0]):
        try:
            desc = np.genfromtxt(fmat[0], dtype=float)
            print("loaded the descriptor matrix from file: ", fmat)
        except:
            raise ValueError('Cannot load the descriptor matrix from file')
    # sanity check
    if len(desc) == 0:
        raise ValueError(
            'Please supply descriptor in a xyz file or a standlone descriptor matrix'
        )

    print("shape of the descriptor matrix: ", np.shape(desc),
          "number of descriptors: ", np.shape(desc[0]))

    if ftags != 'none':
        tags = np.loadtxt(ftags, dtype="str")[:]
        ndict = len(tags)
    else:
        tags = []

    # sparsification
    n_sample = len(desc)
    # set default value of n_sparse
    if n_sparse == 0:
        n_sparse = max(10, n_sample // 20)
    # sparsification
    if n_sparse >= n_sample:
        print(
            "the number of representative structure is too large, please select n < ",
            n_sample)
    elif n_sample > 0:
        if sparse_mode == 'fps' or sparse_mode == 'FPS':
            ifps, _ = fps(desc, n_sparse, 0)
        elif sparse_mode == 'cur' or sparse_mode == 'CUR':
            cov = np.dot(np.asmatrix(desc), np.asmatrix(desc).T)
            ifps, _ = CUR_deterministic(cov, n_sparse)
        else:
            raise ValueError('Cannot find the specified sparsification mode')
    else:
        print("Not using any sparsification")
        ifps = np.range(n_sample)

    k_spec = {
        'k0': {
            "type": "cosine"
        }
    }  #{ 'k1': {"type": "polynomial", "d": power}}
    k_transform = Descriptors_to_Kernels(k_spec)

    kNN = k_transform.compute(desc[ifps])
    kMN = k_transform.compute(desc, desc[ifps])
    print("Shape of the kNN matrix: ", np.shape(kNN),
          ", and shape of the kMN matrix:", np.shape(kMN))
    # main thing
    kpca = KernelPCA(kpca_d)
    kpca.fit(kNN)
    proj = kpca.transform(kMN)
    if peratom or plotatomic and not projectatomic:
        kNT = np.power(np.dot(desc_atomic[:], desc[ifps].T), power)
        proj_atomic_all = kpca.transform(kNT)

    # save
    if output == 'matrix':
        np.savetxt(prefix + "-kpca-d" + str(kpca_d) + ".coord",
                   proj,
                   fmt='%4.8f',
                   header='low D coordinates of samples')
    elif output == 'xyz':
        if os.path.isfile(foutput + ".xyz"):
            os.rename(foutput + ".xyz", "bck." + foutput + ".xyz")
        asapxyz.set_descriptors(proj, 'kpca_coord')
        asapxyz.write(foutput)

    # color scheme
    plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function(
        fcolor, asapxyz, colorscol, 0, (peratom or plotatomic), projectatomic)

    # make plot
    if plotatomic:
        outfile = 'KPCA_4_' + prefix + '-c-' + fcolor + '-plotatomic.png'
    else:
        outfile = 'KPCA_4_' + prefix + '-c-' + fcolor + '.png'

    fig_spec_dict = {
        'outfile': outfile,
        'show': False,
        'title': None,
        'xlabel': 'Principal Axis 1',
        'ylabel': 'Principal Axis 2',
        'xaxis': True,
        'yaxis': True,
        'remove_tick': False,
        'rasterized': True,
        'fontsize': 16,
        'components': {
            "first_p": {
                "type": 'scatter',
                'clabel': colorlabel
            },
            "second_p": {
                "type": 'annotate',
                'adtext': adjusttext
            }
        }
    }
    asap_plot = Plotters(fig_spec_dict)
    asap_plot.plot(proj[::-1, [pc1, pc2]], plotcolor[::-1], [], tags)
    if peratom or plotatomic and not projectatomic:
        asap_plot.plot(proj_atomic_all[::-1, [pc1, pc2]],
                       plotcolor_peratom[::-1], [], [])
    plt.show()