def get_best_parameterization(config_fn,data_fn,metric_name='d_metric',o_config=None,o_data=None): _analyzer = PyposmatDataAnalyzer() _analyzer.read_configuration_file(filename=config_fn) _analyzer.read_data_file(filename=data_fn) # calculate the scoring metric if metric_name is 'd_metric': _df = _analyzer.calculate_d_metric(df=_analyzer.datafile.df) else: s = "The metric name {} is unsupported" s = s.format(metric_name) raise PyposmatUnsupportedPotentialScoringMetric(s) _data = PyposmatDataFile() _data.read(filename=data_fn) _data.df = _df _data.subselect_by_score(score_name='d_metric',n=1) _free_parameter_names = _analyzer.configuration.free_parameter_names _parameter_best_dict = OrderedDict() for pn in _free_parameter_names: _parameter_best_dict[pn] = _data.sub_parameter_df.iloc[0][pn] return _parameter_best_dict
def merge_files(self, i_iteration): _dir = self.data_directory _n_ranks = self.mpi_size datafile = None # filename of old kde file _filename_kde = os.path.join(_dir, 'pyposmat.kde.{}.out'.format(i_iteration)) print('Looking for previous kde file') print(' {}'.format(_filename_kde)) datafile_fns = [] if os.path.exists(_filename_kde): if os.path.isfile(_filename_kde): datafile_fns.append(_filename_kde) for i_rank in range(_n_ranks): rank_fn = os.path.join('rank_{}'.format(i_rank), 'pyposmat.results.out') datafile_fns.append(rank_fn) names = ['sim_id']\ + self.parameter_names\ + self.qoi_names\ + self.error_names types = ['sim_id']\ + ['param']*len(self.parameter_names)\ + ['qoi']*len(self.qoi_names)\ + ['err']*len(self.error_names) dataframes = OrderedDict() for fn in datafile_fns: datafile = PyposmatDataFile() datafile.read(fn) #if fn.startswith('rank') #datafile.df['sim_id'] = datafile.df.apply( # lambda x:"{}_{}_{}".format( # i_iteration,i_rank,str(x['sim_id']))) dataframes[fn] = datafile.df[names] df = pd.concat(dataframes).reset_index(drop=True) datafile = PyposmatDataFile() datafile.df = df datafile.parameter_names = self.parameter_names datafile.error_names = self.error_names datafile.qoi_names = self.qoi_names datafile.names = names datafile.types = types try: fn_out = os.path.join( _dir, 'pyposmat.results.{}.out'.format(i_iteration)) datafile.write(filename=fn_out) except FileNotFoundError as e: if not os.path.exists(self.data_directory): os.mkdir(self.data_directory) datafile.write(filename_fn_out) else: raise
def merge_pypospack_datafiles(datafile_fns): d0 = PyposmatDataFile() d0.read(filename=datafile_fns[0]) df0 = d0.df for i in range(1, len(datafile_fns)): print("merging {}...".format(datafile_fns[i])) d = PyposmatDataFile() d.read(filename=datafile_fns[i]) df = d.df df0 = pd.concat([df0, df]).drop_duplicates().reset_index(drop=True) d0.df = df0 return d0
def get_parameter_variance( config_fn,data_fn, metric_name='d_metric', n=100, o_config=None, o_data=None): """ Args: config_fn (str): data_fn (str): metric_name (str): (default:d_metric) n (int): the number of best metric values o_config (pypospack.config.data.PyposmatConfigurationFile) o_data (pypospack.config.data.PyposmatDataFile) Returns: collections.OrderedDict Raises: PyposmatUnknownPotentialScoringMetric """ _analyzer = PyposmatDataAnalyzer() _analyzer.read_configuration_file(filename=config_fn) _analyzer.read_data_file(filename=data_fn) # calculate the scoring metric if metric_name is 'd_metric': _df = _analyzer.calculate_d_metric(df=_analyzer.datafile.df) else: s = "The metric name {} is unsupported" s = s.format(metric_name) raise PyposmatUnsupportedPotentialScoringMetric(s) _data = PyposmatDataFile() _data.read(filename=data_fn) _data.df = _df _data.subselect_by_score(score_name='d_metric',n=n) _param_std_df = _data.sub_parameter_df.std(axis=0) _parameter_std_dict = OrderedDict() for pn in _analyzer.parameter_names: _parameter_std_dict[pn] =_param_std_df.to_dict()[pn] return _parameter_std_dict
from pypospack.pareto import pareto df = copy.deepcopy(datafile.df) nr,nc = df.shape _nsimulations = OrderedDict() _nsimulations['start'] = nr abs_error_names = ["{}.abserr".format(q) for q in datafile.qoi_names] for q in datafile.qoi_names: qe = "{}.err".format(q) qne = "{}.abserr".format(q) df[qne] = df[qe].abs() names = list(df.columns.values) abs_err_idx = [names.index(n) for n in abs_error_names] pareto_idx = pareto(df[abs_error_names].values.tolist()) datafile.df = df.loc[pareto_idx,datafile.names] datafile.write("results.pareto.out") #pareto_idx = pareto_bruteforce(df[abs_error_names].values.tolist()) #print(pareto_set) if make_rugplots: datafile = PyposmatDataFile() datafile.read("results.pareto.out") datafile.qoi_references = OrderedDict() datafile.qoi_references['TARGET'] = copy.deepcopy(qoi_targets) datafile.score_by_d_metric(scaling_factors='TARGET') datafile.subselect_by_score( score_name='d_metric', n=_n_potentials) subselect_fn = datafile.write_subselect()
def run_kde_sampling(self, n_samples, filename_in, cluster_id=None, kde_bw_type='chiu1999'): """ sample from a KDE distribution Args: n_samples(int): the number of samples to draw from the KDE distribution filename_in(str): the path to the datafile from which the parameters will be drawn from cluster_id(int): if we need to use a specific cluster_id, we specify it here. otherwise, it will be drawn from all parameters contained within the set. kde_bw_type(str): the method of estimating the optimal bandwidth """ _datafile_in = PyposmatDataFile() _datafile_in.read(filename_in) if cluster_id is None: _free_parameter_names = [str(v) for v in self.free_parameter_names] _X = _datafile_in.df[_free_parameter_names].values.T else: # subselect the dataframe by the cluster_id of interest _datafile_in.df = _datafile_in.df.loc[_datafile_in.df['cluster_id'] == cluster_id] _X = _datafile_in.df[self.free_parameter_names].loc[ _datafile_in.df['cluster_id'] == cluster_id].values.T # self.log.write("cluster_id {c} _X.shape={x}".format(c=cluster_id, x=_X.shape)) kde_bw = self.determine_kde_bandwidth(X=_X, kde_bw_type=kde_bw_type) _rv_generator = scipy.stats.gaussian_kde(_X, kde_bw) self.write_data_out_header() self.write_badparameters_header() time_start_iteration = time.time() _n_errors = 0 for i_sample in range(n_samples): # determine sim_id sim_id = self.get_sim_id(i=i_sample) # new OrderedDict to hold in parameter values _parameters = OrderedDict([(p, None) for p in self.parameter_names]) # generate free parameters for ordered dictionary _free_parameters = _rv_generator.resample(1) for i, v in enumerate(self.free_parameter_names): _parameters[v] = float(_free_parameters[i, 0]) # determine parameters determined from equality constraints for p in self.constrained_parameter_names: _constraint_type = self.parameter_distribution_definition[p][0] if _constraint_type == 'equals': # this condition is for fitting EoS for EAM function which # requires a refernce ground state crystal structure if p.endswith('latticetype'): _v = self.parameter_distribution_definition[p][1] _parameters[p] = _v # process evaluation strings elif type(self.parameter_distribution_definition[p] [1]) is not list: _str_eval = str( self.parameter_distribution_definition[p][1]) # replace string values with numerical values for fp in self.free_parameter_names: if fp in _str_eval: _str_eval = _str_eval.replace( fp, str(_parameters[fp])) # evaluate the string into a float _parameters[p] = eval(_str_eval) else: raise ValueError("oops") for p in self.constrained_parameter_names: if self.parameter_distribution_definition[p][0] == 'equals': # some EAM potentials have a normalizing equilbirum density # which have to be determined based upon the parameterization of # the electron density function if type(self.parameter_distribution_definition[p] [1]) is list: if self.parameter_distribution_definition[p][1][ 0] == 'equilibrium_density': a0 = self.parameter_distribution_definition[p][1][ 1] latt = self.parameter_distribution_definition[p][ 1][2] _parameters[ p] = self.calculate_equilibrium_density( a0, latt, _parameters) try: # now we check parameter inequality constraints for k, v in self.parameter_constraints.items(): _eval_str = v for pn, pv in _parameters.items(): _eval_str = _eval_str.replace(pn, str(pv)) if eval(_eval_str) is False: s = 'parameter constraint failed, {}'.format(k) raise PyposmatBadParameterError(s, parameters=_parameters) _results = self.evaluate_parameter_set(parameters=_parameters) except PyposmatBadParameterError as e: self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 except LammpsSimulationError as e: assert isinstance(self.pyposmat_badparameters, PyposmatBadParametersFile) assert isinstance(self.pyposmat_badparameters.parameter_names, list) self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 except PypospackTaskManagerError as e: self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 except PypospackBadEamEosError as e: self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 else: # determine sim_id _sim_id = int(i_sample) self.pyposmat_datafile_out.write_simulation_results( filename=self.pyposmat_data_out_filename, sim_id=i_sample, cluster_id=cluster_id, results=_results) finally: # print out summaries every 10 solutions if (i_sample + 1) % 10 == 0: n_samples_completed = i_sample + 1 time_end = time.time() time_total = time_end - time_start_iteration avg_time = time_total / n_samples_completed _str_msg = 'R{}:{} samples completed in {:.4f}s. Avg_time = {:.4f}. n_errors = {}'.format( self.mpi_rank, n_samples_completed, time_total, avg_time, _n_errors) self.log(_str_msg) d = OrderedDict() d['kde_bandwidth'] = OrderedDict() d['kde_bandwidth']['type'] = self.kde_bw_type d['kde_bandwidth']['h'] = self.kde_bw
import os import pypospack.utils _pypospack_root = pypospack.utils.get_pypospack_root_directory() _data_in_directory = os.path.join( _pypospack_root, 'examples', 'Ni__eam__born_exp_fs__sensitivityanalysis', 'data__from_pareto_optimization') _pyposmat_data_fn = os.path.join(_data_in_directory, 'pyposmat.kde.6.out') _pyposmat_config_fn = os.path.join(_data_in_directory, 'pyposmat.config.in') analyzer = PyposmatDataAnalyzer() analyzer.read_configuration_file(filename=_pyposmat_config_fn) analyzer.read_data_file(filename=_pyposmat_data_fn) df = analyzer.calculate_d_metric(df=analyzer.datafile.df) data = PyposmatDataFile() data.read(filename=_pyposmat_data_fn) data.df = df data.subselect_by_score(score_name="d_metric", n=100) # print(data.sub_df) param_stdev_df = data.sub_parameter_df.std(axis=0) param_mean_df = data.sub_parameter_df.mean(axis=0) print("parameter standard deviations:\n{}".format(param_stdev_df)) print("parameter means:\n{}".format(param_mean_df)) data.subselect_by_score(score_name="d_metric", n=1) print("best parameterization by d_metric:\n{}".format( data.sub_parameter_df))
best_sim_ids = [] for i in range(n_components): qoi_names = o.configuration.qoi_names qoi_indices = [o.names.index(k) for k in qoi_names] mean = o.model.means_[i][qoi_indices] covar = o.model.covariances_[i][np.ix_(qoi_indices, qoi_indices)] df = o.data.df.loc[o.data.df['cluster_id'] == i] X = o.data.df[qoi_names].loc[o.data.df['cluster_id'] == i] df['score'] = stats.multivariate_normal.pdf(X, mean, covar) best_sim_ids += list(df.nlargest(1, 'score')['sim_id'].values) data = PyposmatDataFile() data.read(filename=data_fn) data.df['cluster_id'] = o.data.df['cluster_id'] data.df = data.df.loc[o.data.df['sim_id'].isin(best_sim_ids)] data.write(filename='gmm_best_mle.out') #--------------- # #--------------- data = PyposmatDataFile() data.read(filename=data_fn) data.df['cluster_id'] = o.data.df['cluster_id'] data.df = data.df.loc[o.data.df['sim_id'].isin(best_sim_ids)] data.write(filename='gmm_best_potentials.out') qoi_names = o.configuration.qoi_names qoi_targets = o.configuration.qoi_targets for iqn, qn in enumerate(qoi_names): en = "{}.err".format(qn) nen = "{}.nerr".format(qn)
if __name__ == "__main__": data_fn = "../preconditioning_3.5NN/data/pyposmat.kde.3.out" config_fn = "../preconditioning_3.5NN/data/pyposmat.config.in" o_data = PyposmatDataFile() o_data.read(filename=data_fn) o_config = PyposmatConfigurationFile() o_config.read(filename=config_fn) a0_max = o_config.qoi_targets["Al_fcc.a0"] + 0.5 a0_min = o_config.qoi_targets["Al_fcc.a0"] - 0.5 e_coh_max = o_config.qoi_targets["Al_fcc.E_coh"] + 2 e_coh_min = o_config.qoi_targets["Al_fcc.E_coh"] - 2 print("n points initial: {}".format(len(o_data.df))) o_data.df = o_data.df[o_data.df["Al_fcc.a0"] > a0_min] o_data.df = o_data.df[o_data.df["Al_fcc.a0"] < a0_max] o_data.df = o_data.df[o_data.df["Al_fcc.E_coh"] > e_coh_min] o_data.df = o_data.df[o_data.df["Al_fcc.E_coh"] < e_coh_max] print("n points final: {}".format(len(o_data.df))) print() print("a0 target: {}".format(o_config.qoi_targets["Al_fcc.a0"])) print("a0 min: {}".format(o_data.df["Al_fcc.a0"].min())) print("a0 max: {}".format(o_data.df["Al_fcc.a0"].max())) print() print("E_coh target: {}".format(o_config.qoi_targets["Al_fcc.E_coh"])) print("E_coh min: {}".format(o_data.df["Al_fcc.E_coh"].min())) print("E_coh max: {}".format(o_data.df["Al_fcc.E_coh"].max())) out_fn = "./data/pyposmat.kde.0.out"
data_out_lists = [] for i, row in data_in.df.iterrows(): in_row_results = row.to_dict(into=OrderedDict) out_row_results = OrderedDict() for k in (["sim_id"] + data_out.parameter_names): out_row_results[k] = in_row_results[k] for k in (data_out.qoi_names): try: out_row_results[k] = in_row_results[k] except KeyError as e: if k == 'Ni_fcc.B': c11 = in_row_results['Ni_fcc.c11'] c12 = in_row_results['Ni_fcc.c12'] c44 = in_row_results['Ni_fcc.c44'] out_row_results[k] = calculate_bulk_modulus(c11, c12, c44) elif k == 'Ni_fcc.G': c11 = in_row_results['Ni_fcc.c11'] c12 = in_row_results['Ni_fcc.c12'] c44 = in_row_results['Ni_fcc.c44'] out_row_results[k] = calculate_bulk_modulus(c11, c12, c44) else: raise for k in (data_out.qoi_names): out_row_results["{}.err".format( k)] = out_row_results[k] - qoi_targets[k] data_out_lists.append([out_row_results[k] for k in data_out.names]) data_out.df = pd.DataFrame(data_out_lists, columns=data_out.names) data_out.write(_fn_results_out)
import os, copy, argparse from collections import OrderedDict from pypospack.pyposmat.data import PyposmatDataFile from pypospack.pyposmat.data import PyposmatConfigurationFile from pypospack.pyposmat.data import PyposmatDataAnalyzer if __name__ == "__main__": _fn_config = os.path.join("resources", "pyposmat.config.in") _fn_data = os.path.join("resources", "pyposmat.results.0.out") _fn_pareto_out = os.path.join("pyposmat.pareto.out") pda = PyposmatDataAnalyzer(fn_config=_fn_config, fn_data=_fn_data) pareto_df = pda.calculate_pareto_set() datafile = PyposmatDataFile() datafile.df = pareto_df datafile.parameter_names = pda.parameter_names datafile.qoi_names = pda.qoi_names datafile.error_names = pda.error_names datafile.names = ['sim_id'] \ +datafile.parameter_names\ +datafile.qoi_names\ +datafile.error_names datafile.types = ['sim_id']\ +len(datafile.parameter_names)*['param']\ +len(datafile.qoi_names)*['qoi_names']\ +len(datafile.error_names)*['error_names'] datafile.write(_fn_pareto_out) datafile.read(_fn_pareto_out)
def merge_data_files(self, i_iteration, last_datafile_fn=None, new_datafile_fn=None): """ merge the pyposmat data files Args: i_iteration(int): the current iteration which just finished last_datafile_fn(str,optional): the filename of the last dataset in the data directory. new_datafile_fn(str,optional): where to output the file results """ if last_datafile_fn is None: last_datafile_fn = os.path.join( self.data_directory, 'pyposmat.kde.{}.out'.format(i_iteration)) if new_datafile_fn is None: new_datafile_fn = os.path.join( self.data_directory, 'pyposmat.results.{}.out'.format(i_iteration)) data_dir = self.data_directory rank_dirs = [ v for v in os.listdir(self.root_directory) if v.startswith('rank_') ] filenames = [ os.path.join(self.root_directory, v, 'pyposmat.results.out') for v in rank_dirs ] data = None for i, v in enumerate(filenames): data_new = None if i == 0: data = PyposmatDataFile() data.read(filename=v) else: data_new = PyposmatDataFile() data_new.read(filename=v) data.df = pd.concat([data.df, data_new.df]) nrows = len(data.df) if self.configuration.sampling_type[i_iteration][ 'type'] == 'from_file': pass else: sim_id_fmt = '{:0>2}_{:0>6}' sim_id_str = [ sim_id_fmt.format(i_iteration, i) for i in range(nrows) ] data.df['sim_id'] = [ sim_id_fmt.format(i_iteration, i) for i in range(nrows) ] if self.configuration.sampling_type[i_iteration][ 'type'] == "from_file": data_new = PyposmatDataFile() data_new.read(filename=filenames[0]) data_new.df = data.df data_new.write(filename=new_datafile_fn) else: self.log("merging with candidates from previous simulations") self.log("\tfilename:{}".format(last_datafile_fn)) data_old = PyposmatDataFile() try: data_old.read(filename=last_datafile_fn) data_old.df = pd.concat([data_old.df, data.df]) data_old.write(filename=new_datafile_fn) except FileNotFoundError as e: if i_iteration == 0: data.write(filename=new_datafile_fn) else: raise
data = PyposmatDataFile() data.read(filename=data_fn) qoi_names = config.qoi_names df = filter_by_znormalized_errors(data.df, percentile, qoi_names) _df = data.df for qn in qoi_names: en = "{}.err".format(qn) z_en = "{}.zerr".format(qn) _df[z_en] = (_df[en]) / _df[en].std() zerror_names = ["{}.zerr".format(q) for q in qoi_names] _df['z_err_dist'] = np.sqrt(np.square(_df[zerror_names]).sum(axis=1)) nr0, nc0 = _df.shape print("total_number_of_rows:{}".format(nr0)) nr1 = int(percentile * nr0 // 1) print("keeping_number_of_rows:{}".format(nr1)) _df = _df.nsmallest(nr1, 'z_err_dist').reset_index(drop=True) nr2, nc2 = _df.shape print("current_df_size:{},{}".format(nr2, nc2)) data.df = _df.copy(deep=True) nr3, nc3 = data.df.shape print("current_df_size:{},{}".format(nr3, nc3)) import matplotlib.pyplot as plt plt.figure() _df['z_err_dist'].plot.hist() plt.show()
from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # imports for graphics import matplotlib.pyplot as plt from pypospack.pyposmat.data import PyposmatDataFile #load data data_fn = "pyposmat.results.0.out" data = PyposmatDataFile() data.read(filename=data_fn) data.df = pd.concat([ data.df, pd.DataFrame(data.df['MgO_NaCl.p11'].abs().as_matrix(), columns=['MgO_NaCl.p11.abs']), pd.DataFrame(data.df['MgO_NaCl.a0.err'].abs().as_matrix(), columns=['MgO_NaCl.a0.nerr']) ], axis=1) pca = PCA(n_components=2) pca.fit(data.df[data.parameter_names]) _pca = pca.transform(data.df[data.parameter_names]) _pca_1_min = _pca[:, 0].min() _pca_1_max = _pca[:, 0].max() _pca_2_min = _pca[:, 1].min() _pca_2_max = _pca[:, 1].max() sample_sizes = [50000, 10000, 5000, 1000] pca_p11_data = OrderedDict() for i in sample_sizes: