class FileSampler(): def __init__(self, configuration, data, structure_name, structure_path, workflow_type, workflow_definition): self.initialize_configuration(configuration) self.initialize_data(data) self.structure_name = structure_name self.strucutre_path = structure_path self.workflow_type = workflow_type self.workflow_definition = workflow_definition self.potential_definition = self.configuration.potential def initialize_configuration(self,configuration): if isinstance(configuration,PyposmatConfigurationFile): self.configuration = configuration elif isinstance(configuration,str): self.configuration = PyposmatConfigurationFile() self.configuration.read(filename=configuration) else: msg = ("configuration must be a path to a configuration file or an " "instance of the PyposmatConfigurationFile,") raise TypeError(msg) def initialize_data(self,data): if isinstance(data,PyposmatDataFile): self.data = data elif isinstance(data,str): self.data = PyposmatDataFile() self.data.read(filename=data) else: msg = ("data must be a path to a data file or an instance of " "PyposmatDataFile.") raise TypeError(msg) def run(self): for index,row in self.data.df.iterrows(): sim_id = row['sim_id'] print('working on sim_id:{}'.format(sim_id)) parameters = OrderedDict([(k,row[k]) for k in self.configuration.parameter_names]) original_path = os.getcwd() os.mkdir(sim_id) os.chdir(sim_id) if workflow_type == 'lmps_thermal_expansion': workflow = LammpsThermalExpansion( structure_name=Si_structure_definition['name'], structure_path=Si_structure_definition['filename'], **workflow_definition) workflow.create_task_configurations() workflow.create_tasks() workflow.prepare_tasks( potential_definition = self.potential_definition, potential_parameters = parameters) workflow.run() os.chdir(original_path)
def get_best_parameterization(config_fn,data_fn,metric_name='d_metric',o_config=None,o_data=None): _analyzer = PyposmatDataAnalyzer() _analyzer.read_configuration_file(filename=config_fn) _analyzer.read_data_file(filename=data_fn) # calculate the scoring metric if metric_name is 'd_metric': _df = _analyzer.calculate_d_metric(df=_analyzer.datafile.df) else: s = "The metric name {} is unsupported" s = s.format(metric_name) raise PyposmatUnsupportedPotentialScoringMetric(s) _data = PyposmatDataFile() _data.read(filename=data_fn) _data.df = _df _data.subselect_by_score(score_name='d_metric',n=1) _free_parameter_names = _analyzer.configuration.free_parameter_names _parameter_best_dict = OrderedDict() for pn in _free_parameter_names: _parameter_best_dict[pn] = _data.sub_parameter_df.iloc[0][pn] return _parameter_best_dict
def test__read__wo_named_arguments(): datafile = PyposmatDataFile() datafile.read(MgO_datafile) assert type(datafile.names) is list assert len(expected_names) == len(datafile.names) for i, v in enumerate(expected_names): assert expected_names[i] == datafile.names[i] assert type(datafile.parameter_names) is list assert len(parameter_names) == len(datafile.parameter_names) for i, v in enumerate(parameter_names): assert parameter_names[i] == datafile.parameter_names[i] assert type(datafile.qoi_names) is list assert len(qoi_names) == len(datafile.qoi_names) for i, v in enumerate(qoi_names): assert qoi_names[i] == datafile.qoi_names[i] assert type(datafile.error_names) is list assert len(error_names) == len(datafile.error_names) for i, v in enumerate(error_names): assert error_names[i] == datafile.error_names[i] assert type(datafile.df) is pd.DataFrame
def test__write_header_section(): cleanup_test() parameter_names = ['param{}'.format(i + 1) for i in range(3)] qoi_names = ['qoi{}'.format(i + 1) for i in range(5)] error_names = ['err{}'.format(i + 1) for i in range(5)] datafile = PyposmatDataFile() datafile.write_header_section(parameter_names=parameter_names, qoi_names=qoi_names, error_names=error_names, filename=datafile_out_fn) assert os.path.isfile(datafile_out_fn) datafile_read = PyposmatDataFile() datafile_read.read(filename=datafile_out_fn) assert len(datafile_read.parameter_names) == len(parameter_names) for i, v in enumerate(parameter_names): assert datafile_read.parameter_names[i] == v assert len(datafile_read.qoi_names) == len(qoi_names) for i, v in enumerate(qoi_names): assert datafile_read.qoi_names[i] == v assert len(datafile_read.error_names) == len(qoi_names) for i, v in enumerate(error_names): assert datafile_read.error_names[i] == v cleanup_test()
def dev__get_descriptive_statistics__from_kde_file(): print(80 * '-') print( '{:^80}'.format('method -> get_descriptive_statistics__from_kde_file')) testing_set = get_testing_set() config_fn = testing_set['config_fn'] results_data_fn = testing_set['results_fn'] kde_data_fn = testing_set['kde_fn'] assert os.path.isfile(config_fn) assert os.path.isfile(results_data_fn) assert os.path.isfile(kde_data_fn) o = PyposmatDataAnalyzer(config_fn=config_fn, results_data_fn=results_data_fn) kde_data = PyposmatDataFile() kde_data.read(filename=kde_data_fn) descriptive_statistics = o.get_descriptive_statistics(df=kde_data.df) print( o.str__descriptive_statistics( descriptive_statistics=descriptive_statistics)) print(kde_data.df.shape)
def test__write_simulation_results__no_filename(): cleanup_test() parameter_names = ['param{}'.format(i + 1) for i in range(3)] qoi_names = ['qoi{}'.format(i + 1) for i in range(5)] error_names = ['err{}'.format(i + 1) for i in range(5)] datafile = PyposmatDataFile() datafile.write_header_section(parameter_names=parameter_names, qoi_names=qoi_names, error_names=error_names, filename=datafile_out_fn) sim_id = "test_id" results = OrderedDict() results['parameters'] = OrderedDict([(v, 1.) for v in parameter_names]) results['qois'] = OrderedDict([(v, 2.) for v in qoi_names]) results['errors'] = OrderedDict([(v, 3.0) for v in error_names]) datafile.write_simulation_results(sim_id, results) assert os.path.isfile(datafile_out_fn) datafile_read = PyposmatDataFile() datafile_read.read(filename=datafile_out_fn)
def calculate_kld(data_1_fn,data_2_fn,names,n_samples=2000): assert isinstance(data_1_fn,str) assert isinstance(data_2_fn,str) assert isinstance(n_samples,int) assert os.path.isfile(data_1_fn) assert os.path.isfile(data_1_fn) data_1 = PyposmatDataFile() data_1.read(filename=data_1_fn) data_2 = PyposmatDataFile() data_2.read(filename=data_2_fn) w1,v1 = linalg.eig(np.cov(data_1.df[names].T)) w2,v2 = linalg.eig(np.cov(data_2.df[names].T)) cov1_ill_conditioned = any([k < 0 for k in w1.tolist()]) cov2_ill_conditioned = any([k < 0 for k in w2.tolist()]) any_ill_conditioned = any([cov1_ill_conditioned,cov2_ill_conditioned]) if any_ill_conditioned: print('using ill-conditioned kde') kde_1 = GaussianKde(data_1.df[names].T) print(kde_1.n, kde_1.d) kde_2 = GaussianKde(data_2.df[names].T) else: kde_1 = gaussian_kde(data_1.df[names].T) kde_2 = gaussian_kde(data_2.df[names].T) kld = kullbach_lieber_divergence(kde_1,kde_2,n_samples) return kld
def test__attribute__names__after_reading_file(): datafile_in_fn = "../../../../../data/MgO_pareto_data/culled_004.out" datafile = PyposmatDataFile() datafile.read(datafile_in_fn) assert type(datafile.names) is list
def make_latex_table(config, data, qoi_type=None, param_type=None): qoi_types = ['by_qoi_target'] param_type = [] assert isinstance(config,str) \ or isinstance(config,PyposmatConfigurationFile) assert isinstance(data,str) \ or isinstance(data,PyposmatDataFile) if isinstance(config, str): o_config = PyposmatConfigurationFile() o_config.read(filename=config) elif isinstance(config, PyposmatConfigurationFile): o_config = config else: raise TypeError() if isinstance(data, str): o_data = PyposmatDataFile() o_data.read(filename=data) elif isinstance(data, PyposmatDataFile): o_data = data else: raise TypeError() if qoi_type == 'by_qoi_target': o_data.create_normalized_errors(normalize_type='by_qoi_target', qoi_targets=o_config.qoi_targets) df = o_data.df[o_data.normalized_error_names]
def merge_files(self, i_iteration): _dir = self.data_directory _n_ranks = self.mpi_size datafile = None # filename of old kde file _filename_kde = os.path.join(_dir, 'pyposmat.kde.{}.out'.format(i_iteration)) print('Looking for previous kde file') print(' {}'.format(_filename_kde)) datafile_fns = [] if os.path.exists(_filename_kde): if os.path.isfile(_filename_kde): datafile_fns.append(_filename_kde) for i_rank in range(_n_ranks): rank_fn = os.path.join('rank_{}'.format(i_rank), 'pyposmat.results.out') datafile_fns.append(rank_fn) names = ['sim_id']\ + self.parameter_names\ + self.qoi_names\ + self.error_names types = ['sim_id']\ + ['param']*len(self.parameter_names)\ + ['qoi']*len(self.qoi_names)\ + ['err']*len(self.error_names) dataframes = OrderedDict() for fn in datafile_fns: datafile = PyposmatDataFile() datafile.read(fn) #if fn.startswith('rank') #datafile.df['sim_id'] = datafile.df.apply( # lambda x:"{}_{}_{}".format( # i_iteration,i_rank,str(x['sim_id']))) dataframes[fn] = datafile.df[names] df = pd.concat(dataframes).reset_index(drop=True) datafile = PyposmatDataFile() datafile.df = df datafile.parameter_names = self.parameter_names datafile.error_names = self.error_names datafile.qoi_names = self.qoi_names datafile.names = names datafile.types = types try: fn_out = os.path.join( _dir, 'pyposmat.results.{}.out'.format(i_iteration)) datafile.write(filename=fn_out) except FileNotFoundError as e: if not os.path.exists(self.data_directory): os.mkdir(self.data_directory) datafile.write(filename_fn_out) else: raise
def dev__read(): testing_set = get_testing_set() o = PyposmatDataFile() o.read(filename=testing_set['results_data_fn']) print(o.df['sim_id'])
def test__read_datafile(): from pypospack.pyposmat.data import PyposmatDataFile o_data = PyposmatDataFile() o_data.read(filename=datafile_fn) o_rugplot = PyposmatParetoRugplot() o_rugplot.read_datafile(filename=datafile_fn) import pandas as pd assert type(o_rugplot.data.df) is pd.DataFrame
def gmm_analysis(config_fn, data_fn, names, output_directory='gmm_analysis', max_components=20): assert isinstance(config_fn, str) assert isinstance(data_fn, str) assert os.path.isfile(config_fn) assert os.path.isfile(data_fn) if not os.path.isdir(output_directory): os.mkdir(output_directory) o_config = PyposmatConfigurationFile() o_config.read(filename=config_fn) o_data = PyposmatDataFile() o_data.read(filename=data_fn) o_data.create_normalized_errors(normalize_type='by_qoi_target', qoi_targets=o_config.qoi_targets) o_data.df['score'] = o_data.df[o_config.normalized_error_names].abs().sum( axis=1) data = o_data.df[names] n_components = np.arange(1, max_components) models = [ GaussianMixture(n_components=n, covariance_type='full', random_state=0).fit(data) for n in n_components ] # AIC analysis aic, aic_idx = min( (val, idx) for (idx, val) in enumerate([m.aic(data) for m in models])) aic_n_components = n_components[aic_idx] aic_criteria = [m.aic(data) for m in models] # BIC analysis bic, bic_idx = min( (val, idx) for (idx, val) in enumerate([m.bic(data) for m in models])) bic_n_components = n_components[bic_idx] bic_criteria = [m.bic(data) for m in models] #plot the criteria print('bic_n_components:{}'.format(bic_n_components)) print('aic_n_components:{}'.format(aic_n_components)) plot_fn = os.path.join(output_directory, 'aic_bic_plot.jpg') plot_gmm_aic_bic(filename=plot_fn, n_components=n_components, aic_criteria=aic_criteria, bic_criteria=bic_criteria, aic_n_components=aic_n_components, bic_n_components=bic_n_components) filename = os.path.join('gmm_analysis', 'gmm_analysis.jpg') plot_gmm(models[bic_n_components], data, filename=filename)
def covariance_analysis(data_fn,names): assert isinstance(data_fn,str) assert isinstance(names,list) data = PyposmatDataFile() data.read(filename=data_fn) cov_matrix = np.cov(data.df[names].T) w,v = linalg.eig(cov_matrix) print("eigenvalues:\n",w) print("eigenvectors:\n",v)
def merge_pypospack_datafiles(datafile_fns): d0 = PyposmatDataFile() d0.read(filename=datafile_fns[0]) df0 = d0.df for i in range(1, len(datafile_fns)): print("merging {}...".format(datafile_fns[i])) d = PyposmatDataFile() d.read(filename=datafile_fns[i]) df = d.df df0 = pd.concat([df0, df]).drop_duplicates().reset_index(drop=True) d0.df = df0 return d0
class PyposmatPostProcessorTestHarness(object): def __init__(self, configuration_fn, datafile_fn): self.configuration_fn = configuration_fn self.datafile_fn = datafile_fn if configuration_fn is not None: self.configuration = PyposmatConfigurationFile() self.configuration.read(configuration_fn) if datafile_fn is not None: self.datafile = PyposmatDataFile() self.datafile.read(filename=datafile_fn) def get_parameter_names(self): return self.configuration.parameter_names
def show_qoi_targets(config_fn, data_fn): o_config = PyposmatConfigurationFile() o_config.read(filename=config_fn) o_data = PyposmatDataFile() o_data.read(filename=data_fn) for qoi_name, qoi_target in o_config.qoi_targets.items(): try: qoi_avg = o_data.df[qoi_name].mean() except KeyError as e: qoi_avg = 'no value' s = "{:20} {:10} {:10}".format(qoi_name,qoi_target,qoi_avg) print(s)
class BaseAnalysis(object): def __init__(self, configuration, data, output_path=None): self.configuration = None self.data = None self.output_path = None self._initialize_configuration(configuration=configuration) self._initialize_data(data=data) self._initialize_output_path(path=output_path) def _initialize_configuration(self, configuration): if isinstance(configuration, str): assert os.path.isfile(configuration) self.configuration = PyposmatConfigurationFile() self.configuration.read(filename=configuration) elif isinstance(configuration, PyposmatConfigurationFile): self.configuration = configuration else: raise TypeError('configuration cannot be type:{}'.format( str(type(configuration)))) def _initialize_data(self, data): if isinstance(data, str): assert os.path.isfile(data) self.data = PyposmatDataFile() self.data.read(filename=data) elif isinstance(data, PyposmatDataFile): self.data = deepcopy(data) else: raise TypeError('data cannot be type:{}'.format(str(type(data)))) self.data.create_normalized_errors( normalize_type='by_qoi_target', qoi_targets=self.configuration.qoi_targets) def _initialize_output_path(self, path): if path is None: self.output_path = None elif isinstance(path, str): if os.path.isdir(path): shutil.rmtree(path) os.mkdir(path) self.output_path = path else: raise TypeError
def get_parameter_variance( config_fn,data_fn, metric_name='d_metric', n=100, o_config=None, o_data=None): """ Args: config_fn (str): data_fn (str): metric_name (str): (default:d_metric) n (int): the number of best metric values o_config (pypospack.config.data.PyposmatConfigurationFile) o_data (pypospack.config.data.PyposmatDataFile) Returns: collections.OrderedDict Raises: PyposmatUnknownPotentialScoringMetric """ _analyzer = PyposmatDataAnalyzer() _analyzer.read_configuration_file(filename=config_fn) _analyzer.read_data_file(filename=data_fn) # calculate the scoring metric if metric_name is 'd_metric': _df = _analyzer.calculate_d_metric(df=_analyzer.datafile.df) else: s = "The metric name {} is unsupported" s = s.format(metric_name) raise PyposmatUnsupportedPotentialScoringMetric(s) _data = PyposmatDataFile() _data.read(filename=data_fn) _data.df = _df _data.subselect_by_score(score_name='d_metric',n=n) _param_std_df = _data.sub_parameter_df.std(axis=0) _parameter_std_dict = OrderedDict() for pn in _analyzer.parameter_names: _parameter_std_dict[pn] =_param_std_df.to_dict()[pn] return _parameter_std_dict
def make_rug_plot(config_fn, data_fn, ax=None, plot_fn='rugplot.png'): o_config = PyposmatConfigurationFile() o_config.read(filename=config_fn) o_data = PyposmatDataFile() o_data.read(filename=data_fn) qoi_targets = o_config.qoi_targets #qoi_targets = get_qoi_targets(o_config) error_names = o_data.error_names qoi_names = o_data.qoi_names # create normalized error df = copy.deepcopy(o_data.df[error_names]) for qn in qoi_names: en = "{}.err".format(qn) nen = "{}.nerr".format(qn) q = qoi_targets[qn] df[nen]=o_data.df[en]/q-q (_nrows,_ncols) = o_data.df.shape if ax is None: fig, ax = plt.subplots(nrows=1,ncols=1) for iq,qn in enumerate(qoi_names): _yloc = [iq+1] ax.scatter( df["{}.nerr".format(qn)], _nrows*[iq+1], marker='|', s=100., color='k' ) plt.sca(ax) plt.yticks(range(len(qoi_names)+1),['']+qoi_names) fig.savefig(plot_fn)
pypospack_root_dir = pypospack.utils.get_pypospack_root_directory() config_fn = os.path.join( pypospack_root_dir, 'data','Si__sw__data','pareto_optimization_unconstrained', 'pyposmat.config.in') data_fn = os.path.join( pypospack_root_dir, 'data','Si__sw__data','pareto_optimization_unconstrained', 'pyposmat.kde.20.out') o_config = PyposmatConfigurationFile() o_config.read(filename=config_fn) o_data = PyposmatDataFile() o_data.read(filename=data_fn) manifold_learn_config = OrderedDict() manifold_learn_config['manifold_type'] = 'tsne' manifold_learn_config['pypospack_config_fn'] = config_fn manifold_learn_config['pypospack_data_fn'] = data_fn fig,ax = plt.subplots(1,3) if manifold_learn_config['manifold_type'] == 'mds': manifold['config'] = OrderedDict() manifold['config']['n_components'] = 2 manifold['config']['max_iter'] = 1000 manifold['config']['n_init'] = 1
_n_potentials = 30 _data_fn = "data__Ni__eam__born_exp_bjs_01\pyposmat.results.0.out" #_data_fn = "results.temp.out" _config_fn = "data__Ni__eam__born_exp_bjs_01\pyposmat.config.in" _plot_fn = "rugplot.png" make_rugplots = False print(80*'-') print("reading the configuration file {}...".format(_config_fn)) config=PyposmatConfigurationFile() config.read(filename=_config_fn) qoi_targets=get_qoi_targets(config) print("reading the data file {}...".format(_data_fn)) datafile=PyposmatDataFile() datafile.read(filename=_data_fn) from pypospack.pareto import pareto df = copy.deepcopy(datafile.df) nr,nc = df.shape _nsimulations = OrderedDict() _nsimulations['start'] = nr abs_error_names = ["{}.abserr".format(q) for q in datafile.qoi_names] for q in datafile.qoi_names: qe = "{}.err".format(q) qne = "{}.abserr".format(q) df[qne] = df[qe].abs() names = list(df.columns.values) abs_err_idx = [names.index(n) for n in abs_error_names] pareto_idx = pareto(df[abs_error_names].values.tolist())
class PyposmatParallelCoordinates(object): def __init__(self): self._configuration = PyposmatConfigurationFile() self._data = PyposmatDataFile() def set_configuration(self, configuration): if isinstance(configuration, str): self.set_configuration_by_path(path=configuration) elif isinstance(configuration, PyposmatConfigurationFile): self.set_configuration_by_object(config_obj=configuration) else: raise TypeError def set_data(self, data): if isinstance(data, str): self.data = PyposmatDataFile() self.data.read(data) elif isinstance(data, PyposmatDataFile): self.data = data else: raise TypeError def set_configuration_by_path(self, path): assert isinstance(path, str): self.configuration = PyposmatConfigurationFile() self.configuration.read(path) def set_configuration_by_obj(self, config_obj): assert isinstance(configuration, PyposmatConfigurationFile) self.configuration = config_obj def set_data_by_path(self, path): assert isinstance(path, str) self.data = PyposmatDataFile() self.data.read(path) def set_data_by_obj(self, data_obj): assert isinstance(data_obj, PyposmatDataFile) self.data = data @property def configuration(self): return self._configuration @configuration.setter def configuration(self, configuration): assert isinstance(configuration, PyposmatConfigurationFile) self._configuration = configuration @property def data(self): return self._data @data.setter def data(self, data): assert isinstance(configuration, PyposmatDataFile) self._data = data def plot(self, path): assert isinstance(path, str)
config.read(filename=config_fn) except FileNotFoundError as e: msg = "Cannot find pyposmat configuration file:{}".format(config_fn) message_out(msg) if pyposmat_config_script is not None: # run configuration script pass else: msg = "cannot find pyposmat configuration script because pyposmat_config_script variable was not set" message_out(msg) raise # read the data file datafile = PyposmatDataFile() datafile.read(filename=datafile_fn) (nrows, ncols) = datafile.df.shape msg = "reading data file....\n" msg += "\t{}\n".format(datafile_fn) msg += "the data file has...\n" msg += "\t{} nrows\n".format(nrows) msg += "\t{} ncols\n".format(ncols) message_out(msg) # set the plot filename plot_fn = "rugplots_MgO_buck.png" # check to see if we have excluded names qoi_excluded_names = [] qoi_names = [q for q in config.qoi_names if q not in qoi_excluded_names]
import numpy as np import pandas as pd from pypospack.pyposmat.data import PyposmatDataFile data_fn = "pyposmat.results.0.out" #u can turn panda info into numpy array. in matlab this is a pain #figure out how to do a PCA plot... find examples on scikitlearn python package #( in a different file ) data = PyposmatDataFile() data.read(filename=data_fn) #look up how to do kernel density estimate.. heat maps are cool!!!!!!!! #wiki.materialsexmachina.com/index.php/Kernel_Density_Estimate #youre chosing axes that a re linear ocmbinations of the parameters. #youre also choosing the first axes/second axes that are orthogonal to each #other and at the first axis is the direction of the greatest variace and the #second axes is in the orthogonal direction which describes the greatest amount # of variance in a direction orthogonal to the first PCA vector. #data.df is a pandas dataframe. this is the data structure, How do I get the #smallest values? then you can search with this criteria. then I dont #Data information print('Data structure =') print(type(data.df)) print('Shape =') print(data.df.shape) print('Columns = ') print(list(data.df.columns.values)) print('Parameter names =') print(data.parameter_names) print('QOI names =')
def run_file_sampling(self, filename_in): _datafile_in = PyposmatDataFile(filename=filename_in) _datafile_in.read() # configure random number generator self.write_data_out_header() self.write_badparameters_header() time_start_iteration = time.time() _n_errors = 0 i_sample = 0 for row in _datafile_in.df.iterrows(): if self.mpi_rank != i_sample % self.mpi_size: i_sample += 1 continue else: i_sample += 1 _parameters = OrderedDict([(p, row[1][p]) for p in self.parameter_names]) _sim_id = row[1]['sim_id'] # generate wierd things for p in self.constrained_parameter_names: if self.parameter_distribution_definition[p][0] == 'equals': if type(self.parameter_distribution_definition[p] [1]) is list: if self.parameter_distribution_definition[p][1][ 0] == 'equilibrium_density': a0 = self.parameter_distribution_definition[p][1][ 1] latt = self.parameter_distribution_definition[p][ 1][2] _parameters[ p] = self.calculate_equilibrium_density( a0, latt, _parameters) try: # check constraints for k, v in self.parameter_constraints.items(): _eval_str = v for pn, pv in _parameters.items(): _eval_str = _eval_str.replace(pn, str(pv)) if eval(_eval_str) is False: raise PyposmatBadParameterError() _results = self.evaluate_parameter_set(parameters=_parameters) except PyposmatBadParameterError as e: self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 except LammpsSimulationError as e: self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 except PypospackTaskManagerError as e: self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 except PypospackBadEamEosError as e: self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 else: if type(_sim_id) is float: _sim_id = int(sim_id) self.pyposmat_datafile_out.write_simulation_results( filename=self.pyposmat_data_out_filename, sim_id=_sim_id, results=_results) finally: # print out summaries every 10 solutions i_sample = i_sample + 1 if (i_sample) % 10 == 0: n_samples_completed = i_sample time_end = time.time() time_total = time_end - time_start_iteration avg_time = time_total / n_samples_completed _str_msg = '{} samples completed in {:.4f}s. Avg_time = {:.4f}. n_errors = {}'.format( n_samples_completed, time_total, avg_time, _n_errors) print('rank{}:'.format(self.mpi_rank) + _str_msg)
def run_kde_sampling(self, n_samples, filename_in, cluster_id=None, kde_bw_type='chiu1999'): """ sample from a KDE distribution Args: n_samples(int): the number of samples to draw from the KDE distribution filename_in(str): the path to the datafile from which the parameters will be drawn from cluster_id(int): if we need to use a specific cluster_id, we specify it here. otherwise, it will be drawn from all parameters contained within the set. kde_bw_type(str): the method of estimating the optimal bandwidth """ _datafile_in = PyposmatDataFile() _datafile_in.read(filename_in) if cluster_id is None: _free_parameter_names = [str(v) for v in self.free_parameter_names] _X = _datafile_in.df[_free_parameter_names].values.T else: # subselect the dataframe by the cluster_id of interest _datafile_in.df = _datafile_in.df.loc[_datafile_in.df['cluster_id'] == cluster_id] _X = _datafile_in.df[self.free_parameter_names].loc[ _datafile_in.df['cluster_id'] == cluster_id].values.T # self.log.write("cluster_id {c} _X.shape={x}".format(c=cluster_id, x=_X.shape)) kde_bw = self.determine_kde_bandwidth(X=_X, kde_bw_type=kde_bw_type) _rv_generator = scipy.stats.gaussian_kde(_X, kde_bw) self.write_data_out_header() self.write_badparameters_header() time_start_iteration = time.time() _n_errors = 0 for i_sample in range(n_samples): # determine sim_id sim_id = self.get_sim_id(i=i_sample) # new OrderedDict to hold in parameter values _parameters = OrderedDict([(p, None) for p in self.parameter_names]) # generate free parameters for ordered dictionary _free_parameters = _rv_generator.resample(1) for i, v in enumerate(self.free_parameter_names): _parameters[v] = float(_free_parameters[i, 0]) # determine parameters determined from equality constraints for p in self.constrained_parameter_names: _constraint_type = self.parameter_distribution_definition[p][0] if _constraint_type == 'equals': # this condition is for fitting EoS for EAM function which # requires a refernce ground state crystal structure if p.endswith('latticetype'): _v = self.parameter_distribution_definition[p][1] _parameters[p] = _v # process evaluation strings elif type(self.parameter_distribution_definition[p] [1]) is not list: _str_eval = str( self.parameter_distribution_definition[p][1]) # replace string values with numerical values for fp in self.free_parameter_names: if fp in _str_eval: _str_eval = _str_eval.replace( fp, str(_parameters[fp])) # evaluate the string into a float _parameters[p] = eval(_str_eval) else: raise ValueError("oops") for p in self.constrained_parameter_names: if self.parameter_distribution_definition[p][0] == 'equals': # some EAM potentials have a normalizing equilbirum density # which have to be determined based upon the parameterization of # the electron density function if type(self.parameter_distribution_definition[p] [1]) is list: if self.parameter_distribution_definition[p][1][ 0] == 'equilibrium_density': a0 = self.parameter_distribution_definition[p][1][ 1] latt = self.parameter_distribution_definition[p][ 1][2] _parameters[ p] = self.calculate_equilibrium_density( a0, latt, _parameters) try: # now we check parameter inequality constraints for k, v in self.parameter_constraints.items(): _eval_str = v for pn, pv in _parameters.items(): _eval_str = _eval_str.replace(pn, str(pv)) if eval(_eval_str) is False: s = 'parameter constraint failed, {}'.format(k) raise PyposmatBadParameterError(s, parameters=_parameters) _results = self.evaluate_parameter_set(parameters=_parameters) except PyposmatBadParameterError as e: self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 except LammpsSimulationError as e: assert isinstance(self.pyposmat_badparameters, PyposmatBadParametersFile) assert isinstance(self.pyposmat_badparameters.parameter_names, list) self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 except PypospackTaskManagerError as e: self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 except PypospackBadEamEosError as e: self.pyposmat_badparameters.write_simulation_exception( sim_id=sim_id, exception=e) _n_errors += 1 else: # determine sim_id _sim_id = int(i_sample) self.pyposmat_datafile_out.write_simulation_results( filename=self.pyposmat_data_out_filename, sim_id=i_sample, cluster_id=cluster_id, results=_results) finally: # print out summaries every 10 solutions if (i_sample + 1) % 10 == 0: n_samples_completed = i_sample + 1 time_end = time.time() time_total = time_end - time_start_iteration avg_time = time_total / n_samples_completed _str_msg = 'R{}:{} samples completed in {:.4f}s. Avg_time = {:.4f}. n_errors = {}'.format( self.mpi_rank, n_samples_completed, time_total, avg_time, _n_errors) self.log(_str_msg) d = OrderedDict() d['kde_bandwidth'] = OrderedDict() d['kde_bandwidth']['type'] = self.kde_bw_type d['kde_bandwidth']['h'] = self.kde_bw
class PyposmatPipeline(object): def __init__(self, o_logger=None, configuration_fn=None, data_fn=None, df=None): self.o_logger = o_logger # logging file object self.configuration_fn = configuration_fn self.configuration = None self.data_fn = data_fn self.data = None self.df = df self.parameter_names = None self.error_names = None self.qoi_names = None self.n_parameter_names = None # normalized self.n_error_names = None # normalized self.n_qoi_names = None # normalized self.pca_names = None self.manifold_names = None def read_configuration(self, filename): with open(filename, 'r') as f: config = yaml.load(f, OrderedDictYAMLLoader) self.configuration = config def write_configuration(self, filename, d): with open(filename, 'w') as f: yaml.dump(d, f, default_flow_style=False) def read_data(self, filename): self.data = PyposmatDataFile() self.data.read(filename) self.df = self.data.df self.parameter_names = self.data.parameter_names self.error_names = self.data.error_names self.qoi_names = self.data.qoi_names def log(self, msg): if self.o_logger is None: print(msg) else: self.o_logger.write(msg) def reset(self, o_segment): assert isinstance(o_segment, BasePipeSegment) self.df = o_segment.df self.parameter_names = o_segment.parameter_names self.error_names = o_segment.error_names self.qoi_names = o_segment.qoi_names self.n_parameter_names = o_segment.n_parameter_names self.n_error_names = o_segment.n_error_names self.n_qoi_names = o_segment.n_qoi_names self.pca_names = o_segment.pca_names self.manifold_names = o_segment.manifold_names def spawn_pipeline_segment(self, segment_type): if segment_type == 'preprocess': from pypospack.pyposmat.data.preprocess import PyposmatPreprocessor o_segment = PyposmatPreprocessor() elif segment_type == 'pca': from pypospack.pyposmat.data.pca_analysis import PyposmatPcaAnalysis o_segment = PyposmatPcaAnalysis() elif segment_type == 'cluster': from pypospack.pyposmat.data.cluster_analysis import SeatonClusterAnalysis o_segment = SeatonClusterAnalysis() elif segment_type == 'manifold': from pypospack.pyposmat.data.manifold_analysis import PyposmatManifoldAnalysis o_segment = PyposmatManifoldAnalysis() elif segment_type == 'plot': from pypospack.pyposmat.data.plotting import PyposmatPlotter o_segment = PyposmatPlotter() else: raise ValueError("unknown segment type") o_segment.o_logger = self.o_logger o_segment.df = self.df o_segment.parameter_names = self.parameter_names o_segment.error_names = self.error_names o_segment.qoi_names = self.qoi_names o_segment.n_parameter_names = self.n_parameter_names o_segment.n_error_names = self.n_error_names o_segment.n_qoi_names = self.n_qoi_names o_segment.pca_names = self.pca_names o_segment.manifold_names = self.manifold_names return o_segment def make_function_calls(self, o_segment, calls): for index in calls: self.log("calling function {}".format(calls[index]['function'])) func = getattr(o_segment, calls[index]['function']) kwargs = calls[index]['args'] func(**kwargs) def run(self): pipeline_start_time = time.time() for index in self.configuration: self.log("starting step {} of {}".format( index + 1, len(self.configuration))) # +1 to count like a normal person step_start_time = time.time() o_segment = self.spawn_pipeline_segment( self.configuration[index]['segment_type']) self.make_function_calls( o_segment=o_segment, calls=self.configuration[index]['function_calls']) self.reset(o_segment) step_end_time = time.time() step_delta = step_end_time - step_start_time step_delta = round(step_delta, 4) self.log("step {} complete in {} seconds".format( index + 1, step_delta)) pipeline_end_time = time.time() pipeline_delta = pipeline_end_time - pipeline_start_time pipeline_delta = round(pipeline_delta, 4) self.log("pipeline complete in {} seconds\n".format(pipeline_delta))
def dev__read(): from pypospack.pyposmat.data import PyposmatDataFile o = PyposmatDataFile() o.read(filename=configuration_filename)
class PyposmatBokehVisualizer(object): def __init__(self): bokeh_tools = ['box_select', 'reset', 'box_zoom', 'pan'] self.bokeh_tools = ', '.join(bokeh_tools) def read_configuration(self, filename): self.configuration = PyposmatConfigurationFile() self.configuration.read(filename=filename) def read_data(self, filename): self.datafile = PyposmatDataFile() self.datafile.read(filename=filename) self.parameter_names = list(self.datafile.parameter_names) self.qoi_names = list(self.datafile.qoi_names) self.error_names = list(self.datafile.error_names) self.param_names = list(self.datafile.parameter_names) self.qoi_names = list(self.datafile.qoi_names) self.err_names = list(self.datafile.error_names) print("parameter names") print(type(self.param_names)) for i, v in enumerate(self.param_names): print("{:3} {:<20}".format(i, v)) print("qoi names") print(type(self.qoi_names)) for i, v in enumerate(self.qoi_names): print("{:3} {:<20}".format(i, v)) print("error_names") print(type(self.err_names)) for i, v in enumerate(self.err_names): print("{:3} {:<20}".format(i, v)) # generate pandas dataframes self.param_df = copy.deepcopy(self.datafile.df[self.param_names]) self.qoi_df = copy.deepcopy(self.datafile.df[self.qoi_names]) self.err_df = copy.deepcopy(self.datafile.df[self.err_names]) self.total_df = pd.concat([self.param_df, self.qoi_df, self.err_df], axis=1) def update_data(self, param_x, param_y, err_x, err_y): self.total_df['param_x'] = self.total_df[param_x] self.total_df['param_y'] = self.total_df[param_y] self.total_df['err_x'] = self.total_df[err_x] self.total_df['err_y'] = self.total_df[err_y] self.source.data = dict(param_x=self.total_df['param_x'], param_y=self.total_df['param_y'], err_x=self.total_df['err_x'], err_y=self.total_df['err_y']) def nix(self, val, lst): return [x for x in lst if x != val] def setup_bokeh_frame(self, doc): self.source = ColumnDataSource( data=dict(param_x=[], param_y=[], err_x=[], err_y=[])) self.source_static = ColumnDataSource( data=dict(param_x=[], param_y=[], err_x=[], err_y=[])) ''' --------------------------------------------------------------- Define Param Graph --------------------------------------------------------------- ''' self.param_graph = {} self.param_graph['obj_x_select'] = Select(value=self.param_names[0], options=self.nix( self.param_names[1], self.param_names)) self.param_graph['obj_y_select'] = Select(value=self.param_names[1], options=self.nix( self.param_names[0], self.param_names)) self.param_graph['x_min_entry'] = TextInput(placeholder='Min X Value', value='') self.param_graph['x_max_entry'] = TextInput(placeholder='Max X Value', value='') self.param_graph['y_min_entry'] = TextInput(placeholder='Min Y Value', value='') self.param_graph['y_max_entry'] = TextInput(placeholder='Max Y Value', value='') self.param_graph['plot_width'] = 610 self.param_graph['plot_height'] = 400 self.param_graph['tools'] = self.bokeh_tools self.param_graph['obj_figure'] = figure( plot_width=self.param_graph['plot_width'], plot_height=self.param_graph['plot_height'], tools=self.param_graph['tools'], title=self.param_graph['obj_x_select'].value + ' vs. ' + self.param_graph['obj_y_select'].value) self.param_graph['obj_figure'].xaxis.axis_label = self.param_graph[ 'obj_x_select'].value self.param_graph['obj_figure'].yaxis.axis_label = self.param_graph[ 'obj_y_select'].value self.param_graph['obj_glyph'] = Circle(x='param_x', y='param_y', size=1, fill_color='#5F77D5', line_color='#5F77D5') self.param_graph['obj_figure'].add_glyph(self.source, self.param_graph['obj_glyph']) ''' --------------------------------------------------------------- Define Err Graph --------------------------------------------------------------- ''' self.err_graph = {} self.err_graph['obj_x_select'] = Select(value=self.err_names[0], options=self.nix( self.err_names[1], self.err_names)) self.err_graph['obj_y_select'] = Select(value=self.err_names[1], options=self.nix( self.err_names[0], self.err_names)) self.err_graph['x_min_entry'] = TextInput(placeholder='Min X Value', value='') self.err_graph['x_max_entry'] = TextInput(placeholder='Max X Value', value='') self.err_graph['y_min_entry'] = TextInput(placeholder='Min Y Value', value='') self.err_graph['y_max_entry'] = TextInput(placeholder='Max Y Value', value='') self.err_graph['plot_width'] = 610 self.err_graph['plot_height'] = 400 self.err_graph['tools'] = self.bokeh_tools self.err_graph['obj_figure'] = figure( plot_width=self.err_graph['plot_width'], plot_height=self.err_graph['plot_height'], tools=self.err_graph['tools'], title=self.err_graph['obj_x_select'].value + ' vs. ' + self.err_graph['obj_y_select'].value) self.err_graph['obj_figure'].xaxis.axis_label = self.err_graph[ 'obj_x_select'].value self.err_graph['obj_figure'].yaxis.axis_label = self.err_graph[ 'obj_y_select'].value self.err_graph['obj_glyph'] = Circle(x='err_x', y='err_y', size=1, fill_color='#5F77D5', line_color='#5F77D5') self.err_graph['obj_figure'].add_glyph(self.source, self.err_graph['obj_glyph']) def update(): param_name_x = self.param_graph['obj_x_select'].value param_name_y = self.param_graph['obj_y_select'].value err_name_x = self.err_graph['obj_x_select'].value err_name_y = self.err_graph['obj_y_select'].value self.update_data(param_name_x, param_name_y, err_name_x, err_name_y) param_widgets = bokeh.layouts.row(self.param_graph['obj_x_select'], self.param_graph['obj_y_select']) param_x_entry = bokeh.layouts.row(self.param_graph['x_min_entry'], self.param_graph['x_max_entry']) param_y_entry = bokeh.layouts.row(self.param_graph['y_min_entry'], self.param_graph['y_max_entry']) param_pane = bokeh.layouts.column(param_widgets, self.param_graph['obj_figure'], param_x_entry, param_y_entry) err_widgets = bokeh.layouts.row(self.err_graph['obj_x_select'], self.err_graph['obj_y_select']) err_x_entry = bokeh.layouts.row(self.err_graph['x_min_entry'], self.err_graph['x_max_entry']) err_y_entry = bokeh.layouts.row(self.err_graph['y_min_entry'], self.err_graph['y_max_entry']) err_pane = bokeh.layouts.column(err_widgets, self.err_graph['obj_figure'], err_x_entry, err_y_entry) layout = bokeh.layouts.row(param_pane, err_pane) doc.add_root(layout) update() # callback functions def param_x_select_change(attrname, old, new): self.source.data['param_x'] = self.total_df[new] self.param_graph[ 'obj_figure'].title.text = new + ' vs. ' + self.param_graph[ 'obj_y_select'].value self.param_graph['obj_figure'].xaxis.axis_label = new def param_y_select_change(attrname, old, new): self.source.data['param_y'] = self.total_df[new] self.param_graph['obj_figure'].title.text = self.param_graph[ 'obj_x_select'].value + ' vs. ' + new self.param_graph['obj_figure'].yaxis.axis_label = new self.param_graph['obj_x_select'].on_change('value', param_x_select_change) self.param_graph['obj_y_select'].on_change('value', param_y_select_change) def err_x_select_change(attrname, old, new): self.source.data['err_x'] = self.total_df[new] self.err_graph[ 'obj_figure'].title.text = new + ' vs. ' + self.err_graph[ 'obj_y_select'].value self.err_graph['obj_figure'].xaxis.axis_label = new def err_y_select_change(attrname, old, new): self.source.data['err_y'] = self.total_df[new] self.err_graph['obj_figure'].title.text = self.err_graph[ 'obj_x_select'].value + ' vs. ' + new self.err_graph['obj_figure'].yaxis.axis_label = new self.err_graph['obj_x_select'].on_change('value', err_x_select_change) self.err_graph['obj_y_select'].on_change('value', err_y_select_change) def source_callback(attrname, old, new): selected_index_list = list(new['1d']['indices']) selected_rows = [] for i in selected_index_list: data_row = self.total_df.ix[i] selected_rows.append(data_row) formatted_rows = [] for rows in selected_rows: rows = rows[: -4] # remove the 4 copied columns used in source callback formatted_rows.append(list(rows.get_values())) ''' for rows in selected_rows: param_x_row = self.param_graph['obj_x_select'].value+': '+str(rows[self.param_graph['obj_x_select'].value]) param_y_row = self.param_graph['obj_y_select'].value+': '+str(rows[self.param_graph['obj_y_select'].value]) err_x_row = self.err_graph['obj_x_select'].value+': '+str(rows[self.err_graph['obj_x_select'].value]) err_y_row = self.err_graph['obj_y_select'].value+': '+str(rows[self.err_graph['obj_y_select'].value]) formatted_rows.append(str(param_x_row)+' '+str(param_y_row)+' '+str(err_x_row)+' '+str(err_y_row)) ''' with open('selected_points.txt', 'w') as f: f.write(' '.join(self.param_names) + ' ' + ' '.join(self.err_names) + '\n') for fr in formatted_rows: # apparently python cannot write a list to a file so the extra formatting is necessary fr = str(fr) fr.replace('[', '') fr.replace(']', '') f.write(fr + '\n') self.source.on_change('selected', source_callback) def param_x_min_callback(attrname, old, new): self.param_graph['obj_figure'].x_range.start = float(new) def param_x_max_callback(attrname, old, new): self.param_graph['obj_figure'].x_range.end = float(new) def param_y_min_callback(attrname, old, new): self.param_graph['obj_figure'].y_range.start = float(new) def param_y_max_callback(attrname, old, new): self.param_graph['obj_figure'].y_range.end = float(new) self.param_graph['x_min_entry'].on_change('value', param_x_min_callback) self.param_graph['x_max_entry'].on_change('value', param_x_max_callback) self.param_graph['y_min_entry'].on_change('value', param_y_min_callback) self.param_graph['y_max_entry'].on_change('value', param_y_max_callback) def err_x_min_callback(attrname, old, new): self.err_graph['obj_figure'].x_range.start = float(new) def err_x_max_callback(attrname, old, new): self.err_graph['obj_figure'].x_range.end = float(new) def err_y_min_callback(attrname, old, new): self.err_graph['obj_figure'].y_range.start = float(new) def err_y_max_callback(attrname, old, new): self.err_graph['obj_figure'].y_range.end = float(new) self.err_graph['x_min_entry'].on_change('value', err_x_min_callback) self.err_graph['x_max_entry'].on_change('value', err_x_max_callback) self.err_graph['y_min_entry'].on_change('value', err_y_min_callback) self.err_graph['y_max_entry'].on_change('value', err_y_max_callback) def start_bokeh_server(self): self.bokeh_app = Application(FunctionHandler(self.setup_bokeh_frame)) self.bokeh_server = Server({'/': self.bokeh_app}, num_procs=1) self.bokeh_server.start() # start io loop for bokeh_server self.bokeh_server.io_loop.add_callback(self.bokeh_server.show, '/') self.bokeh_server.io_loop.start()