def get_pd(self, chemsys=None): """ Refresh the phase diagram associated with the seed_data Args: chemsys (str): chemical system for which to filter seed data to provide partial phase diagram Returns: None """ self.pd = PhaseData() # Filter seed data by relevant chemsys if chemsys: total_comp = Composition(chemsys.replace('-', '')) filtered = filter_dataframe_by_composition(self.seed_data, total_comp) else: filtered = self.seed_data phases = [ Phase( row["Composition"], energy=row["delta_e"], per_atom=True, description=row_index, ) for row_index, row in filtered.iterrows() ] phases.extend([Phase(el, 0.0, per_atom=True) for el in ELEMENTS]) self.pd.add_phases(phases) return self.pd
def test_simulated(self): exp_dataframe = pd.read_pickle( os.path.join(CAMD_TEST_FILES, "mn-ni-o-sb.pickle")) experiment = ATFSampler(exp_dataframe) candidate_data = exp_dataframe.iloc[:, :-11] # Set up agents and loop parameters agent = AgentStabilityAdaBoost( model=MLPRegressor(hidden_layer_sizes=(84, 50)), n_query=2, hull_distance=0.2, exploit_fraction=1.0, uncertainty=True, alpha=0.5, diversify=True, n_estimators=20) analyzer = StabilityAnalyzer(hull_distance=0.2) # Reduce seed_data icsd_data = load_dataframe("oqmd1.2_exp_based_entries_featurized_v2") seed_data = filter_dataframe_by_composition(icsd_data, "MnNiOSb") leftover = ~icsd_data.index.isin(seed_data.index) # Add some random other data to test compositional flexibility seed_data = seed_data.append(icsd_data.loc[leftover].sample(30)) del icsd_data with ScratchDir('.'): campaign = ProtoDFTCampaign(candidate_data=candidate_data, agent=agent, experiment=experiment, analyzer=analyzer, seed_data=seed_data, heuristic_stopper=5) campaign.autorun() self.assertTrue(os.path.isfile('hull_finalized.png'))
def test_plot_hull(self): df = pd.read_csv(os.path.join(CAMD_TEST_FILES, "test_df_analysis.csv"), index_col="id") df['Composition'] = df['formula'] # Test 2D with ScratchDir('.'): analyzer = StabilityAnalyzer(hull_distance=0.1) filtered = filter_dataframe_by_composition(df, "TiO") analyzer.plot_hull(filtered, new_result_ids=["mp-685151", "mp-755875"], filename="hull.png") self.assertTrue(os.path.isfile("hull.png")) # Test 3D with ScratchDir('.'): analyzer.hull_distance = 0.05 filtered = filter_dataframe_by_composition(df, "TiNO") analyzer.plot_hull(filtered, new_result_ids=["mp-776280", "mp-30998"], filename="hull.png") self.assertTrue(os.path.isfile("hull.png"))
def test_simulated(self): exp_dataframe = pd.read_pickle( os.path.join(CAMD_TEST_FILES, "mn-ni-o-sb.pickle")) experiment = ATFSampler(exp_dataframe) candidate_data = exp_dataframe.iloc[:, :-11] agent = RandomAgent(n_query=2) analyzer = StabilityAnalyzer(hull_distance=0.2) # Reduce seed_data seed_data = load_dataframe("oqmd1.2_exp_based_entries_featurized_v2") seed_data = filter_dataframe_by_composition(seed_data, "MnNiOSb") with ScratchDir('.'): campaign = ProtoDFTCampaign(candidate_data=candidate_data, agent=agent, experiment=experiment, analyzer=analyzer, seed_data=seed_data, heuristic_stopper=5) campaign.autorun()
def test_analyze(self): df = pd.read_csv(os.path.join(CAMD_TEST_FILES, "test_df_analysis.csv"), index_col="id") df['Composition'] = df['formula'] analyzer = StabilityAnalyzer(hull_distance=0.1) seed_data = filter_dataframe_by_composition(df, "TiNO") # TODO: resolve drop_duplicates filtering mp data seed_data = seed_data.drop_duplicates(keep='last').dropna() new_exp_indices = ["mp-30998", "mp-572822"] new_experimental_results = seed_data.loc[new_exp_indices] seed_data = seed_data.drop(index=new_exp_indices) summary, seed_data = analyzer.analyze( new_experimental_results=seed_data, seed_data=pd.DataFrame(), ) summary, new_seed = analyzer.analyze( new_experimental_results=new_experimental_results, seed_data=seed_data ) self.assertAlmostEqual(new_seed.loc['mp-30998', 'stability'], 0) self.assertAlmostEqual(new_seed.loc["mp-572822", 'stability'], 0.52784795) self.assertTrue(new_seed.loc['mp-30998', 'is_stable']) self.assertFalse(new_seed.loc["mp-572822", 'is_stable'])
def update_run_w_structure(folder, hull_distance=0.2, parallel=True): """ Updates a campaign grouped in directories with structure analysis """ with cd(folder): required_files = ["seed_data.pickle"] if os.path.isfile("error.json"): error = loadfn("error.json") print("{} ERROR: {}".format(folder, error)) if not all([os.path.isfile(fn) for fn in required_files]): print("{} ERROR: no seed data, no analysis to be done") else: with open("seed_data.pickle", "rb") as f: df = pickle.load(f) with open("experiment.pickle", "rb") as f: experiment = pickle.load(f) # Hack to update agg_history experiment.update_current_data(None) all_submitted, all_results = experiment.agg_history old_results = df.drop(all_results.index, errors='ignore') new_results = df.drop(old_results.index) st_a = StabilityAnalyzer(hull_distance=hull_distance, parallel=parallel, entire_space=False, plot=False) summary, new_seed = st_a.analyze(new_results, old_results) # Having calculated stabilities again, we plot the overall hull. # Filter by chemsys new_comp = new_results['Composition'].sum() filtered = filter_dataframe_by_composition(new_seed, new_comp) st_a.plot_hull( filtered, all_submitted.index, filename="hull_finalized.png", finalize=True, ) stable_discovered = new_seed[new_seed["is_stable"].fillna(False)] # Analyze structures if present in experiment if "structure" in all_results.columns: s_a = AnalyzeStructures() s_a.analyze_vaspqmpy_jobs(all_results, against_icsd=True, use_energies=True) unique_s_dict = {} for i in range(len(s_a.structures)): if s_a.structure_is_unique[i] and ( s_a.structure_ids[i] in stable_discovered.index): unique_s_dict[s_a.structure_ids[i]] = s_a.structures[i] with open("discovered_unique_structures.json", "w") as f: json.dump( dict([(k, s.as_dict()) for k, s in unique_s_dict.items()]), f) with open("structure_report.log", "w") as f: f.write( "consumed discovery unique_discovery duplicate in_icsd \n" ) f.write( str(len(all_submitted)) + " " + str(len(stable_discovered)) + " " + str(len(unique_s_dict)) + " " + str(len(s_a.structures) - sum(s_a._not_duplicate)) + " " + str(sum([not i for i in s_a._icsd_filter])))
def plot_hull(self, df, new_result_ids, filename=None, finalize=False): """ Generate plots of convex hulls for each of the runs Args: df (DataFrame): dataframe with formation energies and formulas new_result_ids ([]): list of new result ids (i. e. indexes in the updated dataframe) filename (str): filename to output, if None, no file output is produced finalize (bool): flag indicating whether to include all new results Returns: (pyplot): plotter instance """ # Generate all entries total_comp = Composition(df['Composition'].sum()) if len(total_comp) > 4: warnings.warn( "Number of elements too high for phase diagram plotting") return None filtered = filter_dataframe_by_composition(df, total_comp) filtered = filtered[['delta_e', 'Composition']] filtered = filtered.dropna() # Create computed entry column with un-normalized energies filtered["entry"] = [ ComputedEntry( Composition(row["Composition"]), row["delta_e"] * Composition(row["Composition"]).num_atoms, entry_id=index, ) for index, row in filtered.iterrows() ] ids_prior_to_run = list(set(filtered.index) - set(new_result_ids)) if not ids_prior_to_run: warnings.warn( "No prior data, prior phase diagram cannot be constructed") return None # Create phase diagram based on everything prior to current run entries = filtered.loc[ids_prior_to_run]["entry"].dropna() # Filter for nans by checking if it's a computed entry pg_elements = sorted(total_comp.keys()) pd = PhaseDiagram(entries, elements=pg_elements) plotkwargs = { "markerfacecolor": "white", "markersize": 7, "linewidth": 2, } if finalize: plotkwargs.update({"linestyle": "--"}) else: plotkwargs.update({"linestyle": "-"}) plotter = PDPlotter(pd, backend='matplotlib', **plotkwargs) getplotkwargs = {"label_stable": False} if finalize else {} plot = plotter.get_plot(**getplotkwargs) # Get valid results valid_results = [ new_result_id for new_result_id in new_result_ids if new_result_id in filtered.index ] if finalize: # If finalize, we'll reset pd to all entries at this point to # measure stabilities wrt. the ultimate hull. pd = PhaseDiagram(filtered["entry"].values, elements=pg_elements) plotter = PDPlotter(pd, backend="matplotlib", **{ "markersize": 0, "linestyle": "-", "linewidth": 2 }) plot = plotter.get_plot(plt=plot) for entry in filtered["entry"][valid_results]: decomp, e_hull = pd.get_decomp_and_e_above_hull( entry, allow_negative=True) if e_hull < self.hull_distance: color = "g" marker = "o" markeredgewidth = 1 else: color = "r" marker = "x" markeredgewidth = 1 # Get coords coords = [ entry.composition.get_atomic_fraction(el) for el in pd.elements ][1:] if pd.dim == 2: coords = coords + [pd.get_form_energy_per_atom(entry)] if pd.dim == 3: coords = triangular_coord(coords) elif pd.dim == 4: coords = tet_coord(coords) plot.plot(*coords, marker=marker, markeredgecolor=color, markerfacecolor="None", markersize=11, markeredgewidth=markeredgewidth) if filename is not None: plot.savefig(filename, dpi=70) plot.close()
def analyze(self, new_experimental_results, seed_data): """ Args: new_experimental_results (DataFrame): new experimental results to be added to the seed seed_data (DataFrame): seed to be augmented via the new_experimental_results Returns: (DataFrame): summary of the process, i. e. of the increment or experimental results (DataFrame): augmented seed data, i. e. "new" seed data according to the experimental results """ # Check for new results new_comp = new_experimental_results['Composition'].sum() new_experimental_results = new_experimental_results.dropna( subset=['delta_e']) new_seed = seed_data.append(new_experimental_results) # Aggregate seed_data and new experimental results include_columns = ["Composition", "delta_e"] filtered = new_seed[include_columns].drop_duplicates( keep="last").dropna() if not self.entire_space: # Constrains the phase space to that of the target compounds. # More efficient when searching in a specified chemistry, # less efficient if larger spaces are without specified chemistry. filtered = filter_dataframe_by_composition(filtered, new_comp) space = self.get_phase_space(filtered) new_phases = [ p for p in space.phases if p.description in filtered.index ] space.compute_stabilities(phases=new_phases, ncpus=self.parallel) # Compute new stabilities and update new seed, note that pandas will complain # if the index is not explicit due to multiple types (e. g. ints for OQMD # and strs for prototypes) new_data = pd.DataFrame( {"stability": [phase.stability for phase in new_phases]}, index=[phase.description for phase in new_phases]) new_data["is_stable"] = new_data["stability"] <= self.hull_distance # TODO: This is implicitly adding "stability", and "is_stable" columns # but could be handled more gracefully if "stability" not in new_seed.columns: new_seed = pd.concat([new_seed, new_data], axis=1, sort=False) else: new_seed.update(new_data) # Write hull figure to disk if self.plot: self.plot_hull(filtered, new_experimental_results.index, filename="hull.png") # Compute summary metrics summary = self.get_summary( new_seed, new_experimental_results.index, initial_seed_indices=self.initial_seed_indices, ) # Drop excess columns from experiment new_seed = new_seed.drop([ 'path', 'status', 'start_time', 'jobId', 'jobName', 'jobArn', 'result', 'error', 'elapsed_time' ], axis="columns", errors="ignore") return summary, new_seed