def test_mp_loop(self): df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df_analysis.csv'), ) # index_col="id") df['id'] = [ int(mp_id.replace("mp-", "").replace('mvc-', '')) for mp_id in df['id'] ] df.set_index("id") df['Composition'] = df['formula'] # Just use the Ti-O-N chemsys seed_data = df.iloc[:38] candidate_data = df.iloc[38:209] n_query = 20 # This many new candidates are "calculated with DFT" (i.e. requested from Oracle -- DFT) agent = RandomAgent agent_params = {'n_query': n_query} analyzer = AnalyzeStability analyzer_params = {'hull_distance': 0.05} experiment = ATFSampler experiment_params = {'dataframe': df} # candidate_data = df new_loop = Loop(candidate_data, agent, experiment, analyzer, agent_params=agent_params, analyzer_params=analyzer_params, experiment_params=experiment_params, seed_data=seed_data) new_loop.initialize() self.assertFalse(new_loop.create_seed) for iteration in range(6): new_loop.run() self.assertTrue(os.path.isfile("hull_{}.png".format(iteration))) if iteration >= 1: self.assertTrue(os.path.isfile("report.png")) # Testing the continuation new_loop = Loop(candidate_data, agent, experiment, analyzer, agent_params=agent_params, analyzer_params=analyzer_params, experiment_params=experiment_params) self.assertTrue(new_loop.initialized) self.assertEqual(new_loop.iteration, 6) self.assertEqual(new_loop.loop_state, None) new_loop.run() self.assertTrue(True) self.assertEqual(new_loop.iteration, 7)
def test_generate_final_report(self): with ScratchDir('.'): df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv')) experiment_params = {"dataframe": df} # Construct and start loop new_loop = Loop(df, AgentStabilityML5, ATFSampler, AnalyzeStability, agent_params={}, create_seed=True, analyzer_params={}, experiment_params=experiment_params, ) new_loop.generate_report_plot( "report.png", os.path.join(CAMD_TEST_FILES, "report.log")) self.assertTrue(os.path.isfile("report.png"))
def test_sync(self): with ScratchDir('.'): df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv')) experiment_params = {"dataframe": df} # Construct and start loop new_loop = Loop(df, AgentStabilityML5, ATFSampler, AnalyzeStability, agent_params={}, create_seed=10, analyzer_params={}, experiment_params=experiment_params, s3_prefix="test") new_loop.initialize() s3 = boto3.resource('s3') obj = s3.Object(CAMD_S3_BUCKET, "test/iteration.json") loaded = json.loads(obj.get()['Body'].read()) self.assertEqual(loaded, 0)
def test_random_agent_loop(self): df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv')) n_seed = 200 # Starting sample size n_query = 10 # This many new candidates are "calculated with DFT" (i.e. requested from Oracle -- DFT) agent = RandomAgent agent_params = {'n_query': n_query} analyzer = AnalyzeStability analyzer_params = {'hull_distance': 0.05} experiment = ATFSampler experiment_params = {'dataframe': df} candidate_data = df new_loop = Loop(candidate_data, agent, experiment, analyzer, agent_params=agent_params, analyzer_params=analyzer_params, experiment_params=experiment_params, create_seed=n_seed) new_loop.initialize() self.assertFalse(new_loop.create_seed) for _ in range(6): new_loop.run() self.assertTrue(True) # Testing the continuation new_loop = Loop(candidate_data, agent, experiment, analyzer, agent_params=agent_params, analyzer_params=analyzer_params, experiment_params=experiment_params, create_seed=n_seed) self.assertTrue(new_loop.initialized) self.assertEqual(new_loop.iteration, 6) self.assertEqual(new_loop.loop_state, None) new_loop.run() self.assertTrue(True) self.assertEqual(new_loop.iteration, 7)
def update_run(folder): """ Updates existing runs in s3 to include plots Returns: List of modified chemsys """ required_files = ["seed_data.pickle", "report.log"] with cd(folder): if os.path.isfile("error.json"): error = loadfn("error.json") print("{} ERROR: {}".format(folder, error)) if not all([os.path.isfile(fn) for fn in required_files]): print("{} ERROR: no seed data, no analysis to be done") else: analyzer = AnalyzeStability(hull_distance=0.2) # Generate report plots for iteration in range(0, 25): print("{}: {}".format(folder, iteration)) if not os.path.isdir(str(iteration)) or not os.path.isdir( str(iteration - 1)): continue with open(os.path.join(str(iteration), "seed_data.pickle"), "rb") as f: result_df = pickle.load(f) all_result_ids = loadfn( os.path.join(str(iteration - 1), "consumed_candidates.json")) new_result_ids = loadfn( os.path.join(str(iteration - 1), "submitted_experiment_requests.json")) analyzer.present(df=result_df, new_result_ids=new_result_ids, all_result_ids=all_result_ids, filename="hull_{}.png".format(iteration), finalize=False) Loop.generate_report_plot()
def run_atf_campaign(chemsys): """ A very simple test campaign Returns: True """ s3_prefix = "oqmd-atf/runs/{}".format(chemsys) df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv')) n_seed = 200 # Starting sample size n_query = 10 # This many new candidates are "calculated with DFT" (i.e. requested from Oracle -- DFT) agent = RandomAgent agent_params = {'hull_distance': 0.05, 'N_query': n_query} analyzer = AnalyzeStability analyzer_params = {'hull_distance': 0.05} experiment = ATFSampler experiment_params = {'dataframe': df} candidate_data = df new_loop = Loop(candidate_data, agent, experiment, analyzer, agent_params=agent_params, analyzer_params=analyzer_params, experiment_params=experiment_params, create_seed=n_seed, s3_prefix=s3_prefix) new_loop.initialize() for _ in range(3): new_loop.run() return True
def test_svgp_loop(self): df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv')) df_sub = df[df['N_species'] <= 3] n_seed = 200 # Starting sample size n_query = 10 # This many new candidates are "calculated with DFT" (i.e. requested from Oracle -- DFT) agent = SVGProcessStabilityAgent agent_params = { 'n_query': n_query, 'hull_distance': 0.05, # Distance to hull to consider a finding as discovery (eV/atom) 'alpha': 0.5, # Fraction of std to include in expected improvement 'M': 100 # number of inducing points for SVGP } analyzer = AnalyzeStability analyzer_params = {'hull_distance': 0.05} experiment = ATFSampler experiment_params = {'dataframe': df_sub} candidate_data = df_sub path = '.' new_loop = Loop(candidate_data, agent, experiment, analyzer, agent_params=agent_params, analyzer_params=analyzer_params, experiment_params=experiment_params, create_seed=n_seed) new_loop.initialize() self.assertTrue(new_loop.initialized) new_loop.auto_loop(3) self.assertTrue(True)
def test_random_agent_loop(self): df = load_dataframe("oqmd1.2_exp_based_entries_featurized_v2") n_seed = 5000 n_query = 200 agent = RandomAgent agent_params = { 'hull_distance': 0.05, 'n_query': n_query, } analyzer = AnalyzeStability analyzer_params = {'hull_distance': 0.05} experiment = ATFSampler experiment_params = {'params': {'dataframe': df}} candidate_data = df path = '.' new_loop = Loop(candidate_data, agent, experiment, analyzer, agent_params=agent_params, analyzer_params=analyzer_params, experiment_params=experiment_params, create_seed=n_seed) new_loop.initialize() self.assertFalse(new_loop.create_seed) for _ in range(6): new_loop.run() self.assertTrue(True)
def test_adaboost_loop(self): df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv')) df_sub = df[df['N_species'] <= 3] n_seed = 200 # Starting sample size n_query = 10 # This many new candidates are "calculated with DFT" (i.e. requested from Oracle -- DFT) agent = AgentStabilityAdaBoost agent_params = { 'ml_algorithm': MLPRegressor, 'ml_algorithm_params': { 'hidden_layer_sizes': (84, 50) }, 'n_query': n_query, 'hull_distance': 0.05, # Distance to hull to consider a finding as discovery (eV/atom) 'exploit_fraction': 1.0, # Fraction to exploit (rest will be explored -- randomly picked) 'alpha': 0.5, # Fraction of std to include in expected improvement 'n_estimators': 10 } analyzer = AnalyzeStability analyzer_params = {'hull_distance': 0.05} experiment = ATFSampler experiment_params = {'dataframe': df_sub} candidate_data = df_sub path = '.' new_loop = Loop(candidate_data, agent, experiment, analyzer, agent_params=agent_params, analyzer_params=analyzer_params, experiment_params=experiment_params, create_seed=n_seed) new_loop.initialize() self.assertTrue(new_loop.initialized) new_loop.auto_loop(6) self.assertTrue(True)
def run_proto_dft_campaign(chemsys, s3_prefix="proto-dft-2"): """ Args: chemsys (str): chemical system for the campaign s3_prefix (str): s3 prefix to sync to Returns: (bool): True if run exits """ s3_prefix = "{}/runs/{}".format(s3_prefix, chemsys) # Initialize s3 dumpfn({"started": datetime.now().isoformat(), "version": __version__}, "start.json") s3_sync(s3_bucket=CAMD_S3_BUCKET, s3_prefix=s3_prefix, sync_path='.') try: # Get structure domain element_list = chemsys.split('-') g_max, charge_balanced = heuristic_setup(element_list) domain = StructureDomain.from_bounds( element_list, charge_balanced=charge_balanced, n_max_atoms=20, **{'grid': range(1, g_max)}) candidate_data = domain.candidates() structure_dict = domain.hypo_structures_dict # Dump structure/candidate data with open('candidate_data.pickle', 'wb') as f: pickle.dump(candidate_data, f) with open('structure_dict.pickle', 'wb') as f: pickle.dump(structure_dict, f) # Set up agents and loop parameters agent = AgentStabilityAdaBoost agent_params = { 'ml_algorithm': MLPRegressor, 'ml_algorithm_params': {'hidden_layer_sizes': (84, 50)}, 'n_query': 10, 'hull_distance': 0.2, # Distance to hull to consider a finding as discovery (eV/atom) 'exploit_fraction': 1.0, # Fraction to exploit (rest will be explored -- randomly picked) 'uncertainty': True, 'alpha': 0.5, 'diversify': True, 'n_estimators': 20 } analyzer = AnalyzeStability analyzer_params = {'hull_distance': 0.2} # analysis criterion (need not be exactly same as agent's goal) experiment = OqmdDFTonMC1 experiment_params = {'structure_dict': structure_dict, 'candidate_data': candidate_data, 'timeout': 30000} experiment_params.update({'timeout': 30000}) finalizer = FinalizeQqmdCampaign finalizer_params = {'hull_distance': 0.2} n_max_iter = n_max_iter_heuristics(len(candidate_data), 10) # Construct and start loop new_loop = Loop( candidate_data, agent, experiment, analyzer, agent_params=agent_params, analyzer_params=analyzer_params, experiment_params=experiment_params, finalizer=finalizer, finalizer_params=finalizer_params, heuristic_stopper=5, s3_prefix=s3_prefix) new_loop.auto_loop_in_directories( n_iterations=n_max_iter, timeout=10, monitor=True, initialize=True, with_icsd=True) except Exception as e: error_msg = {"error": "{}".format(e), "traceback": traceback.format_exc()} dumpfn(error_msg, "error.json") dumpfn({"status": "error"}, "job_status.json") s3_sync(s3_bucket=CAMD_S3_BUCKET, s3_prefix=s3_prefix, sync_path='.') return True
n_seed = 5000 # Starting sample size - a seed of this size will be randomly chosen. n_query = 200 # This many new candidates are "calculated with DFT" (i.e. requested from Oracle -- DFT) agent = QBCStabilityAgent agent_params = { 'ml_algorithm': MLPRegressor, 'ml_algorithm_params': { 'hidden_layer_sizes': (84, 50) }, 'n_query': n_query, 'n_members': 10, # Committee size in QBC 'hull_distance': 0.05, # Distance to hull to consider a finding as discovery (eV/atom) 'frac': 0.5 # Fraction of data to choose to form a committee member } analyzer = AnalyzeStability analyzer_params = {'hull_distance': 0.05} experiment = ATFSampler experiment_params = {'dataframe': df} candidate_data = df ########################################################## new_loop = Loop(candidate_data, agent, experiment, analyzer, agent_params=agent_params, analyzer_params=analyzer_params, experiment_params=experiment_params, create_seed=n_seed) new_loop.auto_loop(n_iterations=4, timeout=5, initialize=True)
from sklearn.neural_network import MLPRegressor # Let's create our search domain as Ir-Fe-O ternary. We restrict our search to structures with max 10 atoms. # We further restrict the possible stoichiometry coefficients to integers to [1,4). domain = StructureDomain.from_bounds(["Ir", "Fe", "O"], charge_balanced=True, n_max_atoms = 10, **{"grid": range(1,4)}) candidate_data = domain.candidates() structure_dict = domain.hypo_structures_dict # Setup the loop for this campaign. agent = QBCStabilityAgent # We use a query-by-committee (QBC) based agent agent_params = { # Parameters of the agent 'ml_algorithm': MLPRegressor, # We use simple fully connected neural network as our regressor 'ml_algorithm_params': {'hidden_layer_sizes': (84, 50)}, 'n_query': 3, # Agent is allowed 3 experiments per iteration 'n_members': 10, # Committee size for QBC 'hull_distance': 0.1, # Distance to hull to consider a finding as discovery (eV/atom) 'frac': 0.5 } analyzer = AnalyzeStability # Analyzer for stability analyzer_params = {'hull_distance': 0.1} experiment = OqmdDFTonMC1 # This is the Experiment method to run OQMD compatible DFT on AWS-MC1 experiment_params = {'structure_dict': structure_dict, # Parameters of this experiment class include structures. 'candidate_data': candidate_data, 'timeout': 30000} # Loop class puts all the above pieces together. new_loop = Loop(candidate_data, agent, experiment, analyzer, agent_params=agent_params, analyzer_params=analyzer_params, experiment_params=experiment_params) # Let's start the campaign! new_loop.auto_loop_in_directories(n_iterations=3, timeout=1, monitor=True, initialize=True, with_icsd=True)