Ejemplo n.º 1
0
def process_run():
    folder = os.getcwd()
    if os.path.isfile("error.json"):
        error = loadfn("error.json")
        print("{} ERROR: {}".format(folder, error))

    required_files = ['seed_data.pickle']
    if not all([os.path.isfile(fn) for fn in required_files]):
        print("{} ERROR: no seed data, no analysis to be done")
    else:
        analyzer = StabilityAnalyzer(hull_distance=0.2, parallel=True)
        with open(os.path.join("seed_data.pickle"), "rb") as f:
            result_df = pickle.load(f)

        unique_structures = loadfn("discovered_unique_structures.json")
        all_result_ids = list(unique_structures.keys())

        summary = result_df.loc[all_result_ids]
        summary = summary[['Composition', 'delta_e']]
        analyzer.analyze(result_df,
                         all_result_ids=all_result_ids,
                         new_result_ids=all_result_ids)
        # Add stabilities
        summary['stabilities'] = pd.Series(analyzer.stabilities)

        chemsys = os.path.split(folder)[-1]
        # Get all DFT data
        response = requests.get('{}/synthesis-discovery/{}/dft-results'.format(
            API_URL, chemsys))
        data = json.loads(response.content.decode('utf-8'))
        data = pd.DataFrame(data)
        aggregated = {}
        for result in data['dft_results']:
            aggregated.update(result)
        simulation_data = pd.DataFrame.from_dict(aggregated, orient='index')
        summary['bandgap'] = simulation_data['bandgap']
        # Apply garcia correction
        summary['bandgap_garcia_exp'] = 1.358 * summary['bandgap'] + 0.904
        summary['structure'] = pd.Series(unique_structures)
        summary['chemsys'] = [
            '-'.join(sorted(list(Composition(comp).as_dict().keys())))
            for comp in summary['Composition']
        ]

        # Add structure data
        symmetry_data = {
            key: get_structure_data(structure)
            for key, structure in unique_structures.items()
        }
        symmetry_df = pd.DataFrame.from_dict(symmetry_data, orient='index')
        summary = pd.concat([summary, symmetry_df], axis=1)
        summary['url'] = simulation_data['url']
        return summary
Ejemplo n.º 2
0
    def test_gp_bagging(self):
        df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv'))
        df_sub = df[df['N_species'] <= 3]
        n_seed = 200  # Starting sample size
        agent = BaggedGaussianProcessStabilityAgent(
            n_query=10,
            hull_distance=0.05,
            alpha=0.5,  # Fraction of std to include in expected improvement
            n_estimators=2,
            max_samples=195,
            parallel=False)
        analyzer = StabilityAnalyzer(hull_distance=0.05, parallel=False)
        experiment = ATFSampler(df_sub)
        candidate_data = df_sub

        new_loop = Campaign(candidate_data,
                            agent,
                            experiment,
                            analyzer,
                            create_seed=n_seed)
        new_loop.initialize()
        self.assertTrue(new_loop.initialized)

        new_loop.auto_loop(6)
        self.assertTrue(True)
Ejemplo n.º 3
0
    def test_sync(self):
        with ScratchDir('.'):
            df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv'))

            # Construct and start campaign
            new_campaign = Campaign(df,
                                    AgentStabilityML5(),
                                    ATFSampler(df),
                                    StabilityAnalyzer(),
                                    create_seed=10,
                                    s3_prefix="test")
            new_campaign.auto_loop(n_iterations=3,
                                   save_iterations=True,
                                   initialize=True)
        # Test iteration read
        s3 = boto3.resource('s3')
        obj = s3.Object(CAMD_S3_BUCKET, "test/iteration.json")
        loaded = json.loads(obj.get()['Body'].read())
        self.assertEqual(loaded, 2)

        # Test save directories
        for iteration in [-1, 0, 1, 2]:
            obj = s3.Object(CAMD_S3_BUCKET, f"test/{iteration}/iteration.json")
            loaded = json.loads(obj.get()['Body'].read())
            self.assertEqual(loaded, iteration)
Ejemplo n.º 4
0
    def test_initialize_and_update(self):
        agent_pool = ParameterTable(TEST_AGENT_PARAMS)
        dataframe = get_oqmd_data_by_chemsys("Fe-O")
        cand, seed = partition_intercomp(dataframe, n_elements=1)
        analyzer = StabilityAnalyzer()
        experiment = LocalAgentSimulation(
            cand, iterations=5,
            analyzer=analyzer, seed_data=seed
        )

        MetaAgentCampaign.reserve(
            name="test_meta_agent", experiment=experiment,
            agent_pool=agent_pool, analyzer=analyzer
        )
        self.assertRaises(ValueError, MetaAgentCampaign.reserve,
                          "test_meta_agent", dataframe, agent_pool, None)

        agent_pool, data, analyzer = MetaAgentCampaign.load_pickled_objects(
            "test_meta_agent"
        )
        self.assertEqual(len(agent_pool), 12)

        MetaAgentCampaign.update_agent_pool(
            "test_meta_agent",
            TEST_AGENT_PARAMS
        )
        agent_pool, _, _ = MetaAgentCampaign.load_pickled_objects(
            "test_meta_agent"
        )
        self.assertEqual(len(agent_pool), 12)
    def test_mp_loop(self):
        df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df_analysis.csv'))
        df['id'] = [int(mp_id.replace("mp-", "").replace('mvc-', ''))
                    for mp_id in df['id']]
        df.set_index("id")
        df['Composition'] = df['formula']

        # Just use the Ti-O-N chemsys
        seed = df.iloc[:38]
        candidates = df.iloc[38:209]
        agent = RandomAgent(n_query=20)
        analyzer = StabilityAnalyzer(hull_distance=0.05, parallel=False)
        experiment = ATFSampler(dataframe=df)
        new_loop = Campaign(
            candidates, agent, experiment, analyzer, seed_data=seed
        )

        new_loop.initialize()

        for iteration in range(6):
            new_loop.run()
            self.assertTrue(os.path.isfile("hull.png"))
            if iteration >= 1:
                self.assertTrue(
                    os.path.isfile("history.pickle"))

        # Testing the continuation
        new_loop = Campaign(df, agent, experiment, analyzer)
        self.assertTrue(new_loop.initialized)
        self.assertEqual(new_loop.iteration, 6)
        self.assertEqual(new_loop.loop_state, None)

        new_loop.run()
        self.assertTrue(True)
        self.assertEqual(new_loop.iteration, 7)
Ejemplo n.º 6
0
 def test_simulated(self):
     exp_dataframe = pd.read_pickle(
         os.path.join(CAMD_TEST_FILES, "mn-ni-o-sb.pickle"))
     experiment = ATFSampler(exp_dataframe)
     candidate_data = exp_dataframe.iloc[:, :-11]
     # Set up agents and loop parameters
     agent = AgentStabilityAdaBoost(
         model=MLPRegressor(hidden_layer_sizes=(84, 50)),
         n_query=2,
         hull_distance=0.2,
         exploit_fraction=1.0,
         uncertainty=True,
         alpha=0.5,
         diversify=True,
         n_estimators=20)
     analyzer = StabilityAnalyzer(hull_distance=0.2)
     # Reduce seed_data
     icsd_data = load_dataframe("oqmd1.2_exp_based_entries_featurized_v2")
     seed_data = filter_dataframe_by_composition(icsd_data, "MnNiOSb")
     leftover = ~icsd_data.index.isin(seed_data.index)
     # Add some random other data to test compositional flexibility
     seed_data = seed_data.append(icsd_data.loc[leftover].sample(30))
     del icsd_data
     with ScratchDir('.'):
         campaign = ProtoDFTCampaign(candidate_data=candidate_data,
                                     agent=agent,
                                     experiment=experiment,
                                     analyzer=analyzer,
                                     seed_data=seed_data,
                                     heuristic_stopper=5)
         campaign.autorun()
         self.assertTrue(os.path.isfile('hull_finalized.png'))
Ejemplo n.º 7
0
    def test_random_agent_loop(self):
        df = load_default_atf_data()
        n_seed = 200  # Starting sample size
        agent = RandomAgent(n_query=10)
        analyzer = StabilityAnalyzer(hull_distance=0.05, parallel=False)
        experiment = ATFSampler(dataframe=df)
        candidate_data = df
        new_loop = Campaign(candidate_data,
                            agent,
                            experiment,
                            analyzer,
                            create_seed=n_seed)

        new_loop.initialize()
        self.assertFalse(new_loop.create_seed)

        for _ in range(6):
            new_loop.run()
            self.assertTrue(True)

        # Testing the continuation
        new_loop = Campaign(candidate_data,
                            agent,
                            experiment,
                            analyzer,
                            create_seed=n_seed)
        self.assertTrue(new_loop.initialized)
        self.assertEqual(new_loop.iteration, 6)
        self.assertEqual(new_loop.loop_state, None)

        new_loop.run()
        self.assertTrue(True)
        self.assertEqual(new_loop.iteration, 7)
Ejemplo n.º 8
0
    def from_chemsys(cls, chemsys):
        """
        Class factory method for constructing campaign from
        chemsys.

        Args:
            chemsys (str): chemical system for the campaign

        Returns:
            (ProtoDFTCampaign): Standard proto-dft campaign from
                the chemical system

        """
        s3_prefix = "proto-dft-2/runs/{}".format(chemsys)

        # Initialize s3
        dumpfn({
            "started": datetime.now().isoformat(),
            "version": __version__
        }, "start.json")
        s3_sync(s3_bucket=CAMD_S3_BUCKET, s3_prefix=s3_prefix, sync_path='.')

        # Get structure domain
        element_list = chemsys.split('-')
        max_coeff, charge_balanced = heuristic_setup(element_list)
        domain = StructureDomain.from_bounds(element_list,
                                             charge_balanced=charge_balanced,
                                             n_max_atoms=20,
                                             **{'grid': range(1, max_coeff)})
        candidate_data = domain.candidates()

        # Dump structure/candidate data
        with open('candidate_data.pickle', 'wb') as f:
            pickle.dump(candidate_data, f)

        # Set up agents and loop parameters
        agent = AgentStabilityAdaBoost(
            model=MLPRegressor(hidden_layer_sizes=(84, 50)),
            n_query=10,
            hull_distance=0.2,
            exploit_fraction=1.0,
            uncertainty=True,
            alpha=0.5,
            diversify=True,
            n_estimators=20)
        analyzer = StabilityAnalyzer(hull_distance=0.2)
        experiment = OqmdDFTonMC1(timeout=30000)
        seed_data = load_dataframe("oqmd1.2_exp_based_entries_featurized_v2")

        # Construct and start loop
        return cls(candidate_data=candidate_data,
                   agent=agent,
                   experiment=experiment,
                   analyzer=analyzer,
                   seed_data=seed_data,
                   heuristic_stopper=5,
                   s3_prefix="proto-dft/runs/{}".format(chemsys))
Ejemplo n.º 9
0
def update_run(folder):
    """
    Updates existing runs in s3 to include plots

    Returns:
        List of modified chemsys

    """
    required_files = ["seed_data.pickle", "report.log"]
    with cd(folder):
        if os.path.isfile("error.json"):
            error = loadfn("error.json")
            print("{} ERROR: {}".format(folder, error))

        if not all([os.path.isfile(fn) for fn in required_files]):
            print("{} ERROR: no seed data, no analysis to be done")
        else:
            analyzer = StabilityAnalyzer(hull_distance=0.2)

            # Generate report plots
            for iteration in range(0, 25):
                print("{}: {}".format(folder, iteration))
                if not os.path.isdir(str(iteration)) or not os.path.isdir(
                        str(iteration - 1)):
                    continue
                with open(os.path.join(str(iteration), "seed_data.pickle"),
                          "rb") as f:
                    result_df = pickle.load(f)
                all_result_ids = loadfn(
                    os.path.join(str(iteration - 1),
                                 "consumed_candidates.json"))
                new_result_ids = loadfn(
                    os.path.join(str(iteration - 1),
                                 "submitted_experiment_requests.json"))
                analyzer.plot_hull(df=result_df,
                                   new_result_ids=new_result_ids,
                                   all_result_ids=all_result_ids,
                                   filename="hull_{}.png".format(iteration),
                                   finalize=False)

            Campaign.generate_report_plot()
Ejemplo n.º 10
0
 def test_run(self):
     with ScratchDir('.'):
         dataframe = get_oqmd_data_by_chemsys("Fe-O")
         cand, seed = partition_intercomp(dataframe, n_elements=1)
         agents_df = pd.DataFrame({"agent": [RandomAgent()]})
         simulation = LocalAgentSimulation(
             cand, iterations=5, seed_data=seed,
             analyzer=StabilityAnalyzer())
         simulation.submit(agents_df)
         simulation.monitor()
         results = simulation.get_results()
         self.assertTrue(True)
Ejemplo n.º 11
0
 def test_analyze(self):
     df = pd.read_csv(os.path.join(CAMD_TEST_FILES, "test_df_analysis.csv"),
                      index_col="id")
     df['Composition'] = df['formula']
     analyzer = StabilityAnalyzer(hull_distance=0.1)
     seed_data = filter_dataframe_by_composition(df, "TiNO")
     # TODO: resolve drop_duplicates filtering mp data
     seed_data = seed_data.drop_duplicates(keep='last').dropna()
     new_exp_indices = ["mp-30998", "mp-572822"]
     new_experimental_results = seed_data.loc[new_exp_indices]
     seed_data = seed_data.drop(index=new_exp_indices)
     summary, seed_data = analyzer.analyze(
         new_experimental_results=seed_data, seed_data=pd.DataFrame(),
     )
     summary, new_seed = analyzer.analyze(
         new_experimental_results=new_experimental_results,
         seed_data=seed_data
     )
     self.assertAlmostEqual(new_seed.loc['mp-30998', 'stability'], 0)
     self.assertAlmostEqual(new_seed.loc["mp-572822", 'stability'], 0.52784795)
     self.assertTrue(new_seed.loc['mp-30998', 'is_stable'])
     self.assertFalse(new_seed.loc["mp-572822", 'is_stable'])
Ejemplo n.º 12
0
    def test_svgp_loop(self):
        df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv'))
        df_sub = df[df['N_species'] <= 3]
        n_seed = 200  # Starting sample size
        agent = SVGProcessStabilityAgent(n_query=10, hull_distance=0.05, alpha=0.5, M=100)
        analyzer = StabilityAnalyzer(hull_distance=0.05, parallel=False)
        experiment = ATFSampler(df_sub)
        candidate_data = df_sub

        new_loop = Campaign(candidate_data, agent, experiment, analyzer,
                            create_seed=n_seed)
        new_loop.initialize()
        self.assertTrue(new_loop.initialized)

        new_loop.auto_loop(3)
        self.assertTrue(True)
Ejemplo n.º 13
0
    def test_sync(self):
        with ScratchDir('.'):
            df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv'))

            # Construct and start campaign
            new_campaign = Campaign(df,
                                    AgentStabilityML5(),
                                    ATFSampler(df),
                                    StabilityAnalyzer(),
                                    create_seed=10,
                                    s3_prefix="test")
            new_campaign.initialize()
        s3 = boto3.resource('s3')
        obj = s3.Object(CAMD_S3_BUCKET, "test/iteration.json")
        loaded = json.loads(obj.get()['Body'].read())
        self.assertEqual(loaded, 0)
Ejemplo n.º 14
0
    def test_qbc_agent_loop(self):
        df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv'))
        df_sub = df[df['N_species'] <= 3]
        n_seed = 200  # Starting sample size
        agent = QBCStabilityAgent(model=MLPRegressor(hidden_layer_sizes=(84, 50)),
                                  n_query=10, hull_distance=0.05, alpha=0.5)
        analyzer = StabilityAnalyzer(hull_distance=0.05, parallel=False)
        experiment = ATFSampler(dataframe=df_sub)
        candidate_data = df_sub

        new_loop = Campaign(candidate_data, agent, experiment, analyzer,
                            create_seed=n_seed)
        new_loop.initialize()
        self.assertTrue(new_loop.initialized)

        new_loop.auto_loop(3)
        self.assertTrue(True)
Ejemplo n.º 15
0
    def test_random_agent_loop(self):
        df = load_dataframe("oqmd1.2_exp_based_entries_featurized_v2")
        n_seed = 5000
        agent = RandomAgent(n_query=200)
        analyzer = StabilityAnalyzer(hull_distance=0.05, parallel=False)
        experiment = ATFSampler(dataframe=df)
        candidate_data = df

        new_loop = Campaign(candidate_data, agent, experiment, analyzer,
                            create_seed=n_seed)

        new_loop.initialize()
        self.assertFalse(new_loop.create_seed)

        for _ in range(6):
            new_loop.run()
            self.assertTrue(True)
Ejemplo n.º 16
0
    def test_simple_gp_loop(self):
        df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv'))
        df_sub = df[df['N_species'] <= 3]
        n_seed = 200  # Starting sample size
        n_query = 10  # This many new candidates are "calculated with DFT" (i.e. requested from Oracle -- DFT)
        agent = GaussianProcessStabilityAgent(n_query=n_query, hull_distance=0.05, alpha=0.5, parallel=False)
        analyzer = StabilityAnalyzer(hull_distance=0.05, parallel=False)
        experiment = ATFSampler(dataframe=df_sub)
        candidate_data = df_sub

        new_loop = Campaign(candidate_data, agent, experiment, analyzer,
                            create_seed=n_seed)
        new_loop.initialize()
        self.assertTrue(new_loop.initialized)

        new_loop.auto_loop(2)
        self.assertTrue(True)
Ejemplo n.º 17
0
 def test_simulated(self):
     exp_dataframe = pd.read_pickle(
         os.path.join(CAMD_TEST_FILES, "mn-ni-o-sb.pickle"))
     experiment = ATFSampler(exp_dataframe)
     candidate_data = exp_dataframe.iloc[:, :-11]
     agent = RandomAgent(n_query=2)
     analyzer = StabilityAnalyzer(hull_distance=0.2)
     # Reduce seed_data
     seed_data = load_dataframe("oqmd1.2_exp_based_entries_featurized_v2")
     seed_data = filter_dataframe_by_composition(seed_data, "MnNiOSb")
     with ScratchDir('.'):
         campaign = ProtoDFTCampaign(candidate_data=candidate_data,
                                     agent=agent,
                                     experiment=experiment,
                                     analyzer=analyzer,
                                     seed_data=seed_data,
                                     heuristic_stopper=5)
         campaign.autorun()
Ejemplo n.º 18
0
 def from_chemsys(cls, chemsys):
     """
     Args:
         chemsys:
     Returns:
     """
     s3_prefix = "oqmd-atf/runs/{}".format(chemsys)
     df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv'))
     n_seed = 200  # Starting sample size
     n_query = 10  # This many new candidates are "calculated with DFT" (i.e. requested from Oracle -- DFT)
     agent = RandomAgent(n_query=n_query)
     analyzer = StabilityAnalyzer(hull_distance=0.05)
     experiment = ATFSampler(dataframe=df)
     candidate_data = df
     return cls(candidate_data,
                agent,
                experiment,
                analyzer,
                create_seed=n_seed,
                s3_prefix=s3_prefix)
Ejemplo n.º 19
0
 def test_run(self):
     agent_pool = ParameterTable(RANDOM_TEST_AGENT_PARAMS)
     # Construct experiment
     dataframe = get_oqmd_data_by_chemsys("Fe-O")
     cand, seed = partition_intercomp(dataframe, n_elements=1)
     experiment = LocalAgentSimulation(
         atf_candidate_data=cand, seed_data=seed,
         analyzer=StabilityAnalyzer(), iterations=10,
     )
     analyzer = StabilityCampaignAnalyzer(checkpoint_indices=[2, 5, 10])
     MetaAgentCampaign.reserve(
         name="test_meta_agent", experiment=experiment,
         agent_pool=agent_pool, analyzer=analyzer
     )
     with ScratchDir('.'):
         print("Testing meta agent")
         campaign = MetaAgentCampaign.from_reserved_name(
             "test_meta_agent", meta_agent=RandomAgent(n_query=1),
         )
         campaign.autorun()
     self.assertTrue(True)
Ejemplo n.º 20
0
    def test_plot_hull(self):
        df = pd.read_csv(os.path.join(CAMD_TEST_FILES, "test_df_analysis.csv"),
                         index_col="id")
        df['Composition'] = df['formula']

        # Test 2D
        with ScratchDir('.'):
            analyzer = StabilityAnalyzer(hull_distance=0.1)
            filtered = filter_dataframe_by_composition(df, "TiO")
            analyzer.plot_hull(filtered, new_result_ids=["mp-685151", "mp-755875"],
                               filename="hull.png")
            self.assertTrue(os.path.isfile("hull.png"))

        # Test 3D
        with ScratchDir('.'):
            analyzer.hull_distance = 0.05
            filtered = filter_dataframe_by_composition(df, "TiNO")
            analyzer.plot_hull(filtered, new_result_ids=["mp-776280", "mp-30998"],
                               filename="hull.png")
            self.assertTrue(os.path.isfile("hull.png"))
Ejemplo n.º 21
0
    def from_chemsys(cls,
                     chemsys,
                     prefix="proto-dft-2/runs",
                     n_max_atoms=20,
                     agent=None,
                     analyzer=None,
                     experiment=None,
                     log_file="campaign.log",
                     cloudwatch_group="/camd/worker/dev/"):
        """
        Class factory method for constructing campaign from
        chemsys.

        Args:
            chemsys (str): chemical system for the campaign
            prefix (str): prefix for s3
            n_max_atoms (int): number of maximum atoms
            agent (Agent): agent for stability campaign
            analyzer (Analyzer): analyzer for stability campaign
            experiment (Agent): experiment for stability campaign
            log_file (str): log filename
            cloudwatch_group (str): cloudwatch group to log to

        Returns:
            (ProtoDFTCampaign): Standard proto-dft campaign from
                the chemical system

        """
        logger = logging.Logger("camd")
        logger.setLevel("INFO")
        file_handler = logging.FileHandler(log_file)
        cw_handler = CloudWatchLogHandler(log_group=cloudwatch_group,
                                          stream_name=chemsys)
        logger.addHandler(file_handler)
        logger.addHandler(cw_handler)
        logger.addHandler(logging.StreamHandler())

        logger.info(
            "Starting campaign factory from_chemsys {}".format(chemsys))
        s3_prefix = "{}/{}".format(prefix, chemsys)

        # Initialize s3
        dumpfn({
            "started": datetime.now().isoformat(),
            "version": __version__
        }, "start.json")
        s3_sync(s3_bucket=CAMD_S3_BUCKET, s3_prefix=s3_prefix, sync_path='.')

        # Get structure domain
        element_list = chemsys.split('-')
        max_coeff, charge_balanced = heuristic_setup(element_list)
        domain = StructureDomain.from_bounds(element_list,
                                             charge_balanced=charge_balanced,
                                             n_max_atoms=n_max_atoms,
                                             **{'grid': range(1, max_coeff)})
        candidate_data = domain.candidates()

        # Dump structure/candidate data
        with open('candidate_data.pickle', 'wb') as f:
            pickle.dump(candidate_data, f)
        s3_sync(s3_bucket=CAMD_S3_BUCKET, s3_prefix=s3_prefix, sync_path='.')
        logger.info("Candidates generated")

        # Set up agents and loop parameters
        agent = agent or AgentStabilityAdaBoost(
            model=MLPRegressor(hidden_layer_sizes=(84, 50)),
            n_query=10,
            hull_distance=0.2,
            exploit_fraction=1.0,
            uncertainty=True,
            alpha=0.5,
            diversify=True,
            n_estimators=20)
        analyzer = analyzer or StabilityAnalyzer(hull_distance=0.2)
        experiment = experiment or OqmdDFTonMC1(timeout=30000)
        seed_data = load_dataframe("oqmd1.2_exp_based_entries_featurized_v2")

        # Construct and start loop
        return cls(candidate_data=candidate_data,
                   agent=agent,
                   experiment=experiment,
                   analyzer=analyzer,
                   seed_data=seed_data,
                   heuristic_stopper=5,
                   s3_prefix=s3_prefix,
                   logger=logger)
Ejemplo n.º 22
0
    def from_chemsys(cls,
                     chemsys,
                     prefix="proto-dft-2/runs",
                     n_max_atoms=20,
                     agent=None,
                     analyzer=None,
                     experiment=None,
                     log_file="campaign.log",
                     cloudwatch_group="/camd/worker/dev/"):
        """
        Class factory method for constructing campaign from
        chemsys.

        Args:
            chemsys (str): chemical system for the campaign
            prefix (str): prefix for s3
            n_max_atoms (int): number of maximum atoms
            agent (Agent): agent for stability campaign
            analyzer (Analyzer): analyzer for stability campaign
            experiment (Agent): experiment for stability campaign
            log_file (str): log filename
            cloudwatch_group (str): cloudwatch group to log to

        Returns:
            (ProtoDFTCampaign): Standard proto-dft campaign from
                the chemical system

        """
        logger = logging.Logger("camd")
        logger.setLevel("INFO")
        file_handler = logging.FileHandler(log_file)
        cw_handler = CloudWatchLogHandler(log_group=cloudwatch_group,
                                          stream_name=chemsys)
        logger.addHandler(file_handler)
        logger.addHandler(cw_handler)
        logger.addHandler(logging.StreamHandler())

        logger.info(
            "Starting campaign factory from_chemsys {}".format(chemsys))
        s3_prefix = "{}/{}".format(prefix, chemsys)

        # Initialize s3
        dumpfn({
            "started": datetime.now().isoformat(),
            "version": __version__
        }, "start.json")
        s3_sync(s3_bucket=CAMD_S3_BUCKET, s3_prefix=s3_prefix, sync_path='.')

        # Get structure domain
        # Check cache
        cache_key = "protosearch_cache/v1/{}/{}/candidates.pickle".format(
            chemsys, n_max_atoms)
        # TODO: create test of isfile
        if s3_key_exists(bucket=CAMD_S3_BUCKET, key=cache_key):
            logger.info("Found cached protosearch domain.")
            candidate_data = pd.read_pickle("s3://{}/{}".format(
                CAMD_S3_BUCKET, cache_key))
            logger.info("Loaded cached {}.".format(cache_key))
        else:
            logger.info(
                "Generating domain with max {} atoms.".format(n_max_atoms))
            element_list = chemsys.split('-')
            max_coeff, charge_balanced = heuristic_setup(element_list)
            domain = StructureDomain.from_bounds(
                element_list,
                charge_balanced=charge_balanced,
                n_max_atoms=n_max_atoms,
                **{'grid': range(1, max_coeff)})
            candidate_data = domain.candidates()
            logger.info("Candidates generated")
            candidate_data.to_pickle("s3://{}/{}".format(
                CAMD_S3_BUCKET, cache_key))
            logger.info("Cached protosearch domain at {}.".format(cache_key))

        # Dump structure/candidate data
        candidate_data.to_pickle("candidate_data.pickle")
        s3_sync(s3_bucket=CAMD_S3_BUCKET, s3_prefix=s3_prefix, sync_path='.')

        # Set up agents and loop parameters
        agent = agent or AgentStabilityAdaBoost(
            model=MLPRegressor(hidden_layer_sizes=(84, 50)),
            n_query=10,
            hull_distance=0.2,
            exploit_fraction=1.0,
            uncertainty=True,
            alpha=0.5,
            diversify=True,
            n_estimators=20)
        analyzer = analyzer or StabilityAnalyzer(hull_distance=0.2)
        experiment = experiment or OqmdDFTonMC1(timeout=30000,
                                                prefix_append="proto-dft")
        seed_data = load_dataframe("oqmd1.2_exp_based_entries_featurized_v2")

        # Load cached experiments
        logger.info("Loading cached experiments")
        cached_experiments = experiment.fetch_cached(candidate_data)
        logger.info("Found {} experiments.".format(len(cached_experiments)))
        if len(cached_experiments) > 0:
            summary, seed_data = analyzer.analyze(cached_experiments,
                                                  seed_data)
            # Remove cached experiments from candidate_data
            candidate_space = candidate_data.index.difference(
                cached_experiments.index, sort=False).tolist()
            candidate_data = candidate_data.loc[candidate_space]
            logger.info("Cached experiments added to seed.")

        # Construct and start loop
        return cls(candidate_data=candidate_data,
                   agent=agent,
                   experiment=experiment,
                   analyzer=analyzer,
                   seed_data=seed_data,
                   heuristic_stopper=5,
                   s3_prefix=s3_prefix,
                   logger=logger)
Ejemplo n.º 23
0
from sklearn.neural_network import MLPRegressor
from camd.agent.stability import AgentStabilityML5
from camd.analysis import StabilityAnalyzer
from camd.experiment.base import ATFSampler
from camd.utils.data import load_default_atf_data

##########################################################
# Load dataset and filter by N_species of 2 or less
##########################################################
df = load_default_atf_data()

## Epsilon-Greedy
n_seed = 5000  # Starting sample size - a seed of this size will be randomly chosen.
n_query = 200  # This many new candidates are "calculated with DFT" (i.e. requested from Oracle -- DFT)
agent = AgentStabilityML5(model=MLPRegressor(hidden_layer_sizes=(84, 50)),
                          n_query=n_query,
                          hull_distance=0.05,
                          exploit_fraction=0.5)
analyzer = StabilityAnalyzer(hull_distance=0.05)
experiment = ATFSampler(dataframe=df)
candidate_data = df
##########################################################
new_loop = Campaign(candidate_data,
                    agent,
                    experiment,
                    analyzer,
                    create_seed=n_seed)

new_loop.auto_loop(n_iterations=4, initialize=True)