Python filter_dataframe_by_composition 예제들, camd.utils.data.filter_dataframe_by_composition Python 예제들

예제 #1

0

파일 보기

    def get_pd(self, chemsys=None):
        """
        Refresh the phase diagram associated with the seed_data

        Args:
            chemsys (str): chemical system for which to filter
                seed data to provide partial phase diagram

        Returns:
            None
        """
        self.pd = PhaseData()
        # Filter seed data by relevant chemsys
        if chemsys:
            total_comp = Composition(chemsys.replace('-', ''))
            filtered = filter_dataframe_by_composition(self.seed_data,
                                                       total_comp)
        else:
            filtered = self.seed_data

        phases = [
            Phase(
                row["Composition"],
                energy=row["delta_e"],
                per_atom=True,
                description=row_index,
            ) for row_index, row in filtered.iterrows()
        ]
        phases.extend([Phase(el, 0.0, per_atom=True) for el in ELEMENTS])
        self.pd.add_phases(phases)
        return self.pd

예제 #2

0

파일 보기

 def test_simulated(self):
     exp_dataframe = pd.read_pickle(
         os.path.join(CAMD_TEST_FILES, "mn-ni-o-sb.pickle"))
     experiment = ATFSampler(exp_dataframe)
     candidate_data = exp_dataframe.iloc[:, :-11]
     # Set up agents and loop parameters
     agent = AgentStabilityAdaBoost(
         model=MLPRegressor(hidden_layer_sizes=(84, 50)),
         n_query=2,
         hull_distance=0.2,
         exploit_fraction=1.0,
         uncertainty=True,
         alpha=0.5,
         diversify=True,
         n_estimators=20)
     analyzer = StabilityAnalyzer(hull_distance=0.2)
     # Reduce seed_data
     icsd_data = load_dataframe("oqmd1.2_exp_based_entries_featurized_v2")
     seed_data = filter_dataframe_by_composition(icsd_data, "MnNiOSb")
     leftover = ~icsd_data.index.isin(seed_data.index)
     # Add some random other data to test compositional flexibility
     seed_data = seed_data.append(icsd_data.loc[leftover].sample(30))
     del icsd_data
     with ScratchDir('.'):
         campaign = ProtoDFTCampaign(candidate_data=candidate_data,
                                     agent=agent,
                                     experiment=experiment,
                                     analyzer=analyzer,
                                     seed_data=seed_data,
                                     heuristic_stopper=5)
         campaign.autorun()
         self.assertTrue(os.path.isfile('hull_finalized.png'))

예제 #3

0

파일 보기

    def test_plot_hull(self):
        df = pd.read_csv(os.path.join(CAMD_TEST_FILES, "test_df_analysis.csv"),
                         index_col="id")
        df['Composition'] = df['formula']

        # Test 2D
        with ScratchDir('.'):
            analyzer = StabilityAnalyzer(hull_distance=0.1)
            filtered = filter_dataframe_by_composition(df, "TiO")
            analyzer.plot_hull(filtered, new_result_ids=["mp-685151", "mp-755875"],
                               filename="hull.png")
            self.assertTrue(os.path.isfile("hull.png"))

        # Test 3D
        with ScratchDir('.'):
            analyzer.hull_distance = 0.05
            filtered = filter_dataframe_by_composition(df, "TiNO")
            analyzer.plot_hull(filtered, new_result_ids=["mp-776280", "mp-30998"],
                               filename="hull.png")
            self.assertTrue(os.path.isfile("hull.png"))

예제 #4

0

파일 보기

 def test_simulated(self):
     exp_dataframe = pd.read_pickle(
         os.path.join(CAMD_TEST_FILES, "mn-ni-o-sb.pickle"))
     experiment = ATFSampler(exp_dataframe)
     candidate_data = exp_dataframe.iloc[:, :-11]
     agent = RandomAgent(n_query=2)
     analyzer = StabilityAnalyzer(hull_distance=0.2)
     # Reduce seed_data
     seed_data = load_dataframe("oqmd1.2_exp_based_entries_featurized_v2")
     seed_data = filter_dataframe_by_composition(seed_data, "MnNiOSb")
     with ScratchDir('.'):
         campaign = ProtoDFTCampaign(candidate_data=candidate_data,
                                     agent=agent,
                                     experiment=experiment,
                                     analyzer=analyzer,
                                     seed_data=seed_data,
                                     heuristic_stopper=5)
         campaign.autorun()

예제 #5

0

파일 보기

 def test_analyze(self):
     df = pd.read_csv(os.path.join(CAMD_TEST_FILES, "test_df_analysis.csv"),
                      index_col="id")
     df['Composition'] = df['formula']
     analyzer = StabilityAnalyzer(hull_distance=0.1)
     seed_data = filter_dataframe_by_composition(df, "TiNO")
     # TODO: resolve drop_duplicates filtering mp data
     seed_data = seed_data.drop_duplicates(keep='last').dropna()
     new_exp_indices = ["mp-30998", "mp-572822"]
     new_experimental_results = seed_data.loc[new_exp_indices]
     seed_data = seed_data.drop(index=new_exp_indices)
     summary, seed_data = analyzer.analyze(
         new_experimental_results=seed_data, seed_data=pd.DataFrame(),
     )
     summary, new_seed = analyzer.analyze(
         new_experimental_results=new_experimental_results,
         seed_data=seed_data
     )
     self.assertAlmostEqual(new_seed.loc['mp-30998', 'stability'], 0)
     self.assertAlmostEqual(new_seed.loc["mp-572822", 'stability'], 0.52784795)
     self.assertTrue(new_seed.loc['mp-30998', 'is_stable'])
     self.assertFalse(new_seed.loc["mp-572822", 'is_stable'])

예제 #6

0

파일 보기

파일: analysis.py 프로젝트: sailfish009/CAMD

def update_run_w_structure(folder, hull_distance=0.2, parallel=True):
    """
    Updates a campaign grouped in directories with structure analysis

    """
    with cd(folder):
        required_files = ["seed_data.pickle"]
        if os.path.isfile("error.json"):
            error = loadfn("error.json")
            print("{} ERROR: {}".format(folder, error))

        if not all([os.path.isfile(fn) for fn in required_files]):
            print("{} ERROR: no seed data, no analysis to be done")
        else:
            with open("seed_data.pickle", "rb") as f:
                df = pickle.load(f)

            with open("experiment.pickle", "rb") as f:
                experiment = pickle.load(f)
                # Hack to update agg_history
                experiment.update_current_data(None)

            all_submitted, all_results = experiment.agg_history
            old_results = df.drop(all_results.index, errors='ignore')
            new_results = df.drop(old_results.index)
            st_a = StabilityAnalyzer(hull_distance=hull_distance,
                                     parallel=parallel,
                                     entire_space=False,
                                     plot=False)
            summary, new_seed = st_a.analyze(new_results, old_results)

            # Having calculated stabilities again, we plot the overall hull.
            # Filter by chemsys
            new_comp = new_results['Composition'].sum()
            filtered = filter_dataframe_by_composition(new_seed, new_comp)
            st_a.plot_hull(
                filtered,
                all_submitted.index,
                filename="hull_finalized.png",
                finalize=True,
            )

            stable_discovered = new_seed[new_seed["is_stable"].fillna(False)]

            # Analyze structures if present in experiment
            if "structure" in all_results.columns:
                s_a = AnalyzeStructures()
                s_a.analyze_vaspqmpy_jobs(all_results,
                                          against_icsd=True,
                                          use_energies=True)
                unique_s_dict = {}
                for i in range(len(s_a.structures)):
                    if s_a.structure_is_unique[i] and (
                            s_a.structure_ids[i] in stable_discovered.index):
                        unique_s_dict[s_a.structure_ids[i]] = s_a.structures[i]

                with open("discovered_unique_structures.json", "w") as f:
                    json.dump(
                        dict([(k, s.as_dict())
                              for k, s in unique_s_dict.items()]), f)

                with open("structure_report.log", "w") as f:
                    f.write(
                        "consumed discovery unique_discovery duplicate in_icsd \n"
                    )
                    f.write(
                        str(len(all_submitted)) + " " +
                        str(len(stable_discovered)) + " " +
                        str(len(unique_s_dict)) + " " +
                        str(len(s_a.structures) - sum(s_a._not_duplicate)) +
                        " " + str(sum([not i for i in s_a._icsd_filter])))

예제 #7

0

파일 보기

파일: analysis.py 프로젝트: sailfish009/CAMD

    def plot_hull(self, df, new_result_ids, filename=None, finalize=False):
        """
        Generate plots of convex hulls for each of the runs

        Args:
            df (DataFrame): dataframe with formation energies and formulas
            new_result_ids ([]): list of new result ids (i. e. indexes
                in the updated dataframe)
            filename (str): filename to output, if None, no file output
                is produced
            finalize (bool): flag indicating whether to include all new results

        Returns:
            (pyplot): plotter instance
        """
        # Generate all entries
        total_comp = Composition(df['Composition'].sum())
        if len(total_comp) > 4:
            warnings.warn(
                "Number of elements too high for phase diagram plotting")
            return None
        filtered = filter_dataframe_by_composition(df, total_comp)
        filtered = filtered[['delta_e', 'Composition']]
        filtered = filtered.dropna()

        # Create computed entry column with un-normalized energies
        filtered["entry"] = [
            ComputedEntry(
                Composition(row["Composition"]),
                row["delta_e"] * Composition(row["Composition"]).num_atoms,
                entry_id=index,
            ) for index, row in filtered.iterrows()
        ]

        ids_prior_to_run = list(set(filtered.index) - set(new_result_ids))
        if not ids_prior_to_run:
            warnings.warn(
                "No prior data, prior phase diagram cannot be constructed")
            return None

        # Create phase diagram based on everything prior to current run
        entries = filtered.loc[ids_prior_to_run]["entry"].dropna()

        # Filter for nans by checking if it's a computed entry
        pg_elements = sorted(total_comp.keys())
        pd = PhaseDiagram(entries, elements=pg_elements)
        plotkwargs = {
            "markerfacecolor": "white",
            "markersize": 7,
            "linewidth": 2,
        }
        if finalize:
            plotkwargs.update({"linestyle": "--"})
        else:
            plotkwargs.update({"linestyle": "-"})
        plotter = PDPlotter(pd, backend='matplotlib', **plotkwargs)

        getplotkwargs = {"label_stable": False} if finalize else {}
        plot = plotter.get_plot(**getplotkwargs)

        # Get valid results
        valid_results = [
            new_result_id for new_result_id in new_result_ids
            if new_result_id in filtered.index
        ]

        if finalize:
            # If finalize, we'll reset pd to all entries at this point to
            # measure stabilities wrt. the ultimate hull.
            pd = PhaseDiagram(filtered["entry"].values, elements=pg_elements)
            plotter = PDPlotter(pd,
                                backend="matplotlib",
                                **{
                                    "markersize": 0,
                                    "linestyle": "-",
                                    "linewidth": 2
                                })
            plot = plotter.get_plot(plt=plot)

        for entry in filtered["entry"][valid_results]:
            decomp, e_hull = pd.get_decomp_and_e_above_hull(
                entry, allow_negative=True)
            if e_hull < self.hull_distance:
                color = "g"
                marker = "o"
                markeredgewidth = 1
            else:
                color = "r"
                marker = "x"
                markeredgewidth = 1

            # Get coords
            coords = [
                entry.composition.get_atomic_fraction(el) for el in pd.elements
            ][1:]
            if pd.dim == 2:
                coords = coords + [pd.get_form_energy_per_atom(entry)]
            if pd.dim == 3:
                coords = triangular_coord(coords)
            elif pd.dim == 4:
                coords = tet_coord(coords)
            plot.plot(*coords,
                      marker=marker,
                      markeredgecolor=color,
                      markerfacecolor="None",
                      markersize=11,
                      markeredgewidth=markeredgewidth)

        if filename is not None:
            plot.savefig(filename, dpi=70)
        plot.close()

예제 #8

0

파일 보기

파일: analysis.py 프로젝트: sailfish009/CAMD

    def analyze(self, new_experimental_results, seed_data):
        """
        Args:
            new_experimental_results (DataFrame): new experimental
                results to be added to the seed
            seed_data (DataFrame): seed to be augmented via
                the new_experimental_results

        Returns:
            (DataFrame): summary of the process, i. e. of
                the increment or experimental results
            (DataFrame): augmented seed data, i. e. "new"
                seed data according to the experimental results

        """
        # Check for new results
        new_comp = new_experimental_results['Composition'].sum()
        new_experimental_results = new_experimental_results.dropna(
            subset=['delta_e'])
        new_seed = seed_data.append(new_experimental_results)

        # Aggregate seed_data and new experimental results
        include_columns = ["Composition", "delta_e"]
        filtered = new_seed[include_columns].drop_duplicates(
            keep="last").dropna()

        if not self.entire_space:
            # Constrains the phase space to that of the target compounds.
            # More efficient when searching in a specified chemistry,
            # less efficient if larger spaces are without specified chemistry.
            filtered = filter_dataframe_by_composition(filtered, new_comp)

        space = self.get_phase_space(filtered)
        new_phases = [
            p for p in space.phases if p.description in filtered.index
        ]

        space.compute_stabilities(phases=new_phases, ncpus=self.parallel)

        # Compute new stabilities and update new seed, note that pandas will complain
        # if the index is not explicit due to multiple types (e. g. ints for OQMD
        # and strs for prototypes)
        new_data = pd.DataFrame(
            {"stability": [phase.stability for phase in new_phases]},
            index=[phase.description for phase in new_phases])
        new_data["is_stable"] = new_data["stability"] <= self.hull_distance

        # TODO: This is implicitly adding "stability", and "is_stable" columns
        #       but could be handled more gracefully
        if "stability" not in new_seed.columns:
            new_seed = pd.concat([new_seed, new_data], axis=1, sort=False)
        else:
            new_seed.update(new_data)

        # Write hull figure to disk
        if self.plot:
            self.plot_hull(filtered,
                           new_experimental_results.index,
                           filename="hull.png")

        # Compute summary metrics
        summary = self.get_summary(
            new_seed,
            new_experimental_results.index,
            initial_seed_indices=self.initial_seed_indices,
        )
        # Drop excess columns from experiment
        new_seed = new_seed.drop([
            'path', 'status', 'start_time', 'jobId', 'jobName', 'jobArn',
            'result', 'error', 'elapsed_time'
        ],
                                 axis="columns",
                                 errors="ignore")
        return summary, new_seed