def _prepare_grouped_df(adsorbate1, adsorbate2):
    """
    This function takes the adsorbates of interest
    and make a dataframe that will be used for making 2D plots.
    Each row in the dataframe is grouped by unique surface.
    A surface is defined with mpid, Miller index, top, and shift.

    Args:
        adsorbate1      adsorbate1, who's energy is going on x-axis
        adsorbate1      adsorbate2, who's energy is going on y-axis

    Returns:
        adsorbate1_df   dataframe of adsorbate1
        adsorbate2_df   dataframe of adsorbate2
        grouped_df      dataframe used for plotting
    """
    surface_fp = ['mpid', 'millerx', 'millery', 'millerz', 'top', 'shift']
    adsorbate1_df = _make_df_from_docs(adsorbate1, surface_fp)
    adsorbate2_df = _make_df_from_docs(adsorbate2, surface_fp)

    # merge them together based on unique surface
    grouped_results = pd.merge(adsorbate1_df, adsorbate2_df,
                               on=surface_fp).dropna()
    # drop rows that has ML prediction on both OH & CO
    grouped_results = grouped_results.drop(
        grouped_results[(grouped_results['{}_DFT'.format(adsorbate1)] == False)
                        & (grouped_results['{}_DFT'.format(adsorbate2)] ==
                           False)].index).reset_index()

    # Add formula to the dataframe based on mpid
    rc = read_rc()
    atoms_db = gasdb.get_mongo_collection('atoms')
    mpids = set(grouped_results['mpid'])
    compositions_by_mpid = {}
    print('Beginning to pull data from the Materials Project...')
    with MPRester(read_rc()['matproj_api_key']) as mat_proj:
        for mpid in tqdm.tqdm_notebook(mpids):
            try:
                entry = mat_proj.get_entry_by_material_id({'task_ids': mpid})
                compositions_by_mpid[
                    mpid] = entry.composition.get_reduced_formula_and_factor(
                    )[0]
            except IndexError:
                compositions_by_mpid[mpid] = ""
    data = list(compositions_by_mpid.items())
    df_new = pd.DataFrame(data, columns=['mpid', 'formula'])
    grouped_df = pd.merge(grouped_results, df_new, on='mpid')

    return grouped_df
Beispiel #2
0
def fetch_tar_file(launch_id, verify_tars=True):
    backup_directory = read_rc()['launches_backup_directory']
    backup_loc = backup_directory + '/%d.tar.gz' % launch_id
    if len(glob.glob(backup_loc)) > 0 and verify_tars:
        output = os.system('tar -tzf %s >/dev/null' % backup_loc)
        if output != 0:
            os.remove(backup_loc)
    if len(glob.glob(backup_loc)) == 0:
        try:
            launch = lpad.get_launch_by_id(launch_id)
        except:
            print('could not find launch id %d' % launch_id)
            return

        if launch.state != 'COMPLETED':
            print(launch.fw_id)
            return

        cluster = launch.fworker.name
        if cluster == 'gilgamesh':
            os.system("rsync -azqP --max-size=100M gilgamesh:%s /tmp/%s" % (launch.launch_dir, launch_id))
        elif 'arjuna' in cluster:
            os.system("rsync -azqP --max-size=100M arjuna:%s /tmp/%s" % (launch.launch_dir, launch_id))
        elif 'Cori' in cluster or '/global/project/projectdirs/m2755/' in launch.launch_dir:
            launch_dir = launch.launch_dir
            if '/global/u2/z/zulissi' in launch_dir:
                launch_dir = '/global/project/projectdirs/m2755/fireworks_zu/fireworks/' + launch_dir[30:]

            os.system("rsync -azqP --max-size=100M %s /tmp/%s" % (launch_dir, launch_id))
        else:
            print('unknown cluster!: %s' % cluster)
            print(launch.fw_id)
        launch_dir_folder = launch.launch_dir.split('/')[-1]
        os.system("(cd /tmp/%s/%s && tar -czf %s *)" % (launch_id, launch_dir_folder, backup_loc))
        os.system("rm -r /tmp/%s" % launch_id)
    def _get_compositions_by_mpid(self):
        '''
        We use the Materials Project's python API to find the composition of
        various materials given their MPIDs. This can take awhile though, so we
        also cache the results and modify the cache as necessary.

        Resulting attribute:
            compositions_by_mpid_   A dictionary whose keys are MPIDs and whose
                                    values are lists of strings for each
                                    element that is present in the
                                    corresponding material. This object is
                                    cached and therefore may have extra
                                    key:value pairings that you may not need.
        '''
        # Find the current cache of compositions.
        try:
            with open(CACHE_LOCATION + 'mp_comp_data.pkl',
                      'rb') as file_handle:
                compositions_by_mpid = pickle.load(file_handle)

        # If the cache is not there, then create it
        except FileNotFoundError:
            compositions_by_mpid = {}

            # Figure out which compositions we still need to figure out
            catalog_docs = get_catalog_docs()
            mpids = {
                doc['mpid']
                for doc in self.adsorption_docs + catalog_docs
            }

            # Each MP document may contain several MPIDs. Here we get every
            # single document whose list of associated MPIDs matches anything
            # in our list of missing MPIDs.
            with MPRester(read_rc('matproj_api_key')) as rester:
                query = {'task_ids': {'$elemMatch': {'$in': list(mpids)}}}
                properties = ['elements', 'task_ids']
                mp_docs = rester.query(criteria=query, properties=properties)

            # Match the MP documents to our missing MPIDs.
            for mpid in mpids:
                for doc in mp_docs:
                    if mpid in set(doc['task_ids']):
                        compositions_by_mpid[mpid] = doc['elements']
                        break

            # Save the updated cache
            with open(CACHE_LOCATION + 'mp_comp_data.pkl',
                      'wb') as file_handle:
                pickle.dump(compositions_by_mpid, file_handle)

        self.compositions_by_mpid_ = compositions_by_mpid
__author__ = 'Kevin Tran'
__email__ = '*****@*****.**'

from collections import defaultdict
import warnings
from abc import ABC, abstractmethod
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
with warnings.catch_warnings():
    warnings.filterwarnings('ignore', message='numpy.dtype size changed')
    import mendeleev
from gaspy.atoms_operators import get_stoich_from_mpid
from gaspy.utils import read_rc
from gaspy.gasdb import get_catalog_docs

CACHE_LOCATION = read_rc('gasdb_path')


class Fingerprinter(ABC, BaseEstimator, TransformerMixin):
    '''
    This is a template fingerprinter that is meant to be extended before using.
    It is meant to be extended, and needs a `transform` method.

    The especially useful attributes that this class has are `dummy_fp_`,
    `max_num_species_`, `median_adsorption_energies_`, and `mendeleev_data_`.
    For more details on what they are, refer to the respective methods.

    Refer to Tran & Ulissi (Nature Catalysis, 2018) for even more details.
    '''
    def __init__(self):
        pass
Beispiel #5
0
from collections import defaultdict
from atomicwrites import atomic_write
from datetime import datetime
import pickle
import numpy as np
from pymongo import UpdateOne
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tpot import TPOTRegressor
from gaspy.utils import read_rc, multimap_method
from gaspy.gasdb import get_catalog_docs, get_adsorption_docs, get_mongo_collection
from gaspy_regress import fingerprinters

GASDB_LOCATION = read_rc('gasdb_path')
PREDICTION_CACHES = {
    ('model0', 'CO'): GASDB_LOCATION + '/predictions_tpot_CO.pkl',
    ('model0', 'H'): GASDB_LOCATION + '/predictions_tpot_H.pkl',
    ('model0', 'N'): GASDB_LOCATION + '/predictions_tpot_N.pkl',
    ('model0', 'O'): GASDB_LOCATION + '/predictions_tpot_O.pkl',
    ('model0', 'OH'): GASDB_LOCATION + '/predictions_tpot_OH.pkl',
    ('model0', 'OOH'): GASDB_LOCATION + '/predictions_tpot_OOH.pkl'
}


def fit_model0_adsorption_energies(adsorbate):
    '''
    Create and save a modeling pipeline to predict adsortion energies.

    Arg:
def plot_2D_plot(adsorbate1, adsorbate2, adsorbate1_correction,
                 adsorbate2_correction):
    """
    When called, this function will return a 2D plotly plot with x-aixs
    being adsorbate1, y-axis being adsorbate2.
    In addition, it will also cache the adsorption configuration to AWS.
    The plot can be accessed via
        http://ulissigroup.cheme.cmu.edu/gaspy_plots/index.html?plotly=(plot# here)

    Args:
        adsorbate1,     adsorbate1, who's energy is going on x-axis
        adsorbate1,     adsorbate2, who's energy is going on y-axis
        adsorbateX_correction: energy correction for adsorbate on X-axis
                                 e.g. we have dE for adsorption energy, if
                                 we want to convert to dG.
                                 default is 0 if we plot dE
          adsorbateY_correction: energy correction for adsorbate on Y-axis
                                 e.g. we have dE for adsorption energy, if
                                 we want to convert to dG.
                                 default is 0 if we plot dE

    """
    grouped_df = _prepare_grouped_df(adsorbate1, adsorbate2)

    puremetal_mpid = [
        'mp-101', 'mp-23', 'mp-2', 'mp-91', 'mp-1080711', 'mp-1061133',
        'mp-58', 'mp-13', 'mp-124', 'mp-30', 'mp-32', 'mp-72', 'mp-142',
        'mp-81', 'mp-79', 'mp-1056037', 'mp-94', 'mp-126', 'mp-146',
        'mp-1055994', 'mp-104', 'mp-117', 'mp-35', 'mp-90', 'mp-75',
        'mp-10172', 'mp-632250', 'mp-1056438', 'mp-8', 'mp-74', 'mp-20483',
        'mp-54', 'mp-48', 'mp-134', 'mp-129', 'mp-11', 'mp-1056486', 'mp-149',
        'mp-45', 'mp-49', 'mp-96', 'mp-85', 'mp-568584', 'mp-754514', 'mp-33',
        'mp-999498', 'mp-127', 'mp-25', 'mp-132', 'mp-570747', 'mp-14',
        'mp-154', 'mp-672234', 'mp-672233'
    ]

    # DFT_adsorbate1, DFT_adsorbate2
    DFT_ads1_ads2 = grouped_df.loc[
        (grouped_df['{}_DFT'.format(adsorbate1)] == True)
        & (grouped_df['{}_DFT'.format(adsorbate2)] == True)]
    metal_DFT_ads1_ads2 = DFT_ads1_ads2.loc[DFT_ads1_ads2['mpid'].isin(
        puremetal_mpid)]
    intermetallics_DFT_ads1_ads2 = DFT_ads1_ads2[
        ~DFT_ads1_ads2.isin(metal_DFT_ads1_ads2)].dropna()

    # DFT_adsorbate1, ML_adsorbate2
    DFT_ads1_ML_ads2 = grouped_df.loc[
        (grouped_df['{}_DFT'.format(adsorbate1)] == True)
        & (grouped_df['{}_DFT'.format(adsorbate2)] == False)]
    metal_DFT_ads1_ML_ads2 = DFT_ads1_ML_ads2.loc[
        DFT_ads1_ML_ads2['mpid'].isin(puremetal_mpid)]
    intermetallics_DFT_ads1_ML_ads2 = DFT_ads1_ML_ads2[
        ~DFT_ads1_ML_ads2.isin(metal_DFT_ads1_ML_ads2)].dropna()

    # ML_adsorbate1, DFT_adsorbate2
    ML_ads1_DFT_ads2 = grouped_df.loc[
        (grouped_df['{}_DFT'.format(adsorbate1)] == False)
        & (grouped_df['{}_DFT'.format(adsorbate2)] == True)]
    metal_ML_ads1_DFT_ads2 = ML_ads1_DFT_ads2.loc[
        ML_ads1_DFT_ads2['mpid'].isin(puremetal_mpid)]
    intermetallics_ML_ads1_DFT_ads2 = ML_ads1_DFT_ads2[
        ~ML_ads1_DFT_ads2.isin(metal_ML_ads1_DFT_ads2)].dropna()

    # get scatter points for plotting
    print('Plotting DFT_{}, DFT_{}'.format(adsorbate1, adsorbate2))
    # DFT_adsorbate1, DFT_adsorbate2
    data = _make_scatter_points(
        metal_DFT_ads1_ads2, adsorbate1, adsorbate2,
        'monometallic DFT {} & {}'.format(adsorbate1, adsorbate2), 'square',
        'red', 'red', adsorbate1_correction, adsorbate2_correction)
    data += _make_scatter_points(
        intermetallics_DFT_ads1_ads2, adsorbate1, adsorbate2,
        'intermetallic DFT {} & {}'.format(adsorbate1, adsorbate2), 'circle',
        'white', 'red', adsorbate1_correction, adsorbate2_correction)

    # DFT_adsorbate1, ML_adsorbate2
    print('Plotting DFT_{}, ML_{}'.format(adsorbate1, adsorbate2))
    data += _make_scatter_points(
        metal_DFT_ads1_ML_ads2, adsorbate1, adsorbate2,
        'monometallic DFT {} & ML {}'.format(adsorbate1, adsorbate2), 'square',
        'yellowgreen', 'yellowgreen', adsorbate1_correction,
        adsorbate2_correction)
    data += _make_scatter_points(
        intermetallics_DFT_ads1_ML_ads2, adsorbate1, adsorbate2,
        'intermetallic DFT {} & ML {}'.format(adsorbate1,
                                              adsorbate2), 'circle', 'white',
        'yellowgreen', adsorbate1_correction, adsorbate2_correction)

    # ML_adsorbate1, DFT_adsorbate2
    print('Plotting ML_{}, DFT_{}'.format(adsorbate1, adsorbate2))
    data += _make_scatter_points(
        metal_ML_ads1_DFT_ads2, adsorbate1, adsorbate2,
        'monometallic ML {} & DFT {}'.format(adsorbate1, adsorbate2), 'square',
        'cornflowerblue', 'cornflowerblue', adsorbate1_correction,
        adsorbate2_correction)
    data += _make_scatter_points(
        intermetallics_ML_ads1_DFT_ads2, adsorbate1, adsorbate2,
        'intermetallic ML {} & DFT {}'.format(adsorbate1,
                                              adsorbate2), 'circle', 'white',
        'cornflowerblue', adsorbate1_correction, adsorbate2_correction)

    if adsorbate1_correction == 0 and adsorbate2_correction == 0:
        fig = go.Figure(data=data,
                        layout=go.Layout(
                            hovermode='closest',
                            xaxis=dict(title='dE_{} [eV]'.format(adsorbate1),
                                       titlefont=dict(size=25)),
                            yaxis=dict(title='dE_{} [eV]'.format(adsorbate2),
                                       titlefont=dict(size=25))))

    elif adsorbate1_correction != 0 and adsorbate2_correction != 0:
        fig = go.Figure(data=data,
                        layout=go.Layout(
                            hovermode='closest',
                            xaxis=dict(title='dG_{} [eV]'.format(adsorbate1),
                                       titlefont=dict(size=25)),
                            yaxis=dict(title='dG_{} [eV]'.format(adsorbate2),
                                       titlefont=dict(size=25))))
    else:
        print(
            'you only added correction to 1 of the adsorbate energies. please keep the energies consistent'
        )

    print('Your figure location is in Plotly 2D_plots/{}_{}_bySurface'.format(
        adsorbate1, adsorbate2))
    plotly.sign_in(**read_rc('plotly_login_info'))
    plotly.plot(fig,
                filename='2D_plots/{}_{}_bySurface'.format(
                    adsorbate1, adsorbate2))
Beispiel #7
0
def create_gridplot(adsorbate, targets, filename, hovertext_labels=None):
    '''
    This function will create and save a gridplot of our adsorption energy
    data.

    Args:
        adsorbate           A string indicating which adsorbate you want to
                            plot the data for
        targets             A 2-tuple of floats indicating the low and high
                            range of the adsorption energies you consider to be
                            good, respectively.
        filename            A string indicating where you want to save the plot
                            (within the Plotly account)
        hovertext_labels    A sequence of strings indicating which
                            data you want displayed in the hovertext.
                            Possible strings include everything in the
                            `gaspy.defaults.adsorption_projections` dictionary
                            in addition to `stoichiometry` and `date`.
    Returns:
        url     The URL for the plot you just created
    '''
    # Python doesn't like mutable defaults
    if hovertext_labels is None:
        hovertext_labels = {'coordination', 'energy', 'stoichiometry', 'date',
                            'mpid', 'miller'}

    # Get and preprocess all the documents we have now
    extra_projections = {'atoms': '$atoms',
                         'date': '$calculation_dates.slab+adsorbate'}
    all_docs = get_adsorption_docs(adsorbate, extra_projections)
    mpids = {doc['mpid'] for doc in all_docs}
    comps = {mpid: get_stoich_from_mpid(mpid) for mpid in mpids}
    for doc in all_docs:
        doc['composition'] = tuple(comps[doc['mpid']].keys())
    all_elements = {element for doc in all_docs
                    for element in comps[doc['mpid']]}

    # Organize all of our documents according to their bimetallic composition
    docs_by_comp = defaultdict(list)
    for doc in all_docs:
        comp = doc['composition']
        if len(comp) == 2:
            docs_by_comp[tuple(comp)].append(doc)
            docs_by_comp[tuple(reversed(comp))].append(doc)
        elif len(comp) == 1:
            docs_by_comp[tuple([comp[0], comp[0]])].append(doc)

    # Create the local coordinates for each set of bimetallics
    max_radius = 0
    for (element_i, element_j), docs in docs_by_comp.items():
        n = len(docs)
        width = np.sqrt(n)

        # Add `x` dimension to documents, which is uniform random sorted by
        # adsorption energy
        X = np.random.uniform(-width/2, width/2, n)
        X.sort()
        docs.sort(key=lambda doc: doc['energy'])
        for doc, x in zip(docs, X):
            doc['x'] = x

        # Add `y` dimension to documents, which is uniform random sorted by
        # composition
        Y = np.random.uniform(-width/2, width/2, n)
        Y.sort()
        for doc in docs:
            symbol_counts = doc['atoms']['symbol_counts']
            n_atoms = symbol_counts[element_i] + symbol_counts[element_j]
            ratio = symbol_counts[element_i] / n_atoms
            doc['ratio'] = ratio
            doc['stoichiometry'] = {element_i: symbol_counts[element_i],
                                    element_j: symbol_counts[element_j]}
        docs.sort(key=lambda doc: doc['ratio'])

        # Shuffle the y values within each ratio so that we get squares instead
        # of lines
        ratios = [doc['ratio'] for doc in docs]
        unique_ratios = sorted(list(set(ratios)))
        shuffle_counter = 0
        for i, ratio in enumerate(unique_ratios):
            ratio_count = ratios.count(ratio)
            ys = Y[shuffle_counter:shuffle_counter+ratio_count]
            random.shuffle(ys)
            shuffle_counter += ratio_count

        # Concatenate the appropriately shuffled uniform distribution with
        # documents
        for doc, y in zip(docs, Y):
            doc['y'] = y

        # Recalculate the size of the biggest square. We use this to scale
        # everything.
        max_radius = max([max_radius] + [max(doc['x'], doc['y']) for doc in docs])
    max_width = max_radius * 2

    # Settings for interactive image
    marker_size = 4
    font_size = 24
    font = dict(family='Arial', color='black')
    width = 900
    height = 800
    axis_font_size = 12

    # Set thresholds for energy
    low_energy = targets[0]
    high_energy = targets[1]
    good_energy = (low_energy + high_energy) / 2

    # We need the max and min energies to make sure the color mapping in all
    # our squares map to each other
    all_energies = [doc['energy'] for doc in all_docs]
    energy_min = min(all_energies)
    energy_max = max(all_energies)

    # Plotly lets you set colors only based on their normalized values, so we
    # need to normalize our energies before mapping colors onto them.
    energy_bandwidth = energy_max - energy_min
    low_energy_normalized = (low_energy - energy_min) / energy_bandwidth
    good_energy_normalized = (good_energy - energy_min) / energy_bandwidth
    high_energy_normalized = (high_energy - energy_min) / energy_bandwidth

    # Make our colorscale
    low_color = 'rgb(0, 0, 0)'
    good_color = 'rgb(175, 0, 255)'
    high_color = 'rgb(255, 200, 200)'
    colorscale = [(0., low_color),
                  (low_energy_normalized, low_color),
                  (good_energy_normalized, good_color),
                  (high_energy_normalized, high_color),
                  (1., high_color)]

    # Sort the elements according to how many good calculations we have
    n_calcs_by_element = defaultdict(int)
    for element_i in all_elements:
        for element_j in all_elements:
            docs = [doc for doc in docs_by_comp[element_i, element_j]
                    if low_energy <= doc['energy'] <= high_energy]
            n_calcs_by_element[element_i] += len(docs)
    elements_sorted = [element for element, count in
                       sorted(n_calcs_by_element.items(),
                              key=lambda kv: kv[1],
                              reverse=True)]

    # Initialize the data structures we'll plot
    Xs = np.array([])
    Ys = np.array([])
    energies = []
    hovertexts = []

    # Figure out the spacings between each square in the grid
    traces = []
    for i, element_i in enumerate(elements_sorted):
        x_offset = (i+1) * max_width
        for j, element_j in enumerate(elements_sorted):
            y_offset = (j+1) * max_width

            # If we have an empty square, move on
            try:
                docs = docs_by_comp[(element_i, element_j)]
            except KeyError:
                continue

            # Get all the data out of the documents
            _Xs = np.array([doc['x'] for doc in docs]) + x_offset
            _Ys = np.array([doc['y'] for doc in docs]) + y_offset
            _energies = [doc['energy'] for doc in docs]
            _hovertexts = [doc_to_hovertext(doc, hovertext_labels) for doc in docs]

            # Push the data for this elemental combination into the larger
            # dataset
            Xs = np.append(Xs, _Xs)
            Ys = np.append(Ys, _Ys)
            energies.extend(_energies)
            hovertexts.extend(_hovertexts)

    # Make the graphical object trace for each data set, along with all of the
    # appropriate formatting
    bimetallic_trace = go.Scattergl(x=Xs, y=Ys,
                                    mode='markers',
                                    marker=dict(size=marker_size,
                                                color=energies,
                                                colorscale=colorscale,
                                                cmin=energy_min,
                                                cmax=energy_max),
                                    text=hovertexts)

    # Add a trace for the colorbar
    colorbar_trace = go.Scattergl(x=[0, 0], y=[0, 0],
                                  mode='markers',
                                  marker=dict(size=0.1,
                                              color=[energy_min, energy_max],
                                              colorscale=[(0., low_color),
                                                          (0.5, good_color),
                                                          (1., high_color)],
                                              cmin=low_energy,
                                              cmax=high_energy,
                                              showscale=True),
                                  hoverinfo=None)

    # Concatenate traces
    traces = [bimetallic_trace, colorbar_trace]

    # Format the x and y axes
    axes_labels = dict(ticks='',
                       tickmode='array',
                       tickvals=np.linspace(max_width,
                                            len(all_elements)*max_width,
                                            len(all_elements)),
                       ticktext=[element for element in elements_sorted],
                       tick0=max_width/2,
                       dtick=max_width,
                       tickfont=dict(size=axis_font_size),
                       showgrid=False)

    # Format the plot itself
    layout = go.Layout(title=filename.split('/')[-1],
                       titlefont=font,
                       xaxis=axes_labels,
                       yaxis=axes_labels,
                       showlegend=False,
                       width=width, height=height,
                       font=dict(size=font_size))

    # Save it online
    plotly.sign_in(**read_rc('plotly_login_info'))
    url = plotly.plot(go.Figure(data=traces, layout=layout), filename=filename)
    return url