def _prepare_grouped_df(adsorbate1, adsorbate2): """ This function takes the adsorbates of interest and make a dataframe that will be used for making 2D plots. Each row in the dataframe is grouped by unique surface. A surface is defined with mpid, Miller index, top, and shift. Args: adsorbate1 adsorbate1, who's energy is going on x-axis adsorbate1 adsorbate2, who's energy is going on y-axis Returns: adsorbate1_df dataframe of adsorbate1 adsorbate2_df dataframe of adsorbate2 grouped_df dataframe used for plotting """ surface_fp = ['mpid', 'millerx', 'millery', 'millerz', 'top', 'shift'] adsorbate1_df = _make_df_from_docs(adsorbate1, surface_fp) adsorbate2_df = _make_df_from_docs(adsorbate2, surface_fp) # merge them together based on unique surface grouped_results = pd.merge(adsorbate1_df, adsorbate2_df, on=surface_fp).dropna() # drop rows that has ML prediction on both OH & CO grouped_results = grouped_results.drop( grouped_results[(grouped_results['{}_DFT'.format(adsorbate1)] == False) & (grouped_results['{}_DFT'.format(adsorbate2)] == False)].index).reset_index() # Add formula to the dataframe based on mpid rc = read_rc() atoms_db = gasdb.get_mongo_collection('atoms') mpids = set(grouped_results['mpid']) compositions_by_mpid = {} print('Beginning to pull data from the Materials Project...') with MPRester(read_rc()['matproj_api_key']) as mat_proj: for mpid in tqdm.tqdm_notebook(mpids): try: entry = mat_proj.get_entry_by_material_id({'task_ids': mpid}) compositions_by_mpid[ mpid] = entry.composition.get_reduced_formula_and_factor( )[0] except IndexError: compositions_by_mpid[mpid] = "" data = list(compositions_by_mpid.items()) df_new = pd.DataFrame(data, columns=['mpid', 'formula']) grouped_df = pd.merge(grouped_results, df_new, on='mpid') return grouped_df
def fetch_tar_file(launch_id, verify_tars=True): backup_directory = read_rc()['launches_backup_directory'] backup_loc = backup_directory + '/%d.tar.gz' % launch_id if len(glob.glob(backup_loc)) > 0 and verify_tars: output = os.system('tar -tzf %s >/dev/null' % backup_loc) if output != 0: os.remove(backup_loc) if len(glob.glob(backup_loc)) == 0: try: launch = lpad.get_launch_by_id(launch_id) except: print('could not find launch id %d' % launch_id) return if launch.state != 'COMPLETED': print(launch.fw_id) return cluster = launch.fworker.name if cluster == 'gilgamesh': os.system("rsync -azqP --max-size=100M gilgamesh:%s /tmp/%s" % (launch.launch_dir, launch_id)) elif 'arjuna' in cluster: os.system("rsync -azqP --max-size=100M arjuna:%s /tmp/%s" % (launch.launch_dir, launch_id)) elif 'Cori' in cluster or '/global/project/projectdirs/m2755/' in launch.launch_dir: launch_dir = launch.launch_dir if '/global/u2/z/zulissi' in launch_dir: launch_dir = '/global/project/projectdirs/m2755/fireworks_zu/fireworks/' + launch_dir[30:] os.system("rsync -azqP --max-size=100M %s /tmp/%s" % (launch_dir, launch_id)) else: print('unknown cluster!: %s' % cluster) print(launch.fw_id) launch_dir_folder = launch.launch_dir.split('/')[-1] os.system("(cd /tmp/%s/%s && tar -czf %s *)" % (launch_id, launch_dir_folder, backup_loc)) os.system("rm -r /tmp/%s" % launch_id)
def _get_compositions_by_mpid(self): ''' We use the Materials Project's python API to find the composition of various materials given their MPIDs. This can take awhile though, so we also cache the results and modify the cache as necessary. Resulting attribute: compositions_by_mpid_ A dictionary whose keys are MPIDs and whose values are lists of strings for each element that is present in the corresponding material. This object is cached and therefore may have extra key:value pairings that you may not need. ''' # Find the current cache of compositions. try: with open(CACHE_LOCATION + 'mp_comp_data.pkl', 'rb') as file_handle: compositions_by_mpid = pickle.load(file_handle) # If the cache is not there, then create it except FileNotFoundError: compositions_by_mpid = {} # Figure out which compositions we still need to figure out catalog_docs = get_catalog_docs() mpids = { doc['mpid'] for doc in self.adsorption_docs + catalog_docs } # Each MP document may contain several MPIDs. Here we get every # single document whose list of associated MPIDs matches anything # in our list of missing MPIDs. with MPRester(read_rc('matproj_api_key')) as rester: query = {'task_ids': {'$elemMatch': {'$in': list(mpids)}}} properties = ['elements', 'task_ids'] mp_docs = rester.query(criteria=query, properties=properties) # Match the MP documents to our missing MPIDs. for mpid in mpids: for doc in mp_docs: if mpid in set(doc['task_ids']): compositions_by_mpid[mpid] = doc['elements'] break # Save the updated cache with open(CACHE_LOCATION + 'mp_comp_data.pkl', 'wb') as file_handle: pickle.dump(compositions_by_mpid, file_handle) self.compositions_by_mpid_ = compositions_by_mpid
__author__ = 'Kevin Tran' __email__ = '*****@*****.**' from collections import defaultdict import warnings from abc import ABC, abstractmethod import numpy as np from sklearn.base import BaseEstimator, TransformerMixin with warnings.catch_warnings(): warnings.filterwarnings('ignore', message='numpy.dtype size changed') import mendeleev from gaspy.atoms_operators import get_stoich_from_mpid from gaspy.utils import read_rc from gaspy.gasdb import get_catalog_docs CACHE_LOCATION = read_rc('gasdb_path') class Fingerprinter(ABC, BaseEstimator, TransformerMixin): ''' This is a template fingerprinter that is meant to be extended before using. It is meant to be extended, and needs a `transform` method. The especially useful attributes that this class has are `dummy_fp_`, `max_num_species_`, `median_adsorption_energies_`, and `mendeleev_data_`. For more details on what they are, refer to the respective methods. Refer to Tran & Ulissi (Nature Catalysis, 2018) for even more details. ''' def __init__(self): pass
from collections import defaultdict from atomicwrites import atomic_write from datetime import datetime import pickle import numpy as np from pymongo import UpdateOne from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from tpot import TPOTRegressor from gaspy.utils import read_rc, multimap_method from gaspy.gasdb import get_catalog_docs, get_adsorption_docs, get_mongo_collection from gaspy_regress import fingerprinters GASDB_LOCATION = read_rc('gasdb_path') PREDICTION_CACHES = { ('model0', 'CO'): GASDB_LOCATION + '/predictions_tpot_CO.pkl', ('model0', 'H'): GASDB_LOCATION + '/predictions_tpot_H.pkl', ('model0', 'N'): GASDB_LOCATION + '/predictions_tpot_N.pkl', ('model0', 'O'): GASDB_LOCATION + '/predictions_tpot_O.pkl', ('model0', 'OH'): GASDB_LOCATION + '/predictions_tpot_OH.pkl', ('model0', 'OOH'): GASDB_LOCATION + '/predictions_tpot_OOH.pkl' } def fit_model0_adsorption_energies(adsorbate): ''' Create and save a modeling pipeline to predict adsortion energies. Arg:
def plot_2D_plot(adsorbate1, adsorbate2, adsorbate1_correction, adsorbate2_correction): """ When called, this function will return a 2D plotly plot with x-aixs being adsorbate1, y-axis being adsorbate2. In addition, it will also cache the adsorption configuration to AWS. The plot can be accessed via http://ulissigroup.cheme.cmu.edu/gaspy_plots/index.html?plotly=(plot# here) Args: adsorbate1, adsorbate1, who's energy is going on x-axis adsorbate1, adsorbate2, who's energy is going on y-axis adsorbateX_correction: energy correction for adsorbate on X-axis e.g. we have dE for adsorption energy, if we want to convert to dG. default is 0 if we plot dE adsorbateY_correction: energy correction for adsorbate on Y-axis e.g. we have dE for adsorption energy, if we want to convert to dG. default is 0 if we plot dE """ grouped_df = _prepare_grouped_df(adsorbate1, adsorbate2) puremetal_mpid = [ 'mp-101', 'mp-23', 'mp-2', 'mp-91', 'mp-1080711', 'mp-1061133', 'mp-58', 'mp-13', 'mp-124', 'mp-30', 'mp-32', 'mp-72', 'mp-142', 'mp-81', 'mp-79', 'mp-1056037', 'mp-94', 'mp-126', 'mp-146', 'mp-1055994', 'mp-104', 'mp-117', 'mp-35', 'mp-90', 'mp-75', 'mp-10172', 'mp-632250', 'mp-1056438', 'mp-8', 'mp-74', 'mp-20483', 'mp-54', 'mp-48', 'mp-134', 'mp-129', 'mp-11', 'mp-1056486', 'mp-149', 'mp-45', 'mp-49', 'mp-96', 'mp-85', 'mp-568584', 'mp-754514', 'mp-33', 'mp-999498', 'mp-127', 'mp-25', 'mp-132', 'mp-570747', 'mp-14', 'mp-154', 'mp-672234', 'mp-672233' ] # DFT_adsorbate1, DFT_adsorbate2 DFT_ads1_ads2 = grouped_df.loc[ (grouped_df['{}_DFT'.format(adsorbate1)] == True) & (grouped_df['{}_DFT'.format(adsorbate2)] == True)] metal_DFT_ads1_ads2 = DFT_ads1_ads2.loc[DFT_ads1_ads2['mpid'].isin( puremetal_mpid)] intermetallics_DFT_ads1_ads2 = DFT_ads1_ads2[ ~DFT_ads1_ads2.isin(metal_DFT_ads1_ads2)].dropna() # DFT_adsorbate1, ML_adsorbate2 DFT_ads1_ML_ads2 = grouped_df.loc[ (grouped_df['{}_DFT'.format(adsorbate1)] == True) & (grouped_df['{}_DFT'.format(adsorbate2)] == False)] metal_DFT_ads1_ML_ads2 = DFT_ads1_ML_ads2.loc[ DFT_ads1_ML_ads2['mpid'].isin(puremetal_mpid)] intermetallics_DFT_ads1_ML_ads2 = DFT_ads1_ML_ads2[ ~DFT_ads1_ML_ads2.isin(metal_DFT_ads1_ML_ads2)].dropna() # ML_adsorbate1, DFT_adsorbate2 ML_ads1_DFT_ads2 = grouped_df.loc[ (grouped_df['{}_DFT'.format(adsorbate1)] == False) & (grouped_df['{}_DFT'.format(adsorbate2)] == True)] metal_ML_ads1_DFT_ads2 = ML_ads1_DFT_ads2.loc[ ML_ads1_DFT_ads2['mpid'].isin(puremetal_mpid)] intermetallics_ML_ads1_DFT_ads2 = ML_ads1_DFT_ads2[ ~ML_ads1_DFT_ads2.isin(metal_ML_ads1_DFT_ads2)].dropna() # get scatter points for plotting print('Plotting DFT_{}, DFT_{}'.format(adsorbate1, adsorbate2)) # DFT_adsorbate1, DFT_adsorbate2 data = _make_scatter_points( metal_DFT_ads1_ads2, adsorbate1, adsorbate2, 'monometallic DFT {} & {}'.format(adsorbate1, adsorbate2), 'square', 'red', 'red', adsorbate1_correction, adsorbate2_correction) data += _make_scatter_points( intermetallics_DFT_ads1_ads2, adsorbate1, adsorbate2, 'intermetallic DFT {} & {}'.format(adsorbate1, adsorbate2), 'circle', 'white', 'red', adsorbate1_correction, adsorbate2_correction) # DFT_adsorbate1, ML_adsorbate2 print('Plotting DFT_{}, ML_{}'.format(adsorbate1, adsorbate2)) data += _make_scatter_points( metal_DFT_ads1_ML_ads2, adsorbate1, adsorbate2, 'monometallic DFT {} & ML {}'.format(adsorbate1, adsorbate2), 'square', 'yellowgreen', 'yellowgreen', adsorbate1_correction, adsorbate2_correction) data += _make_scatter_points( intermetallics_DFT_ads1_ML_ads2, adsorbate1, adsorbate2, 'intermetallic DFT {} & ML {}'.format(adsorbate1, adsorbate2), 'circle', 'white', 'yellowgreen', adsorbate1_correction, adsorbate2_correction) # ML_adsorbate1, DFT_adsorbate2 print('Plotting ML_{}, DFT_{}'.format(adsorbate1, adsorbate2)) data += _make_scatter_points( metal_ML_ads1_DFT_ads2, adsorbate1, adsorbate2, 'monometallic ML {} & DFT {}'.format(adsorbate1, adsorbate2), 'square', 'cornflowerblue', 'cornflowerblue', adsorbate1_correction, adsorbate2_correction) data += _make_scatter_points( intermetallics_ML_ads1_DFT_ads2, adsorbate1, adsorbate2, 'intermetallic ML {} & DFT {}'.format(adsorbate1, adsorbate2), 'circle', 'white', 'cornflowerblue', adsorbate1_correction, adsorbate2_correction) if adsorbate1_correction == 0 and adsorbate2_correction == 0: fig = go.Figure(data=data, layout=go.Layout( hovermode='closest', xaxis=dict(title='dE_{} [eV]'.format(adsorbate1), titlefont=dict(size=25)), yaxis=dict(title='dE_{} [eV]'.format(adsorbate2), titlefont=dict(size=25)))) elif adsorbate1_correction != 0 and adsorbate2_correction != 0: fig = go.Figure(data=data, layout=go.Layout( hovermode='closest', xaxis=dict(title='dG_{} [eV]'.format(adsorbate1), titlefont=dict(size=25)), yaxis=dict(title='dG_{} [eV]'.format(adsorbate2), titlefont=dict(size=25)))) else: print( 'you only added correction to 1 of the adsorbate energies. please keep the energies consistent' ) print('Your figure location is in Plotly 2D_plots/{}_{}_bySurface'.format( adsorbate1, adsorbate2)) plotly.sign_in(**read_rc('plotly_login_info')) plotly.plot(fig, filename='2D_plots/{}_{}_bySurface'.format( adsorbate1, adsorbate2))
def create_gridplot(adsorbate, targets, filename, hovertext_labels=None): ''' This function will create and save a gridplot of our adsorption energy data. Args: adsorbate A string indicating which adsorbate you want to plot the data for targets A 2-tuple of floats indicating the low and high range of the adsorption energies you consider to be good, respectively. filename A string indicating where you want to save the plot (within the Plotly account) hovertext_labels A sequence of strings indicating which data you want displayed in the hovertext. Possible strings include everything in the `gaspy.defaults.adsorption_projections` dictionary in addition to `stoichiometry` and `date`. Returns: url The URL for the plot you just created ''' # Python doesn't like mutable defaults if hovertext_labels is None: hovertext_labels = {'coordination', 'energy', 'stoichiometry', 'date', 'mpid', 'miller'} # Get and preprocess all the documents we have now extra_projections = {'atoms': '$atoms', 'date': '$calculation_dates.slab+adsorbate'} all_docs = get_adsorption_docs(adsorbate, extra_projections) mpids = {doc['mpid'] for doc in all_docs} comps = {mpid: get_stoich_from_mpid(mpid) for mpid in mpids} for doc in all_docs: doc['composition'] = tuple(comps[doc['mpid']].keys()) all_elements = {element for doc in all_docs for element in comps[doc['mpid']]} # Organize all of our documents according to their bimetallic composition docs_by_comp = defaultdict(list) for doc in all_docs: comp = doc['composition'] if len(comp) == 2: docs_by_comp[tuple(comp)].append(doc) docs_by_comp[tuple(reversed(comp))].append(doc) elif len(comp) == 1: docs_by_comp[tuple([comp[0], comp[0]])].append(doc) # Create the local coordinates for each set of bimetallics max_radius = 0 for (element_i, element_j), docs in docs_by_comp.items(): n = len(docs) width = np.sqrt(n) # Add `x` dimension to documents, which is uniform random sorted by # adsorption energy X = np.random.uniform(-width/2, width/2, n) X.sort() docs.sort(key=lambda doc: doc['energy']) for doc, x in zip(docs, X): doc['x'] = x # Add `y` dimension to documents, which is uniform random sorted by # composition Y = np.random.uniform(-width/2, width/2, n) Y.sort() for doc in docs: symbol_counts = doc['atoms']['symbol_counts'] n_atoms = symbol_counts[element_i] + symbol_counts[element_j] ratio = symbol_counts[element_i] / n_atoms doc['ratio'] = ratio doc['stoichiometry'] = {element_i: symbol_counts[element_i], element_j: symbol_counts[element_j]} docs.sort(key=lambda doc: doc['ratio']) # Shuffle the y values within each ratio so that we get squares instead # of lines ratios = [doc['ratio'] for doc in docs] unique_ratios = sorted(list(set(ratios))) shuffle_counter = 0 for i, ratio in enumerate(unique_ratios): ratio_count = ratios.count(ratio) ys = Y[shuffle_counter:shuffle_counter+ratio_count] random.shuffle(ys) shuffle_counter += ratio_count # Concatenate the appropriately shuffled uniform distribution with # documents for doc, y in zip(docs, Y): doc['y'] = y # Recalculate the size of the biggest square. We use this to scale # everything. max_radius = max([max_radius] + [max(doc['x'], doc['y']) for doc in docs]) max_width = max_radius * 2 # Settings for interactive image marker_size = 4 font_size = 24 font = dict(family='Arial', color='black') width = 900 height = 800 axis_font_size = 12 # Set thresholds for energy low_energy = targets[0] high_energy = targets[1] good_energy = (low_energy + high_energy) / 2 # We need the max and min energies to make sure the color mapping in all # our squares map to each other all_energies = [doc['energy'] for doc in all_docs] energy_min = min(all_energies) energy_max = max(all_energies) # Plotly lets you set colors only based on their normalized values, so we # need to normalize our energies before mapping colors onto them. energy_bandwidth = energy_max - energy_min low_energy_normalized = (low_energy - energy_min) / energy_bandwidth good_energy_normalized = (good_energy - energy_min) / energy_bandwidth high_energy_normalized = (high_energy - energy_min) / energy_bandwidth # Make our colorscale low_color = 'rgb(0, 0, 0)' good_color = 'rgb(175, 0, 255)' high_color = 'rgb(255, 200, 200)' colorscale = [(0., low_color), (low_energy_normalized, low_color), (good_energy_normalized, good_color), (high_energy_normalized, high_color), (1., high_color)] # Sort the elements according to how many good calculations we have n_calcs_by_element = defaultdict(int) for element_i in all_elements: for element_j in all_elements: docs = [doc for doc in docs_by_comp[element_i, element_j] if low_energy <= doc['energy'] <= high_energy] n_calcs_by_element[element_i] += len(docs) elements_sorted = [element for element, count in sorted(n_calcs_by_element.items(), key=lambda kv: kv[1], reverse=True)] # Initialize the data structures we'll plot Xs = np.array([]) Ys = np.array([]) energies = [] hovertexts = [] # Figure out the spacings between each square in the grid traces = [] for i, element_i in enumerate(elements_sorted): x_offset = (i+1) * max_width for j, element_j in enumerate(elements_sorted): y_offset = (j+1) * max_width # If we have an empty square, move on try: docs = docs_by_comp[(element_i, element_j)] except KeyError: continue # Get all the data out of the documents _Xs = np.array([doc['x'] for doc in docs]) + x_offset _Ys = np.array([doc['y'] for doc in docs]) + y_offset _energies = [doc['energy'] for doc in docs] _hovertexts = [doc_to_hovertext(doc, hovertext_labels) for doc in docs] # Push the data for this elemental combination into the larger # dataset Xs = np.append(Xs, _Xs) Ys = np.append(Ys, _Ys) energies.extend(_energies) hovertexts.extend(_hovertexts) # Make the graphical object trace for each data set, along with all of the # appropriate formatting bimetallic_trace = go.Scattergl(x=Xs, y=Ys, mode='markers', marker=dict(size=marker_size, color=energies, colorscale=colorscale, cmin=energy_min, cmax=energy_max), text=hovertexts) # Add a trace for the colorbar colorbar_trace = go.Scattergl(x=[0, 0], y=[0, 0], mode='markers', marker=dict(size=0.1, color=[energy_min, energy_max], colorscale=[(0., low_color), (0.5, good_color), (1., high_color)], cmin=low_energy, cmax=high_energy, showscale=True), hoverinfo=None) # Concatenate traces traces = [bimetallic_trace, colorbar_trace] # Format the x and y axes axes_labels = dict(ticks='', tickmode='array', tickvals=np.linspace(max_width, len(all_elements)*max_width, len(all_elements)), ticktext=[element for element in elements_sorted], tick0=max_width/2, dtick=max_width, tickfont=dict(size=axis_font_size), showgrid=False) # Format the plot itself layout = go.Layout(title=filename.split('/')[-1], titlefont=font, xaxis=axes_labels, yaxis=axes_labels, showlegend=False, width=width, height=height, font=dict(size=font_size)) # Save it online plotly.sign_in(**read_rc('plotly_login_info')) url = plotly.plot(go.Figure(data=traces, layout=layout), filename=filename) return url