def run(mpfile, nmax=None): #print json.dumps(mpfile.document, indent=4) datasource = mpfile.document['general'].pop('Datasource') subdir = os.path.abspath(os.path.join( datasource['work_dir'], datasource['directory'] )) # TODO Potentially we have to insert a preprocessing step, probably in msp scandata_f = msp.read_scans(subdir, datacounter="Counter 1") scan_groups = scandata_f.groupby(datasource['group_by'].split()) process_template = mpfile.document['general'].pop('process_template') translate = get_translate(datasource['work_dir']) keys = scan_groups.groups.keys() keys.sort() for i,g in enumerate(tqdm(keys, leave=True)): # TODO: Group information is saved into the output. Rethink? comp, sx, sy = translate(g) composition = normalize_root_level(comp)[1] process_template_copy = copy.deepcopy(process_template) process_template_copy['position'] = {'x': sx, 'y': sy} mpfile.document.rec_update(nest_dict( process_template_copy, [composition, 'process_chain'] )) sg = scan_groups.get_group(g) for process_chain_name in process_template.keys(): scan_params = mpfile.document[composition]['process_chain'][process_chain_name] xmcd_frame = treat_xmcd(sg, scan_params, xas_process.process_dict) mpfile.add_data_table( composition, xmcd_frame[['Energy', 'XAS', 'XMCD']], '_'.join(['data', process_chain_name]) ) if nmax is not None and i > nmax: break
def run(mpfile, **kwargs): url = mpfile.hdata.general['url'] dbfile = os.path.join(os.environ['HOME'], 'work', url.rsplit('/')[-1]) if not os.path.exists(dbfile): data = urllib.URLopener() data.retrieve(url, dbfile) con = ase.db.connect(dbfile) nr_mpids = con.count(selection='mpid') for idx, row in enumerate(con.select('mpid')): if idx and not idx % 10: print 'added', idx, '/', nr_mpids, 'materials' mpid = 'mp-' + str(row.mpid) d = RecursiveDict() # kohn-sham band gap d[u'ΔE-KS'] = RecursiveDict([ ('indirect', clean_value(row.gllbsc_ind_gap - row.gllbsc_disc, 'eV')), ('direct', clean_value(row.gllbsc_dir_gap - row.gllbsc_disc, 'eV')) ]) # derivative discontinuity d['C'] = clean_value(row.gllbsc_disc, 'eV') # quasi particle band gap d[u'ΔE-QP'] = RecursiveDict([ ('indirect', clean_value(row.gllbsc_ind_gap, 'eV')), ('direct', clean_value(row.gllbsc_dir_gap, 'eV')) ]) mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=mpid)
def run(mpfile, nmax=None): #print json.dumps(mpfile.document, indent=4) datasource = mpfile.document['general'].pop('Datasource') subdir = os.path.abspath( os.path.join(datasource['work_dir'], datasource['directory'])) # TODO Potentially we have to insert a preprocessing step, probably in msp scandata_f = msp.read_scans(subdir, datacounter="Counter 1") scan_groups = scandata_f.groupby(datasource['group_by'].split()) process_template = mpfile.document['general'].pop('process_template') translate = get_translate(datasource['work_dir']) keys = scan_groups.groups.keys() keys.sort() for i, g in enumerate(tqdm(keys, leave=True)): # TODO: Group information is saved into the output. Rethink? comp, sx, sy = translate(g) composition = normalize_root_level(comp)[1] process_template_copy = copy.deepcopy(process_template) process_template_copy['position'] = {'x': sx, 'y': sy} mpfile.document.rec_update( nest_dict(process_template_copy, [composition, 'process_chain'])) sg = scan_groups.get_group(g) for process_chain_name in process_template.keys(): scan_params = mpfile.document[composition]['process_chain'][ process_chain_name] xmcd_frame = treat_xmcd(sg, scan_params, xas_process.process_dict) mpfile.add_data_table(composition, xmcd_frame[['Energy', 'XAS', 'XMCD']], '_'.join(['data', process_chain_name])) if nmax is not None and i > nmax: break
def add_structure(self, source, name=None, identifier=None, fmt=None): """add a structure to the mpfile""" from pymatgen import Structure, MPRester if isinstance(source, Structure): structure = source elif isinstance(source, dict): structure = Structure.from_dict(source) elif os.path.exists(source): structure = Structure.from_file(source, sort=True) elif isinstance(source, six.string_types): if fmt is None: raise ValueError("Need fmt to get structure from string!") structure = Structure.from_str(source, fmt, sort=True) else: raise ValueError(source, "not supported!") if name is not None: if not isinstance(name, six.string_types): raise ValueError("structure name needs to be a string") elif "." in name: raise ValueError("structure name cannot contain dots (.)") mpr = MPRester() if not mpr.api_key: raise ValueError( "API key not set. Run `pmg config --add PMG_MAPI_KEY <USER_API_KEY>`." ) matched_mpids = mpr.find_structure(structure) formula = get_composition_from_string(structure.composition.formula) if not matched_mpids: if identifier is None: identifier = formula print( "Structure not found in MP! Please submit via MPComplete to " "obtain mp-id or manually choose an anchor mp-id! Continuing " "with {} as identifier!".format(identifier)) else: print("Structure not found in MP! Forcing {} as identifier!". format(identifier)) elif identifier is None: identifier = matched_mpids[0] if len(matched_mpids) > 1: print("Multiple matching structures found in MP. Using", identifier) elif identifier not in matched_mpids: msg = "Structure does not match {} but instead {}!".format( identifier, matched_mpids) raise ValueError(msg) idx = len( self.document.get(identifier, {}).get(mp_level01_titles[3], {})) sub_key = formula if name is None else name if sub_key in self.document.get(identifier, {}).get(mp_level01_titles[3], {}): sub_key += "_{}".format(idx) self.document.rec_update( nest_dict(structure.as_dict(), [identifier, mp_level01_titles[3], sub_key])) return identifier
def concat(self, mpfile): """concatenate single-section MPFile with this MPFile""" try: if len(mpfile.document) > 1: raise ValueError('concatenation only possible with single section files') except AttributeError: raise ValueError('Provide a MPFile to concatenate') mp_cat_id = list(mpfile.document.keys())[0] general_title = mp_level01_titles[0] if general_title in mpfile.document[mp_cat_id]: general_data = mpfile.document[mp_cat_id].pop(general_title) if general_title not in self.document: self.document.rec_update(nest_dict(general_data, [general_title])) self.document.rec_update(nest_dict( mpfile.document.pop(mp_cat_id), [self.get_unique_mp_cat_id(mp_cat_id)] ))
def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(archieml.loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in rdct.keys(): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct.insert_before(rdct.keys()[0], (mp_level01_titles[0], RecursiveDict())) rdct.rec_update( nest_dict(rdct.pop(key), [mp_level01_titles[0], root_key])) else: # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [root_key] if isinstance(value, list): keys.append('table') rdct.rec_update(nest_dict(value, keys)) # Note: CSV section is marked with 'data ' prefix during iterate() for k, v in rdct[root_key].iterate(): if isinstance(k, six.string_types) and \ k.startswith(mp_level01_titles[1]): # k = table name (incl. data prefix) # v = csv string from ArchieML free-form arrays table_name = k[len(mp_level01_titles[1] + '_'):] pd_obj = read_csv(v) rdct[root_key].pop(table_name) rdct[root_key].rec_update( nest_dict(pd_obj.to_dict(), [k])) rdct[root_key].insert_default_plot_options(pd_obj, k) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in rdct[root_key]: from pymatgen.io.cif import CifParser for name in rdct[root_key][mp_level01_titles[3]].keys(): cif = rdct[root_key][mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] rdct[root_key][mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
def run(mpfile, **kwargs): input_file = mpfile.document['_hdata'].pop('input_file') zip_path = os.path.join(os.environ['HOME'], 'work', input_file) if not os.path.exists(zip_path): return 'Please upload', zip_path zip_file = ZipFile(zip_path, 'r') composition_table_dict = mpfile.document['_hdata']['composition_table'] conc_funcs = get_concentration_functions(composition_table_dict) for info in zip_file.infolist(): print info.filename d = RecursiveDict() # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4) d['position'] = RecursiveDict( (k, clean_value(v, 'mm')) for k, v in zip(['x', 'y'], [x, y]) ) # composition d['composition'] = RecursiveDict( (el, clean_value(f(x, y), convert_to_percent=True)) for el, f in conc_funcs.items() ) # identifier identifier = get_composition_from_string(''.join([ '{}{}'.format(el, int(round(Decimal(comp.split()[0])))) for el, comp in d['composition'].items() ])) # load csv file try: csv = zip_file.read(info.filename) except KeyError: print 'ERROR: Did not find %s in zip file' % info.filename # read csv to pandas DataFrame and add to MPFile df = read_csv(csv) df = df[['Energy', 'XAS', 'XMCD']] # min and max d.rec_update(RecursiveDict( (y, RecursiveDict([ ('min', df[y].min()), ('max', df[y].max()) ])) for y in ['XAS', 'XMCD'] )) # add data to MPFile mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier) mpfile.add_data_table(identifier, df, name=element)
def run(mpfile, **kwargs): input_file = mpfile.document['_hdata'].pop('input_file') zip_path = os.path.join(os.environ['HOME'], 'work', input_file) if not os.path.exists(zip_path): return 'Please upload', zip_path zip_file = ZipFile(zip_path, 'r') composition_table_dict = mpfile.document['_hdata']['composition_table'] conc_funcs = get_concentration_functions(composition_table_dict) for info in zip_file.infolist(): print info.filename d = RecursiveDict() # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4) d['position'] = RecursiveDict( (k, clean_value(v, 'mm')) for k, v in zip(['x', 'y'], [x, y])) # composition d['composition'] = RecursiveDict( (el, clean_value(f(x, y), convert_to_percent=True)) for el, f in conc_funcs.items()) # identifier identifier = get_composition_from_string(''.join([ '{}{}'.format(el, int(round(Decimal(comp.split()[0])))) for el, comp in d['composition'].items() ])) # load csv file try: csv = zip_file.read(info.filename) except KeyError: print 'ERROR: Did not find %s in zip file' % info.filename # read csv to pandas DataFrame and add to MPFile df = read_csv(csv) df = df[['Energy', 'XAS', 'XMCD']] # min and max d.rec_update( RecursiveDict( (y, RecursiveDict([('min', df[y].min()), ('max', df[y].max())])) for y in ['XAS', 'XMCD'])) # add data to MPFile mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier) mpfile.add_data_table(identifier, df, name=element)
def run(mpfile, **kwargs): input_file = mpfile.document["_hdata"].pop("input_file") zip_path = os.path.join(os.environ["HOME"], "work", input_file) if not os.path.exists(zip_path): return "Please upload", zip_path zip_file = ZipFile(zip_path, "r") composition_table_dict = mpfile.document["_hdata"]["composition_table"] conc_funcs = get_concentration_functions(composition_table_dict) for info in zip_file.infolist(): print info.filename d = RecursiveDict() # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv element, x, y = os.path.splitext(info.filename)[0].rsplit("_", 4) d["position"] = RecursiveDict( (k, clean_value(v, "mm")) for k, v in zip(["x", "y"], [x, y])) # composition d["composition"] = RecursiveDict( (el, clean_value(f(x, y), convert_to_percent=True)) for el, f in conc_funcs.items()) # identifier identifier = get_composition_from_string("".join([ "{}{}".format(el, int(round(Decimal(comp.split()[0])))) for el, comp in d["composition"].items() ])) # load csv file try: csv = zip_file.read(info.filename) except KeyError: print "ERROR: Did not find %s in zip file" % info.filename # read csv to pandas DataFrame and add to MPFile df = read_csv(csv) df = df[["Energy", "XAS", "XMCD"]] # min and max d.rec_update( RecursiveDict( (y, RecursiveDict([("min", df[y].min()), ("max", df[y].max())])) for y in ["XAS", "XMCD"])) # add data to MPFile mpfile.add_hierarchical_data(nest_dict(d, ["data"]), identifier=identifier) mpfile.add_data_table(identifier, df, name=element)
def insert_general_section(self, general_mpfile): """insert general section from `general_mpfile` into this MPFile""" if general_mpfile is None: return general_title = mp_level01_titles[0] general_data = general_mpfile.document[general_title] root_key = list(self.document.keys())[0] for key, value in general_data.items(): if key in self.document[root_key]: self.document.rec_update(nest_dict(value, [root_key, key])) else: self.document[root_key][key] = value for key in reversed(general_data.keys()): self.document[root_key].move_to_end(key, last=False)
def __init__(self, document): from pymatgen import Structure super(HierarchicalData, self).__init__() scope = [] for key, value in document.iterate(): if isinstance(value, Table) or isinstance(value, Structure): continue level, key = key level_reduction = bool(level < len(scope)) if level_reduction: del scope[level:] if value is None: scope.append(key) elif mp_level01_titles[2] not in scope: self.rec_update(nest_dict({key: value}, scope))
def add_data_table(self, identifier, dataframe, name, plot_options=None): """add a datatable to the root-level section Args: identifier (str): MP category ID (`mp_cat_id`) dataframe (pandas.DataFrame): tabular data as Pandas DataFrame name (str): table name, optional if only one table in section plot_options (dict): options for according plotly graph """ # TODO: optional table name, required if multiple tables per root-level section name = "".join([replacements.get(c, c) for c in name]) self.document.rec_update( nest_dict(Table(dataframe).to_dict(), [identifier, name])) self.document[identifier].insert_default_plot_options( dataframe, name, update_plot_options=plot_options)
def __init__(self, document): from pymatgen import Structure super(HierarchicalData, self).__init__() scope = [] for key, value in document.iterate(): if isinstance(value, Table) or isinstance(value, Structure): continue level, key = key level_reduction = bool(level < len(scope)) if level_reduction: del scope[level:] if value is None: scope.append(key) elif mp_level01_titles[2] not in scope: self.rec_update(nest_dict({key: value}, scope))
def __init__(self, doc): super(HierarchicalData, self).__init__() document = RecursiveDict(doc) scope = [] for key, value in document.iterate(): level, key = key if key in mp_level01_titles: continue level_reduction = bool(level < len(scope)) if level_reduction: del scope[level:] if value is None: scope.append(key) else: d = nest_dict(value, scope + [key]) self.rec_update(d, overwrite=True)
def split(self): general_mpfile = (self.pop_first_section() if mp_level01_titles[0] in self.document.keys() else None) if not self.document: raise ValueError("No contributions in MPFile! Either the file is" " empty or only contains shared (meta-)data not" " correlated to core identifier.") while True: try: mpfile_single = self.pop_first_section() mpid_orig = mpfile_single.ids[0] if "--" in mpid_orig: mpid = mpid_orig.split("--")[0] mpfile_single.document.rec_update( nest_dict(mpfile_single.document.pop(mpid_orig), [mpid])) if general_mpfile is not None: mpfile_single.insert_general_section(general_mpfile) yield mpfile_single except KeyError: break
def run(mpfile, **kwargs): from pymatgen import MPRester, Composition import pandas as pd input_file = mpfile.document['_hdata'].pop('input_file') file_path = os.path.join(os.environ['HOME'], 'work', input_file) if not os.path.exists(file_path): return 'Please upload', file_path df_dct = pd.read_excel(file_path) columns_units = [ ('A-Site', ''), ('B-Site', ''), ('a', 'Å'), ('Eᶠ|ABO₃', 'eV'), ('Eᶠ|Yᴮ', 'eV'), ('Eᶠ|Vᴼ', 'eV'), ('Eᶠ|Hᵢ', 'eV'), ('ΔEᵢ|Yᴮ-Hᵢ', 'eV') ] columns = df_dct.columns mpr = MPRester(endpoint="http://materialsproject.org:8080/rest/v2") for row_idx, row in df_dct.iterrows(): formula = '{}{}O3'.format(row[columns[0]], row[columns[1]]) comp = Composition(formula) crit = {"reduced_cell_formula": comp.to_reduced_dict, "nsites": 5} docs = mpr.query(criteria=crit, properties=["task_id", "volume"]) if len(docs) > 1: volume = row[columns[2]]**3 volumes = pd.np.array([r['volume'] for r in docs]) idx = pd.np.abs(volumes-volume).argmin() identifier = docs[idx]['task_id'] continue elif not docs: print formula, 'not found on MP' continue else: identifier = docs[0]['task_id'] print formula, '->', identifier d = RecursiveDict() for col, (key, unit) in zip(columns, columns_units): d[key] = clean_value(row[col], unit) mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier)
def add_hierarchical_data(self, dct, identifier=mp_level01_titles[0]): if len(self.ids) >= self.max_contribs: raise StopIteration( "Reached max. number of contributions in MPFile") self.document.rec_update(nest_dict(RecursiveDict(dct), [identifier]))
def run(mpfile, hosts=None, download=False, **kwargs): #mpfile.unique_mp_cat_ids = False from pymatgen import MPRester mpr = MPRester() fpath = os.path.join(os.environ['HOME'], 'work', 'dilute_solute_diffusion.xlsx') if download or not os.path.exists(fpath): figshare_id = mpfile.hdata.general['info']['figshare_id'] url = 'https://api.figshare.com/v2/articles/{}'.format(figshare_id) print 'get figshare article {}'.format(figshare_id) r = requests.get(url) figshare = json.loads(r.content) mpfile.document['_hdata']['version'] = figshare['version'] print 'read excel from figshare into DataFrame' df_dct = None for d in figshare['files']: if 'xlsx' in d['name']: # Dict of DataFrames is returned, with keys representing sheets df_dct = read_excel(d['download_url'], sheet_name=None) break if df_dct is None: print 'no excel sheet found on figshare' return print 'save excel to disk' writer = ExcelWriter(fpath) for sheet, df in df_dct.items(): df.to_excel(writer, sheet) writer.save() else: df_dct = read_excel(fpath, sheet_name=None) print len(df_dct), 'sheets loaded.' print 'looping hosts ...' host_info = df_dct['Host Information'] host_info.set_index(host_info.columns[0], inplace=True) host_info.dropna(inplace=True) for idx, host in enumerate(host_info): if hosts is not None: if isinstance(hosts, int) and idx + 1 > hosts: break elif isinstance(hosts, list) and not host in hosts: continue print 'get mp-id for {}'.format(host) mpid = None for doc in mpr.query(criteria={'pretty_formula': host}, properties={'task_id': 1}): if doc['sbxd'][0]['decomposes_to'] is None: mpid = doc['task_id'] break if mpid is None: print 'mp-id for {} not found'.format(host) continue print 'add host info for {}'.format(mpid) hdata = host_info[host].to_dict(into=RecursiveDict) for k in hdata.keys(): v = hdata.pop(k) ks = k.split() if ks[0] not in hdata: hdata[ks[0]] = RecursiveDict() unit = ks[-1][1:-1] if ks[-1].startswith('[') else '' subkey = '_'.join(ks[1:-1] if unit else ks[1:]).split(',')[0] if subkey == "lattice_constant": unit = u'Å' try: hdata[ks[0]][subkey] = clean_value( v, unit.replace('angstrom', u'Å')) except ValueError: hdata[ks[0]][subkey] = v hdata['formula'] = host df = df_dct['{}-X'.format(host)] rows = list(isnull(df).any(1).nonzero()[0]) if rows: cells = df.ix[rows].dropna(how='all').dropna(axis=1)[df.columns[0]] note = cells.iloc[0].replace('following', cells.iloc[1])[:-1] hdata['note'] = note df.drop(rows, inplace=True) mpfile.add_hierarchical_data(nest_dict(hdata, ['data']), identifier=mpid) print 'add table for D₀/Q data for {}'.format(mpid) df.set_index(df['Solute element number'], inplace=True) df.drop('Solute element number', axis=1, inplace=True) df.columns = df.ix[0] df.index.name = 'index' df.drop('Solute element name', inplace=True) df = df.T.reset_index() if str(host) == 'Fe': df_D0_Q = df[[ 'Solute element name', 'Solute D0, paramagnetic [cm^2/s]', 'Solute Q, paramagnetic [eV]' ]] elif hdata['Host']['crystal_structure'] == 'HCP': df_D0_Q = df[[ 'Solute element name', 'Solute D0 basal [cm^2/s]', 'Solute Q basal [eV]' ]] else: df_D0_Q = df[[ 'Solute element name', 'Solute D0 [cm^2/s]', 'Solute Q [eV]' ]] df_D0_Q.columns = ['El.', 'D₀ [cm²/s]', 'Q [eV]'] mpfile.add_data_table(mpid, df_D0_Q, 'D₀_Q') if hdata['Host']['crystal_structure'] == 'BCC': print 'add table for hop activation barriers for {} (BCC)'.format( mpid) columns_E = [ 'Hop activation barrier, E_{} [eV]'.format(i) for i in range(2, 5) ] + [ "Hop activation barrier, E'_{} [eV]".format(i) for i in range(3, 5) ] + [ "Hop activation barrier, E''_{} [eV]".format(i) for i in range(3, 5) ] + [ 'Hop activation barrier, E_{} [eV]'.format(i) for i in range(5, 7) ] df_E = df[['Solute element name'] + columns_E] df_E.columns = ['El.'] + [ 'E{} [eV]'.format(i) for i in ['₂', '₃', '₄'] ] + ['E`{} [eV]'.format(i) for i in ['₃', '₄']] + [ 'E``{} [eV]'.format(i) for i in ['₃', '₄'] ] + ['E{} [eV]'.format(i) for i in ['₅', '₆']] mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers') print 'add table for hop attempt frequencies for {} (BCC)'.format( mpid) columns_v = [ 'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(2, 5) ] + [ "Hop attempt frequency, v'_{} [THz]".format(i) for i in range(3, 5) ] + [ "Hop attempt frequency, v''_{} [THz]".format(i) for i in range(3, 5) ] + [ 'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(5, 7) ] df_v = df[['Solute element name'] + columns_v] df_v.columns = ['El.'] + [ 'v{} [THz]'.format(i) for i in ['₂', '₃', '₄'] ] + ['v``{} [THz]'.format(i) for i in ['₃', '₄']] + [ 'v``{} [THz]'.format(i) for i in ['₃', '₄'] ] + ['v{} [THz]'.format(i) for i in ['₅', '₆']] mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies') elif hdata['Host']['crystal_structure'] == 'FCC': print 'add table for hop activation barriers for {} (FCC)'.format( mpid) columns_E = [ 'Hop activation barrier, E_{} [eV]'.format(i) for i in range(5) ] df_E = df[['Solute element name'] + columns_E] df_E.columns = ['El.'] + [ 'E{} [eV]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄'] ] mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers') print 'add table for hop attempt frequencies for {} (FCC)'.format( mpid) columns_v = [ 'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(5) ] df_v = df[['Solute element name'] + columns_v] df_v.columns = ['El.'] + [ 'v{} [THz]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄'] ] mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies') elif hdata['Host']['crystal_structure'] == 'HCP': print 'add table for hop activation barriers for {} (HCP)'.format( mpid) columns_E = [ "Hop activation barrier, E_X [eV]", "Hop activation barrier, E'_X [eV]", "Hop activation barrier, E_a [eV]", "Hop activation barrier, E'_a [eV]", "Hop activation barrier, E_b [eV]", "Hop activation barrier, E'_b [eV]", "Hop activation barrier, E_c [eV]", "Hop activation barrier, E'_c [eV]" ] df_E = df[['Solute element name'] + columns_E] df_E.columns = ['El.'] + [ 'Eₓ [eV]', 'E`ₓ [eV]', 'Eₐ [eV]', 'E`ₐ [eV]', 'E_b [eV]', 'E`_b [eV]', 'Eꪱ [eV]', 'E`ꪱ [eV]' ] mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers') print 'add table for hop attempt frequencies for {} (HCP)'.format( mpid) columns_v = ['Hop attempt frequency, v_a [THz]' ] + ['Hop attempt frequency, v_X [THz]'] df_v = df[['Solute element name'] + columns_v] df_v.columns = ['El.'] + ['vₐ [THz]'] + ['vₓ [THz]'] mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies') print mpfile print 'DONE'
def run(mpfile, hosts=None, download=False): mpr = MPRester() fpath = f"{project}.xlsx" if download or not os.path.exists(fpath): figshare_id = 1546772 url = "https://api.figshare.com/v2/articles/{}".format(figshare_id) print("get figshare article {}".format(figshare_id)) r = requests.get(url) figshare = json.loads(r.content) print("version =", figshare["version"]) # TODO set manually in "other"? print("read excel from figshare into DataFrame") df_dct = None for d in figshare["files"]: if "xlsx" in d["name"]: # Dict of DataFrames is returned, with keys representing sheets df_dct = read_excel(d["download_url"], sheet_name=None) break if df_dct is None: print("no excel sheet found on figshare") return print("save excel to disk") writer = ExcelWriter(fpath) for sheet, df in df_dct.items(): df.to_excel(writer, sheet) writer.save() else: df_dct = read_excel(fpath, sheet_name=None) print(len(df_dct), "sheets loaded.") print("looping hosts ...") host_info = df_dct["Host Information"] host_info.set_index(host_info.columns[0], inplace=True) host_info.dropna(inplace=True) for idx, host in enumerate(host_info): if hosts is not None: if isinstance(hosts, int) and idx + 1 > hosts: break elif isinstance(hosts, list) and not host in hosts: continue print("get mp-id for {}".format(host)) mpid = None for doc in mpr.query(criteria={"pretty_formula": host}, properties={"task_id": 1}): if "decomposes_to" not in doc["sbxd"][0]: mpid = doc["task_id"] break if mpid is None: print("mp-id for {} not found".format(host)) continue print("add host info for {}".format(mpid)) hdata = host_info[host].to_dict(into=RecursiveDict) for k in list(hdata.keys()): v = hdata.pop(k) ks = k.split() if ks[0] not in hdata: hdata[ks[0]] = RecursiveDict() unit = ks[-1][1:-1] if ks[-1].startswith("[") else "" subkey = "_".join(ks[1:-1] if unit else ks[1:]).split(",")[0] if subkey == "lattice_constant": unit = "Å" try: hdata[ks[0]][subkey] = clean_value( v, unit.replace("angstrom", "Å")) except ValueError: hdata[ks[0]][subkey] = v hdata["formula"] = host df = df_dct["{}-X".format(host)] rows = list(isnull(df).any(1).nonzero()[0]) if rows: cells = df.iloc[rows].dropna(how="all").dropna( axis=1)[df.columns[0]] note = cells.iloc[0].replace("following", cells.iloc[1])[:-1] hdata["note"] = note df.drop(rows, inplace=True) mpfile.add_hierarchical_data(nest_dict(hdata, ["data"]), identifier=mpid) print("add table for D₀/Q data for {}".format(mpid)) df.set_index(df["Solute element number"], inplace=True) df.drop("Solute element number", axis=1, inplace=True) df.columns = df.iloc[0] df.index.name = "index" df.drop("Solute element name", inplace=True) df = df.T.reset_index() if str(host) == "Fe": df_D0_Q = df[[ "Solute element name", "Solute D0, paramagnetic [cm^2/s]", "Solute Q, paramagnetic [eV]", ]] elif hdata["Host"]["crystal_structure"] == "HCP": df_D0_Q = df[[ "Solute element name", "Solute D0 basal [cm^2/s]", "Solute Q basal [eV]", ]] else: df_D0_Q = df[[ "Solute element name", "Solute D0 [cm^2/s]", "Solute Q [eV]" ]] df_D0_Q.columns = ["Solute", "D₀ [cm²/s]", "Q [eV]"] anums = [z[el] for el in df_D0_Q["Solute"]] df_D0_Q.insert(0, "Z", Series(anums, index=df_D0_Q.index)) df_D0_Q.sort_values("Z", inplace=True) df_D0_Q.reset_index(drop=True, inplace=True) mpfile.add_data_table(mpid, df_D0_Q, "D₀_Q") if hdata["Host"]["crystal_structure"] == "BCC": print("add table for hop activation barriers for {} (BCC)".format( mpid)) columns_E = ([ "Hop activation barrier, E_{} [eV]".format(i) for i in range(2, 5) ] + [ "Hop activation barrier, E'_{} [eV]".format(i) for i in range(3, 5) ] + [ "Hop activation barrier, E''_{} [eV]".format(i) for i in range(3, 5) ] + [ "Hop activation barrier, E_{} [eV]".format(i) for i in range(5, 7) ]) df_E = df[["Solute element name"] + columns_E] df_E.columns = (["Solute"] + ["E{} [eV]".format(i) for i in ["₂", "₃", "₄"]] + ["E`{} [eV]".format(i) for i in ["₃", "₄"]] + ["E``{} [eV]".format(i) for i in ["₃", "₄"]] + ["E{} [eV]".format(i) for i in ["₅", "₆"]]) mpfile.add_data_table(mpid, df_E, "hop_activation_barriers") print("add table for hop attempt frequencies for {} (BCC)".format( mpid)) columns_v = ([ "Hop attempt frequency, v_{} [THz]".format(i) for i in range(2, 5) ] + [ "Hop attempt frequency, v'_{} [THz]".format(i) for i in range(3, 5) ] + [ "Hop attempt frequency, v''_{} [THz]".format(i) for i in range(3, 5) ] + [ "Hop attempt frequency, v_{} [THz]".format(i) for i in range(5, 7) ]) df_v = df[["Solute element name"] + columns_v] df_v.columns = (["Solute"] + ["v{} [THz]".format(i) for i in ["₂", "₃", "₄"]] + ["v`{} [THz]".format(i) for i in ["₃", "₄"]] + ["v``{} [THz]".format(i) for i in ["₃", "₄"]] + ["v{} [THz]".format(i) for i in ["₅", "₆"]]) mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies") elif hdata["Host"]["crystal_structure"] == "FCC": print("add table for hop activation barriers for {} (FCC)".format( mpid)) columns_E = [ "Hop activation barrier, E_{} [eV]".format(i) for i in range(5) ] df_E = df[["Solute element name"] + columns_E] df_E.columns = ["Solute"] + [ "E{} [eV]".format(i) for i in ["₀", "₁", "₂", "₃", "₄"] ] mpfile.add_data_table(mpid, df_E, "hop_activation_barriers") print("add table for hop attempt frequencies for {} (FCC)".format( mpid)) columns_v = [ "Hop attempt frequency, v_{} [THz]".format(i) for i in range(5) ] df_v = df[["Solute element name"] + columns_v] df_v.columns = ["Solute"] + [ "v{} [THz]".format(i) for i in ["₀", "₁", "₂", "₃", "₄"] ] mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies") elif hdata["Host"]["crystal_structure"] == "HCP": print("add table for hop activation barriers for {} (HCP)".format( mpid)) columns_E = [ "Hop activation barrier, E_X [eV]", "Hop activation barrier, E'_X [eV]", "Hop activation barrier, E_a [eV]", "Hop activation barrier, E'_a [eV]", "Hop activation barrier, E_b [eV]", "Hop activation barrier, E'_b [eV]", "Hop activation barrier, E_c [eV]", "Hop activation barrier, E'_c [eV]", ] df_E = df[["Solute element name"] + columns_E] df_E.columns = ["Solute"] + [ "Eₓ [eV]", "E`ₓ [eV]", "Eₐ [eV]", "E`ₐ [eV]", "E_b [eV]", "E`_b [eV]", "Eꪱ [eV]", "E`ꪱ [eV]", ] mpfile.add_data_table(mpid, df_E, "hop_activation_barriers") print("add table for hop attempt frequencies for {} (HCP)".format( mpid)) columns_v = ["Hop attempt frequency, v_a [THz]" ] + ["Hop attempt frequency, v_X [THz]"] df_v = df[["Solute element name"] + columns_v] df_v.columns = ["Solute"] + ["vₐ [THz]"] + ["vₓ [THz]"] mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies") print("DONE")
def run(mpfile, **kwargs): # extract data from json files keys = ['pretty_formula', 'volume'] input_dir = mpfile.hdata.general['input_dir'] for idx, obj in enumerate(scandir(input_dir)): mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1] print(mpid) input_file = gzip.open(obj.path, 'rb') try: data = json.loads(input_file.read()) # filter out metals if 'GGA' not in data['gap'] or data['gap']['GGA'] < 0.1: print('GGA gap < 0.1 -> skip') continue # add hierarchical data (nested key-values) # TODO: extreme values for power factor, zT, effective mass # TODO: add a text for the description of each table hdata = RecursiveDict((k, data[k]) for k in keys) hdata['volume'] = u'{:g} ų'.format(hdata['volume']) cond_eff_mass = u'mₑᶜᵒⁿᵈ' hdata[cond_eff_mass] = RecursiveDict() names = [u'e₁', u'e₂', u'e₃', u'<m>'] if 'GGA' not in data: print('no GGA key for', mpid) continue for dt, d in data['GGA']['cond_eff_mass'].items(): eff_mass = d['300']['1e+18'] eff_mass.append(np.mean(eff_mass)) hdata[cond_eff_mass][dt] = RecursiveDict( (names[idx], u'{:.2f} mₑ'.format(x)) for idx, x in enumerate(eff_mass)) seebeck_fix_dop_temp = "Seebeck" hdata[seebeck_fix_dop_temp] = RecursiveDict() cols = [u'e₁', u'e₂', u'e₃', 'temperature', 'doping'] for doping_type in ['p', 'n']: sbk = [ float(i) for i in data['GGA']['seebeck_doping'] [doping_type]['300']['1e+18']['eigs'] ] vals = [u'{:.2e} μV/K'.format(s) for s in sbk] + [ u'{} K'.format('300'), u'{} cm⁻³'.format('1e+18') ] hdata[seebeck_fix_dop_temp][doping_type] = RecursiveDict( (k, v) for k, v in zip(cols, vals)) # build data and max values for seebeck, conductivity and kappa # max/min values computed using numpy. It may be better to code it in pure python. cols = ['value', 'temperature', 'doping'] for prop_name in ['seebeck_doping', 'cond_doping', 'kappa_doping']: # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font) # and select it as standard font in your browser (leave other fonts as is, esp. fixed width) if prop_name[0] == 's': lbl, unit = u"Sₘₐₓ", u"μV/K" elif prop_name[0] == 'c': lbl, unit = u"σₘₐₓ", u"(Ωms)⁻¹" elif prop_name[0] == 'k': lbl, unit = u"κₑ₋ₘᵢₙ", u"W/(mKs)" hdata[lbl] = RecursiveDict() for doping_type in ['p', 'n']: prop = data['GGA'][prop_name][doping_type] prop_averages, dopings, columns = [], None, ['T (K)'] temps = sorted(map(int, prop.keys())) for temp in temps: row = [temp] if dopings is None: dopings = sorted(map(float, prop[str(temp)].keys())) for doping in dopings: doping_str = '%.0e' % doping if len(columns) <= len(dopings): columns.append(doping_str + u' cm⁻³') eigs = prop[str(temp)][doping_str]['eigs'] row.append(np.mean(eigs)) prop_averages.append(row) arr_prop_avg = np.array(prop_averages)[:, 1:] max_v = np.max(arr_prop_avg) if prop_name[0] == 's' and doping_type == 'n': max_v = np.min(arr_prop_avg) if prop_name[0] == 'k': max_v = np.min(arr_prop_avg) arg_max = np.argwhere(arr_prop_avg == max_v)[0] vals = [ u'{:.2e} {}'.format(max_v, unit), u'{:.2e} K'.format(temps[arg_max[0]]), u'{:.2e} cm⁻³'.format(dopings[arg_max[1]]) ] hdata[lbl][doping_type] = RecursiveDict( (k, v) for k, v in zip(cols, vals)) mpfile.add_hierarchical_data(nest_dict(hdata, ['data']), identifier=data['mp_id']) finally: input_file.close()
def run(mpfile, **kwargs): from pymatgen import Structure reference_project = None input_data, input_keys, extra = RecursiveDict(), RecursiveDict(), RecursiveDict() input_urls = mpfile.document['_hdata'].pop('input_urls') for project in input_urls: input_url = input_urls[project]['file'] if '{}' in input_url: input_url = input_url.format('2d') # TODO 3d for Jarvis dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1]) if not os.path.exists(dbfile): print 'downloading', dbfile, '...' urllib.urlretrieve(input_url, dbfile) ext = os.path.splitext(dbfile)[1] is_nus = bool(ext == '.gz') id_key = 'parent_id' if is_nus else 'mpid' if not is_nus: with tarfile.open(dbfile, "r:gz") as tar: member = tar.getmembers()[0] raw_data = json.load(tar.extractfile(member), cls=MontyDecoder) else: reference_project = project raw_data = [] with gzip.open(dbfile, 'rb') as f: for line in f: raw_data.append(json.loads(line, cls=MontyDecoder)) input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data) input_keys[project] = [ 'material_id', 'exfoliation_energy_per_atom', 'structure' ] if is_nus else ['jid', 'exfoliation_en', 'final_str'] extra[project] = [ ('fin_en', ('E', 'eV')), ('op_gap', ('ΔE|optB88vdW', 'eV')), ('mbj_gap', ('ΔE|mbj', 'eV')), #('kv', ('Kᵥ', 'GPa')), #('gv', ('Gᵥ', 'GPa')) ] if not is_nus else [] print len(input_data[project]), 'materials loaded for', project projects = input_data.keys() identifiers = [] for d in input_data.values(): identifiers += list(d.keys()) for identifier in identifiers: data, structures = RecursiveDict(), RecursiveDict() for project in projects: if project not in data: data[project] = RecursiveDict() if identifier in input_data[project]: d = input_data[project][identifier] structures[project] = d[input_keys[project][-1]] if data.get('formula') is None: data['formula'] = get_composition_from_string( structures[project].composition.reduced_formula ) data[project]['id'] = input_urls[project]['detail'].format(d[input_keys[project][0]]) Ex = d[input_keys[project][1]] if project == reference_project: Ex *= 1000. data[project]['Eₓ'] = clean_value(Ex, 'eV') for k, (sym, unit) in extra[project]: if d[k] != 'na': data[project][sym] = clean_value(d[k], unit) mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier) for project, structure in structures.items(): name = '{}_{}'.format(data['formula'], project) try: mpfile.add_structure(structure, name=name, identifier=identifier) except Exception as ex: print str(ex)
def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in list(rdct.keys()): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct[mp_level01_titles[0]] = RecursiveDict() rdct.move_to_end(mp_level01_titles[0], last=False) # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [mp_level01_titles[0]] if is_general else [] keys.append(root_key) if isinstance(value, list): keys.append("table") rdct.rec_update(nest_dict(value, keys)) # reference to section to iterate or parse as CIF section = (rdct[mp_level01_titles[0]][root_key] if is_general else rdct[root_key]) # iterate to find CSV sections to parse # also parse propnet quantities if isinstance(section, dict): scope = [] for k, v in section.iterate(): level, key = k key = "".join([replacements.get(c, c) for c in key]) level_reduction = bool(level < len(scope)) if level_reduction: del scope[level:] if v is None: scope.append(key) elif isinstance(v, list) and isinstance(v[0], dict): table = "" for row_dct in v: table = "\n".join([table, row_dct["value"]]) pd_obj = read_csv(table) d = nest_dict(pd_obj.to_dict(), scope + [key]) section.rec_update(d, overwrite=True) if not is_general and level == 0: section.insert_default_plot_options(pd_obj, key) elif (Quantity is not None and isinstance(v, six.string_types) and " " in v): quantity = Quantity.from_key_value(key, v) d = nest_dict(quantity.as_dict(), scope + [key]) # TODO quantity.symbol.name section.rec_update(d, overwrite=True) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in section: from pymatgen.io.cif import CifParser for name in section[mp_level01_titles[3]].keys(): cif = section[mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] section[mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
def run(mpfile, **kwargs): from pymatgen import Structure reference_project = None input_data, input_keys, extra = RecursiveDict(), RecursiveDict( ), RecursiveDict() input_urls = mpfile.document['_hdata'].pop('input_urls') for project in input_urls: input_url = input_urls[project]['file'] if '{}' in input_url: input_url = input_url.format('2d') # TODO 3d for Jarvis dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1]) if not os.path.exists(dbfile): print 'downloading', dbfile, '...' urllib.urlretrieve(input_url, dbfile) ext = os.path.splitext(dbfile)[1] is_nus = bool(ext == '.gz') id_key = 'parent_id' if is_nus else 'mpid' if not is_nus: with tarfile.open(dbfile, "r:gz") as tar: member = tar.getmembers()[0] raw_data = json.load(tar.extractfile(member), cls=MontyDecoder) else: reference_project = project raw_data = [] with gzip.open(dbfile, 'rb') as f: for line in f: raw_data.append(json.loads(line, cls=MontyDecoder)) input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data) input_keys[project] = [ 'material_id', 'exfoliation_energy_per_atom', 'structure' ] if is_nus else ['jid', 'exfoliation_en', 'final_str'] extra[project] = [ ('fin_en', ('E', 'meV/atom')), ('op_gap', ('ΔE|optB88vdW', 'meV/atom')), ('mbj_gap', ('ΔE|mbj', 'meV/atom')), #('kv', ('Kᵥ', 'GPa')), #('gv', ('Gᵥ', 'GPa')) ] if not is_nus else [] print len(input_data[project]), 'materials loaded for', project projects = input_data.keys() identifiers = [] for d in input_data.values(): identifiers += list(d.keys()) for identifier in identifiers: data, structures = RecursiveDict(), RecursiveDict() for project in projects: if project not in data: data[project] = RecursiveDict() if identifier in input_data[project]: d = input_data[project][identifier] structures[project] = d[input_keys[project][-1]] if data.get('formula') is None: data['formula'] = get_composition_from_string( structures[project].composition.reduced_formula) data[project]['id'] = input_urls[project]['detail'].format( d[input_keys[project][0]]) Ex = d[input_keys[project][1]] if project == reference_project: Ex *= 1000. data[project]['Eₓ'] = clean_value(Ex, 'eV') for k, (sym, unit) in extra[project]: if d[k] != 'na': data[project][sym] = clean_value(d[k], unit) mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier) for project, structure in structures.items(): name = '{}_{}'.format(data['formula'], project) try: mpfile.add_structure(structure, name=name, identifier=identifier) except Exception as ex: print str(ex)
def run(mpfile, **kwargs): # extract data from json files input_dir = mpfile.hdata.general['input_dir'] for idx, obj in enumerate(scandir(input_dir)): mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1] print(mpid) input_file = gzip.open(obj.path, 'rb') try: data = json.loads(input_file.read()) # filter out metals if 'GGA' not in data or 'GGA' not in data[ 'gap'] or data['gap']['GGA'] < 0.1: print('GGA gap < 0.1 -> skip') continue # add hierarchical data (nested key-values) hdata = RecursiveDict() T, lvl, S2 = '300', '1e+18', None pf_key = 'S²σ' hdata['temperature'] = T + ' K' hdata['doping_level'] = lvl + ' cm⁻³' variables = [ { 'key': 'cond_eff_mass', 'name': 'mₑᶜᵒⁿᵈ', 'unit': 'mₑ' }, { 'key': 'seebeck_doping', 'name': 'S', 'unit': 'μV/K' }, { 'key': 'cond_doping', 'name': 'σ', 'unit': '(Ωms)⁻¹' }, ] eigs_keys = ['ε₁', 'ε₂', 'ε₃', '<ε>'] for v in variables: hdata[v['name']] = RecursiveDict() for doping_type in ['p', 'n']: if doping_type in data['GGA'][v['key']]: d = data['GGA'][v['key']][doping_type][T][lvl] eigs = map(float, d if isinstance(d, list) else d['eigs']) hdata[v['name']][doping_type] = RecursiveDict( (eigs_keys[neig], clean_value(eig, v['unit'])) for neig, eig in enumerate(eigs)) hdata[v['name']][doping_type][ eigs_keys[-1]] = clean_value( np.mean(eigs), v['unit']) if v['key'] == 'seebeck_doping': S2 = np.dot(d['tensor'], d['tensor']) elif v['key'] == 'cond_doping': pf = np.mean( np.linalg.eigh(np.dot(S2, d['tensor']))[0]) * 1e-8 if pf_key not in hdata: hdata[pf_key] = RecursiveDict() hdata[pf_key][doping_type] = { eigs_keys[-1]: clean_value(pf, 'μW/(cmK²s)') } mpfile_data = nest_dict(hdata, ['data']) # build data and max values for seebeck, conductivity and kappa # max/min values computed using numpy. It may be better to code it in pure python. keys = ['pretty_formula', 'volume'] hdata = RecursiveDict((k, data[k]) for k in keys) hdata['volume'] = clean_value(hdata['volume'], 'ų') hdata['bandgap'] = clean_value(data['gap']['GGA'], 'eV') cols = ['value', 'temperature', 'doping'] tables = RecursiveDict() props = RecursiveDict() props['seebeck_doping'] = ['S', 'μV/K'] props['cond_doping'] = ['σ', '(Ωms)⁻¹'] props['kappa_doping'] = ['κₑ', 'W/(mKs)'] for prop_name, (lbl, unit) in props.iteritems(): # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font) # and select it as standard font in your browser (leave other fonts as is, esp. fixed width) tables[lbl] = RecursiveDict() hlbl = lbl + '₋' if len(lbl) > 1 else lbl hlbl += 'ₑₓₜᵣ' hdata[hlbl] = RecursiveDict() for doping_type in ['p', 'n']: prop = data['GGA'][prop_name][doping_type] prop_averages, dopings, columns = [], None, ['T [K]'] temps = sorted(map(int, prop.keys())) for temp in temps: row = [temp] if dopings is None: dopings = sorted(map(float, prop[str(temp)].keys())) for doping in dopings: doping_str = '%.0e' % doping if len(columns) <= len(dopings): columns.append('{} cm⁻³ [{}]'.format( doping_str, unit)) eigs = prop[str(temp)][doping_str]['eigs'] row.append(np.mean(eigs)) prop_averages.append((temp, row)) tables[lbl][doping_type] = Table.from_items( prop_averages, orient='index', columns=columns) arr_prop_avg = np.array( [item[1] for item in prop_averages])[:, 1:] max_v = np.max(arr_prop_avg) if prop_name[0] == 's' and doping_type == 'n': max_v = np.min(arr_prop_avg) if prop_name[0] == 'k': max_v = np.min(arr_prop_avg) arg_max = np.argwhere(arr_prop_avg == max_v)[0] vals = [ clean_value(max_v, unit), clean_value(temps[arg_max[0]], 'K'), clean_value(dopings[arg_max[1]], 'cm⁻³') ] hdata[hlbl][doping_type] = RecursiveDict( (k, v) for k, v in zip(cols, vals)) mpfile_data.rec_update(nest_dict(hdata, ['extra_data'])) mpfile.add_hierarchical_data(mpfile_data, identifier=data['mp_id']) for lbl, dct in tables.iteritems(): for doping_type, table in dct.iteritems(): mpfile.add_data_table(data['mp_id'], table, name='{}({})'.format( lbl, doping_type)) finally: input_file.close()
def run(mpfile, **kwargs): # extract data from json files input_dir = mpfile.hdata.general['input_dir'] for idx, obj in enumerate(scandir(input_dir)): mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1] print(mpid) input_file = gzip.open(obj.path, 'rb') try: data = json.loads(input_file.read()) # filter out metals if 'GGA' not in data or 'GGA' not in data['gap'] or data['gap']['GGA'] < 0.1: print('GGA gap < 0.1 -> skip') continue # add hierarchical data (nested key-values) hdata = RecursiveDict() T, lvl, S2 = '300', '1e+18', None pf_key = 'S²σ' hdata['temperature'] = T + ' K' hdata['doping_level'] = lvl + ' cm⁻³' variables = [ {'key': 'cond_eff_mass', 'name': 'mₑᶜᵒⁿᵈ', 'unit': 'mₑ'}, {'key': 'seebeck_doping', 'name': 'S', 'unit': 'μV/K'}, {'key': 'cond_doping', 'name': 'σ', 'unit': '(Ωms)⁻¹'}, ] eigs_keys = ['ε₁', 'ε₂', 'ε₃', '<ε>'] for v in variables: hdata[v['name']] = RecursiveDict() for doping_type in ['p', 'n']: if doping_type in data['GGA'][v['key']]: d = data['GGA'][v['key']][doping_type][T][lvl] eigs = map(float, d if isinstance(d, list) else d['eigs']) hdata[v['name']][doping_type] = RecursiveDict( (eigs_keys[neig], clean_value(eig, v['unit'])) for neig, eig in enumerate(eigs) ) hdata[v['name']][doping_type][eigs_keys[-1]] = clean_value(np.mean(eigs), v['unit']) if v['key'] == 'seebeck_doping': S2 = np.dot(d['tensor'], d['tensor']) elif v['key'] == 'cond_doping': pf = np.mean(np.linalg.eigh(np.dot(S2, d['tensor']))[0]) * 1e-8 if pf_key not in hdata: hdata[pf_key] = RecursiveDict() hdata[pf_key][doping_type] = {eigs_keys[-1]: clean_value(pf, 'μW/(cmK²s)')} mpfile_data = nest_dict(hdata, ['data']) # build data and max values for seebeck, conductivity and kappa # max/min values computed using numpy. It may be better to code it in pure python. keys = ['pretty_formula', 'volume'] hdata = RecursiveDict((k, data[k]) for k in keys) hdata['volume'] = clean_value(hdata['volume'], 'ų') hdata['bandgap'] = clean_value(data['gap']['GGA'], 'eV') cols = ['value', 'temperature', 'doping'] tables = RecursiveDict() props = RecursiveDict() props['seebeck_doping'] = ['S', 'μV/K'] props['cond_doping'] = ['σ', '(Ωms)⁻¹'] props['kappa_doping'] = ['κₑ', 'W/(mKs)'] for prop_name, (lbl, unit) in props.iteritems(): # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font) # and select it as standard font in your browser (leave other fonts as is, esp. fixed width) tables[lbl] = RecursiveDict() hlbl = lbl+'₋' if len(lbl) > 1 else lbl hlbl += 'ₑₓₜᵣ' hdata[hlbl] = RecursiveDict() for doping_type in ['p', 'n']: prop = data['GGA'][prop_name][doping_type] prop_averages, dopings, columns = [], None, ['T [K]'] temps = sorted(map(int, prop.keys())) for temp in temps: row = [temp] if dopings is None: dopings = sorted(map(float, prop[str(temp)].keys())) for doping in dopings: doping_str = '%.0e' % doping if len(columns) <= len(dopings): columns.append('{} cm⁻³ [{}]'.format(doping_str, unit)) eigs = prop[str(temp)][doping_str]['eigs'] row.append(np.mean(eigs)) prop_averages.append((temp, row)) tables[lbl][doping_type] = Table.from_items( prop_averages, orient='index', columns=columns ) arr_prop_avg = np.array([item[1] for item in prop_averages])[:,1:] max_v = np.max(arr_prop_avg) if prop_name[0] == 's' and doping_type == 'n': max_v = np.min(arr_prop_avg) if prop_name[0] == 'k': max_v = np.min(arr_prop_avg) arg_max = np.argwhere(arr_prop_avg==max_v)[0] vals = [ clean_value(max_v, unit), clean_value(temps[arg_max[0]], 'K'), clean_value(dopings[arg_max[1]], 'cm⁻³') ] hdata[hlbl][doping_type] = RecursiveDict( (k, v) for k, v in zip(cols, vals) ) mpfile_data.rec_update(nest_dict(hdata, ['extra_data'])) mpfile.add_hierarchical_data(mpfile_data, identifier=data['mp_id']) for lbl, dct in tables.iteritems(): for doping_type, table in dct.iteritems(): mpfile.add_data_table( data['mp_id'], table, name='{}({})'.format(lbl, doping_type) ) finally: input_file.close()
def run(mpfile, hosts=None, download=False, **kwargs): #mpfile.unique_mp_cat_ids = False from pymatgen import MPRester mpr = MPRester() fpath = os.path.join(os.environ['HOME'], 'work', 'dilute_solute_diffusion.xlsx') if download or not os.path.exists(fpath): figshare_id = mpfile.hdata.general['info']['figshare_id'] url = 'https://api.figshare.com/v2/articles/{}'.format(figshare_id) print 'get figshare article {}'.format(figshare_id) r = requests.get(url) figshare = json.loads(r.content) mpfile.document['_hdata']['version'] = figshare['version'] print 'read excel from figshare into DataFrame' df_dct = None for d in figshare['files']: if 'xlsx' in d['name']: # Dict of DataFrames is returned, with keys representing sheets df_dct = read_excel(d['download_url'], sheet_name=None) break if df_dct is None: print 'no excel sheet found on figshare' return print 'save excel to disk' writer = ExcelWriter(fpath) for sheet, df in df_dct.items(): df.to_excel(writer, sheet) writer.save() else: df_dct = read_excel(fpath, sheet_name=None) print len(df_dct), 'sheets loaded.' print 'looping hosts ...' host_info = df_dct['Host Information'] host_info.set_index(host_info.columns[0], inplace=True) host_info.dropna(inplace=True) for idx, host in enumerate(host_info): if hosts is not None: if isinstance(hosts, int) and idx+1 > hosts: break elif isinstance(hosts, list) and not host in hosts: continue print 'get mp-id for {}'.format(host) mpid = None for doc in mpr.query( criteria={'pretty_formula': host}, properties={'task_id': 1} ): if doc['sbxd'][0]['decomposes_to'] is None: mpid = doc['task_id'] break if mpid is None: print 'mp-id for {} not found'.format(host) continue print 'add host info for {}'.format(mpid) hdata = host_info[host].to_dict(into=RecursiveDict) for k in hdata.keys(): v = hdata.pop(k) ks = k.split() if ks[0] not in hdata: hdata[ks[0]] = RecursiveDict() unit = ks[-1][1:-1] if ks[-1].startswith('[') else '' subkey = '_'.join(ks[1:-1] if unit else ks[1:]).split(',')[0] if subkey == "lattice_constant": unit = u'Å' try: hdata[ks[0]][subkey] = clean_value(v, unit.replace('angstrom', u'Å')) except ValueError: hdata[ks[0]][subkey] = v hdata['formula'] = host df = df_dct['{}-X'.format(host)] rows = list(isnull(df).any(1).nonzero()[0]) if rows: cells = df.ix[rows].dropna(how='all').dropna(axis=1)[df.columns[0]] note = cells.iloc[0].replace('following', cells.iloc[1])[:-1] hdata['note'] = note df.drop(rows, inplace=True) mpfile.add_hierarchical_data(nest_dict(hdata, ['data']), identifier=mpid) print 'add table for D₀/Q data for {}'.format(mpid) df.set_index(df['Solute element number'], inplace=True) df.drop('Solute element number', axis=1, inplace=True) df.columns = df.ix[0] df.index.name = 'index' df.drop('Solute element name', inplace=True) df = df.T.reset_index() if str(host) == 'Fe': df_D0_Q = df[[ 'Solute element name', 'Solute D0, paramagnetic [cm^2/s]', 'Solute Q, paramagnetic [eV]' ]] elif hdata['Host']['crystal_structure'] == 'HCP': df_D0_Q = df[['Solute element name', 'Solute D0 basal [cm^2/s]', 'Solute Q basal [eV]']] else: df_D0_Q = df[['Solute element name', 'Solute D0 [cm^2/s]', 'Solute Q [eV]']] df_D0_Q.columns = ['El.', 'D₀ [cm²/s]', 'Q [eV]'] mpfile.add_data_table(mpid, df_D0_Q, 'D₀_Q') if hdata['Host']['crystal_structure'] == 'BCC': print 'add table for hop activation barriers for {} (BCC)'.format(mpid) columns_E = [ 'Hop activation barrier, E_{} [eV]'.format(i) for i in range(2,5) ] + [ "Hop activation barrier, E'_{} [eV]".format(i) for i in range(3,5) ] + [ "Hop activation barrier, E''_{} [eV]".format(i) for i in range(3,5) ] + [ 'Hop activation barrier, E_{} [eV]'.format(i) for i in range(5,7) ] df_E = df[['Solute element name'] + columns_E] df_E.columns = ['El.'] + [ 'E{} [eV]'.format(i) for i in ['₂', '₃', '₄'] ] + [ 'E`{} [eV]'.format(i) for i in ['₃', '₄'] ] + [ 'E``{} [eV]'.format(i) for i in ['₃', '₄'] ] + [ 'E{} [eV]'.format(i) for i in ['₅', '₆'] ] mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers') print 'add table for hop attempt frequencies for {} (BCC)'.format(mpid) columns_v = [ 'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(2,5) ] + [ "Hop attempt frequency, v'_{} [THz]".format(i) for i in range(3,5) ] + [ "Hop attempt frequency, v''_{} [THz]".format(i) for i in range(3,5) ] + [ 'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(5,7) ] df_v = df[['Solute element name'] + columns_v] df_v.columns = ['El.'] + [ 'v{} [THz]'.format(i) for i in ['₂', '₃', '₄'] ] + [ 'v``{} [THz]'.format(i) for i in ['₃', '₄'] ] + [ 'v``{} [THz]'.format(i) for i in ['₃', '₄'] ] + [ 'v{} [THz]'.format(i) for i in ['₅', '₆'] ] mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies') elif hdata['Host']['crystal_structure'] == 'FCC': print 'add table for hop activation barriers for {} (FCC)'.format(mpid) columns_E = ['Hop activation barrier, E_{} [eV]'.format(i) for i in range(5)] df_E = df[['Solute element name'] + columns_E] df_E.columns = ['El.'] + ['E{} [eV]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄']] mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers') print 'add table for hop attempt frequencies for {} (FCC)'.format(mpid) columns_v = ['Hop attempt frequency, v_{} [THz]'.format(i) for i in range(5)] df_v = df[['Solute element name'] + columns_v] df_v.columns = ['El.'] + ['v{} [THz]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄']] mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies') elif hdata['Host']['crystal_structure'] == 'HCP': print 'add table for hop activation barriers for {} (HCP)'.format(mpid) columns_E = [ "Hop activation barrier, E_X [eV]", "Hop activation barrier, E'_X [eV]", "Hop activation barrier, E_a [eV]", "Hop activation barrier, E'_a [eV]", "Hop activation barrier, E_b [eV]", "Hop activation barrier, E'_b [eV]", "Hop activation barrier, E_c [eV]", "Hop activation barrier, E'_c [eV]" ] df_E = df[['Solute element name'] + columns_E] df_E.columns = ['El.'] + [ 'Eₓ [eV]', 'E`ₓ [eV]', 'Eₐ [eV]', 'E`ₐ [eV]', 'E_b [eV]', 'E`_b [eV]', 'Eꪱ [eV]', 'E`ꪱ [eV]' ] mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers') print 'add table for hop attempt frequencies for {} (HCP)'.format(mpid) columns_v = ['Hop attempt frequency, v_a [THz]'] + ['Hop attempt frequency, v_X [THz]'] df_v = df[['Solute element name'] + columns_v] df_v.columns = ['El.'] + ['vₐ [THz]'] + ['vₓ [THz]'] mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies') print mpfile print 'DONE'
def run(mpfile, **kwargs): from pymatgen import Structure reference_project = None input_data, input_keys, extra = RecursiveDict(), RecursiveDict( ), RecursiveDict() #input_urls = mpfile.document['_hdata'].pop('input_urls') input_urls = { 'NUS': { "file": "http://www.2dmatpedia.org/static/db.json.gz", "detail": "http://www.2dmatpedia.org/2dmaterials/doc/{}" }, 'JARVIS': { "file": "https://www.ctcms.nist.gov/~knc6/jdft_{}.json.tgz", "detail": "https://www.ctcms.nist.gov/~knc6/jsmol/{}.html" } } for project in input_urls: input_url = input_urls[project]['file'] if '{}' in input_url: input_url = input_url.format('2d') # TODO 3d for Jarvis #dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1]) dbfile = input_url.rsplit('/')[-1] if not os.path.exists(dbfile): print('downloading', dbfile, '...') urllib.request.urlretrieve(input_url, dbfile) ext = os.path.splitext(dbfile)[1] is_nus = bool(ext == '.gz') id_key = 'source_id' if is_nus else 'mpid' if not is_nus: with tarfile.open(dbfile, "r:gz") as tar: member = tar.getmembers()[0] raw_data = json.load(tar.extractfile(member), cls=MontyDecoder) else: reference_project = project raw_data = [] with gzip.open(dbfile, 'rb') as f: for line in f: raw_data.append(json.loads(line, cls=MontyDecoder)) input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data) input_keys[project] = [ 'material_id', 'exfoliation_energy_per_atom', 'structure' ] if is_nus else ['jid', 'exfoliation_en', 'final_str'] extra[project] = [ ('fin_en', ('E', 'meV/atom')), ('op_gap', ('ΔE|optB88vdW', 'meV/atom')), ('mbj_gap', ('ΔE|mbj', 'meV/atom')), #('kv', ('Kᵥ', 'GPa')), #('gv', ('Gᵥ', 'GPa')) ] if not is_nus else [] print(len(input_data[project]), 'materials loaded for', project) projects = input_data.keys() identifiers = [] for d in input_data.values(): identifiers += list(d.keys()) for identifier in set(identifiers): print(identifier) data, structures = RecursiveDict(), RecursiveDict() for project in projects: if project not in data: data[project] = RecursiveDict() if identifier in input_data[project]: d = input_data[project][identifier] structures[project] = d[input_keys[project][-1]] if data.get('formula') is None: data['formula'] = get_composition_from_string( structures[project].composition.reduced_formula) data[project]['id'] = input_urls[project]['detail'].format( d[input_keys[project][0]]) if input_keys[project][1] in d: Ex = d[input_keys[project][1]] if project == reference_project: Ex *= 1000. data[project]['Eₓ'] = clean_value(Ex, 'eV') for k, (sym, unit) in extra[project]: if d[k] != 'na': data[project][sym] = clean_value(d[k], unit) mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier) #r = db.contributions.update_one( # {'identifier': identifier, 'project': 'jarvis_dft'}, # {'$set': {'content.data': mpfile.document[identifier]['data']}}, # upsert=True #) #print(r.matched_count, r.modified_count, r.upserted_id) doc = db.contributions.find_one( { 'identifier': identifier, 'project': 'jarvis_dft' }, { '_id': 1, 'content.structures': 1 }) if 'structures' in doc['content']: print('structures already added for', identifier) continue print(doc['_id']) inserted_ids = [] for project, structure in structures.items(): try: mpfile.add_structure(structure, name=project, identifier=identifier) sdct = mpfile.document[identifier]['structures'][project] sdct.pop('@module') sdct.pop('@class') if sdct['charge'] is None: sdct.pop('charge') sdct['identifier'] = identifier sdct['project'] = 'jarvis_dft' sdct['name'] = project sdct['cid'] = doc['_id'] r = db.structures.insert_one(sdct) inserted_ids.append(r.inserted_id) except Exception as ex: print(str(ex)) print(inserted_ids) r = db.contributions.update_one( {'_id': doc['_id']}, {'$set': { 'content.structures': inserted_ids }}) print(r.matched_count, r.modified_count)
def run(mpfile, **kwargs): # TODO clone solar_perovskite if needed, abort if insufficient permissions import solar_perovskite from solar_perovskite.core import GetExpThermo from solar_perovskite.init.find_structures import FindStructures from solar_perovskite.init.import_data import Importdata from solar_perovskite.modelling.from_theo import EnthTheo input_file = mpfile.hdata.general['input_file'] input_file = os.path.join(os.path.dirname(solar_perovskite.__file__), input_file) table = read_csv(open(input_file, 'r').read().replace(';', ',')) dct = super(Table, table).to_dict(orient='records', into=RecursiveDict) shomate = pd.read_csv(os.path.abspath(os.path.join( os.path.dirname(solar_perovskite.__file__), "datafiles", "shomate.csv" )), index_col=0) shomate_dct = RecursiveDict() for col in shomate.columns: key = col.split('.')[0] if key not in shomate_dct: shomate_dct[key] = RecursiveDict() d = shomate[col].to_dict(into=RecursiveDict) subkey = '{}-{}'.format(int(d.pop('low')), int(d.pop('high'))) shomate_dct[key][subkey] = RecursiveDict( (k, clean_value(v, max_dgts=6)) for k, v in d.items() ) mpfile.add_hierarchical_data(nest_dict(shomate_dct, ['shomate'])) for row in dct: sample_number = int(row['sample_number']) identifier = row['closest phase MP (oxidized)'].replace('n.a.', '') if not identifier.startswith('mp-'): continue if not identifier: identifier = get_composition_from_string(row['composition oxidized phase']) print identifier print 'add hdata ...' d = RecursiveDict() d['tolerance_factor'] = row['tolerance_factor'] d['solid_solution'] = row['type of solid solution'] d['oxidized_phase'] = RecursiveDict() d['oxidized_phase']['composition'] = row['composition oxidized phase'] d['oxidized_phase']['crystal-structure'] = row['crystal structure (fully oxidized)'] d['reduced_phase'] = RecursiveDict() d['reduced_phase']['composition'] = row['composition reduced phase'] d['reduced_phase']['closest-MP'] = row['closest phase MP (reduced)'].replace('n.a.', '') d = nest_dict(d, ['data']) d['pars'] = get_fit_pars(sample_number) d['pars']['theo_compstr'] = row['theo_compstr'] try: fs = FindStructures(compstr=row['theo_compstr']) theo_redenth = fs.find_theo_redenth() imp = Importdata() splitcomp = imp.split_comp(row['theo_compstr']) conc_act = imp.find_active(mat_comp=splitcomp)[1] et = EnthTheo(comp=row['theo_compstr']) dh_max, dh_min = et.calc_dh_endm() red_enth_mean_endm = (conc_act * dh_min) + ((1 - conc_act) * dh_max) difference = theo_redenth - red_enth_mean_endm d['pars']['dh_min'] = clean_value(dh_min + difference, max_dgts=8) d['pars']['dh_max'] = clean_value(dh_max + difference, max_dgts=8) except Exception as ex: print('error in dh_min/max!') print(str(ex)) pass mpfile.add_hierarchical_data(d, identifier=identifier) print 'add ΔH ...' exp_thermo = GetExpThermo(sample_number, plotting=False) enthalpy = exp_thermo.exp_dh() table = get_table(enthalpy, 'H') mpfile.add_data_table(identifier, table, name='enthalpy') print 'add ΔS ...' entropy = exp_thermo.exp_ds() table = get_table(entropy, 'S') mpfile.add_data_table(identifier, table, name='entropy') print 'add raw data ...' tga_results = os.path.join(os.path.dirname(solar_perovskite.__file__), 'tga_results') for path in glob(os.path.join(tga_results, 'ExpDat_JV_P_{}_*.csv'.format(sample_number))): print path.split('_{}_'.format(sample_number))[-1].split('.')[0], '...' body = open(path, 'r').read() cols = ['Time [min]', 'Temperature [C]', 'dm [%]', 'pO2'] table = read_csv(body, lineterminator=os.linesep, usecols=cols, skiprows=5) table = table[cols].iloc[::100, :] # scale/shift for better graphs T, dm, p = [pd.to_numeric(table[col]) for col in cols[1:]] T_min, T_max, dm_min, dm_max, p_max = T.min(), T.max(), dm.min(), dm.max(), p.max() rT, rdm = abs(T_max - T_min), abs(dm_max - dm_min) table[cols[2]] = (dm - dm_min) * rT/rdm table[cols[3]] = p * rT/p_max table.rename(columns={ 'dm [%]': '(dm [%] + {:.4g}) * {:.4g}'.format(-dm_min, rT/rdm), 'pO2': 'pO₂ * {:.4g}'.format(rT/p_max) }, inplace=True) mpfile.add_data_table(identifier, table, name='raw')