def run(mpfile, **kwargs): import pymatgen import pandas as pd from mpcontribs.users.swf.rest.rester import SwfRester # load data from google sheet google_sheet = mpfile.document[mp_level01_titles[0]].pop('google_sheet') google_sheet += '/export?format=xlsx' df_dct = pd.read_excel(google_sheet, sheet_name=None) # rename sheet columns elements = ['Fe', 'V', 'Co'] df_dct['IP Energy Product'].columns = ['IP_Energy_product'] + elements df_dct['total'].columns = elements df_dct['MOKE'].columns = elements + ['thickness', 'MOKE_IP_Hc'] df_dct['VSM'].columns = elements + ['thickness', 'VSM_IP_Hc'] df_dct['formula'].columns = elements df_dct['Kondorsky'].columns = ['angle', 'Kondorsky_Model', 'Experiment'] # round all compositions to 100% for sheet, df in df_dct.items(): if sheet != 'Kondorsky': for idx, row in df.iterrows(): df.loc[idx:idx, elements] = round_to_100_percent(row[elements]) row5 = df_dct['formula'].iloc[0] formula5 = get_composition_from_string( pymatgen.Composition(10 * row5).formula.replace(' ', '')) dct = dict((k, clean_value(v, '%')) for k, v in row5.to_dict().items()) mpfile.add_hierarchical_data({'data': dct}, identifier=formula5) mpfile.add_data_table(formula5, df_dct['Kondorsky'], name='Angular Dependence of Switching Field') for sheet, df in df_dct.items(): if sheet == 'formula' or sheet == 'Kondorsky' or sheet == 'total': continue for idx, row in df.iterrows(): composition = pymatgen.Composition(row[elements] * 10) formula = get_composition_from_string( composition.formula.replace(' ', '')) dct = dict((k, clean_value(v, '%')) for k, v in row[elements].to_dict().items()) mpfile.add_hierarchical_data({'data': dct}, identifier=formula) columns = [x for x in row.index if x not in elements] if columns: data = row[columns].round(decimals=1) dct = dict( (k, clean_value(v)) for k, v in data.to_dict().items()) mpfile.add_hierarchical_data({'data': dct}, identifier=formula)
def run(mpfile, **kwargs): import pymatgen import pandas as pd from mpcontribs.users.swf.rest.rester import SwfRester # load data from google sheet google_sheet = mpfile.document[mp_level01_titles[0]].pop("google_sheet") google_sheet += "/export?format=xlsx" df_dct = pd.read_excel(google_sheet, sheet_name=None) # rename sheet columns elements = ["Fe", "V", "Co"] df_dct["IP Energy Product"].columns = ["IP_Energy_product"] + elements df_dct["total"].columns = elements df_dct["MOKE"].columns = elements + ["thickness", "MOKE_IP_Hc"] df_dct["VSM"].columns = elements + ["thickness", "VSM_IP_Hc"] df_dct["formula"].columns = elements df_dct["Kondorsky"].columns = ["angle", "Kondorsky_Model", "Experiment"] # round all compositions to 100% for sheet, df in df_dct.items(): if sheet != "Kondorsky": for idx, row in df.iterrows(): df.loc[idx:idx, elements] = round_to_100_percent(row[elements]) row5 = df_dct["formula"].iloc[0] formula5 = get_composition_from_string( pymatgen.Composition(10 * row5).formula.replace(" ", "")) dct = dict((k, clean_value(v, "%")) for k, v in row5.to_dict().items()) mpfile.add_hierarchical_data({"data": dct}, identifier=formula5) mpfile.add_data_table(formula5, df_dct["Kondorsky"], name="Angular Dependence of Switching Field") for sheet, df in df_dct.items(): if sheet == "formula" or sheet == "Kondorsky" or sheet == "total": continue for idx, row in df.iterrows(): composition = pymatgen.Composition(row[elements] * 10) formula = get_composition_from_string( composition.formula.replace(" ", "")) dct = dict((k, clean_value(v, "%")) for k, v in row[elements].to_dict().items()) mpfile.add_hierarchical_data({"data": dct}, identifier=formula) columns = [x for x in row.index if x not in elements] if columns: data = row[columns].round(decimals=1) dct = dict( (k, clean_value(v)) for k, v in data.to_dict().items()) mpfile.add_hierarchical_data({"data": dct}, identifier=formula)
def run(mpfile, **kwargs): import pymatgen import pandas as pd from mpcontribs.users.swf.rest.rester import SwfRester # load data from google sheet google_sheet = mpfile.document[mp_level01_titles[0]].pop('google_sheet') google_sheet += '/export?format=xlsx' df_dct = pd.read_excel(google_sheet, sheet_name=None) # rename sheet columns elements = ['Fe', 'V', 'Co'] df_dct['IP Energy Product'].columns = ['IP_Energy_product'] + elements df_dct['total'].columns = elements df_dct['MOKE'].columns = elements + ['thickness', 'MOKE_IP_Hc'] df_dct['VSM'].columns = elements + ['thickness', 'VSM_IP_Hc'] df_dct['formula'].columns = elements df_dct['Kondorsky'].columns = ['angle', 'Kondorsky_Model', 'Experiment'] # round all compositions to 100% for sheet, df in df_dct.items(): if sheet != 'Kondorsky': for idx, row in df.iterrows(): df.loc[idx:idx, elements] = round_to_100_percent(row[elements]) row5 = df_dct['formula'].iloc[0] formula5 = get_composition_from_string( pymatgen.Composition(10*row5).formula.replace(' ', '') ) dct = dict((k, clean_value(v, '%')) for k,v in row5.to_dict().items()) mpfile.add_hierarchical_data({'data': dct}, identifier=formula5) mpfile.add_data_table( formula5, df_dct['Kondorsky'], name='Angular Dependence of Switching Field' ) for sheet, df in df_dct.items(): if sheet == 'formula' or sheet == 'Kondorsky' or sheet == 'total': continue for idx, row in df.iterrows(): composition = pymatgen.Composition(row[elements]*10) formula = get_composition_from_string(composition.formula.replace(' ', '')) dct = dict((k, clean_value(v, '%')) for k,v in row[elements].to_dict().items()) mpfile.add_hierarchical_data({'data': dct}, identifier=formula) columns = [x for x in row.index if x not in elements] if columns: data = row[columns].round(decimals=1) dct = dict((k, clean_value(v)) for k,v in data.to_dict().items()) mpfile.add_hierarchical_data({'data': dct}, identifier=formula)
def add_structure(self, source, name=None, identifier=None, fmt=None): """add a structure to the mpfile""" from pymatgen import Structure, MPRester if isinstance(source, Structure): structure = source elif isinstance(source, dict): structure = Structure.from_dict(source) elif os.path.exists(source): structure = Structure.from_file(source, sort=True) elif isinstance(source, six.string_types): if fmt is None: raise ValueError("Need fmt to get structure from string!") structure = Structure.from_str(source, fmt, sort=True) else: raise ValueError(source, "not supported!") if name is not None: if not isinstance(name, six.string_types): raise ValueError("structure name needs to be a string") elif "." in name: raise ValueError("structure name cannot contain dots (.)") mpr = MPRester() if not mpr.api_key: raise ValueError( "API key not set. Run `pmg config --add PMG_MAPI_KEY <USER_API_KEY>`." ) matched_mpids = mpr.find_structure(structure) formula = get_composition_from_string(structure.composition.formula) if not matched_mpids: if identifier is None: identifier = formula print( "Structure not found in MP! Please submit via MPComplete to " "obtain mp-id or manually choose an anchor mp-id! Continuing " "with {} as identifier!".format(identifier)) else: print("Structure not found in MP! Forcing {} as identifier!". format(identifier)) elif identifier is None: identifier = matched_mpids[0] if len(matched_mpids) > 1: print("Multiple matching structures found in MP. Using", identifier) elif identifier not in matched_mpids: msg = "Structure does not match {} but instead {}!".format( identifier, matched_mpids) raise ValueError(msg) idx = len( self.document.get(identifier, {}).get(mp_level01_titles[3], {})) sub_key = formula if name is None else name if sub_key in self.document.get(identifier, {}).get(mp_level01_titles[3], {}): sub_key += "_{}".format(idx) self.document.rec_update( nest_dict(structure.as_dict(), [identifier, mp_level01_titles[3], sub_key])) return identifier
def run(mpfile, **kwargs): input_dir = mpfile.hdata['_hdata']['input_dir'] identifier = get_composition_from_string('PbZr20Ti80O3') print identifier # 'SP128_NSO_LPFM0000.ibw' too big to display in notebook files = ['BR_60016 (1).ibw', 'SP128_NSO_VPFM0000.ibw'] for f in files: file_name = os.path.join(input_dir, f) df = load_data(file_name) name = f.split('.')[0] mpfile.add_data_table(identifier, df, name) print 'imported', f xrd_file = os.path.join(input_dir, 'Program6_JA_6_2th0m Near SRO (002)_2.xrdml.xml') data = read_xrdml(xrd_file) df = DataFrame(np.stack((data['2Theta'],data['data']),1), columns=['2Theta','Intensity']) opts = {'yaxis': {'type': 'log'}} # see plotly docs mpfile.add_data_table(identifier, df, 'NearSRO', plot_options=opts) print 'imported', os.path.basename(xrd_file) rsm_file = os.path.join(input_dir, 'JA 42 RSM 103 STO 001.xrdml.xml') rvals, df = load_RSM(rsm_file) mpfile.add_hierarchical_data({'rsm_range': { 'x': '{} {}'.format(rvals[0], rvals[1]), 'y': '{} {}'.format(rvals[2], rvals[3]), }}, identifier=identifier) mpfile.add_data_table(identifier, df, 'RSM') print 'imported', os.path.basename(rsm_file)
def run(mpfile, **kwargs): input_file = mpfile.document['_hdata'].pop('input_file') zip_path = os.path.join(os.environ['HOME'], 'work', input_file) if not os.path.exists(zip_path): return 'Please upload', zip_path zip_file = ZipFile(zip_path, 'r') composition_table_dict = mpfile.document['_hdata']['composition_table'] conc_funcs = get_concentration_functions(composition_table_dict) for info in zip_file.infolist(): print info.filename d = RecursiveDict() # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4) d['position'] = RecursiveDict( (k, clean_value(v, 'mm')) for k, v in zip(['x', 'y'], [x, y]) ) # composition d['composition'] = RecursiveDict( (el, clean_value(f(x, y), convert_to_percent=True)) for el, f in conc_funcs.items() ) # identifier identifier = get_composition_from_string(''.join([ '{}{}'.format(el, int(round(Decimal(comp.split()[0])))) for el, comp in d['composition'].items() ])) # load csv file try: csv = zip_file.read(info.filename) except KeyError: print 'ERROR: Did not find %s in zip file' % info.filename # read csv to pandas DataFrame and add to MPFile df = read_csv(csv) df = df[['Energy', 'XAS', 'XMCD']] # min and max d.rec_update(RecursiveDict( (y, RecursiveDict([ ('min', df[y].min()), ('max', df[y].max()) ])) for y in ['XAS', 'XMCD'] )) # add data to MPFile mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier) mpfile.add_data_table(identifier, df, name=element)
def run(mpfile, **kwargs): input_file = mpfile.document["_hdata"].pop("input_file") zip_path = os.path.join(os.environ["HOME"], "work", input_file) if not os.path.exists(zip_path): return "Please upload", zip_path zip_file = ZipFile(zip_path, "r") composition_table_dict = mpfile.document["_hdata"]["composition_table"] conc_funcs = get_concentration_functions(composition_table_dict) for info in zip_file.infolist(): print info.filename d = RecursiveDict() # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv element, x, y = os.path.splitext(info.filename)[0].rsplit("_", 4) d["position"] = RecursiveDict( (k, clean_value(v, "mm")) for k, v in zip(["x", "y"], [x, y])) # composition d["composition"] = RecursiveDict( (el, clean_value(f(x, y), convert_to_percent=True)) for el, f in conc_funcs.items()) # identifier identifier = get_composition_from_string("".join([ "{}{}".format(el, int(round(Decimal(comp.split()[0])))) for el, comp in d["composition"].items() ])) # load csv file try: csv = zip_file.read(info.filename) except KeyError: print "ERROR: Did not find %s in zip file" % info.filename # read csv to pandas DataFrame and add to MPFile df = read_csv(csv) df = df[["Energy", "XAS", "XMCD"]] # min and max d.rec_update( RecursiveDict( (y, RecursiveDict([("min", df[y].min()), ("max", df[y].max())])) for y in ["XAS", "XMCD"])) # add data to MPFile mpfile.add_hierarchical_data(nest_dict(d, ["data"]), identifier=identifier) mpfile.add_data_table(identifier, df, name=element)
def run(mpfile, **kwargs): input_file = mpfile.document['_hdata'].pop('input_file') zip_path = os.path.join(os.environ['HOME'], 'work', input_file) if not os.path.exists(zip_path): return 'Please upload', zip_path zip_file = ZipFile(zip_path, 'r') composition_table_dict = mpfile.document['_hdata']['composition_table'] conc_funcs = get_concentration_functions(composition_table_dict) for info in zip_file.infolist(): print info.filename d = RecursiveDict() # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4) d['position'] = RecursiveDict( (k, clean_value(v, 'mm')) for k, v in zip(['x', 'y'], [x, y])) # composition d['composition'] = RecursiveDict( (el, clean_value(f(x, y), convert_to_percent=True)) for el, f in conc_funcs.items()) # identifier identifier = get_composition_from_string(''.join([ '{}{}'.format(el, int(round(Decimal(comp.split()[0])))) for el, comp in d['composition'].items() ])) # load csv file try: csv = zip_file.read(info.filename) except KeyError: print 'ERROR: Did not find %s in zip file' % info.filename # read csv to pandas DataFrame and add to MPFile df = read_csv(csv) df = df[['Energy', 'XAS', 'XMCD']] # min and max d.rec_update( RecursiveDict( (y, RecursiveDict([('min', df[y].min()), ('max', df[y].max())])) for y in ['XAS', 'XMCD'])) # add data to MPFile mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier) mpfile.add_data_table(identifier, df, name=element)
def run(mpfile, dup_check_test_site=True): from pymatgen import MPRester existing_identifiers = {} #for b in [False, True]: # with DlrVietenRester(test_site=b) as mpr: # for doc in mpr.query_contributions(): # existing_identifiers[doc['mp_cat_id']] = doc['_id'] # if not dup_check_test_site: # break google_sheet = mpfile.document[mp_level01_titles[0]].pop('google_sheet') google_sheet += '/export?format=xlsx' df_dct = pd.read_excel(google_sheet, sheetname=None) mpr = MPRester() update = 0 for sheet in df_dct.keys(): print(sheet) df = df_dct[sheet] sheet_split = sheet.split() composition = sheet_split[0] identifier = get_composition_from_string(composition) if len(sheet_split) > 1 and mp_id_pattern.match(sheet_split[1]): identifier = sheet_split[1] print('identifier = {}'.format(identifier)) if 'CIF' in sheet_split: print('adding CIF ...') df.columns = [df.columns[0]] + [''] * (df.shape[1] - 1) cif = df.to_csv(na_rep='', index=False, sep='\t', quoting=csv.QUOTE_NONE) mpfile.add_structure(cif, identifier=identifier, fmt='cif') else: print('adding data ...') mpfile.add_hierarchical_data({'composition': composition}, identifier=identifier) mpfile.add_data_table(identifier, df, name='dH_dS') if identifier in existing_identifiers: cid = existing_identifiers[identifier] mpfile.insert_id(identifier, cid) update += 1 print len(mpfile.ids), 'contributions to submit.' if update > 0: print update, 'contributions to update.'
def run(mpfile, **kwargs): input_dir = mpfile.hdata["_hdata"]["input_dir"] identifier = get_composition_from_string("PbZr20Ti80O3") print identifier # 'SP128_NSO_LPFM0000.ibw' too big to display in notebook files = ["BR_60016 (1).ibw", "SP128_NSO_VPFM0000.ibw"] for f in files: file_name = os.path.join(input_dir, f) df = load_data(file_name) name = f.split(".")[0] mpfile.add_data_table(identifier, df, name) print "imported", f xrd_file = os.path.join(input_dir, "Program6_JA_6_2th0m Near SRO (002)_2.xrdml.xml") data = read_xrdml(xrd_file) df = DataFrame( np.stack((data["2Theta"], data["data"]), 1), columns=["2Theta", "Intensity"] ) opts = {"yaxis": {"type": "log"}} # see plotly docs mpfile.add_data_table(identifier, df, "NearSRO", plot_options=opts) print "imported", os.path.basename(xrd_file) rsm_file = os.path.join(input_dir, "JA 42 RSM 103 STO 001.xrdml.xml") rvals, df = load_RSM(rsm_file) mpfile.add_hierarchical_data( { "rsm_range": { "x": "{} {}".format(rvals[0], rvals[1]), "y": "{} {}".format(rvals[2], rvals[3]), } }, identifier=identifier, ) mpfile.add_data_table(identifier, df, "RSM") print "imported", os.path.basename(rsm_file)
def run(mpfile, **kwargs): input_dir = mpfile.hdata['_hdata']['input_dir'] identifier = get_composition_from_string('PbZr20Ti80O3') print identifier # 'SP128_NSO_LPFM0000.ibw' too big to display in notebook files = ['BR_60016 (1).ibw', 'SP128_NSO_VPFM0000.ibw'] for f in files: file_name = os.path.join(input_dir, f) df = load_data(file_name) name = f.split('.')[0] mpfile.add_data_table(identifier, df, name) print 'imported', f xrd_file = os.path.join(input_dir, 'Program6_JA_6_2th0m Near SRO (002)_2.xrdml.xml') data = read_xrdml(xrd_file) df = DataFrame(np.stack((data['2Theta'], data['data']), 1), columns=['2Theta', 'Intensity']) opts = {'yaxis': {'type': 'log'}} # see plotly docs mpfile.add_data_table(identifier, df, 'NearSRO', plot_options=opts) print 'imported', os.path.basename(xrd_file) rsm_file = os.path.join(input_dir, 'JA 42 RSM 103 STO 001.xrdml.xml') rvals, df = load_RSM(rsm_file) mpfile.add_hierarchical_data( { 'rsm_range': { 'x': '{} {}'.format(rvals[0], rvals[1]), 'y': '{} {}'.format(rvals[2], rvals[3]), } }, identifier=identifier) mpfile.add_data_table(identifier, df, 'RSM') print 'imported', os.path.basename(rsm_file)
def run(mpfile, **kwargs): from pymatgen import Structure reference_project = None input_data, input_keys, extra = RecursiveDict(), RecursiveDict( ), RecursiveDict() #input_urls = mpfile.document['_hdata'].pop('input_urls') input_urls = { 'NUS': { "file": "http://www.2dmatpedia.org/static/db.json.gz", "detail": "http://www.2dmatpedia.org/2dmaterials/doc/{}" }, 'JARVIS': { "file": "https://www.ctcms.nist.gov/~knc6/jdft_{}.json.tgz", "detail": "https://www.ctcms.nist.gov/~knc6/jsmol/{}.html" } } for project in input_urls: input_url = input_urls[project]['file'] if '{}' in input_url: input_url = input_url.format('2d') # TODO 3d for Jarvis #dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1]) dbfile = input_url.rsplit('/')[-1] if not os.path.exists(dbfile): print('downloading', dbfile, '...') urllib.request.urlretrieve(input_url, dbfile) ext = os.path.splitext(dbfile)[1] is_nus = bool(ext == '.gz') id_key = 'source_id' if is_nus else 'mpid' if not is_nus: with tarfile.open(dbfile, "r:gz") as tar: member = tar.getmembers()[0] raw_data = json.load(tar.extractfile(member), cls=MontyDecoder) else: reference_project = project raw_data = [] with gzip.open(dbfile, 'rb') as f: for line in f: raw_data.append(json.loads(line, cls=MontyDecoder)) input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data) input_keys[project] = [ 'material_id', 'exfoliation_energy_per_atom', 'structure' ] if is_nus else ['jid', 'exfoliation_en', 'final_str'] extra[project] = [ ('fin_en', ('E', 'meV/atom')), ('op_gap', ('ΔE|optB88vdW', 'meV/atom')), ('mbj_gap', ('ΔE|mbj', 'meV/atom')), #('kv', ('Kᵥ', 'GPa')), #('gv', ('Gᵥ', 'GPa')) ] if not is_nus else [] print(len(input_data[project]), 'materials loaded for', project) projects = input_data.keys() identifiers = [] for d in input_data.values(): identifiers += list(d.keys()) for identifier in set(identifiers): print(identifier) data, structures = RecursiveDict(), RecursiveDict() for project in projects: if project not in data: data[project] = RecursiveDict() if identifier in input_data[project]: d = input_data[project][identifier] structures[project] = d[input_keys[project][-1]] if data.get('formula') is None: data['formula'] = get_composition_from_string( structures[project].composition.reduced_formula) data[project]['id'] = input_urls[project]['detail'].format( d[input_keys[project][0]]) if input_keys[project][1] in d: Ex = d[input_keys[project][1]] if project == reference_project: Ex *= 1000. data[project]['Eₓ'] = clean_value(Ex, 'eV') for k, (sym, unit) in extra[project]: if d[k] != 'na': data[project][sym] = clean_value(d[k], unit) mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier) #r = db.contributions.update_one( # {'identifier': identifier, 'project': 'jarvis_dft'}, # {'$set': {'content.data': mpfile.document[identifier]['data']}}, # upsert=True #) #print(r.matched_count, r.modified_count, r.upserted_id) doc = db.contributions.find_one( { 'identifier': identifier, 'project': 'jarvis_dft' }, { '_id': 1, 'content.structures': 1 }) if 'structures' in doc['content']: print('structures already added for', identifier) continue print(doc['_id']) inserted_ids = [] for project, structure in structures.items(): try: mpfile.add_structure(structure, name=project, identifier=identifier) sdct = mpfile.document[identifier]['structures'][project] sdct.pop('@module') sdct.pop('@class') if sdct['charge'] is None: sdct.pop('charge') sdct['identifier'] = identifier sdct['project'] = 'jarvis_dft' sdct['name'] = project sdct['cid'] = doc['_id'] r = db.structures.insert_one(sdct) inserted_ids.append(r.inserted_id) except Exception as ex: print(str(ex)) print(inserted_ids) r = db.contributions.update_one( {'_id': doc['_id']}, {'$set': { 'content.structures': inserted_ids }}) print(r.matched_count, r.modified_count)
def run(mpfile, **kwargs): from pymatgen import Structure reference_project = None input_data, input_keys, extra = RecursiveDict(), RecursiveDict(), RecursiveDict() input_urls = mpfile.document['_hdata'].pop('input_urls') for project in input_urls: input_url = input_urls[project]['file'] if '{}' in input_url: input_url = input_url.format('2d') # TODO 3d for Jarvis dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1]) if not os.path.exists(dbfile): print 'downloading', dbfile, '...' urllib.urlretrieve(input_url, dbfile) ext = os.path.splitext(dbfile)[1] is_nus = bool(ext == '.gz') id_key = 'parent_id' if is_nus else 'mpid' if not is_nus: with tarfile.open(dbfile, "r:gz") as tar: member = tar.getmembers()[0] raw_data = json.load(tar.extractfile(member), cls=MontyDecoder) else: reference_project = project raw_data = [] with gzip.open(dbfile, 'rb') as f: for line in f: raw_data.append(json.loads(line, cls=MontyDecoder)) input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data) input_keys[project] = [ 'material_id', 'exfoliation_energy_per_atom', 'structure' ] if is_nus else ['jid', 'exfoliation_en', 'final_str'] extra[project] = [ ('fin_en', ('E', 'eV')), ('op_gap', ('ΔE|optB88vdW', 'eV')), ('mbj_gap', ('ΔE|mbj', 'eV')), #('kv', ('Kᵥ', 'GPa')), #('gv', ('Gᵥ', 'GPa')) ] if not is_nus else [] print len(input_data[project]), 'materials loaded for', project projects = input_data.keys() identifiers = [] for d in input_data.values(): identifiers += list(d.keys()) for identifier in identifiers: data, structures = RecursiveDict(), RecursiveDict() for project in projects: if project not in data: data[project] = RecursiveDict() if identifier in input_data[project]: d = input_data[project][identifier] structures[project] = d[input_keys[project][-1]] if data.get('formula') is None: data['formula'] = get_composition_from_string( structures[project].composition.reduced_formula ) data[project]['id'] = input_urls[project]['detail'].format(d[input_keys[project][0]]) Ex = d[input_keys[project][1]] if project == reference_project: Ex *= 1000. data[project]['Eₓ'] = clean_value(Ex, 'eV') for k, (sym, unit) in extra[project]: if d[k] != 'na': data[project][sym] = clean_value(d[k], unit) mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier) for project, structure in structures.items(): name = '{}_{}'.format(data['formula'], project) try: mpfile.add_structure(structure, name=name, identifier=identifier) except Exception as ex: print str(ex)
def run(mpfile, **kwargs): from pymatgen import Structure reference_project = None input_data, input_keys, extra = RecursiveDict(), RecursiveDict( ), RecursiveDict() input_urls = mpfile.document['_hdata'].pop('input_urls') for project in input_urls: input_url = input_urls[project]['file'] if '{}' in input_url: input_url = input_url.format('2d') # TODO 3d for Jarvis dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1]) if not os.path.exists(dbfile): print 'downloading', dbfile, '...' urllib.urlretrieve(input_url, dbfile) ext = os.path.splitext(dbfile)[1] is_nus = bool(ext == '.gz') id_key = 'parent_id' if is_nus else 'mpid' if not is_nus: with tarfile.open(dbfile, "r:gz") as tar: member = tar.getmembers()[0] raw_data = json.load(tar.extractfile(member), cls=MontyDecoder) else: reference_project = project raw_data = [] with gzip.open(dbfile, 'rb') as f: for line in f: raw_data.append(json.loads(line, cls=MontyDecoder)) input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data) input_keys[project] = [ 'material_id', 'exfoliation_energy_per_atom', 'structure' ] if is_nus else ['jid', 'exfoliation_en', 'final_str'] extra[project] = [ ('fin_en', ('E', 'meV/atom')), ('op_gap', ('ΔE|optB88vdW', 'meV/atom')), ('mbj_gap', ('ΔE|mbj', 'meV/atom')), #('kv', ('Kᵥ', 'GPa')), #('gv', ('Gᵥ', 'GPa')) ] if not is_nus else [] print len(input_data[project]), 'materials loaded for', project projects = input_data.keys() identifiers = [] for d in input_data.values(): identifiers += list(d.keys()) for identifier in identifiers: data, structures = RecursiveDict(), RecursiveDict() for project in projects: if project not in data: data[project] = RecursiveDict() if identifier in input_data[project]: d = input_data[project][identifier] structures[project] = d[input_keys[project][-1]] if data.get('formula') is None: data['formula'] = get_composition_from_string( structures[project].composition.reduced_formula) data[project]['id'] = input_urls[project]['detail'].format( d[input_keys[project][0]]) Ex = d[input_keys[project][1]] if project == reference_project: Ex *= 1000. data[project]['Eₓ'] = clean_value(Ex, 'eV') for k, (sym, unit) in extra[project]: if d[k] != 'na': data[project][sym] = clean_value(d[k], unit) mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier) for project, structure in structures.items(): name = '{}_{}'.format(data['formula'], project) try: mpfile.add_structure(structure, name=name, identifier=identifier) except Exception as ex: print str(ex)
def run(mpfile, **kwargs): # TODO clone solar_perovskite if needed, abort if insufficient permissions import solar_perovskite from solar_perovskite.core import GetExpThermo from solar_perovskite.init.find_structures import FindStructures from solar_perovskite.init.import_data import Importdata from solar_perovskite.modelling.from_theo import EnthTheo input_file = mpfile.hdata.general['input_file'] input_file = os.path.join(os.path.dirname(solar_perovskite.__file__), input_file) table = read_csv(open(input_file, 'r').read().replace(';', ',')) dct = super(Table, table).to_dict(orient='records', into=RecursiveDict) shomate = pd.read_csv(os.path.abspath(os.path.join( os.path.dirname(solar_perovskite.__file__), "datafiles", "shomate.csv" )), index_col=0) shomate_dct = RecursiveDict() for col in shomate.columns: key = col.split('.')[0] if key not in shomate_dct: shomate_dct[key] = RecursiveDict() d = shomate[col].to_dict(into=RecursiveDict) subkey = '{}-{}'.format(int(d.pop('low')), int(d.pop('high'))) shomate_dct[key][subkey] = RecursiveDict( (k, clean_value(v, max_dgts=6)) for k, v in d.items() ) mpfile.add_hierarchical_data(nest_dict(shomate_dct, ['shomate'])) for row in dct: sample_number = int(row['sample_number']) identifier = row['closest phase MP (oxidized)'].replace('n.a.', '') if not identifier.startswith('mp-'): continue if not identifier: identifier = get_composition_from_string(row['composition oxidized phase']) print identifier print 'add hdata ...' d = RecursiveDict() d['tolerance_factor'] = row['tolerance_factor'] d['solid_solution'] = row['type of solid solution'] d['oxidized_phase'] = RecursiveDict() d['oxidized_phase']['composition'] = row['composition oxidized phase'] d['oxidized_phase']['crystal-structure'] = row['crystal structure (fully oxidized)'] d['reduced_phase'] = RecursiveDict() d['reduced_phase']['composition'] = row['composition reduced phase'] d['reduced_phase']['closest-MP'] = row['closest phase MP (reduced)'].replace('n.a.', '') d = nest_dict(d, ['data']) d['pars'] = get_fit_pars(sample_number) d['pars']['theo_compstr'] = row['theo_compstr'] try: fs = FindStructures(compstr=row['theo_compstr']) theo_redenth = fs.find_theo_redenth() imp = Importdata() splitcomp = imp.split_comp(row['theo_compstr']) conc_act = imp.find_active(mat_comp=splitcomp)[1] et = EnthTheo(comp=row['theo_compstr']) dh_max, dh_min = et.calc_dh_endm() red_enth_mean_endm = (conc_act * dh_min) + ((1 - conc_act) * dh_max) difference = theo_redenth - red_enth_mean_endm d['pars']['dh_min'] = clean_value(dh_min + difference, max_dgts=8) d['pars']['dh_max'] = clean_value(dh_max + difference, max_dgts=8) except Exception as ex: print('error in dh_min/max!') print(str(ex)) pass mpfile.add_hierarchical_data(d, identifier=identifier) print 'add ΔH ...' exp_thermo = GetExpThermo(sample_number, plotting=False) enthalpy = exp_thermo.exp_dh() table = get_table(enthalpy, 'H') mpfile.add_data_table(identifier, table, name='enthalpy') print 'add ΔS ...' entropy = exp_thermo.exp_ds() table = get_table(entropy, 'S') mpfile.add_data_table(identifier, table, name='entropy') print 'add raw data ...' tga_results = os.path.join(os.path.dirname(solar_perovskite.__file__), 'tga_results') for path in glob(os.path.join(tga_results, 'ExpDat_JV_P_{}_*.csv'.format(sample_number))): print path.split('_{}_'.format(sample_number))[-1].split('.')[0], '...' body = open(path, 'r').read() cols = ['Time [min]', 'Temperature [C]', 'dm [%]', 'pO2'] table = read_csv(body, lineterminator=os.linesep, usecols=cols, skiprows=5) table = table[cols].iloc[::100, :] # scale/shift for better graphs T, dm, p = [pd.to_numeric(table[col]) for col in cols[1:]] T_min, T_max, dm_min, dm_max, p_max = T.min(), T.max(), dm.min(), dm.max(), p.max() rT, rdm = abs(T_max - T_min), abs(dm_max - dm_min) table[cols[2]] = (dm - dm_min) * rT/rdm table[cols[3]] = p * rT/p_max table.rename(columns={ 'dm [%]': '(dm [%] + {:.4g}) * {:.4g}'.format(-dm_min, rT/rdm), 'pO2': 'pO₂ * {:.4g}'.format(rT/p_max) }, inplace=True) mpfile.add_data_table(identifier, table, name='raw')
def to_backgrid_dict(self): """Backgrid-conform dict from DataFrame""" # shorten global import times by importing django here import numpy as np from mpcontribs.io.core.utils import get_composition_from_string from pandas import MultiIndex import pymatgen.util as pmg_util from pymatgen.core.composition import CompositionError table = dict() nrows_max = 260 nrows = self.shape[0] df = Table(self.head(n=nrows_max)) if nrows > nrows_max else self numeric_columns = df.select_dtypes( include=[np.number]).columns.tolist() if isinstance(df.index, MultiIndex): df.reset_index(inplace=True) table['columns'] = [] table['rows'] = super(Table, df).to_dict(orient='records') for col_index, col in enumerate(list(df.columns)): cell_type = 'number' # avoid looping rows to minimize use of `df.iat` (time-consuming in 3d) if not col.startswith('level_') and col not in numeric_columns: is_url_column, prev_unit, old_col = True, None, col for row_index in range(df.shape[0]): cell = str(df.iat[row_index, col_index]) cell_split = cell.split(' ', 1) if not cell or len( cell_split) == 1: # empty cell or no space is_url_column = bool( is_url_column and (not cell or mp_id_pattern.match(cell))) if is_url_column: if cell: value = 'https://materialsproject.org/materials/{}'.format( cell) table['rows'][row_index][col] = value elif cell: try: composition = get_composition_from_string(cell) composition = pmg_util.string.unicodeify( composition) table['rows'][row_index][col] = composition except (CompositionError, ValueError, OverflowError): try: # https://stackoverflow.com/a/38020041 result = urlparse(cell) if not all([ result.scheme, result.netloc, result.path ]): break is_url_column = True except: break else: value, unit = cell_split # TODO convert cell_split[0] to float? is_url_column = False try: float(value ) # unit is only a unit if value is number except ValueError: continue table['rows'][row_index].pop(old_col) if prev_unit is None: prev_unit = unit col = '{} [{}]'.format(col, unit) table['rows'][row_index][ col] = cell if prev_unit != unit else value cell_type = 'uri' if is_url_column else 'string' col_split = col.split('##') nesting = [col_split[0]] if len(col_split) > 1 else [] table['columns'].append({ 'name': col, 'cell': cell_type, 'nesting': nesting, 'editable': 0 }) if len(col_split) > 1: table['columns'][-1].update( {'label': '##'.join(col_split[1:])}) if len(table['columns']) > 12: table['columns'][-1]['renderable'] = 0 header = RecursiveDict() for idx, col in enumerate(table['columns']): if 'label' in col: k, sk = col['name'].split('##') sk_split = sk.split() if len(sk_split) == 2: d = {'name': sk_split[0], 'unit': sk_split[1], 'idx': idx} if k not in header: header[k] = [d] else: header[k].append(d) elif k in header: header.pop(k) for k, skl in header.items(): units = [sk['unit'] for sk in skl] if units.count(units[0]) == len(units): for sk in skl: table['columns'][sk['idx']]['label'] = sk['name'] table['columns'][sk['idx']]['nesting'][0] = '{} {}'.format( k, sk['unit']) return table
def to_backgrid_dict(self): """Backgrid-conform dict from DataFrame""" # shorten global import times by importing django here from mpcontribs.io.core.utils import get_composition_from_string from pandas import MultiIndex import pymatgen.util as pmg_util from pymatgen.core.composition import CompositionError table = dict() nrows_max = 260 nrows = self.shape[0] df = Table(self.head(n=nrows_max)) if nrows > nrows_max else self if isinstance(df.index, MultiIndex): df.reset_index(inplace=True) table["columns"] = [] table["rows"] = super(Table, df).to_dict(orient="records") for col_index, col in enumerate(list(df.columns)): cell_type = "number" # avoid looping rows to minimize use of `df.iat` (time-consuming in 3d) if not col.startswith("level_") and col[-1] != "]": is_url_column = True for row_index in range(df.shape[0]): cell = str(df.iat[row_index, col_index]) is_url_column = bool( is_url_column and (not cell or mp_id_pattern.match(cell))) if is_url_column: if cell: value = "https://materialsproject.org/materials/{}".format( cell) table["rows"][row_index][col] = value elif cell: try: composition = get_composition_from_string(cell) composition = pmg_util.string.unicodeify( composition) table["rows"][row_index][col] = composition except (CompositionError, ValueError, OverflowError): try: # https://stackoverflow.com/a/38020041 result = urlparse(cell) if not all([ result.scheme, result.netloc, result.path ]): break is_url_column = True except Exception: break cell_type = "uri" if is_url_column else "string" col_split = col.split(".") nesting = [col_split[0]] if len(col_split) > 1 else [] table["columns"].append({ "name": col, "cell": cell_type, "nesting": nesting, "editable": 0 }) if len(col_split) > 1: table["columns"][-1].update({"label": ".".join(col_split[1:])}) return table