def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(archieml.loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in rdct.keys(): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct.insert_before(rdct.keys()[0], (mp_level01_titles[0], RecursiveDict())) rdct.rec_update( nest_dict(rdct.pop(key), [mp_level01_titles[0], root_key])) else: # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [root_key] if isinstance(value, list): keys.append('table') rdct.rec_update(nest_dict(value, keys)) # Note: CSV section is marked with 'data ' prefix during iterate() for k, v in rdct[root_key].iterate(): if isinstance(k, six.string_types) and \ k.startswith(mp_level01_titles[1]): # k = table name (incl. data prefix) # v = csv string from ArchieML free-form arrays table_name = k[len(mp_level01_titles[1] + '_'):] pd_obj = read_csv(v) rdct[root_key].pop(table_name) rdct[root_key].rec_update( nest_dict(pd_obj.to_dict(), [k])) rdct[root_key].insert_default_plot_options(pd_obj, k) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in rdct[root_key]: from pymatgen.io.cif import CifParser for name in rdct[root_key][mp_level01_titles[3]].keys(): cif = rdct[root_key][mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] rdct[root_key][mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
def run(mpfile, **kwargs): from pymatgen import Structure reference_project = None input_data, input_keys, extra = RecursiveDict(), RecursiveDict( ), RecursiveDict() #input_urls = mpfile.document['_hdata'].pop('input_urls') input_urls = { 'NUS': { "file": "http://www.2dmatpedia.org/static/db.json.gz", "detail": "http://www.2dmatpedia.org/2dmaterials/doc/{}" }, 'JARVIS': { "file": "https://www.ctcms.nist.gov/~knc6/jdft_{}.json.tgz", "detail": "https://www.ctcms.nist.gov/~knc6/jsmol/{}.html" } } for project in input_urls: input_url = input_urls[project]['file'] if '{}' in input_url: input_url = input_url.format('2d') # TODO 3d for Jarvis #dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1]) dbfile = input_url.rsplit('/')[-1] if not os.path.exists(dbfile): print('downloading', dbfile, '...') urllib.request.urlretrieve(input_url, dbfile) ext = os.path.splitext(dbfile)[1] is_nus = bool(ext == '.gz') id_key = 'source_id' if is_nus else 'mpid' if not is_nus: with tarfile.open(dbfile, "r:gz") as tar: member = tar.getmembers()[0] raw_data = json.load(tar.extractfile(member), cls=MontyDecoder) else: reference_project = project raw_data = [] with gzip.open(dbfile, 'rb') as f: for line in f: raw_data.append(json.loads(line, cls=MontyDecoder)) input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data) input_keys[project] = [ 'material_id', 'exfoliation_energy_per_atom', 'structure' ] if is_nus else ['jid', 'exfoliation_en', 'final_str'] extra[project] = [ ('fin_en', ('E', 'meV/atom')), ('op_gap', ('ΔE|optB88vdW', 'meV/atom')), ('mbj_gap', ('ΔE|mbj', 'meV/atom')), #('kv', ('Kᵥ', 'GPa')), #('gv', ('Gᵥ', 'GPa')) ] if not is_nus else [] print(len(input_data[project]), 'materials loaded for', project) projects = input_data.keys() identifiers = [] for d in input_data.values(): identifiers += list(d.keys()) for identifier in set(identifiers): print(identifier) data, structures = RecursiveDict(), RecursiveDict() for project in projects: if project not in data: data[project] = RecursiveDict() if identifier in input_data[project]: d = input_data[project][identifier] structures[project] = d[input_keys[project][-1]] if data.get('formula') is None: data['formula'] = get_composition_from_string( structures[project].composition.reduced_formula) data[project]['id'] = input_urls[project]['detail'].format( d[input_keys[project][0]]) if input_keys[project][1] in d: Ex = d[input_keys[project][1]] if project == reference_project: Ex *= 1000. data[project]['Eₓ'] = clean_value(Ex, 'eV') for k, (sym, unit) in extra[project]: if d[k] != 'na': data[project][sym] = clean_value(d[k], unit) mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier) #r = db.contributions.update_one( # {'identifier': identifier, 'project': 'jarvis_dft'}, # {'$set': {'content.data': mpfile.document[identifier]['data']}}, # upsert=True #) #print(r.matched_count, r.modified_count, r.upserted_id) doc = db.contributions.find_one( { 'identifier': identifier, 'project': 'jarvis_dft' }, { '_id': 1, 'content.structures': 1 }) if 'structures' in doc['content']: print('structures already added for', identifier) continue print(doc['_id']) inserted_ids = [] for project, structure in structures.items(): try: mpfile.add_structure(structure, name=project, identifier=identifier) sdct = mpfile.document[identifier]['structures'][project] sdct.pop('@module') sdct.pop('@class') if sdct['charge'] is None: sdct.pop('charge') sdct['identifier'] = identifier sdct['project'] = 'jarvis_dft' sdct['name'] = project sdct['cid'] = doc['_id'] r = db.structures.insert_one(sdct) inserted_ids.append(r.inserted_id) except Exception as ex: print(str(ex)) print(inserted_ids) r = db.contributions.update_one( {'_id': doc['_id']}, {'$set': { 'content.structures': inserted_ids }}) print(r.matched_count, r.modified_count)
def run(mpfile, **kwargs): from pymatgen import Structure reference_project = None input_data, input_keys, extra = RecursiveDict(), RecursiveDict(), RecursiveDict() input_urls = mpfile.document['_hdata'].pop('input_urls') for project in input_urls: input_url = input_urls[project]['file'] if '{}' in input_url: input_url = input_url.format('2d') # TODO 3d for Jarvis dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1]) if not os.path.exists(dbfile): print 'downloading', dbfile, '...' urllib.urlretrieve(input_url, dbfile) ext = os.path.splitext(dbfile)[1] is_nus = bool(ext == '.gz') id_key = 'parent_id' if is_nus else 'mpid' if not is_nus: with tarfile.open(dbfile, "r:gz") as tar: member = tar.getmembers()[0] raw_data = json.load(tar.extractfile(member), cls=MontyDecoder) else: reference_project = project raw_data = [] with gzip.open(dbfile, 'rb') as f: for line in f: raw_data.append(json.loads(line, cls=MontyDecoder)) input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data) input_keys[project] = [ 'material_id', 'exfoliation_energy_per_atom', 'structure' ] if is_nus else ['jid', 'exfoliation_en', 'final_str'] extra[project] = [ ('fin_en', ('E', 'eV')), ('op_gap', ('ΔE|optB88vdW', 'eV')), ('mbj_gap', ('ΔE|mbj', 'eV')), #('kv', ('Kᵥ', 'GPa')), #('gv', ('Gᵥ', 'GPa')) ] if not is_nus else [] print len(input_data[project]), 'materials loaded for', project projects = input_data.keys() identifiers = [] for d in input_data.values(): identifiers += list(d.keys()) for identifier in identifiers: data, structures = RecursiveDict(), RecursiveDict() for project in projects: if project not in data: data[project] = RecursiveDict() if identifier in input_data[project]: d = input_data[project][identifier] structures[project] = d[input_keys[project][-1]] if data.get('formula') is None: data['formula'] = get_composition_from_string( structures[project].composition.reduced_formula ) data[project]['id'] = input_urls[project]['detail'].format(d[input_keys[project][0]]) Ex = d[input_keys[project][1]] if project == reference_project: Ex *= 1000. data[project]['Eₓ'] = clean_value(Ex, 'eV') for k, (sym, unit) in extra[project]: if d[k] != 'na': data[project][sym] = clean_value(d[k], unit) mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier) for project, structure in structures.items(): name = '{}_{}'.format(data['formula'], project) try: mpfile.add_structure(structure, name=name, identifier=identifier) except Exception as ex: print str(ex)
class MPFileCore(six.with_metaclass(ABCMeta, object)): """Abstract Base Class for representing a MP Contribution File""" def __init__(self, data=RecursiveDict()): if isinstance(data, dict): self.document = RecursiveDict(data) else: raise ValueError("Need dict (or inherited class) to init MPFile.") self.document.rec_update( ) # convert (most) OrderedDict's to RecursiveDict's self.unique_mp_cat_ids = True self.max_contribs = 10 def __getitem__(self, key): item = self.from_dict({key: self.document[key]}) general = self.document.get(mp_level01_titles[0]) if general: item.insert_general_section( self.from_dict({mp_level01_titles[0]: general})) return item @property def ids(self): return [ k for k in self.document.keys() if k.lower() != mp_level01_titles[0] ] @property def hdata(self): return HierarchicalData(self.document) @property def tdata(self): return TabularData(self.document) @property def gdata(self): return GraphicalData(self.document) @property def sdata(self): return StructuralData(self.document) @classmethod def from_file(cls, filename_or_file=default_mpfile_path.replace( ".txt", "_in.txt")): """Reads a MPFile from a file. Args: filename_or_file (str or file): name of file or file containing contribution data. Returns: MPFile object. """ f = (open(filename_or_file) if isinstance( filename_or_file, six.string_types) else filename_or_file) return cls.from_string(f.read()) @classmethod def from_dict(cls, data=RecursiveDict()): return cls(data=data) @classmethod def from_contribution(cls, contrib): """construct MPFile from contribution (see rest.adapter.submit_contribution)""" if "identifier" not in contrib or "content" not in contrib: raise ValueError("Dict not in contribution-style format") recdict = RecursiveDict({contrib["identifier"]: contrib["content"]}) return cls.from_dict(recdict) def write_file(self, filename=default_mpfile_path.replace(".txt", "_out.txt"), **kwargs): """Writes MPFile to a file. The supported kwargs are the same as those for the MPFile.get_string method and are passed through directly.""" with codecs.open(filename, encoding="utf-8", mode="w") as f: file_str = self.get_string(**kwargs) + "\n" f.write(file_str) print("{} ({:.3f}MB) written".format( filename, os.path.getsize(filename) / 1024.0 / 1024.0)) def get_number_of_lines(self, **kwargs): return len(self.get_string(**kwargs).split("\n")) def split(self): general_mpfile = (self.pop_first_section() if mp_level01_titles[0] in self.document.keys() else None) if not self.document: raise ValueError("No contributions in MPFile! Either the file is" " empty or only contains shared (meta-)data not" " correlated to core identifier.") while True: try: mpfile_single = self.pop_first_section() mpid_orig = mpfile_single.ids[0] if "--" in mpid_orig: mpid = mpid_orig.split("--")[0] mpfile_single.document.rec_update( nest_dict(mpfile_single.document.pop(mpid_orig), [mpid])) if general_mpfile is not None: mpfile_single.insert_general_section(general_mpfile) yield mpfile_single except KeyError: break def get_identifiers(self): """list of materials/composition identifiers as tuples w/ contribution IDs""" return [(k, self.document[k].get("cid", None)) for k in self.document if k.lower() != mp_level01_titles[0]] def pop_first_section(self): item = self.document.popitem(last=False) return self.from_dict(RecursiveDict([item])) def insert_general_section(self, general_mpfile): """insert general section from `general_mpfile` into this MPFile""" if general_mpfile is None: return general_title = mp_level01_titles[0] general_data = general_mpfile.document[general_title] root_key = list(self.document.keys())[0] for key, value in general_data.items(): if key in self.document[root_key]: self.document.rec_update(nest_dict(value, [root_key, key])) else: self.document[root_key][key] = value for key in reversed(general_data.keys()): self.document[root_key].move_to_end(key, last=False) def get_unique_mp_cat_id(self, mp_cat_id): if not self.unique_mp_cat_ids or mp_cat_id in mp_level01_titles: return mp_cat_id mp_cat_id_idx = len([i for i in self.ids if i.startswith(mp_cat_id)]) if mp_cat_id_idx == 0: return mp_cat_id return "{}--{}".format(mp_cat_id, mp_cat_id_idx) def concat(self, mpfile): """concatenate single-section MPFile with this MPFile""" try: if len(mpfile.document) > 1: raise ValueError( "concatenation only possible with single section files") except AttributeError: raise ValueError("Provide a MPFile to concatenate") mp_cat_id = list(mpfile.document.keys())[0] general_title = mp_level01_titles[0] if general_title in mpfile.document[mp_cat_id]: general_data = mpfile.document[mp_cat_id].pop(general_title) if general_title not in self.document: self.document.rec_update( nest_dict(general_data, [general_title])) self.document.rec_update( nest_dict(mpfile.document.pop(mp_cat_id), [self.get_unique_mp_cat_id(mp_cat_id)])) def insert_top(self, mp_cat_id, key, value): """insert value for `mp_cat_id` as `key: <value>` at top""" self.document[mp_cat_id][key] = str(value) self.document[mp_cat_id].move_to_end(key, last=False) def add_data_table(self, identifier, dataframe, name, plot_options=None): """add a datatable to the root-level section Args: identifier (str): MP category ID (`mp_cat_id`) dataframe (pandas.DataFrame): tabular data as Pandas DataFrame name (str): table name, optional if only one table in section plot_options (dict): options for according plotly graph """ # TODO: optional table name, required if multiple tables per root-level section name = "".join([replacements.get(c, c) for c in name]) self.document.rec_update( nest_dict(Table(dataframe).to_dict(), [identifier, name])) self.document[identifier].insert_default_plot_options( dataframe, name, update_plot_options=plot_options) def add_hierarchical_data(self, dct, identifier=mp_level01_titles[0]): if len(self.ids) >= self.max_contribs: raise StopIteration( "Reached max. number of contributions in MPFile") self.document.rec_update(nest_dict(RecursiveDict(dct), [identifier])) def add_structure(self, source, name=None, identifier=None, fmt=None): """add a structure to the mpfile""" from pymatgen import Structure, MPRester if isinstance(source, Structure): structure = source elif isinstance(source, dict): structure = Structure.from_dict(source) elif os.path.exists(source): structure = Structure.from_file(source, sort=True) elif isinstance(source, six.string_types): if fmt is None: raise ValueError("Need fmt to get structure from string!") structure = Structure.from_str(source, fmt, sort=True) else: raise ValueError(source, "not supported!") if name is not None: if not isinstance(name, six.string_types): raise ValueError("structure name needs to be a string") elif "." in name: raise ValueError("structure name cannot contain dots (.)") mpr = MPRester() if not mpr.api_key: raise ValueError( "API key not set. Run `pmg config --add PMG_MAPI_KEY <USER_API_KEY>`." ) matched_mpids = mpr.find_structure(structure) formula = get_composition_from_string(structure.composition.formula) if not matched_mpids: if identifier is None: identifier = formula print( "Structure not found in MP! Please submit via MPComplete to " "obtain mp-id or manually choose an anchor mp-id! Continuing " "with {} as identifier!".format(identifier)) else: print("Structure not found in MP! Forcing {} as identifier!". format(identifier)) elif identifier is None: identifier = matched_mpids[0] if len(matched_mpids) > 1: print("Multiple matching structures found in MP. Using", identifier) elif identifier not in matched_mpids: msg = "Structure does not match {} but instead {}!".format( identifier, matched_mpids) raise ValueError(msg) idx = len( self.document.get(identifier, {}).get(mp_level01_titles[3], {})) sub_key = formula if name is None else name if sub_key in self.document.get(identifier, {}).get(mp_level01_titles[3], {}): sub_key += "_{}".format(idx) self.document.rec_update( nest_dict(structure.as_dict(), [identifier, mp_level01_titles[3], sub_key])) return identifier def __repr__(self): return self.get_string(df_head_only=True) def __str__(self): return self.get_string(df_head_only=True) def _ipython_display_(self): from IPython.display import display_html display_html(self.hdata) display_html(self.tdata) display_html(self.gdata) display_html(self.sdata) # ---------------------------- # Override these in subclasses # ---------------------------- @staticmethod def from_string(data): """Reads a MPFile from a string containing contribution data.""" return MPFileCore() def get_string(self, df_head_only=False): """Returns a string to be written as a file""" return repr(self.document)
def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in list(rdct.keys()): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct[mp_level01_titles[0]] = RecursiveDict() rdct.move_to_end(mp_level01_titles[0], last=False) # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [mp_level01_titles[0]] if is_general else [] keys.append(root_key) if isinstance(value, list): keys.append("table") rdct.rec_update(nest_dict(value, keys)) # reference to section to iterate or parse as CIF section = (rdct[mp_level01_titles[0]][root_key] if is_general else rdct[root_key]) # iterate to find CSV sections to parse # also parse propnet quantities if isinstance(section, dict): scope = [] for k, v in section.iterate(): level, key = k key = "".join([replacements.get(c, c) for c in key]) level_reduction = bool(level < len(scope)) if level_reduction: del scope[level:] if v is None: scope.append(key) elif isinstance(v, list) and isinstance(v[0], dict): table = "" for row_dct in v: table = "\n".join([table, row_dct["value"]]) pd_obj = read_csv(table) d = nest_dict(pd_obj.to_dict(), scope + [key]) section.rec_update(d, overwrite=True) if not is_general and level == 0: section.insert_default_plot_options(pd_obj, key) elif (Quantity is not None and isinstance(v, six.string_types) and " " in v): quantity = Quantity.from_key_value(key, v) d = nest_dict(quantity.as_dict(), scope + [key]) # TODO quantity.symbol.name section.rec_update(d, overwrite=True) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in section: from pymatgen.io.cif import CifParser for name in section[mp_level01_titles[3]].keys(): cif = section[mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] section[mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
def run(mpfile, nmax=None, dup_check_test_site=True): existing_mpids = {} for b in [False, True]: with DibbsRester(test_site=b) as mpr: for doc in mpr.query_contributions(criteria=mpr.dibbs_query): existing_mpids[doc['mp_cat_id']] = doc['_id'] if not dup_check_test_site: break general = mpfile.document[mp_level01_titles[0]] input_file = general.pop('input_file') df = read_excel(input_file) columns_map = RecursiveDict([ (v, k) for k, v in general.pop('columns_map').items() ]) columns = columns_map.keys() df = df[columns] df = df[notnull(df[columns[-1]]) & notnull(df[columns[1]])] mpfile.add_hierarchical_data({'title': 'DIBBS - 27Al NMR'}) count, skipped, update = 0, 0, 0 for idx, row in df.iterrows(): url = row[columns[-1]] if not url.startswith('http'): continue # hierarchical data d = RecursiveDict() for col in columns[:4]: d[columns_map[col]] = unidecode(row[col]) \ if isinstance(row[col], six.string_types) else row[col] if d['name'] in [ 'AlPO4 Tridymite (AlPO4-t)', 'Basic aluminum sulfate', 'Cryolite', 'berlinite(AlPO4-Q)' ]: continue d['data'] = RecursiveDict() for col in columns[4:8]: if notnull(row[col]): value = unicode('{}'.format(row[col]), 'utf-8') if col == columns[4]: value += ' ppm' elif col == columns[6]: value += ' MHz' elif col == columns[7]: value = ' '.join([value[:-1], value[-1]]) else: value = u'' d['data'][columns_map[col]] = value # structure if url.startswith('https://materialsproject.org'): mpid = url.split('/')[-2] else: #print 'retrieve cif and match to MP structure ...' d[columns_map[columns[-1]]] = url f = requests.get(url) try: mpid = mpfile.add_structure(f.text, name=d['name'], fmt='cif') except ValueError as ex: print d['name'], str(ex) continue if nmax is not None and mpid in existing_mpids: item = mpfile.document.popitem(last=True) print 'removed duplicate', mpid if nmax is not None and mpid in existing_mpids: print 'skipping', mpid skipped += 1 continue # skip duplicates mpfile.add_hierarchical_data(d, identifier=mpid) print 'added {} ({})'.format(d['name'], mpid) if mpid in existing_mpids: cid = existing_mpids[mpid] mpfile.insert_id(mpid, cid) update += 1 if nmax is not None and count >= nmax-1: break count += 1 print len(mpfile.ids), 'mp-ids to submit.' if nmax is None and update > 0: print update, 'mp-ids to update.' if nmax is not None and skipped > 0: print skipped, 'duplicates to skip.'
def run(mpfile, nmax=None, dup_check_test_site=True): existing_mpids = {} for b in [False, True]: with DibbsRester(test_site=b) as mpr: for doc in mpr.query_contributions(criteria=mpr.dibbs_query): existing_mpids[doc['mp_cat_id']] = doc['_id'] if not dup_check_test_site: break general = mpfile.document[mp_level01_titles[0]] input_file = general.pop('input_file') df = read_excel(input_file) columns_map = RecursiveDict([ (v, k) for k, v in general.pop('columns_map').items() ]) columns = columns_map.keys() df = df[columns] df = df[notnull(df[columns[-1]]) & notnull(df[columns[1]])] mpfile.add_hierarchical_data({'title': 'DIBBS - 27Al NMR'}) count, skipped, update = 0, 0, 0 for idx, row in df.iterrows(): url = row[columns[-1]] if not url.startswith('http'): continue # hierarchical data d = RecursiveDict() for col in columns[:4]: d[columns_map[col]] = unidecode(row[col]) \ if isinstance(row[col], six.string_types) else row[col] if d['name'] in [ 'AlPO4 Tridymite (AlPO4-t)', 'Basic aluminum sulfate', 'Cryolite', 'berlinite(AlPO4-Q)' ]: continue d['data'] = RecursiveDict() for col in columns[4:8]: if notnull(row[col]): value = unicode('{}'.format(row[col]), 'utf-8') if col == columns[4]: value += ' ppm' elif col == columns[6]: value += ' MHz' elif col == columns[7]: value = ' '.join([value[:-1], value[-1]]) else: value = u'' d['data'][columns_map[col]] = value # structure if url.startswith('https://materialsproject.org'): mpid = url.split('/')[-2] else: #print 'retrieve cif and match to MP structure ...' d[columns_map[columns[-1]]] = url f = requests.get(url) try: mpid = mpfile.add_structure(f.text, name=d['name'], fmt='cif') except ValueError as ex: print d['name'], str(ex) continue if nmax is not None and mpid in existing_mpids: item = mpfile.document.popitem(last=True) print 'removed duplicate', mpid if nmax is not None and mpid in existing_mpids: print 'skipping', mpid skipped += 1 continue # skip duplicates mpfile.add_hierarchical_data(d, identifier=mpid) print 'added {} ({})'.format(d['name'], mpid) if mpid in existing_mpids: cid = existing_mpids[mpid] mpfile.insert_id(mpid, cid) update += 1 if nmax is not None and count >= nmax - 1: break count += 1 print len(mpfile.ids), 'mp-ids to submit.' if nmax is None and update > 0: print update, 'mp-ids to update.' if nmax is not None and skipped > 0: print skipped, 'duplicates to skip.'
def run(mpfile, **kwargs): from pymatgen import Structure reference_project = None input_data, input_keys, extra = RecursiveDict(), RecursiveDict( ), RecursiveDict() input_urls = mpfile.document['_hdata'].pop('input_urls') for project in input_urls: input_url = input_urls[project]['file'] if '{}' in input_url: input_url = input_url.format('2d') # TODO 3d for Jarvis dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1]) if not os.path.exists(dbfile): print 'downloading', dbfile, '...' urllib.urlretrieve(input_url, dbfile) ext = os.path.splitext(dbfile)[1] is_nus = bool(ext == '.gz') id_key = 'parent_id' if is_nus else 'mpid' if not is_nus: with tarfile.open(dbfile, "r:gz") as tar: member = tar.getmembers()[0] raw_data = json.load(tar.extractfile(member), cls=MontyDecoder) else: reference_project = project raw_data = [] with gzip.open(dbfile, 'rb') as f: for line in f: raw_data.append(json.loads(line, cls=MontyDecoder)) input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data) input_keys[project] = [ 'material_id', 'exfoliation_energy_per_atom', 'structure' ] if is_nus else ['jid', 'exfoliation_en', 'final_str'] extra[project] = [ ('fin_en', ('E', 'meV/atom')), ('op_gap', ('ΔE|optB88vdW', 'meV/atom')), ('mbj_gap', ('ΔE|mbj', 'meV/atom')), #('kv', ('Kᵥ', 'GPa')), #('gv', ('Gᵥ', 'GPa')) ] if not is_nus else [] print len(input_data[project]), 'materials loaded for', project projects = input_data.keys() identifiers = [] for d in input_data.values(): identifiers += list(d.keys()) for identifier in identifiers: data, structures = RecursiveDict(), RecursiveDict() for project in projects: if project not in data: data[project] = RecursiveDict() if identifier in input_data[project]: d = input_data[project][identifier] structures[project] = d[input_keys[project][-1]] if data.get('formula') is None: data['formula'] = get_composition_from_string( structures[project].composition.reduced_formula) data[project]['id'] = input_urls[project]['detail'].format( d[input_keys[project][0]]) Ex = d[input_keys[project][1]] if project == reference_project: Ex *= 1000. data[project]['Eₓ'] = clean_value(Ex, 'eV') for k, (sym, unit) in extra[project]: if d[k] != 'na': data[project][sym] = clean_value(d[k], unit) mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier) for project, structure in structures.items(): name = '{}_{}'.format(data['formula'], project) try: mpfile.add_structure(structure, name=name, identifier=identifier) except Exception as ex: print str(ex)