def run(mpfile, nmax=None):
    #print json.dumps(mpfile.document, indent=4)
    datasource = mpfile.document['general'].pop('Datasource')
    subdir = os.path.abspath(os.path.join(
        datasource['work_dir'], datasource['directory']
    ))

    # TODO Potentially we have to insert a preprocessing step, probably in msp
    scandata_f = msp.read_scans(subdir, datacounter="Counter 1")
    scan_groups = scandata_f.groupby(datasource['group_by'].split())
    process_template = mpfile.document['general'].pop('process_template')
    translate = get_translate(datasource['work_dir'])
    keys = scan_groups.groups.keys()
    keys.sort()

    for i,g in enumerate(tqdm(keys, leave=True)):
        # TODO: Group information is saved into the output. Rethink?
        comp, sx, sy = translate(g)
        composition = normalize_root_level(comp)[1]
        process_template_copy = copy.deepcopy(process_template)
        process_template_copy['position'] = {'x': sx, 'y': sy}
        mpfile.document.rec_update(nest_dict(
            process_template_copy, [composition, 'process_chain']
        ))
        sg = scan_groups.get_group(g)
        for process_chain_name in process_template.keys():
            scan_params = mpfile.document[composition]['process_chain'][process_chain_name]
            xmcd_frame = treat_xmcd(sg, scan_params, xas_process.process_dict)
            mpfile.add_data_table(
                composition, xmcd_frame[['Energy', 'XAS', 'XMCD']],
                '_'.join(['data', process_chain_name])
            )
        if nmax is not None and i > nmax:
          break
def run(mpfile, **kwargs):

    url = mpfile.hdata.general['url']
    dbfile = os.path.join(os.environ['HOME'], 'work', url.rsplit('/')[-1])

    if not os.path.exists(dbfile):
        data = urllib.URLopener()
        data.retrieve(url, dbfile)

    con = ase.db.connect(dbfile)
    nr_mpids = con.count(selection='mpid')

    for idx, row in enumerate(con.select('mpid')):
        if idx and not idx % 10:
            print 'added', idx, '/', nr_mpids, 'materials'
        mpid = 'mp-' + str(row.mpid)
        d = RecursiveDict()

        # kohn-sham band gap
        d[u'ΔE-KS'] = RecursiveDict([
            ('indirect', clean_value(row.gllbsc_ind_gap - row.gllbsc_disc,
                                     'eV')),
            ('direct', clean_value(row.gllbsc_dir_gap - row.gllbsc_disc, 'eV'))
        ])

        # derivative discontinuity
        d['C'] = clean_value(row.gllbsc_disc, 'eV')

        # quasi particle band gap
        d[u'ΔE-QP'] = RecursiveDict([
            ('indirect', clean_value(row.gllbsc_ind_gap, 'eV')),
            ('direct', clean_value(row.gllbsc_dir_gap, 'eV'))
        ])

        mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=mpid)
def run(mpfile, nmax=None):
    #print json.dumps(mpfile.document, indent=4)
    datasource = mpfile.document['general'].pop('Datasource')
    subdir = os.path.abspath(
        os.path.join(datasource['work_dir'], datasource['directory']))

    # TODO Potentially we have to insert a preprocessing step, probably in msp
    scandata_f = msp.read_scans(subdir, datacounter="Counter 1")
    scan_groups = scandata_f.groupby(datasource['group_by'].split())
    process_template = mpfile.document['general'].pop('process_template')
    translate = get_translate(datasource['work_dir'])
    keys = scan_groups.groups.keys()
    keys.sort()

    for i, g in enumerate(tqdm(keys, leave=True)):
        # TODO: Group information is saved into the output. Rethink?
        comp, sx, sy = translate(g)
        composition = normalize_root_level(comp)[1]
        process_template_copy = copy.deepcopy(process_template)
        process_template_copy['position'] = {'x': sx, 'y': sy}
        mpfile.document.rec_update(
            nest_dict(process_template_copy, [composition, 'process_chain']))
        sg = scan_groups.get_group(g)
        for process_chain_name in process_template.keys():
            scan_params = mpfile.document[composition]['process_chain'][
                process_chain_name]
            xmcd_frame = treat_xmcd(sg, scan_params, xas_process.process_dict)
            mpfile.add_data_table(composition,
                                  xmcd_frame[['Energy', 'XAS', 'XMCD']],
                                  '_'.join(['data', process_chain_name]))
        if nmax is not None and i > nmax:
            break
Beispiel #4
0
    def add_structure(self, source, name=None, identifier=None, fmt=None):
        """add a structure to the mpfile"""
        from pymatgen import Structure, MPRester

        if isinstance(source, Structure):
            structure = source
        elif isinstance(source, dict):
            structure = Structure.from_dict(source)
        elif os.path.exists(source):
            structure = Structure.from_file(source, sort=True)
        elif isinstance(source, six.string_types):
            if fmt is None:
                raise ValueError("Need fmt to get structure from string!")
            structure = Structure.from_str(source, fmt, sort=True)
        else:
            raise ValueError(source, "not supported!")

        if name is not None:
            if not isinstance(name, six.string_types):
                raise ValueError("structure name needs to be a string")
            elif "." in name:
                raise ValueError("structure name cannot contain dots (.)")

        mpr = MPRester()
        if not mpr.api_key:
            raise ValueError(
                "API key not set. Run `pmg config --add PMG_MAPI_KEY <USER_API_KEY>`."
            )
        matched_mpids = mpr.find_structure(structure)
        formula = get_composition_from_string(structure.composition.formula)
        if not matched_mpids:
            if identifier is None:
                identifier = formula
                print(
                    "Structure not found in MP! Please submit via MPComplete to "
                    "obtain mp-id or manually choose an anchor mp-id! Continuing "
                    "with {} as identifier!".format(identifier))
            else:
                print("Structure not found in MP! Forcing {} as identifier!".
                      format(identifier))
        elif identifier is None:
            identifier = matched_mpids[0]
            if len(matched_mpids) > 1:
                print("Multiple matching structures found in MP. Using",
                      identifier)
        elif identifier not in matched_mpids:
            msg = "Structure does not match {} but instead {}!".format(
                identifier, matched_mpids)
            raise ValueError(msg)

        idx = len(
            self.document.get(identifier, {}).get(mp_level01_titles[3], {}))
        sub_key = formula if name is None else name
        if sub_key in self.document.get(identifier,
                                        {}).get(mp_level01_titles[3], {}):
            sub_key += "_{}".format(idx)
        self.document.rec_update(
            nest_dict(structure.as_dict(),
                      [identifier, mp_level01_titles[3], sub_key]))
        return identifier
Beispiel #5
0
 def concat(self, mpfile):
     """concatenate single-section MPFile with this MPFile"""
     try:
         if len(mpfile.document) > 1:
             raise ValueError('concatenation only possible with single section files')
     except AttributeError:
         raise ValueError('Provide a MPFile to concatenate')
     mp_cat_id = list(mpfile.document.keys())[0]
     general_title = mp_level01_titles[0]
     if general_title in mpfile.document[mp_cat_id]:
         general_data = mpfile.document[mp_cat_id].pop(general_title)
         if general_title not in self.document:
             self.document.rec_update(nest_dict(general_data, [general_title]))
     self.document.rec_update(nest_dict(
         mpfile.document.pop(mp_cat_id),
         [self.get_unique_mp_cat_id(mp_cat_id)]
     ))
Beispiel #6
0
 def from_string(data):
     # use archieml-python parse to import data
     rdct = RecursiveDict(archieml.loads(data))
     rdct.rec_update()
     # post-process internal representation of file contents
     for key in rdct.keys():
         is_general, root_key = normalize_root_level(key)
         if is_general:
             # make part of shared (meta-)data, i.e. nest under `general` at
             # the beginning of the MPFile
             if mp_level01_titles[0] not in rdct:
                 rdct.insert_before(rdct.keys()[0],
                                    (mp_level01_titles[0], RecursiveDict()))
             rdct.rec_update(
                 nest_dict(rdct.pop(key), [mp_level01_titles[0], root_key]))
         else:
             # normalize identifier key (pop & insert)
             # using rec_update since we're looping over all entries
             # also: support data in bare tables (marked-up only by
             #       root-level identifier) by nesting under 'data'
             value = rdct.pop(key)
             keys = [root_key]
             if isinstance(value, list): keys.append('table')
             rdct.rec_update(nest_dict(value, keys))
             # Note: CSV section is marked with 'data ' prefix during iterate()
             for k, v in rdct[root_key].iterate():
                 if isinstance(k, six.string_types) and \
                    k.startswith(mp_level01_titles[1]):
                     # k = table name (incl. data prefix)
                     # v = csv string from ArchieML free-form arrays
                     table_name = k[len(mp_level01_titles[1] + '_'):]
                     pd_obj = read_csv(v)
                     rdct[root_key].pop(table_name)
                     rdct[root_key].rec_update(
                         nest_dict(pd_obj.to_dict(), [k]))
                     rdct[root_key].insert_default_plot_options(pd_obj, k)
             # convert CIF strings into pymatgen structures
             if mp_level01_titles[3] in rdct[root_key]:
                 from pymatgen.io.cif import CifParser
                 for name in rdct[root_key][mp_level01_titles[3]].keys():
                     cif = rdct[root_key][mp_level01_titles[3]].pop(name)
                     parser = CifParser.from_string(cif)
                     structure = parser.get_structures(primitive=False)[0]
                     rdct[root_key][mp_level01_titles[3]].rec_update(
                         nest_dict(structure.as_dict(), [name]))
     return MPFile.from_dict(rdct)
def run(mpfile, **kwargs):

    input_file = mpfile.document['_hdata'].pop('input_file')
    zip_path = os.path.join(os.environ['HOME'], 'work', input_file)
    if not os.path.exists(zip_path):
        return 'Please upload', zip_path
    zip_file = ZipFile(zip_path, 'r')

    composition_table_dict = mpfile.document['_hdata']['composition_table']
    conc_funcs = get_concentration_functions(composition_table_dict)

    for info in zip_file.infolist():
        print info.filename
        d = RecursiveDict()

        # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv
        element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4)
        d['position'] = RecursiveDict(
            (k, clean_value(v, 'mm'))
            for k, v in zip(['x', 'y'], [x, y])
        )

        # composition
        d['composition'] = RecursiveDict(
            (el, clean_value(f(x, y), convert_to_percent=True))
            for el, f in conc_funcs.items()
        )

        # identifier
        identifier = get_composition_from_string(''.join([
            '{}{}'.format(el, int(round(Decimal(comp.split()[0]))))
            for el, comp in d['composition'].items()
        ]))

        # load csv file
        try:
            csv = zip_file.read(info.filename)
        except KeyError:
            print 'ERROR: Did not find %s in zip file' % info.filename

        # read csv to pandas DataFrame and add to MPFile
        df = read_csv(csv)
        df = df[['Energy', 'XAS', 'XMCD']]

        # min and max
        d.rec_update(RecursiveDict(
            (y, RecursiveDict([
                ('min', df[y].min()), ('max', df[y].max())
            ])) for y in ['XAS', 'XMCD']
        ))

        # add data to MPFile
        mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier)
        mpfile.add_data_table(identifier, df, name=element)
Beispiel #8
0
def run(mpfile, **kwargs):

    input_file = mpfile.document['_hdata'].pop('input_file')
    zip_path = os.path.join(os.environ['HOME'], 'work', input_file)
    if not os.path.exists(zip_path):
        return 'Please upload', zip_path
    zip_file = ZipFile(zip_path, 'r')

    composition_table_dict = mpfile.document['_hdata']['composition_table']
    conc_funcs = get_concentration_functions(composition_table_dict)

    for info in zip_file.infolist():
        print info.filename
        d = RecursiveDict()

        # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv
        element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4)
        d['position'] = RecursiveDict(
            (k, clean_value(v, 'mm')) for k, v in zip(['x', 'y'], [x, y]))

        # composition
        d['composition'] = RecursiveDict(
            (el, clean_value(f(x, y), convert_to_percent=True))
            for el, f in conc_funcs.items())

        # identifier
        identifier = get_composition_from_string(''.join([
            '{}{}'.format(el, int(round(Decimal(comp.split()[0]))))
            for el, comp in d['composition'].items()
        ]))

        # load csv file
        try:
            csv = zip_file.read(info.filename)
        except KeyError:
            print 'ERROR: Did not find %s in zip file' % info.filename

        # read csv to pandas DataFrame and add to MPFile
        df = read_csv(csv)
        df = df[['Energy', 'XAS', 'XMCD']]

        # min and max
        d.rec_update(
            RecursiveDict(
                (y, RecursiveDict([('min', df[y].min()), ('max',
                                                          df[y].max())]))
                for y in ['XAS', 'XMCD']))

        # add data to MPFile
        mpfile.add_hierarchical_data(nest_dict(d, ['data']),
                                     identifier=identifier)
        mpfile.add_data_table(identifier, df, name=element)
Beispiel #9
0
def run(mpfile, **kwargs):

    input_file = mpfile.document["_hdata"].pop("input_file")
    zip_path = os.path.join(os.environ["HOME"], "work", input_file)
    if not os.path.exists(zip_path):
        return "Please upload", zip_path
    zip_file = ZipFile(zip_path, "r")

    composition_table_dict = mpfile.document["_hdata"]["composition_table"]
    conc_funcs = get_concentration_functions(composition_table_dict)

    for info in zip_file.infolist():
        print info.filename
        d = RecursiveDict()

        # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv
        element, x, y = os.path.splitext(info.filename)[0].rsplit("_", 4)
        d["position"] = RecursiveDict(
            (k, clean_value(v, "mm")) for k, v in zip(["x", "y"], [x, y]))

        # composition
        d["composition"] = RecursiveDict(
            (el, clean_value(f(x, y), convert_to_percent=True))
            for el, f in conc_funcs.items())

        # identifier
        identifier = get_composition_from_string("".join([
            "{}{}".format(el, int(round(Decimal(comp.split()[0]))))
            for el, comp in d["composition"].items()
        ]))

        # load csv file
        try:
            csv = zip_file.read(info.filename)
        except KeyError:
            print "ERROR: Did not find %s in zip file" % info.filename

        # read csv to pandas DataFrame and add to MPFile
        df = read_csv(csv)
        df = df[["Energy", "XAS", "XMCD"]]

        # min and max
        d.rec_update(
            RecursiveDict(
                (y, RecursiveDict([("min", df[y].min()), ("max",
                                                          df[y].max())]))
                for y in ["XAS", "XMCD"]))

        # add data to MPFile
        mpfile.add_hierarchical_data(nest_dict(d, ["data"]),
                                     identifier=identifier)
        mpfile.add_data_table(identifier, df, name=element)
Beispiel #10
0
 def insert_general_section(self, general_mpfile):
     """insert general section from `general_mpfile` into this MPFile"""
     if general_mpfile is None:
         return
     general_title = mp_level01_titles[0]
     general_data = general_mpfile.document[general_title]
     root_key = list(self.document.keys())[0]
     for key, value in general_data.items():
         if key in self.document[root_key]:
             self.document.rec_update(nest_dict(value, [root_key, key]))
         else:
             self.document[root_key][key] = value
     for key in reversed(general_data.keys()):
         self.document[root_key].move_to_end(key, last=False)
Beispiel #11
0
 def __init__(self, document):
     from pymatgen import Structure
     super(HierarchicalData, self).__init__()
     scope = []
     for key, value in document.iterate():
         if isinstance(value, Table) or isinstance(value, Structure):
             continue
         level, key = key
         level_reduction = bool(level < len(scope))
         if level_reduction:
             del scope[level:]
         if value is None:
             scope.append(key)
         elif mp_level01_titles[2] not in scope:
             self.rec_update(nest_dict({key: value}, scope))
Beispiel #12
0
    def add_data_table(self, identifier, dataframe, name, plot_options=None):
        """add a datatable to the root-level section

        Args:
            identifier (str): MP category ID (`mp_cat_id`)
            dataframe (pandas.DataFrame): tabular data as Pandas DataFrame
            name (str): table name, optional if only one table in section
            plot_options (dict): options for according plotly graph
        """
        # TODO: optional table name, required if multiple tables per root-level section
        name = "".join([replacements.get(c, c) for c in name])
        self.document.rec_update(
            nest_dict(Table(dataframe).to_dict(), [identifier, name]))
        self.document[identifier].insert_default_plot_options(
            dataframe, name, update_plot_options=plot_options)
Beispiel #13
0
 def __init__(self, document):
     from pymatgen import Structure
     super(HierarchicalData, self).__init__()
     scope = []
     for key, value in document.iterate():
         if isinstance(value, Table) or isinstance(value, Structure):
             continue
         level, key = key
         level_reduction = bool(level < len(scope))
         if level_reduction:
             del scope[level:]
         if value is None:
             scope.append(key)
         elif mp_level01_titles[2] not in scope:
             self.rec_update(nest_dict({key: value}, scope))
Beispiel #14
0
 def __init__(self, doc):
     super(HierarchicalData, self).__init__()
     document = RecursiveDict(doc)
     scope = []
     for key, value in document.iterate():
         level, key = key
         if key in mp_level01_titles:
             continue
         level_reduction = bool(level < len(scope))
         if level_reduction:
             del scope[level:]
         if value is None:
             scope.append(key)
         else:
             d = nest_dict(value, scope + [key])
             self.rec_update(d, overwrite=True)
Beispiel #15
0
 def split(self):
     general_mpfile = (self.pop_first_section() if mp_level01_titles[0]
                       in self.document.keys() else None)
     if not self.document:
         raise ValueError("No contributions in MPFile! Either the file is"
                          " empty or only contains shared (meta-)data not"
                          " correlated to core identifier.")
     while True:
         try:
             mpfile_single = self.pop_first_section()
             mpid_orig = mpfile_single.ids[0]
             if "--" in mpid_orig:
                 mpid = mpid_orig.split("--")[0]
                 mpfile_single.document.rec_update(
                     nest_dict(mpfile_single.document.pop(mpid_orig),
                               [mpid]))
             if general_mpfile is not None:
                 mpfile_single.insert_general_section(general_mpfile)
             yield mpfile_single
         except KeyError:
             break
def run(mpfile, **kwargs):
    from pymatgen import MPRester, Composition
    import pandas as pd

    input_file = mpfile.document['_hdata'].pop('input_file')
    file_path = os.path.join(os.environ['HOME'], 'work', input_file)
    if not os.path.exists(file_path):
        return 'Please upload', file_path
    df_dct = pd.read_excel(file_path)
    columns_units = [
        ('A-Site', ''), ('B-Site', ''), ('a', 'Å'),
        ('Eᶠ|ABO₃', 'eV'), ('Eᶠ|Yᴮ', 'eV'), ('Eᶠ|Vᴼ', 'eV'),
        ('Eᶠ|Hᵢ', 'eV'), ('ΔEᵢ|Yᴮ-Hᵢ', 'eV')
    ]
    columns = df_dct.columns
    mpr = MPRester(endpoint="http://materialsproject.org:8080/rest/v2")

    for row_idx, row in df_dct.iterrows():
        formula = '{}{}O3'.format(row[columns[0]], row[columns[1]])
        comp = Composition(formula)
        crit = {"reduced_cell_formula": comp.to_reduced_dict, "nsites": 5}
        docs = mpr.query(criteria=crit, properties=["task_id", "volume"])
        if len(docs) > 1:
            volume = row[columns[2]]**3
            volumes = pd.np.array([r['volume'] for r in docs])
            idx = pd.np.abs(volumes-volume).argmin()
            identifier = docs[idx]['task_id']
            continue
        elif not docs:
            print formula, 'not found on MP'
            continue
        else:
            identifier = docs[0]['task_id']
        print formula, '->', identifier
        d = RecursiveDict()
        for col, (key, unit) in zip(columns, columns_units):
            d[key] = clean_value(row[col], unit)
        mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier)
Beispiel #17
0
 def add_hierarchical_data(self, dct, identifier=mp_level01_titles[0]):
     if len(self.ids) >= self.max_contribs:
         raise StopIteration(
             "Reached max. number of contributions in MPFile")
     self.document.rec_update(nest_dict(RecursiveDict(dct), [identifier]))
Beispiel #18
0
def run(mpfile, hosts=None, download=False, **kwargs):
    #mpfile.unique_mp_cat_ids = False
    from pymatgen import MPRester
    mpr = MPRester()

    fpath = os.path.join(os.environ['HOME'], 'work',
                         'dilute_solute_diffusion.xlsx')

    if download or not os.path.exists(fpath):

        figshare_id = mpfile.hdata.general['info']['figshare_id']
        url = 'https://api.figshare.com/v2/articles/{}'.format(figshare_id)
        print 'get figshare article {}'.format(figshare_id)
        r = requests.get(url)
        figshare = json.loads(r.content)
        mpfile.document['_hdata']['version'] = figshare['version']

        print 'read excel from figshare into DataFrame'
        df_dct = None
        for d in figshare['files']:
            if 'xlsx' in d['name']:
                # Dict of DataFrames is returned, with keys representing sheets
                df_dct = read_excel(d['download_url'], sheet_name=None)
                break
        if df_dct is None:
            print 'no excel sheet found on figshare'
            return

        print 'save excel to disk'
        writer = ExcelWriter(fpath)
        for sheet, df in df_dct.items():
            df.to_excel(writer, sheet)
        writer.save()

    else:
        df_dct = read_excel(fpath, sheet_name=None)

    print len(df_dct), 'sheets loaded.'

    print 'looping hosts ...'
    host_info = df_dct['Host Information']
    host_info.set_index(host_info.columns[0], inplace=True)
    host_info.dropna(inplace=True)

    for idx, host in enumerate(host_info):
        if hosts is not None:
            if isinstance(hosts, int) and idx + 1 > hosts:
                break
            elif isinstance(hosts, list) and not host in hosts:
                continue

        print 'get mp-id for {}'.format(host)
        mpid = None
        for doc in mpr.query(criteria={'pretty_formula': host},
                             properties={'task_id': 1}):
            if doc['sbxd'][0]['decomposes_to'] is None:
                mpid = doc['task_id']
                break
        if mpid is None:
            print 'mp-id for {} not found'.format(host)
            continue

        print 'add host info for {}'.format(mpid)
        hdata = host_info[host].to_dict(into=RecursiveDict)
        for k in hdata.keys():
            v = hdata.pop(k)
            ks = k.split()
            if ks[0] not in hdata:
                hdata[ks[0]] = RecursiveDict()
            unit = ks[-1][1:-1] if ks[-1].startswith('[') else ''
            subkey = '_'.join(ks[1:-1] if unit else ks[1:]).split(',')[0]
            if subkey == "lattice_constant":
                unit = u'Å'
            try:
                hdata[ks[0]][subkey] = clean_value(
                    v, unit.replace('angstrom', u'Å'))
            except ValueError:
                hdata[ks[0]][subkey] = v
        hdata['formula'] = host
        df = df_dct['{}-X'.format(host)]
        rows = list(isnull(df).any(1).nonzero()[0])
        if rows:
            cells = df.ix[rows].dropna(how='all').dropna(axis=1)[df.columns[0]]
            note = cells.iloc[0].replace('following', cells.iloc[1])[:-1]
            hdata['note'] = note
            df.drop(rows, inplace=True)
        mpfile.add_hierarchical_data(nest_dict(hdata, ['data']),
                                     identifier=mpid)

        print 'add table for D₀/Q data for {}'.format(mpid)
        df.set_index(df['Solute element number'], inplace=True)
        df.drop('Solute element number', axis=1, inplace=True)
        df.columns = df.ix[0]
        df.index.name = 'index'
        df.drop('Solute element name', inplace=True)
        df = df.T.reset_index()
        if str(host) == 'Fe':
            df_D0_Q = df[[
                'Solute element name', 'Solute D0, paramagnetic [cm^2/s]',
                'Solute Q, paramagnetic [eV]'
            ]]
        elif hdata['Host']['crystal_structure'] == 'HCP':
            df_D0_Q = df[[
                'Solute element name', 'Solute D0 basal [cm^2/s]',
                'Solute Q basal [eV]'
            ]]
        else:
            df_D0_Q = df[[
                'Solute element name', 'Solute D0 [cm^2/s]', 'Solute Q [eV]'
            ]]
        df_D0_Q.columns = ['El.', 'D₀ [cm²/s]', 'Q [eV]']
        mpfile.add_data_table(mpid, df_D0_Q, 'D₀_Q')

        if hdata['Host']['crystal_structure'] == 'BCC':

            print 'add table for hop activation barriers for {} (BCC)'.format(
                mpid)
            columns_E = [
                'Hop activation barrier, E_{} [eV]'.format(i)
                for i in range(2, 5)
            ] + [
                "Hop activation barrier, E'_{} [eV]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop activation barrier, E''_{} [eV]".format(i)
                for i in range(3, 5)
            ] + [
                'Hop activation barrier, E_{} [eV]'.format(i)
                for i in range(5, 7)
            ]
            df_E = df[['Solute element name'] + columns_E]
            df_E.columns = ['El.'] + [
                'E{} [eV]'.format(i) for i in ['₂', '₃', '₄']
            ] + ['E`{} [eV]'.format(i) for i in ['₃', '₄']] + [
                'E``{} [eV]'.format(i) for i in ['₃', '₄']
            ] + ['E{} [eV]'.format(i) for i in ['₅', '₆']]
            mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers')

            print 'add table for hop attempt frequencies for {} (BCC)'.format(
                mpid)
            columns_v = [
                'Hop attempt frequency, v_{} [THz]'.format(i)
                for i in range(2, 5)
            ] + [
                "Hop attempt frequency, v'_{} [THz]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop attempt frequency, v''_{} [THz]".format(i)
                for i in range(3, 5)
            ] + [
                'Hop attempt frequency, v_{} [THz]'.format(i)
                for i in range(5, 7)
            ]
            df_v = df[['Solute element name'] + columns_v]
            df_v.columns = ['El.'] + [
                'v{} [THz]'.format(i) for i in ['₂', '₃', '₄']
            ] + ['v``{} [THz]'.format(i) for i in ['₃', '₄']] + [
                'v``{} [THz]'.format(i) for i in ['₃', '₄']
            ] + ['v{} [THz]'.format(i) for i in ['₅', '₆']]
            mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies')

        elif hdata['Host']['crystal_structure'] == 'FCC':

            print 'add table for hop activation barriers for {} (FCC)'.format(
                mpid)
            columns_E = [
                'Hop activation barrier, E_{} [eV]'.format(i) for i in range(5)
            ]
            df_E = df[['Solute element name'] + columns_E]
            df_E.columns = ['El.'] + [
                'E{} [eV]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄']
            ]
            mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers')

            print 'add table for hop attempt frequencies for {} (FCC)'.format(
                mpid)
            columns_v = [
                'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(5)
            ]
            df_v = df[['Solute element name'] + columns_v]
            df_v.columns = ['El.'] + [
                'v{} [THz]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄']
            ]
            mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies')

        elif hdata['Host']['crystal_structure'] == 'HCP':

            print 'add table for hop activation barriers for {} (HCP)'.format(
                mpid)
            columns_E = [
                "Hop activation barrier, E_X [eV]",
                "Hop activation barrier, E'_X [eV]",
                "Hop activation barrier, E_a [eV]",
                "Hop activation barrier, E'_a [eV]",
                "Hop activation barrier, E_b [eV]",
                "Hop activation barrier, E'_b [eV]",
                "Hop activation barrier, E_c [eV]",
                "Hop activation barrier, E'_c [eV]"
            ]
            df_E = df[['Solute element name'] + columns_E]
            df_E.columns = ['El.'] + [
                'Eₓ [eV]', 'E`ₓ [eV]', 'Eₐ [eV]', 'E`ₐ [eV]', 'E_b [eV]',
                'E`_b [eV]', 'Eꪱ [eV]', 'E`ꪱ [eV]'
            ]
            mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers')

            print 'add table for hop attempt frequencies for {} (HCP)'.format(
                mpid)
            columns_v = ['Hop attempt frequency, v_a [THz]'
                         ] + ['Hop attempt frequency, v_X [THz]']
            df_v = df[['Solute element name'] + columns_v]
            df_v.columns = ['El.'] + ['vₐ [THz]'] + ['vₓ [THz]']
            mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies')

            print mpfile
    print 'DONE'
def run(mpfile, hosts=None, download=False):
    mpr = MPRester()
    fpath = f"{project}.xlsx"

    if download or not os.path.exists(fpath):

        figshare_id = 1546772
        url = "https://api.figshare.com/v2/articles/{}".format(figshare_id)
        print("get figshare article {}".format(figshare_id))
        r = requests.get(url)
        figshare = json.loads(r.content)
        print("version =",
              figshare["version"])  # TODO set manually in "other"?

        print("read excel from figshare into DataFrame")
        df_dct = None
        for d in figshare["files"]:
            if "xlsx" in d["name"]:
                # Dict of DataFrames is returned, with keys representing sheets
                df_dct = read_excel(d["download_url"], sheet_name=None)
                break
        if df_dct is None:
            print("no excel sheet found on figshare")
            return

        print("save excel to disk")
        writer = ExcelWriter(fpath)
        for sheet, df in df_dct.items():
            df.to_excel(writer, sheet)
        writer.save()

    else:
        df_dct = read_excel(fpath, sheet_name=None)

    print(len(df_dct), "sheets loaded.")

    print("looping hosts ...")
    host_info = df_dct["Host Information"]
    host_info.set_index(host_info.columns[0], inplace=True)
    host_info.dropna(inplace=True)

    for idx, host in enumerate(host_info):
        if hosts is not None:
            if isinstance(hosts, int) and idx + 1 > hosts:
                break
            elif isinstance(hosts, list) and not host in hosts:
                continue

        print("get mp-id for {}".format(host))
        mpid = None
        for doc in mpr.query(criteria={"pretty_formula": host},
                             properties={"task_id": 1}):
            if "decomposes_to" not in doc["sbxd"][0]:
                mpid = doc["task_id"]
                break
        if mpid is None:
            print("mp-id for {} not found".format(host))
            continue

        print("add host info for {}".format(mpid))
        hdata = host_info[host].to_dict(into=RecursiveDict)
        for k in list(hdata.keys()):
            v = hdata.pop(k)
            ks = k.split()
            if ks[0] not in hdata:
                hdata[ks[0]] = RecursiveDict()
            unit = ks[-1][1:-1] if ks[-1].startswith("[") else ""
            subkey = "_".join(ks[1:-1] if unit else ks[1:]).split(",")[0]
            if subkey == "lattice_constant":
                unit = "Å"
            try:
                hdata[ks[0]][subkey] = clean_value(
                    v, unit.replace("angstrom", "Å"))
            except ValueError:
                hdata[ks[0]][subkey] = v
        hdata["formula"] = host
        df = df_dct["{}-X".format(host)]
        rows = list(isnull(df).any(1).nonzero()[0])
        if rows:
            cells = df.iloc[rows].dropna(how="all").dropna(
                axis=1)[df.columns[0]]
            note = cells.iloc[0].replace("following", cells.iloc[1])[:-1]
            hdata["note"] = note
            df.drop(rows, inplace=True)
        mpfile.add_hierarchical_data(nest_dict(hdata, ["data"]),
                                     identifier=mpid)

        print("add table for D₀/Q data for {}".format(mpid))
        df.set_index(df["Solute element number"], inplace=True)
        df.drop("Solute element number", axis=1, inplace=True)
        df.columns = df.iloc[0]
        df.index.name = "index"
        df.drop("Solute element name", inplace=True)
        df = df.T.reset_index()
        if str(host) == "Fe":
            df_D0_Q = df[[
                "Solute element name",
                "Solute D0, paramagnetic [cm^2/s]",
                "Solute Q, paramagnetic [eV]",
            ]]
        elif hdata["Host"]["crystal_structure"] == "HCP":
            df_D0_Q = df[[
                "Solute element name",
                "Solute D0 basal [cm^2/s]",
                "Solute Q basal [eV]",
            ]]
        else:
            df_D0_Q = df[[
                "Solute element name", "Solute D0 [cm^2/s]", "Solute Q [eV]"
            ]]
        df_D0_Q.columns = ["Solute", "D₀ [cm²/s]", "Q [eV]"]
        anums = [z[el] for el in df_D0_Q["Solute"]]
        df_D0_Q.insert(0, "Z", Series(anums, index=df_D0_Q.index))
        df_D0_Q.sort_values("Z", inplace=True)
        df_D0_Q.reset_index(drop=True, inplace=True)
        mpfile.add_data_table(mpid, df_D0_Q, "D₀_Q")

        if hdata["Host"]["crystal_structure"] == "BCC":

            print("add table for hop activation barriers for {} (BCC)".format(
                mpid))
            columns_E = ([
                "Hop activation barrier, E_{} [eV]".format(i)
                for i in range(2, 5)
            ] + [
                "Hop activation barrier, E'_{} [eV]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop activation barrier, E''_{} [eV]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop activation barrier, E_{} [eV]".format(i)
                for i in range(5, 7)
            ])
            df_E = df[["Solute element name"] + columns_E]
            df_E.columns = (["Solute"] +
                            ["E{} [eV]".format(i) for i in ["₂", "₃", "₄"]] +
                            ["E`{} [eV]".format(i) for i in ["₃", "₄"]] +
                            ["E``{} [eV]".format(i) for i in ["₃", "₄"]] +
                            ["E{} [eV]".format(i) for i in ["₅", "₆"]])
            mpfile.add_data_table(mpid, df_E, "hop_activation_barriers")

            print("add table for hop attempt frequencies for {} (BCC)".format(
                mpid))
            columns_v = ([
                "Hop attempt frequency, v_{} [THz]".format(i)
                for i in range(2, 5)
            ] + [
                "Hop attempt frequency, v'_{} [THz]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop attempt frequency, v''_{} [THz]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop attempt frequency, v_{} [THz]".format(i)
                for i in range(5, 7)
            ])
            df_v = df[["Solute element name"] + columns_v]
            df_v.columns = (["Solute"] +
                            ["v{} [THz]".format(i) for i in ["₂", "₃", "₄"]] +
                            ["v`{} [THz]".format(i) for i in ["₃", "₄"]] +
                            ["v``{} [THz]".format(i) for i in ["₃", "₄"]] +
                            ["v{} [THz]".format(i) for i in ["₅", "₆"]])
            mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies")

        elif hdata["Host"]["crystal_structure"] == "FCC":

            print("add table for hop activation barriers for {} (FCC)".format(
                mpid))
            columns_E = [
                "Hop activation barrier, E_{} [eV]".format(i) for i in range(5)
            ]
            df_E = df[["Solute element name"] + columns_E]
            df_E.columns = ["Solute"] + [
                "E{} [eV]".format(i) for i in ["₀", "₁", "₂", "₃", "₄"]
            ]
            mpfile.add_data_table(mpid, df_E, "hop_activation_barriers")

            print("add table for hop attempt frequencies for {} (FCC)".format(
                mpid))
            columns_v = [
                "Hop attempt frequency, v_{} [THz]".format(i) for i in range(5)
            ]
            df_v = df[["Solute element name"] + columns_v]
            df_v.columns = ["Solute"] + [
                "v{} [THz]".format(i) for i in ["₀", "₁", "₂", "₃", "₄"]
            ]
            mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies")

        elif hdata["Host"]["crystal_structure"] == "HCP":

            print("add table for hop activation barriers for {} (HCP)".format(
                mpid))
            columns_E = [
                "Hop activation barrier, E_X [eV]",
                "Hop activation barrier, E'_X [eV]",
                "Hop activation barrier, E_a [eV]",
                "Hop activation barrier, E'_a [eV]",
                "Hop activation barrier, E_b [eV]",
                "Hop activation barrier, E'_b [eV]",
                "Hop activation barrier, E_c [eV]",
                "Hop activation barrier, E'_c [eV]",
            ]
            df_E = df[["Solute element name"] + columns_E]
            df_E.columns = ["Solute"] + [
                "Eₓ [eV]",
                "E`ₓ [eV]",
                "Eₐ [eV]",
                "E`ₐ [eV]",
                "E_b [eV]",
                "E`_b [eV]",
                "Eꪱ [eV]",
                "E`ꪱ [eV]",
            ]
            mpfile.add_data_table(mpid, df_E, "hop_activation_barriers")

            print("add table for hop attempt frequencies for {} (HCP)".format(
                mpid))
            columns_v = ["Hop attempt frequency, v_a [THz]"
                         ] + ["Hop attempt frequency, v_X [THz]"]
            df_v = df[["Solute element name"] + columns_v]
            df_v.columns = ["Solute"] + ["vₐ [THz]"] + ["vₓ [THz]"]
            mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies")

    print("DONE")
Beispiel #20
0
def run(mpfile, **kwargs):

    # extract data from json files
    keys = ['pretty_formula', 'volume']
    input_dir = mpfile.hdata.general['input_dir']
    for idx, obj in enumerate(scandir(input_dir)):
        mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1]
        print(mpid)
        input_file = gzip.open(obj.path, 'rb')
        try:
            data = json.loads(input_file.read())

            # filter out metals
            if 'GGA' not in data['gap'] or data['gap']['GGA'] < 0.1:
                print('GGA gap < 0.1 -> skip')
                continue

            # add hierarchical data (nested key-values)
            # TODO: extreme values for power factor, zT, effective mass
            # TODO: add a text for the description of each table
            hdata = RecursiveDict((k, data[k]) for k in keys)
            hdata['volume'] = u'{:g} ų'.format(hdata['volume'])
            cond_eff_mass = u'mₑᶜᵒⁿᵈ'
            hdata[cond_eff_mass] = RecursiveDict()
            names = [u'e₁', u'e₂', u'e₃', u'<m>']
            if 'GGA' not in data:
                print('no GGA key for', mpid)
                continue
            for dt, d in data['GGA']['cond_eff_mass'].items():
                eff_mass = d['300']['1e+18']
                eff_mass.append(np.mean(eff_mass))
                hdata[cond_eff_mass][dt] = RecursiveDict(
                    (names[idx], u'{:.2f} mₑ'.format(x))
                    for idx, x in enumerate(eff_mass))
            seebeck_fix_dop_temp = "Seebeck"
            hdata[seebeck_fix_dop_temp] = RecursiveDict()
            cols = [u'e₁', u'e₂', u'e₃', 'temperature', 'doping']
            for doping_type in ['p', 'n']:
                sbk = [
                    float(i) for i in data['GGA']['seebeck_doping']
                    [doping_type]['300']['1e+18']['eigs']
                ]
                vals = [u'{:.2e} μV/K'.format(s) for s in sbk] + [
                    u'{} K'.format('300'), u'{} cm⁻³'.format('1e+18')
                ]
                hdata[seebeck_fix_dop_temp][doping_type] = RecursiveDict(
                    (k, v) for k, v in zip(cols, vals))

            # build data and max values for seebeck, conductivity and kappa
            # max/min values computed using numpy. It may be better to code it in pure python.
            cols = ['value', 'temperature', 'doping']
            for prop_name in ['seebeck_doping', 'cond_doping', 'kappa_doping']:
                # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font)
                # and select it as standard font in your browser (leave other fonts as is, esp. fixed width)
                if prop_name[0] == 's':
                    lbl, unit = u"Sₘₐₓ", u"μV/K"
                elif prop_name[0] == 'c':
                    lbl, unit = u"σₘₐₓ", u"(Ωms)⁻¹"
                elif prop_name[0] == 'k':
                    lbl, unit = u"κₑ₋ₘᵢₙ", u"W/(mKs)"
                hdata[lbl] = RecursiveDict()

                for doping_type in ['p', 'n']:
                    prop = data['GGA'][prop_name][doping_type]
                    prop_averages, dopings, columns = [], None, ['T (K)']
                    temps = sorted(map(int, prop.keys()))
                    for temp in temps:
                        row = [temp]
                        if dopings is None:
                            dopings = sorted(map(float,
                                                 prop[str(temp)].keys()))
                        for doping in dopings:
                            doping_str = '%.0e' % doping
                            if len(columns) <= len(dopings):
                                columns.append(doping_str + u' cm⁻³')
                            eigs = prop[str(temp)][doping_str]['eigs']
                            row.append(np.mean(eigs))
                        prop_averages.append(row)

                    arr_prop_avg = np.array(prop_averages)[:, 1:]
                    max_v = np.max(arr_prop_avg)
                    if prop_name[0] == 's' and doping_type == 'n':
                        max_v = np.min(arr_prop_avg)
                    if prop_name[0] == 'k':
                        max_v = np.min(arr_prop_avg)
                    arg_max = np.argwhere(arr_prop_avg == max_v)[0]

                    vals = [
                        u'{:.2e} {}'.format(max_v, unit),
                        u'{:.2e} K'.format(temps[arg_max[0]]),
                        u'{:.2e} cm⁻³'.format(dopings[arg_max[1]])
                    ]
                    hdata[lbl][doping_type] = RecursiveDict(
                        (k, v) for k, v in zip(cols, vals))

            mpfile.add_hierarchical_data(nest_dict(hdata, ['data']),
                                         identifier=data['mp_id'])

        finally:
            input_file.close()
def run(mpfile, **kwargs):
    from pymatgen import Structure

    reference_project = None
    input_data, input_keys, extra = RecursiveDict(), RecursiveDict(), RecursiveDict()
    input_urls = mpfile.document['_hdata'].pop('input_urls')

    for project in input_urls:
        input_url = input_urls[project]['file']
        if '{}' in input_url:
            input_url = input_url.format('2d') # TODO 3d for Jarvis

        dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1])
        if not os.path.exists(dbfile):
            print 'downloading', dbfile, '...'
            urllib.urlretrieve(input_url, dbfile)

        ext = os.path.splitext(dbfile)[1]
        is_nus = bool(ext == '.gz')
        id_key = 'parent_id' if is_nus else 'mpid'
        if not is_nus:
            with tarfile.open(dbfile, "r:gz") as tar:
                member = tar.getmembers()[0]
                raw_data = json.load(tar.extractfile(member), cls=MontyDecoder)
        else:
            reference_project = project
            raw_data = []
            with gzip.open(dbfile, 'rb') as f:
                for line in f:
                    raw_data.append(json.loads(line, cls=MontyDecoder))
        input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data)

        input_keys[project] = [
            'material_id', 'exfoliation_energy_per_atom', 'structure'
        ] if is_nus else ['jid', 'exfoliation_en', 'final_str']
        extra[project] = [
            ('fin_en', ('E', 'eV')),
            ('op_gap', ('ΔE|optB88vdW', 'eV')),
            ('mbj_gap', ('ΔE|mbj', 'eV')),
            #('kv', ('Kᵥ', 'GPa')),
            #('gv', ('Gᵥ', 'GPa'))
        ] if not is_nus else []

        print len(input_data[project]), 'materials loaded for', project

    projects = input_data.keys()
    identifiers = []
    for d in input_data.values():
        identifiers += list(d.keys())

    for identifier in identifiers:
        data, structures = RecursiveDict(), RecursiveDict()

        for project in projects:
            if project not in data:
                data[project] = RecursiveDict()
            if identifier in input_data[project]:
                d = input_data[project][identifier]
                structures[project] = d[input_keys[project][-1]]
                if data.get('formula') is None:
                    data['formula'] = get_composition_from_string(
                        structures[project].composition.reduced_formula
                    )
                data[project]['id'] = input_urls[project]['detail'].format(d[input_keys[project][0]])
                Ex = d[input_keys[project][1]]
                if project == reference_project:
                    Ex *= 1000.
                data[project]['Eₓ'] = clean_value(Ex, 'eV')
                for k, (sym, unit) in extra[project]:
                    if d[k] != 'na':
                        data[project][sym] = clean_value(d[k], unit)

        mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier)
        for project, structure in structures.items():
            name = '{}_{}'.format(data['formula'], project)
            try:
                mpfile.add_structure(structure, name=name, identifier=identifier)
            except Exception as ex:
                print str(ex)
Beispiel #22
0
    def from_string(data):
        # use archieml-python parse to import data
        rdct = RecursiveDict(loads(data))
        rdct.rec_update()

        # post-process internal representation of file contents
        for key in list(rdct.keys()):
            is_general, root_key = normalize_root_level(key)

            if is_general:
                # make part of shared (meta-)data, i.e. nest under `general` at
                # the beginning of the MPFile
                if mp_level01_titles[0] not in rdct:
                    rdct[mp_level01_titles[0]] = RecursiveDict()
                    rdct.move_to_end(mp_level01_titles[0], last=False)

            # normalize identifier key (pop & insert)
            # using rec_update since we're looping over all entries
            # also: support data in bare tables (marked-up only by
            #       root-level identifier) by nesting under 'data'
            value = rdct.pop(key)
            keys = [mp_level01_titles[0]] if is_general else []
            keys.append(root_key)
            if isinstance(value, list):
                keys.append("table")
            rdct.rec_update(nest_dict(value, keys))

            # reference to section to iterate or parse as CIF
            section = (rdct[mp_level01_titles[0]][root_key]
                       if is_general else rdct[root_key])

            # iterate to find CSV sections to parse
            # also parse propnet quantities
            if isinstance(section, dict):
                scope = []
                for k, v in section.iterate():
                    level, key = k
                    key = "".join([replacements.get(c, c) for c in key])
                    level_reduction = bool(level < len(scope))
                    if level_reduction:
                        del scope[level:]
                    if v is None:
                        scope.append(key)
                    elif isinstance(v, list) and isinstance(v[0], dict):
                        table = ""
                        for row_dct in v:
                            table = "\n".join([table, row_dct["value"]])
                        pd_obj = read_csv(table)
                        d = nest_dict(pd_obj.to_dict(), scope + [key])
                        section.rec_update(d, overwrite=True)
                        if not is_general and level == 0:
                            section.insert_default_plot_options(pd_obj, key)
                    elif (Quantity is not None
                          and isinstance(v, six.string_types) and " " in v):
                        quantity = Quantity.from_key_value(key, v)
                        d = nest_dict(quantity.as_dict(), scope +
                                      [key])  # TODO quantity.symbol.name
                        section.rec_update(d, overwrite=True)

            # convert CIF strings into pymatgen structures
            if mp_level01_titles[3] in section:
                from pymatgen.io.cif import CifParser

                for name in section[mp_level01_titles[3]].keys():
                    cif = section[mp_level01_titles[3]].pop(name)
                    parser = CifParser.from_string(cif)
                    structure = parser.get_structures(primitive=False)[0]
                    section[mp_level01_titles[3]].rec_update(
                        nest_dict(structure.as_dict(), [name]))

        return MPFile.from_dict(rdct)
Beispiel #23
0
def run(mpfile, **kwargs):
    from pymatgen import Structure

    reference_project = None
    input_data, input_keys, extra = RecursiveDict(), RecursiveDict(
    ), RecursiveDict()
    input_urls = mpfile.document['_hdata'].pop('input_urls')

    for project in input_urls:
        input_url = input_urls[project]['file']
        if '{}' in input_url:
            input_url = input_url.format('2d')  # TODO 3d for Jarvis

        dbfile = os.path.join(os.environ['HOME'], 'work',
                              input_url.rsplit('/')[-1])
        if not os.path.exists(dbfile):
            print 'downloading', dbfile, '...'
            urllib.urlretrieve(input_url, dbfile)

        ext = os.path.splitext(dbfile)[1]
        is_nus = bool(ext == '.gz')
        id_key = 'parent_id' if is_nus else 'mpid'
        if not is_nus:
            with tarfile.open(dbfile, "r:gz") as tar:
                member = tar.getmembers()[0]
                raw_data = json.load(tar.extractfile(member), cls=MontyDecoder)
        else:
            reference_project = project
            raw_data = []
            with gzip.open(dbfile, 'rb') as f:
                for line in f:
                    raw_data.append(json.loads(line, cls=MontyDecoder))
        input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data)

        input_keys[project] = [
            'material_id', 'exfoliation_energy_per_atom', 'structure'
        ] if is_nus else ['jid', 'exfoliation_en', 'final_str']
        extra[project] = [
            ('fin_en', ('E', 'meV/atom')),
            ('op_gap', ('ΔE|optB88vdW', 'meV/atom')),
            ('mbj_gap', ('ΔE|mbj', 'meV/atom')),
            #('kv', ('Kᵥ', 'GPa')),
            #('gv', ('Gᵥ', 'GPa'))
        ] if not is_nus else []

        print len(input_data[project]), 'materials loaded for', project

    projects = input_data.keys()
    identifiers = []
    for d in input_data.values():
        identifiers += list(d.keys())

    for identifier in identifiers:
        data, structures = RecursiveDict(), RecursiveDict()

        for project in projects:
            if project not in data:
                data[project] = RecursiveDict()
            if identifier in input_data[project]:
                d = input_data[project][identifier]
                structures[project] = d[input_keys[project][-1]]
                if data.get('formula') is None:
                    data['formula'] = get_composition_from_string(
                        structures[project].composition.reduced_formula)
                data[project]['id'] = input_urls[project]['detail'].format(
                    d[input_keys[project][0]])
                Ex = d[input_keys[project][1]]
                if project == reference_project:
                    Ex *= 1000.
                data[project]['Eₓ'] = clean_value(Ex, 'eV')
                for k, (sym, unit) in extra[project]:
                    if d[k] != 'na':
                        data[project][sym] = clean_value(d[k], unit)

        mpfile.add_hierarchical_data(nest_dict(data, ['data']),
                                     identifier=identifier)
        for project, structure in structures.items():
            name = '{}_{}'.format(data['formula'], project)
            try:
                mpfile.add_structure(structure,
                                     name=name,
                                     identifier=identifier)
            except Exception as ex:
                print str(ex)
def run(mpfile, **kwargs):

    # extract data from json files
    input_dir = mpfile.hdata.general['input_dir']
    for idx, obj in enumerate(scandir(input_dir)):
        mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1]
        print(mpid)
        input_file = gzip.open(obj.path, 'rb')
        try:
            data = json.loads(input_file.read())

            # filter out metals
            if 'GGA' not in data or 'GGA' not in data[
                    'gap'] or data['gap']['GGA'] < 0.1:
                print('GGA gap < 0.1 -> skip')
                continue

            # add hierarchical data (nested key-values)
            hdata = RecursiveDict()
            T, lvl, S2 = '300', '1e+18', None
            pf_key = 'S²σ'
            hdata['temperature'] = T + ' K'
            hdata['doping_level'] = lvl + ' cm⁻³'
            variables = [
                {
                    'key': 'cond_eff_mass',
                    'name': 'mₑᶜᵒⁿᵈ',
                    'unit': 'mₑ'
                },
                {
                    'key': 'seebeck_doping',
                    'name': 'S',
                    'unit': 'μV/K'
                },
                {
                    'key': 'cond_doping',
                    'name': 'σ',
                    'unit': '(Ωms)⁻¹'
                },
            ]
            eigs_keys = ['ε₁', 'ε₂', 'ε₃', '<ε>']

            for v in variables:
                hdata[v['name']] = RecursiveDict()
                for doping_type in ['p', 'n']:
                    if doping_type in data['GGA'][v['key']]:
                        d = data['GGA'][v['key']][doping_type][T][lvl]
                        eigs = map(float,
                                   d if isinstance(d, list) else d['eigs'])
                        hdata[v['name']][doping_type] = RecursiveDict(
                            (eigs_keys[neig], clean_value(eig, v['unit']))
                            for neig, eig in enumerate(eigs))
                        hdata[v['name']][doping_type][
                            eigs_keys[-1]] = clean_value(
                                np.mean(eigs), v['unit'])
                        if v['key'] == 'seebeck_doping':
                            S2 = np.dot(d['tensor'], d['tensor'])
                        elif v['key'] == 'cond_doping':
                            pf = np.mean(
                                np.linalg.eigh(np.dot(S2,
                                                      d['tensor']))[0]) * 1e-8
                            if pf_key not in hdata:
                                hdata[pf_key] = RecursiveDict()
                            hdata[pf_key][doping_type] = {
                                eigs_keys[-1]: clean_value(pf, 'μW/(cmK²s)')
                            }

            mpfile_data = nest_dict(hdata, ['data'])

            # build data and max values for seebeck, conductivity and kappa
            # max/min values computed using numpy. It may be better to code it in pure python.
            keys = ['pretty_formula', 'volume']
            hdata = RecursiveDict((k, data[k]) for k in keys)
            hdata['volume'] = clean_value(hdata['volume'], 'ų')
            hdata['bandgap'] = clean_value(data['gap']['GGA'], 'eV')
            cols = ['value', 'temperature', 'doping']
            tables = RecursiveDict()
            props = RecursiveDict()
            props['seebeck_doping'] = ['S', 'μV/K']
            props['cond_doping'] = ['σ', '(Ωms)⁻¹']
            props['kappa_doping'] = ['κₑ', 'W/(mKs)']

            for prop_name, (lbl, unit) in props.iteritems():
                # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font)
                # and select it as standard font in your browser (leave other fonts as is, esp. fixed width)
                tables[lbl] = RecursiveDict()
                hlbl = lbl + '₋' if len(lbl) > 1 else lbl
                hlbl += 'ₑₓₜᵣ'
                hdata[hlbl] = RecursiveDict()

                for doping_type in ['p', 'n']:
                    prop = data['GGA'][prop_name][doping_type]
                    prop_averages, dopings, columns = [], None, ['T [K]']
                    temps = sorted(map(int, prop.keys()))
                    for temp in temps:
                        row = [temp]
                        if dopings is None:
                            dopings = sorted(map(float,
                                                 prop[str(temp)].keys()))
                        for doping in dopings:
                            doping_str = '%.0e' % doping
                            if len(columns) <= len(dopings):
                                columns.append('{} cm⁻³ [{}]'.format(
                                    doping_str, unit))
                            eigs = prop[str(temp)][doping_str]['eigs']
                            row.append(np.mean(eigs))
                        prop_averages.append((temp, row))

                    tables[lbl][doping_type] = Table.from_items(
                        prop_averages, orient='index', columns=columns)

                    arr_prop_avg = np.array(
                        [item[1] for item in prop_averages])[:, 1:]
                    max_v = np.max(arr_prop_avg)
                    if prop_name[0] == 's' and doping_type == 'n':
                        max_v = np.min(arr_prop_avg)
                    if prop_name[0] == 'k':
                        max_v = np.min(arr_prop_avg)
                    arg_max = np.argwhere(arr_prop_avg == max_v)[0]

                    vals = [
                        clean_value(max_v, unit),
                        clean_value(temps[arg_max[0]], 'K'),
                        clean_value(dopings[arg_max[1]], 'cm⁻³')
                    ]
                    hdata[hlbl][doping_type] = RecursiveDict(
                        (k, v) for k, v in zip(cols, vals))

            mpfile_data.rec_update(nest_dict(hdata, ['extra_data']))
            mpfile.add_hierarchical_data(mpfile_data, identifier=data['mp_id'])
            for lbl, dct in tables.iteritems():
                for doping_type, table in dct.iteritems():
                    mpfile.add_data_table(data['mp_id'],
                                          table,
                                          name='{}({})'.format(
                                              lbl, doping_type))

        finally:
            input_file.close()
def run(mpfile, **kwargs):

    # extract data from json files
    input_dir = mpfile.hdata.general['input_dir']
    for idx, obj in enumerate(scandir(input_dir)):
        mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1]
        print(mpid)
        input_file = gzip.open(obj.path, 'rb')
        try:
            data = json.loads(input_file.read())

            # filter out metals
            if 'GGA' not in data or 'GGA' not in data['gap'] or data['gap']['GGA'] < 0.1:
                print('GGA gap < 0.1 -> skip')
                continue

            # add hierarchical data (nested key-values)
            hdata = RecursiveDict()
            T, lvl, S2 = '300', '1e+18', None
            pf_key = 'S²σ'
            hdata['temperature'] = T + ' K'
            hdata['doping_level'] = lvl + ' cm⁻³'
            variables = [
                {'key': 'cond_eff_mass', 'name': 'mₑᶜᵒⁿᵈ', 'unit': 'mₑ'},
                {'key': 'seebeck_doping', 'name': 'S', 'unit': 'μV/K'},
                {'key': 'cond_doping', 'name': 'σ', 'unit': '(Ωms)⁻¹'},
            ]
            eigs_keys = ['ε₁', 'ε₂', 'ε₃', '<ε>']

            for v in variables:
                hdata[v['name']] = RecursiveDict()
                for doping_type in ['p', 'n']:
                    if doping_type in data['GGA'][v['key']]:
                        d = data['GGA'][v['key']][doping_type][T][lvl]
                        eigs = map(float, d if isinstance(d, list) else d['eigs'])
                        hdata[v['name']][doping_type] = RecursiveDict(
                            (eigs_keys[neig], clean_value(eig, v['unit']))
                            for neig, eig in enumerate(eigs)
                        )
                        hdata[v['name']][doping_type][eigs_keys[-1]] = clean_value(np.mean(eigs), v['unit'])
                        if v['key'] == 'seebeck_doping':
                            S2 = np.dot(d['tensor'], d['tensor'])
                        elif v['key'] == 'cond_doping':
                            pf = np.mean(np.linalg.eigh(np.dot(S2, d['tensor']))[0]) * 1e-8
                            if pf_key not in hdata:
                                hdata[pf_key] = RecursiveDict()
                            hdata[pf_key][doping_type] = {eigs_keys[-1]: clean_value(pf, 'μW/(cmK²s)')}


            mpfile_data = nest_dict(hdata, ['data'])

            # build data and max values for seebeck, conductivity and kappa
            # max/min values computed using numpy. It may be better to code it in pure python.
            keys = ['pretty_formula', 'volume']
            hdata = RecursiveDict((k, data[k]) for k in keys)
            hdata['volume'] = clean_value(hdata['volume'], 'ų')
            hdata['bandgap'] = clean_value(data['gap']['GGA'], 'eV')
            cols = ['value', 'temperature', 'doping']
            tables = RecursiveDict()
            props = RecursiveDict()
            props['seebeck_doping'] = ['S', 'μV/K']
            props['cond_doping'] = ['σ', '(Ωms)⁻¹']
            props['kappa_doping'] = ['κₑ', 'W/(mKs)']

            for prop_name, (lbl, unit) in props.iteritems():
                # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font)
                # and select it as standard font in your browser (leave other fonts as is, esp. fixed width)
                tables[lbl] = RecursiveDict()
                hlbl = lbl+'₋' if len(lbl) > 1 else lbl
                hlbl += 'ₑₓₜᵣ'
                hdata[hlbl] = RecursiveDict()

                for doping_type in ['p', 'n']:
                    prop = data['GGA'][prop_name][doping_type]
                    prop_averages, dopings, columns = [], None, ['T [K]']
                    temps = sorted(map(int, prop.keys()))
                    for temp in temps:
                        row = [temp]
                        if dopings is None:
                            dopings = sorted(map(float, prop[str(temp)].keys()))
                        for doping in dopings:
                            doping_str = '%.0e' % doping
                            if len(columns) <= len(dopings):
                                columns.append('{} cm⁻³ [{}]'.format(doping_str, unit))
                            eigs = prop[str(temp)][doping_str]['eigs']
                            row.append(np.mean(eigs))
                        prop_averages.append((temp, row))

                    tables[lbl][doping_type] = Table.from_items(
                        prop_averages, orient='index', columns=columns
                    )

                    arr_prop_avg = np.array([item[1] for item in prop_averages])[:,1:]
                    max_v = np.max(arr_prop_avg)
                    if prop_name[0] == 's' and doping_type == 'n':
                        max_v = np.min(arr_prop_avg)
                    if prop_name[0] == 'k':
                        max_v = np.min(arr_prop_avg)
                    arg_max = np.argwhere(arr_prop_avg==max_v)[0]

                    vals = [
                        clean_value(max_v, unit),
                        clean_value(temps[arg_max[0]], 'K'),
                        clean_value(dopings[arg_max[1]], 'cm⁻³')
                    ]
                    hdata[hlbl][doping_type] = RecursiveDict(
                        (k, v) for k, v in zip(cols, vals)
                    )

            mpfile_data.rec_update(nest_dict(hdata, ['extra_data']))
            mpfile.add_hierarchical_data(mpfile_data, identifier=data['mp_id'])
            for lbl, dct in tables.iteritems():
                for doping_type, table in dct.iteritems():
                    mpfile.add_data_table(
                        data['mp_id'], table, name='{}({})'.format(lbl, doping_type)
                    )

        finally:
            input_file.close()
def run(mpfile, hosts=None, download=False, **kwargs):
    #mpfile.unique_mp_cat_ids = False
    from pymatgen import MPRester
    mpr = MPRester()

    fpath = os.path.join(os.environ['HOME'], 'work', 'dilute_solute_diffusion.xlsx')

    if download or not os.path.exists(fpath):

        figshare_id = mpfile.hdata.general['info']['figshare_id']
        url = 'https://api.figshare.com/v2/articles/{}'.format(figshare_id)
        print 'get figshare article {}'.format(figshare_id)
        r = requests.get(url)
        figshare = json.loads(r.content)
        mpfile.document['_hdata']['version'] = figshare['version']

        print 'read excel from figshare into DataFrame'
        df_dct = None
        for d in figshare['files']:
            if 'xlsx' in d['name']:
                # Dict of DataFrames is returned, with keys representing sheets
                df_dct = read_excel(d['download_url'], sheet_name=None)
                break
        if df_dct is None:
            print 'no excel sheet found on figshare'
            return

        print 'save excel to disk'
        writer = ExcelWriter(fpath)
        for sheet, df in df_dct.items():
            df.to_excel(writer, sheet)
        writer.save()

    else:
        df_dct = read_excel(fpath, sheet_name=None)

    print len(df_dct), 'sheets loaded.'

    print 'looping hosts ...'
    host_info = df_dct['Host Information']
    host_info.set_index(host_info.columns[0], inplace=True)
    host_info.dropna(inplace=True)

    for idx, host in enumerate(host_info):
        if hosts is not None:
            if isinstance(hosts, int) and idx+1 > hosts:
                break
            elif isinstance(hosts, list) and not host in hosts:
                continue

        print 'get mp-id for {}'.format(host)
        mpid = None
        for doc in mpr.query(
            criteria={'pretty_formula': host},
            properties={'task_id': 1}
        ):
            if doc['sbxd'][0]['decomposes_to'] is None:
                mpid = doc['task_id']
                break
        if mpid is None:
            print 'mp-id for {} not found'.format(host)
            continue

        print 'add host info for {}'.format(mpid)
        hdata = host_info[host].to_dict(into=RecursiveDict)
        for k in hdata.keys():
            v = hdata.pop(k)
            ks = k.split()
            if ks[0] not in hdata:
                hdata[ks[0]] = RecursiveDict()
            unit = ks[-1][1:-1] if ks[-1].startswith('[') else ''
            subkey = '_'.join(ks[1:-1] if unit else ks[1:]).split(',')[0]
            if subkey == "lattice_constant":
                unit = u'Å'
            try:
                hdata[ks[0]][subkey] = clean_value(v, unit.replace('angstrom', u'Å'))
            except ValueError:
                hdata[ks[0]][subkey] = v
        hdata['formula'] = host
        df = df_dct['{}-X'.format(host)]
        rows = list(isnull(df).any(1).nonzero()[0])
        if rows:
            cells = df.ix[rows].dropna(how='all').dropna(axis=1)[df.columns[0]]
            note = cells.iloc[0].replace('following', cells.iloc[1])[:-1]
            hdata['note'] = note
            df.drop(rows, inplace=True)
        mpfile.add_hierarchical_data(nest_dict(hdata, ['data']), identifier=mpid)

        print 'add table for D₀/Q data for {}'.format(mpid)
        df.set_index(df['Solute element number'], inplace=True)
        df.drop('Solute element number', axis=1, inplace=True)
        df.columns = df.ix[0]
        df.index.name = 'index'
        df.drop('Solute element name', inplace=True)
        df = df.T.reset_index()
        if str(host) == 'Fe':
            df_D0_Q = df[[
                'Solute element name', 'Solute D0, paramagnetic [cm^2/s]',
                'Solute Q, paramagnetic [eV]'
            ]]
        elif hdata['Host']['crystal_structure'] == 'HCP':
            df_D0_Q = df[['Solute element name', 'Solute D0 basal [cm^2/s]', 'Solute Q basal [eV]']]
        else:
            df_D0_Q = df[['Solute element name', 'Solute D0 [cm^2/s]', 'Solute Q [eV]']]
        df_D0_Q.columns = ['El.', 'D₀ [cm²/s]', 'Q [eV]']
        mpfile.add_data_table(mpid, df_D0_Q, 'D₀_Q')

        if hdata['Host']['crystal_structure'] == 'BCC':

            print 'add table for hop activation barriers for {} (BCC)'.format(mpid)
            columns_E = [
                'Hop activation barrier, E_{} [eV]'.format(i) for i in range(2,5)
            ] + [
                "Hop activation barrier, E'_{} [eV]".format(i) for i in range(3,5)
            ] + [
                "Hop activation barrier, E''_{} [eV]".format(i) for i in range(3,5)
            ] + [
                'Hop activation barrier, E_{} [eV]'.format(i) for i in range(5,7)
            ]
            df_E = df[['Solute element name'] + columns_E]
            df_E.columns = ['El.'] + [
                'E{} [eV]'.format(i) for i in ['₂', '₃', '₄']
            ] + [
                'E`{} [eV]'.format(i) for i in ['₃', '₄']
            ] + [
                'E``{} [eV]'.format(i) for i in ['₃', '₄']
            ] + [
                'E{} [eV]'.format(i) for i in ['₅', '₆']
            ]
            mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers')

            print 'add table for hop attempt frequencies for {} (BCC)'.format(mpid)
            columns_v = [
                'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(2,5)
            ] + [
                "Hop attempt frequency, v'_{} [THz]".format(i) for i in range(3,5)
            ] + [
                "Hop attempt frequency, v''_{} [THz]".format(i) for i in range(3,5)
            ] + [
                'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(5,7)
            ]
            df_v = df[['Solute element name'] + columns_v]
            df_v.columns = ['El.'] + [
                'v{} [THz]'.format(i) for i in ['₂', '₃', '₄']
            ] + [
                'v``{} [THz]'.format(i) for i in ['₃', '₄']
            ] + [
                'v``{} [THz]'.format(i) for i in ['₃', '₄']
            ] + [
                'v{} [THz]'.format(i) for i in ['₅', '₆']
            ]
            mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies')

        elif hdata['Host']['crystal_structure'] == 'FCC':

            print 'add table for hop activation barriers for {} (FCC)'.format(mpid)
            columns_E = ['Hop activation barrier, E_{} [eV]'.format(i) for i in range(5)]
            df_E = df[['Solute element name'] + columns_E]
            df_E.columns = ['El.'] + ['E{} [eV]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄']]
            mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers')

            print 'add table for hop attempt frequencies for {} (FCC)'.format(mpid)
            columns_v = ['Hop attempt frequency, v_{} [THz]'.format(i) for i in range(5)]
            df_v = df[['Solute element name'] + columns_v]
            df_v.columns = ['El.'] + ['v{} [THz]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄']]
            mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies')

        elif hdata['Host']['crystal_structure'] == 'HCP':

            print 'add table for hop activation barriers for {} (HCP)'.format(mpid)
            columns_E = [
                "Hop activation barrier, E_X [eV]", "Hop activation barrier, E'_X [eV]",
                "Hop activation barrier, E_a [eV]", "Hop activation barrier, E'_a [eV]",
                "Hop activation barrier, E_b [eV]", "Hop activation barrier, E'_b [eV]",
                "Hop activation barrier, E_c [eV]", "Hop activation barrier, E'_c [eV]"
            ]
            df_E = df[['Solute element name'] + columns_E]
            df_E.columns = ['El.'] + [
                'Eₓ [eV]', 'E`ₓ [eV]', 'Eₐ [eV]', 'E`ₐ [eV]',
                'E_b [eV]', 'E`_b [eV]', 'Eꪱ [eV]', 'E`ꪱ [eV]'
            ]
            mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers')

            print 'add table for hop attempt frequencies for {} (HCP)'.format(mpid)
            columns_v = ['Hop attempt frequency, v_a [THz]'] + ['Hop attempt frequency, v_X [THz]']
            df_v = df[['Solute element name'] + columns_v]
            df_v.columns = ['El.'] + ['vₐ [THz]'] + ['vₓ [THz]']
            mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies')

            print mpfile
    print 'DONE'
Beispiel #27
0
def run(mpfile, **kwargs):
    from pymatgen import Structure

    reference_project = None
    input_data, input_keys, extra = RecursiveDict(), RecursiveDict(
    ), RecursiveDict()
    #input_urls = mpfile.document['_hdata'].pop('input_urls')
    input_urls = {
        'NUS': {
            "file": "http://www.2dmatpedia.org/static/db.json.gz",
            "detail": "http://www.2dmatpedia.org/2dmaterials/doc/{}"
        },
        'JARVIS': {
            "file": "https://www.ctcms.nist.gov/~knc6/jdft_{}.json.tgz",
            "detail": "https://www.ctcms.nist.gov/~knc6/jsmol/{}.html"
        }
    }

    for project in input_urls:
        input_url = input_urls[project]['file']
        if '{}' in input_url:
            input_url = input_url.format('2d')  # TODO 3d for Jarvis

        #dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1])
        dbfile = input_url.rsplit('/')[-1]
        if not os.path.exists(dbfile):
            print('downloading', dbfile, '...')
            urllib.request.urlretrieve(input_url, dbfile)

        ext = os.path.splitext(dbfile)[1]
        is_nus = bool(ext == '.gz')
        id_key = 'source_id' if is_nus else 'mpid'
        if not is_nus:
            with tarfile.open(dbfile, "r:gz") as tar:
                member = tar.getmembers()[0]
                raw_data = json.load(tar.extractfile(member), cls=MontyDecoder)
        else:
            reference_project = project
            raw_data = []
            with gzip.open(dbfile, 'rb') as f:
                for line in f:
                    raw_data.append(json.loads(line, cls=MontyDecoder))
        input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data)

        input_keys[project] = [
            'material_id', 'exfoliation_energy_per_atom', 'structure'
        ] if is_nus else ['jid', 'exfoliation_en', 'final_str']
        extra[project] = [
            ('fin_en', ('E', 'meV/atom')),
            ('op_gap', ('ΔE|optB88vdW', 'meV/atom')),
            ('mbj_gap', ('ΔE|mbj', 'meV/atom')),
            #('kv', ('Kᵥ', 'GPa')),
            #('gv', ('Gᵥ', 'GPa'))
        ] if not is_nus else []

        print(len(input_data[project]), 'materials loaded for', project)

    projects = input_data.keys()
    identifiers = []
    for d in input_data.values():
        identifiers += list(d.keys())

    for identifier in set(identifiers):
        print(identifier)
        data, structures = RecursiveDict(), RecursiveDict()

        for project in projects:
            if project not in data:
                data[project] = RecursiveDict()
            if identifier in input_data[project]:
                d = input_data[project][identifier]
                structures[project] = d[input_keys[project][-1]]
                if data.get('formula') is None:
                    data['formula'] = get_composition_from_string(
                        structures[project].composition.reduced_formula)
                data[project]['id'] = input_urls[project]['detail'].format(
                    d[input_keys[project][0]])
                if input_keys[project][1] in d:
                    Ex = d[input_keys[project][1]]
                    if project == reference_project:
                        Ex *= 1000.
                    data[project]['Eₓ'] = clean_value(Ex, 'eV')
                for k, (sym, unit) in extra[project]:
                    if d[k] != 'na':
                        data[project][sym] = clean_value(d[k], unit)

        mpfile.add_hierarchical_data(nest_dict(data, ['data']),
                                     identifier=identifier)
        #r = db.contributions.update_one(
        #    {'identifier': identifier, 'project': 'jarvis_dft'},
        #    {'$set': {'content.data': mpfile.document[identifier]['data']}},
        #    upsert=True
        #)
        #print(r.matched_count, r.modified_count, r.upserted_id)

        doc = db.contributions.find_one(
            {
                'identifier': identifier,
                'project': 'jarvis_dft'
            }, {
                '_id': 1,
                'content.structures': 1
            })
        if 'structures' in doc['content']:
            print('structures already added for', identifier)
            continue
        print(doc['_id'])

        inserted_ids = []
        for project, structure in structures.items():
            try:
                mpfile.add_structure(structure,
                                     name=project,
                                     identifier=identifier)
                sdct = mpfile.document[identifier]['structures'][project]
                sdct.pop('@module')
                sdct.pop('@class')
                if sdct['charge'] is None:
                    sdct.pop('charge')
                sdct['identifier'] = identifier
                sdct['project'] = 'jarvis_dft'
                sdct['name'] = project
                sdct['cid'] = doc['_id']
                r = db.structures.insert_one(sdct)
                inserted_ids.append(r.inserted_id)
            except Exception as ex:
                print(str(ex))

        print(inserted_ids)
        r = db.contributions.update_one(
            {'_id': doc['_id']},
            {'$set': {
                'content.structures': inserted_ids
            }})
        print(r.matched_count, r.modified_count)
def run(mpfile, **kwargs):
    # TODO clone solar_perovskite if needed, abort if insufficient permissions
    import solar_perovskite
    from solar_perovskite.core import GetExpThermo
    from solar_perovskite.init.find_structures import FindStructures
    from solar_perovskite.init.import_data import Importdata
    from solar_perovskite.modelling.from_theo import EnthTheo

    input_file = mpfile.hdata.general['input_file']
    input_file = os.path.join(os.path.dirname(solar_perovskite.__file__), input_file)
    table = read_csv(open(input_file, 'r').read().replace(';', ','))
    dct = super(Table, table).to_dict(orient='records', into=RecursiveDict)

    shomate = pd.read_csv(os.path.abspath(os.path.join(
        os.path.dirname(solar_perovskite.__file__), "datafiles", "shomate.csv"
    )), index_col=0)
    shomate_dct = RecursiveDict()
    for col in shomate.columns:
        key = col.split('.')[0]
        if key not in shomate_dct:
            shomate_dct[key] = RecursiveDict()
        d = shomate[col].to_dict(into=RecursiveDict)
        subkey = '{}-{}'.format(int(d.pop('low')), int(d.pop('high')))
        shomate_dct[key][subkey] = RecursiveDict(
            (k, clean_value(v, max_dgts=6)) for k, v in d.items()
        )
    mpfile.add_hierarchical_data(nest_dict(shomate_dct, ['shomate']))

    for row in dct:

        sample_number = int(row['sample_number'])
        identifier = row['closest phase MP (oxidized)'].replace('n.a.', '')
        if not identifier.startswith('mp-'):
            continue
        if not identifier:
            identifier = get_composition_from_string(row['composition oxidized phase'])
        print identifier

        print 'add hdata ...'
        d = RecursiveDict()
        d['tolerance_factor'] = row['tolerance_factor']
        d['solid_solution'] = row['type of solid solution']
        d['oxidized_phase'] = RecursiveDict()
        d['oxidized_phase']['composition'] = row['composition oxidized phase']
        d['oxidized_phase']['crystal-structure'] = row['crystal structure (fully oxidized)']
        d['reduced_phase'] = RecursiveDict()
        d['reduced_phase']['composition'] = row['composition reduced phase']
        d['reduced_phase']['closest-MP'] = row['closest phase MP (reduced)'].replace('n.a.', '')
        d = nest_dict(d, ['data'])
        d['pars'] = get_fit_pars(sample_number)
        d['pars']['theo_compstr'] = row['theo_compstr']
        try:
            fs = FindStructures(compstr=row['theo_compstr'])
            theo_redenth = fs.find_theo_redenth()
            imp = Importdata()
            splitcomp = imp.split_comp(row['theo_compstr'])
            conc_act = imp.find_active(mat_comp=splitcomp)[1]
            et = EnthTheo(comp=row['theo_compstr'])
            dh_max, dh_min = et.calc_dh_endm()
            red_enth_mean_endm = (conc_act * dh_min) + ((1 - conc_act) * dh_max)
            difference = theo_redenth - red_enth_mean_endm
            d['pars']['dh_min'] = clean_value(dh_min + difference, max_dgts=8)
            d['pars']['dh_max'] = clean_value(dh_max + difference, max_dgts=8)
        except Exception as ex:
            print('error in dh_min/max!')
            print(str(ex))
            pass
        mpfile.add_hierarchical_data(d, identifier=identifier)

        print 'add ΔH ...'
        exp_thermo = GetExpThermo(sample_number, plotting=False)
        enthalpy = exp_thermo.exp_dh()
        table = get_table(enthalpy, 'H')
        mpfile.add_data_table(identifier, table, name='enthalpy')

        print 'add ΔS ...'
        entropy = exp_thermo.exp_ds()
        table = get_table(entropy, 'S')
        mpfile.add_data_table(identifier, table, name='entropy')

        print 'add raw data ...'
        tga_results = os.path.join(os.path.dirname(solar_perovskite.__file__), 'tga_results')
        for path in glob(os.path.join(tga_results, 'ExpDat_JV_P_{}_*.csv'.format(sample_number))):
            print path.split('_{}_'.format(sample_number))[-1].split('.')[0], '...'
            body = open(path, 'r').read()
            cols = ['Time [min]', 'Temperature [C]', 'dm [%]', 'pO2']
            table = read_csv(body, lineterminator=os.linesep, usecols=cols, skiprows=5)
            table = table[cols].iloc[::100, :]
            # scale/shift for better graphs
            T, dm, p = [pd.to_numeric(table[col]) for col in cols[1:]]
            T_min, T_max, dm_min, dm_max, p_max = T.min(), T.max(), dm.min(), dm.max(), p.max()
            rT, rdm = abs(T_max - T_min), abs(dm_max - dm_min)
            table[cols[2]] = (dm - dm_min) * rT/rdm
            table[cols[3]] = p * rT/p_max
            table.rename(columns={
                'dm [%]': '(dm [%] + {:.4g}) * {:.4g}'.format(-dm_min, rT/rdm),
                'pO2': 'pO₂ * {:.4g}'.format(rT/p_max)
            }, inplace=True)
            mpfile.add_data_table(identifier, table, name='raw')