Ejemplo n.º 1
0
 def from_string(data):
     # use archieml-python parse to import data
     rdct = RecursiveDict(archieml.loads(data))
     rdct.rec_update()
     # post-process internal representation of file contents
     for key in rdct.keys():
         is_general, root_key = normalize_root_level(key)
         if is_general:
             # make part of shared (meta-)data, i.e. nest under `general` at
             # the beginning of the MPFile
             if mp_level01_titles[0] not in rdct:
                 rdct.insert_before(rdct.keys()[0],
                                    (mp_level01_titles[0], RecursiveDict()))
             rdct.rec_update(
                 nest_dict(rdct.pop(key), [mp_level01_titles[0], root_key]))
         else:
             # normalize identifier key (pop & insert)
             # using rec_update since we're looping over all entries
             # also: support data in bare tables (marked-up only by
             #       root-level identifier) by nesting under 'data'
             value = rdct.pop(key)
             keys = [root_key]
             if isinstance(value, list): keys.append('table')
             rdct.rec_update(nest_dict(value, keys))
             # Note: CSV section is marked with 'data ' prefix during iterate()
             for k, v in rdct[root_key].iterate():
                 if isinstance(k, six.string_types) and \
                    k.startswith(mp_level01_titles[1]):
                     # k = table name (incl. data prefix)
                     # v = csv string from ArchieML free-form arrays
                     table_name = k[len(mp_level01_titles[1] + '_'):]
                     pd_obj = read_csv(v)
                     rdct[root_key].pop(table_name)
                     rdct[root_key].rec_update(
                         nest_dict(pd_obj.to_dict(), [k]))
                     rdct[root_key].insert_default_plot_options(pd_obj, k)
             # convert CIF strings into pymatgen structures
             if mp_level01_titles[3] in rdct[root_key]:
                 from pymatgen.io.cif import CifParser
                 for name in rdct[root_key][mp_level01_titles[3]].keys():
                     cif = rdct[root_key][mp_level01_titles[3]].pop(name)
                     parser = CifParser.from_string(cif)
                     structure = parser.get_structures(primitive=False)[0]
                     rdct[root_key][mp_level01_titles[3]].rec_update(
                         nest_dict(structure.as_dict(), [name]))
     return MPFile.from_dict(rdct)
Ejemplo n.º 2
0
    def from_string(data):
        # use archieml-python parse to import data
        rdct = RecursiveDict(loads(data))
        rdct.rec_update()

        # post-process internal representation of file contents
        for key in list(rdct.keys()):
            is_general, root_key = normalize_root_level(key)

            if is_general:
                # make part of shared (meta-)data, i.e. nest under `general` at
                # the beginning of the MPFile
                if mp_level01_titles[0] not in rdct:
                    rdct[mp_level01_titles[0]] = RecursiveDict()
                    rdct.move_to_end(mp_level01_titles[0], last=False)

            # normalize identifier key (pop & insert)
            # using rec_update since we're looping over all entries
            # also: support data in bare tables (marked-up only by
            #       root-level identifier) by nesting under 'data'
            value = rdct.pop(key)
            keys = [mp_level01_titles[0]] if is_general else []
            keys.append(root_key)
            if isinstance(value, list):
                keys.append("table")
            rdct.rec_update(nest_dict(value, keys))

            # reference to section to iterate or parse as CIF
            section = (rdct[mp_level01_titles[0]][root_key]
                       if is_general else rdct[root_key])

            # iterate to find CSV sections to parse
            # also parse propnet quantities
            if isinstance(section, dict):
                scope = []
                for k, v in section.iterate():
                    level, key = k
                    key = "".join([replacements.get(c, c) for c in key])
                    level_reduction = bool(level < len(scope))
                    if level_reduction:
                        del scope[level:]
                    if v is None:
                        scope.append(key)
                    elif isinstance(v, list) and isinstance(v[0], dict):
                        table = ""
                        for row_dct in v:
                            table = "\n".join([table, row_dct["value"]])
                        pd_obj = read_csv(table)
                        d = nest_dict(pd_obj.to_dict(), scope + [key])
                        section.rec_update(d, overwrite=True)
                        if not is_general and level == 0:
                            section.insert_default_plot_options(pd_obj, key)
                    elif (Quantity is not None
                          and isinstance(v, six.string_types) and " " in v):
                        quantity = Quantity.from_key_value(key, v)
                        d = nest_dict(quantity.as_dict(), scope +
                                      [key])  # TODO quantity.symbol.name
                        section.rec_update(d, overwrite=True)

            # convert CIF strings into pymatgen structures
            if mp_level01_titles[3] in section:
                from pymatgen.io.cif import CifParser

                for name in section[mp_level01_titles[3]].keys():
                    cif = section[mp_level01_titles[3]].pop(name)
                    parser = CifParser.from_string(cif)
                    structure = parser.get_structures(primitive=False)[0]
                    section[mp_level01_titles[3]].rec_update(
                        nest_dict(structure.as_dict(), [name]))

        return MPFile.from_dict(rdct)
Ejemplo n.º 3
0
    def to_backgrid_dict(self):
        """Backgrid-conform dict from DataFrame"""
        # shorten global import times by importing django here
        import numpy as np
        from mpcontribs.io.core.utils import get_composition_from_string
        from pandas import MultiIndex
        import pymatgen.util as pmg_util
        from pymatgen.core.composition import CompositionError

        table = dict()
        nrows_max = 260
        nrows = self.shape[0]
        df = Table(self.head(n=nrows_max)) if nrows > nrows_max else self
        numeric_columns = df.select_dtypes(
            include=[np.number]).columns.tolist()

        if isinstance(df.index, MultiIndex):
            df.reset_index(inplace=True)

        table['columns'] = []
        table['rows'] = super(Table, df).to_dict(orient='records')

        for col_index, col in enumerate(list(df.columns)):
            cell_type = 'number'

            # avoid looping rows to minimize use of `df.iat` (time-consuming in 3d)
            if not col.startswith('level_') and col not in numeric_columns:
                is_url_column, prev_unit, old_col = True, None, col

                for row_index in range(df.shape[0]):
                    cell = str(df.iat[row_index, col_index])
                    cell_split = cell.split(' ', 1)

                    if not cell or len(
                            cell_split) == 1:  # empty cell or no space
                        is_url_column = bool(
                            is_url_column
                            and (not cell or mp_id_pattern.match(cell)))
                        if is_url_column:
                            if cell:
                                value = 'https://materialsproject.org/materials/{}'.format(
                                    cell)
                                table['rows'][row_index][col] = value
                        elif cell:
                            try:
                                composition = get_composition_from_string(cell)
                                composition = pmg_util.string.unicodeify(
                                    composition)
                                table['rows'][row_index][col] = composition
                            except (CompositionError, ValueError,
                                    OverflowError):
                                try:
                                    # https://stackoverflow.com/a/38020041
                                    result = urlparse(cell)
                                    if not all([
                                            result.scheme, result.netloc,
                                            result.path
                                    ]):
                                        break
                                    is_url_column = True
                                except:
                                    break

                    else:
                        value, unit = cell_split  # TODO convert cell_split[0] to float?
                        is_url_column = False
                        try:
                            float(value
                                  )  # unit is only a unit if value is number
                        except ValueError:
                            continue
                        table['rows'][row_index].pop(old_col)
                        if prev_unit is None:
                            prev_unit = unit
                            col = '{} [{}]'.format(col, unit)
                        table['rows'][row_index][
                            col] = cell if prev_unit != unit else value

                cell_type = 'uri' if is_url_column else 'string'

            col_split = col.split('##')
            nesting = [col_split[0]] if len(col_split) > 1 else []
            table['columns'].append({
                'name': col,
                'cell': cell_type,
                'nesting': nesting,
                'editable': 0
            })
            if len(col_split) > 1:
                table['columns'][-1].update(
                    {'label': '##'.join(col_split[1:])})
            if len(table['columns']) > 12:
                table['columns'][-1]['renderable'] = 0

        header = RecursiveDict()
        for idx, col in enumerate(table['columns']):
            if 'label' in col:
                k, sk = col['name'].split('##')
                sk_split = sk.split()
                if len(sk_split) == 2:
                    d = {'name': sk_split[0], 'unit': sk_split[1], 'idx': idx}
                    if k not in header:
                        header[k] = [d]
                    else:
                        header[k].append(d)
                elif k in header:
                    header.pop(k)

        for k, skl in header.items():
            units = [sk['unit'] for sk in skl]
            if units.count(units[0]) == len(units):
                for sk in skl:
                    table['columns'][sk['idx']]['label'] = sk['name']
                    table['columns'][sk['idx']]['nesting'][0] = '{} {}'.format(
                        k, sk['unit'])

        return table