Python BasisSet.groupby Examples

Programming Language: Python

Namespace/Package Name: exatomic.core.basis

Class/Type: BasisSet

Method/Function: groupby

Examples at hotexamples.com: 6

Python BasisSet.groupby - 6 examples found. These are the top rated real world Python examples of exatomic.core.basis.BasisSet.groupby extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

groupby(5)

BasisSet(4)

functions(3)

functions_by_shell(1)

primitives(1)

primitives_by_shell(1)

shells(1)

Example #1

Show file

File: output.py Project: tjduigna/exatomic

class Output(six.with_metaclass(OutMeta, Editor)):
    """Editor for NWChem calculation output file (stdout)."""

    def parse_atom(self):
        """Parse the atom dataframe."""
        _reatom01 = 'Geometry "'
        _reatom02 = 'Atomic Mass'
        _reatom03 = 'ECP       "ecp basis"'
        _reatom04 = 'Output coordinates in'
        found = self.find(_reatom01, _reatom02,
                          _reatom03, _reatom04, keys_only=True)
        unit = self[found[_reatom04][0]].split()[3]
        unit = "Angstrom" if unit == "angstroms" else "au"
        starts = np.array(found[_reatom01]) + 7
        stops = np.array(found[_reatom02]) - 1
        ecps = np.array(found[_reatom03]) + 2
        ecps = {self[ln].split()[0]: int(self[ln].split()[3]) for ln in ecps}
        columns = ['label', 'tag', 'Z', 'x', 'y', 'z']
        atom = pd.concat([self.pandas_dataframe(s, e, columns)
                          for s, e in zip(starts, stops)])
        atom['symbol'] = atom['tag'].str.extract('([A-z]{1,})([0-9]*)',
                                                 expand=False)[0].str.lower().str.title()
        atom['Z'] = atom['Z'].astype(np.int64)
        atom['Zeff'] = (atom['Z'] - atom['tag'].map(ecps).fillna(value=0)).astype(np.int64)
        #n = len(atom)
        nf = atom.label.value_counts().max()
        nat = atom.label.max()
        atom['frame'] = [i for i in range(nf) for j in range(nat)]
        atom['label'] -= 1
        atom['x'] *= Length[unit, 'au']
        atom['y'] *= Length[unit, 'au']
        atom['z'] *= Length[unit, 'au']
        if atom['frame'].max() > 0:
            li = atom['frame'].max()
            atom = atom[~(atom['frame'] == li)]
            atom.reset_index(drop=True, inplace=True)
        del atom['label']
        self.atom = Atom(atom)

    def parse_orbital(self):
        """Parse the :class:`~exatomic.core.orbital.Orbital` dataframe."""
        orbital = None
        _remo01 = 'Molecular Orbital Analysis'
        _remo02 = 'alpha - beta orbital overlaps'
        _remo03 = 'center of mass'
        check = self.find(_remo01)
        if any(['Alpha' in value for value in check]):
            alpha_starts = np.array([no for no, line in check if 'Alpha' in line], dtype=np.int64) + 2
            alpha_stops = np.array([no for no, line in check if 'Beta' in line], dtype=np.int64) - 1
            beta_starts = alpha_stops + 3
            beta_stops = np.array(self.find(_remo02, keys_only=True), dtype=np.int64) - 1
            alpha_orbital = self._parse_orbital(alpha_starts, alpha_stops)
            beta_orbital = self._parse_orbital(beta_starts, beta_stops)
            alpha_orbital['spin'] = 0
            beta_orbital['spin'] = 1
            orbital = pd.concat((alpha_orbital, beta_orbital), ignore_index=True)
        else:
            starts = np.array(list(zip(*check))[0], dtype=np.int64) + 2
            stops = np.array(self.find(_remo03, keys_only=True), dtype=np.int64) - 1
            orbital = self._parse_orbital(starts, stops)
            orbital['spin'] = 0
        orbital['group'] = 0
        self.orbital = Orbital(orbital)

    def parse_momatrix(self):
        """
        Parse the :class:`~exatomic.core.orbital.MOMatrix` dataframe.

        Note:
            Must supply 'print "final vectors" "final vectors analysis"' for momatrix
        """
        key0 = "Final MO vectors"
        key1 = "center of mass"
        found = self.find(key0, key1)
        if found[key0]:
            start = found[key0][0][0] + 6
            end = found[key1][0][0] - 1
            c = pd.read_fwf(StringIO("\n".join(self[start:end])), widths=(6, 12, 12, 12, 12, 12, 12),
                            names=list(range(7)))
            self.c = c
            idx = c[c[0].isnull()].index.values
            c = c[~c.index.isin(idx)]
            del c[0]
            nbas = len(self.basis_set_order)
            n = c.shape[0]//nbas
            coefs = []
            # The for loop below is like numpy.array_split(df, n); using numpy.array_split
            # with dataframes seemed to have strange results where splits had wrong sizes?
            for i in range(n):
                coefs.append(c.iloc[i*nbas:(i+1)*nbas, :].astype(float).dropna(axis=1).values.ravel("F"))
            c = np.concatenate(coefs)
            del coefs
            orbital, chi = _square_indices(len(self.basis_set_order))
            self.momatrix = MOMatrix.from_dict({'coef': c, 'chi': chi, 'orbital': orbital, 'frame': 0})
            # momatrix = pd.DataFrame.from_dict({'coef': c, 'chi': chi, 'orbital': orbital})
            # momatrix['frame'] = 0
            # self.momatrix = momatrix



    def _parse_orbital(self, starts, stops):
        '''
        This function actually performs parsing of :class:`~exatomic.orbital.Orbital`

        See Also:
            :func:`~exnwchem.output.Output.parse_orbital`
        '''
        joined = '\n'.join(['\n'.join(self[s:e]) for s, e in zip(starts, stops)])
        nvec = joined.count('Vector')
        if 'spherical' not in self.meta:
            self.parse_basis_set()
        mapper = self.basis_set.functions(self.meta['spherical']).groupby(level="set").sum()
        nbas = self.atom['set'].map(mapper).sum()
        nbas *= nvec
        # Orbital dataframe -- alternatively one could parse the strings
        # into the DataFrame and then use the pd.Series.str methods to
        # perform all the replacements at the same time, eg. 'D' --> 'E'
        # and 'Occ=' --> '', etc.
        orb_no = np.empty((nvec, ), dtype=np.int64)
        occ = np.empty((nvec, ), dtype=np.float64)
        nrg = np.empty((nvec, ), dtype=np.float64)
        x = np.empty((nvec, ), dtype=np.float64)
        y = np.empty((nvec, ), dtype=np.float64)
        z = np.empty((nvec, ), dtype=np.float64)
        frame = np.empty((nvec, ), dtype=np.int64)
        fc = -1   # Frame counter
        oc = 0   # Orbital counter
        for s, e in zip(starts, stops):
            fc += 1
            for line in self[s:e]:
                ls = line.split()
                if 'Vector' in line:
                    orb_no[oc] = ls[1]
                    occ[oc] = ls[2].replace('Occ=', '').replace('D', 'E')
                    nrg[oc] = ls[3].replace('E=', '').replace('D', 'E') if 'E=-' in line else ls[4].replace('D', 'E')
                    frame[oc] = fc
                elif 'MO Center' in line:
                    x[oc] = ls[2].replace(',', '').replace('D', 'E')
                    y[oc] = ls[3].replace(',', '').replace('D', 'E')
                    z[oc] = ls[4].replace(',', '').replace('D', 'E')
                    oc += 1
        orb_no -= 1
        return pd.DataFrame.from_dict({'x': x, 'y': z, 'z': z, 'frame': frame,
                                       'vector': orb_no, 'occupation': occ, 'energy': nrg})

    def parse_basis_set(self):
        """
        Parse the :class:`~exatomic.core.basis.BasisSet` dataframe.
        """
        if not hasattr(self, "atom"):
            self.parse_atom()
        _rebas01 = ' Basis "'
        _rebas02 = ' Summary of "'
        _rebas03 = [' s ', ' px ', ' py ', ' pz ',
                    ' d ', ' f ', ' g ', ' h ', ' i ',
                    ' j ', ' k ', ' l ', ' m ', ' p ']
        found = self.find(_rebas01, _rebas02)
        spherical = True if "spherical" in found[_rebas01][0][1] else False
        start = found[_rebas01][0][0] + 2
        idx = 1 if len(found[_rebas02]) > 1 else -1
        stop = found[_rebas02][idx][0] - 1
        # Read in all of the extra lines that contain ---- and tag names
        df = pd.read_fwf(StringIO("\n".join(self[start:stop])),
                         widths=(4, 2, 16, 16),
                         names=("shell", "L", "alpha", "d"))
        df.loc[df['shell'] == "--", "shell"] = np.nan
        tags = df.loc[(df['shell'].str.isdigit() == False), "shell"]
        idxs = tags.index.tolist()
        idxs.append(len(df))
        df['set'] = ""
        for i, tag in enumerate(tags):
            df.loc[idxs[i]:idxs[i + 1], "set"] = tag
        df = df.dropna().reset_index(drop=True)
        mapper = {v: k for k, v in dict(enumerate(df['set'].unique())).items()}
        df['set'] = df['set'].map(mapper)
        df['L'] = df['L'].str.strip().str.lower().map(lmap)
        df['alpha'] = df['alpha'].astype(float)
        df['d'] = df['d'].astype(float)
        # NO SUPPORT FOR MULTIPLE FRAMES?
        df['frame'] = 0
        self.basis_set = BasisSet(df)
        self.meta['spherical'] = spherical
        self.atom['set'] = self.atom['tag'].map(mapper)

    def parse_basis_set_order(self):
        dtype = [('center', 'i8'), ('shell', 'i8'), ('L', 'i8')]
        if 'spherical' not in self.meta:
            self.parse_basis_set()
        if self.meta['spherical']:
            dtype += [('ml', 'i8')]
        else:
            dtype += [('l', 'i8'), ('m', 'i8'), ('n', 'i8')]
        mapper = self.basis_set.functions(self.meta['spherical']).groupby(level="set").sum()
        nbas = self.atom['set'].map(mapper).sum()
        bso = np.empty((nbas,), dtype=dtype)
        cnt = 0
        bases = self.basis_set.groupby('set')
        for seht, center in zip(self.atom['set'], self.atom.index):
            bas = bases.get_group(seht).groupby('shell')
            if self.meta['spherical']:
                for shell, grp in bas:
                    l = grp['L'].values[0]
                    for ml in spherical_ordering_function(l):
                        bso[cnt] = (center, shell, l, ml)
                        cnt += 1
            else:
                for shell, grp in bas:
                    l = grp['L'].values[0]
                    for _, ll, m, n in cartesian_ordering_function(l):
                        bso[cnt] = (center, shell, l, ll, m, n)
                        cnt += 1
        bso = pd.DataFrame(bso)
        bso['frame'] = 0
        # New shell definition consistent with basis internals
        shls = []
        grps = bso.groupby(['center', 'L'])
        cache = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
        for (cen, L), grp in grps:
            for ml in grp['ml']:
                shls.append(cache[cen][L][ml])
                cache[cen][L][ml] += 1
        bso['shell'] = shls
        self.basis_set_order = bso

    def parse_frame(self):
        """
        Create a minimal :class:`~exatomic.core.frame.Frame` from the (parsed)
        :class:`~exatomic.core.atom.Atom` object.
        """
        _rescfen = 'Total SCF energy'
        _redften = 'Total DFT energy'
        self.frame = compute_frame_from_atom(self.atom)
        found = self.find(_rescfen, _redften)
        scfs = found[_rescfen]
        dfts = found[_redften]
        if scfs and dfts:
            print('Warning: found total energies from scf and dft, using dft')
            dfts = [float(val.split()[-1]) for key, val in dfts]
            self.frame['total_energy'] = dfts
        elif scfs:
            scfs = [float(val.split()[-1]) for key, val in scfs]
            self.frame['total_energy'] = scfs
        elif dfts:
            dfts = [float(val.split()[-1]) for key, val in dfts]
            self.frame['total_energy'] = dfts


    def __init__(self, *args, **kwargs):
        super(Output, self).__init__(*args, **kwargs)

Example #2

Show file

File: output.py Project: farnoushnouri/exatomic

class Output(six.with_metaclass(OutMeta, Editor)):
    """The ADF output parser."""
    def parse_atom(self):
        # TODO : only supports single frame, gets last atomic positions
        _re_atom_00 = 'Atoms in this Fragment     Cart. coord.s (Angstrom)'
        start = stop = self.find(_re_atom_00, keys_only=True)[0] + 2
        while self[stop].strip():
            stop += 1
        atom = self.pandas_dataframe(start, stop, 7)
        atom.drop([0, 2, 3], axis=1, inplace=True)
        atom.columns = ['symbol', 'x', 'y', 'z']
        for c in ['x', 'y', 'z']:
            atom[c] *= Length['Angstrom', 'au']
        atom['Z'] = atom['symbol'].map(sym2z)
        atom['frame'] = 0
        self.atom = atom

    def parse_basis_set(self):
        # Find the basis set
        _re_bas_00 = '(Slater-type)  F U N C T I O N S'
        _re_bas_01 = 'Atom Type'
        start = self.find(_re_bas_00, keys_only=True)[-1] + 3
        starts = self.find(_re_bas_01, start=start, keys_only=True)
        lines = []
        for ext in starts:
            for i in range(4):
                lines.append(start + ext + i)
            stop = start + ext + 4
            while self[stop].strip():
                lines.append(stop)
                stop += 1
        df = pd.read_fwf(StringIO('\n'.join([self[i] for i in lines])),
                         widths=[4, 2, 12, 4],
                         names=['n', 'L', 'alpha', 'symbol'])
        # Where atom types change
        idxs = [0] + df['n'][df['n'] == '---'].index.tolist() + [df.shape[0]]
        sets, shells = [], []
        for i, (start, stop) in enumerate(zip(idxs, idxs[1:])):
            sets.append(np.repeat(i - 1, stop - start))
            shells.append(np.arange(-1, stop - start - 1))
        df['set'] = np.concatenate(sets)
        df['shell'] = np.concatenate(shells)
        # Atom table basis set map
        basmap = df['symbol'].dropna()
        basmap = basmap[basmap.str.endswith(')')].str.strip(')')
        basmap = {
            val: df['set'][key] + 1
            for key, val in basmap.to_dict().items()
        }
        # Discard the garbage
        drop = df['n'].str.strip().str.isnumeric().fillna(False)
        df.drop(drop[drop == False].index, inplace=True)
        df.drop('symbol', axis=1, inplace=True)
        # Clean up the series
        df['alpha'] = df['alpha'].astype(np.float64)
        df['n'] = df['n'].astype(np.int64)
        df['L'] = df['L'].str.lower().map(lmap)
        df['d'] = np.sqrt((2 * df['L'] + 1) / (4 * np.pi))
        df['r'] = df['n'] - (df['L'] + 1)
        df['frame'] = 0
        self.basis_set = BasisSet(df)
        self.meta['spherical'] = False
        self.atom['set'] = self.atom['symbol'].map(basmap)

    def parse_basis_set_order(self):
        # All the columns we need
        data = defaultdict(list)
        sets = self.basis_set.groupby('set')
        # Iterate over atoms
        for center, symbol, seht in zip(self.atom.index, self.atom['symbol'],
                                        self.atom['set']):
            # Per basis set
            bas = sets.get_group(seht).groupby('L')
            for L, grp in bas:
                # Iterate over cartesians
                for l, m, n in enum_cartesian[L]:
                    for shell, r in zip(grp['shell'], grp['r']):
                        data['center'].append(center)
                        data['symbol'].append(symbol)
                        data['shell'].append(shell)
                        data['seht'].append(seht)
                        data['L'].append(L)
                        data['l'].append(l)
                        data['m'].append(m)
                        data['n'].append(n)
                        data['r'].append(r)
        data['set'] = data.pop('seht')
        data['frame'] = 0
        self.basis_set_order = pd.DataFrame.from_dict(data)
        self.basis_set_order['prefac'] = (
            self.basis_set_order['L'].apply(dfac21) /
            (self.basis_set_order['l'].apply(dfac21) *
             self.basis_set_order['m'].apply(dfac21) *
             self.basis_set_order['n'].apply(dfac21))).apply(np.sqrt)

    def parse_orbital(self):
        _re_orb_00 = 'Orbital Energies, both Spins'
        _re_orb_01 = 'Orbital Energies, per Irrep and Spin'
        found = self.find(_re_orb_00, _re_orb_01, keys_only=True)
        # Open shell vs. closed shell
        cols = {
            _re_orb_00:
            ['symmetry', 'vector', 'spin', 'occupation', 'energy', 'eV'],
            _re_orb_01: ['vector', 'occupation', 'energy', 'eV', 'dE']
        }
        key = _re_orb_00 if found[_re_orb_00] else _re_orb_01
        start = stop = found[key][-1] + 5
        while self[stop].strip():
            stop += 1
        df = self.pandas_dataframe(start, stop, cols[key])
        df['vector'] -= 1
        if 'spin' in cols[key]:
            df['spin'] = df.spin.map({'A': 0, 'B': 1})
            df.sort_values(by=['spin', 'energy'], inplace=True)
        else:
            df.sort_values(by='energy', inplace=True)
            df['spin'] = 0
        df.reset_index(drop=True, inplace=True)
        df['frame'] = df['group'] = 0
        self.orbital = df

    def parse_contribution(self):
        _re_con_00 = ('E(eV)  Occ       MO           %     '
                      'SFO (first member)   E(eV)  Occ   Fragment')
        # MO contribution by percentage
        found = self.find(_re_con_00, keys_only=True)
        starts = [i + 3 for i in found]
        widths = [12, 6, 6, 6, 11, 6, 10, 12, 6, 6, 3]
        names = [
            'eV', 'occupation', 'vector', 'sym', '%', 'SFO', 'angmom',
            'eV(sfo)', 'occ(sfo)', 'atom', 'symbol'
        ]
        dfs = []
        # Prints for both spins
        for i, start in enumerate(starts):
            stop = start
            while self[stop].strip():
                stop += 1
            dfs.append(
                pd.read_fwf(StringIO('\n'.join(self[start:stop])),
                            delim_whitespace=True,
                            widths=widths,
                            names=names))
            dfs[-1]['spin'] = i
        dfs = pd.concat(dfs).reset_index(drop=True)
        dfs = dfs.applymap(lambda x: np.nan if (isinstance(
            x, six.string_types) and x.isspace()) else x)
        dfs.fillna(method='ffill', inplace=True)
        # Clean up
        dfs['symbol'] = dfs['symbol'].str.strip()
        dfs['angmom'] = dfs['angmom'].str.strip()
        dfs['angmom'].update(dfs['angmom'].map({'S': 'S:'}))
        dfs[['L', 'ml']] = dfs['angmom'].str.extract('(.*):(.*)', expand=True)
        dfs['%'] = dfs['%'].str.replace('%', '')
        dfs['%'].update(dfs['%'].map({"    ******": np.inf}))
        dfs['%'] = dfs['%'].astype(np.float64)
        dfs['occupation'] = dfs['occupation'].astype(np.float64)
        dfs['vector'] = dfs['vector'].astype(np.int64) - 1
        dfs['eV'] = dfs['eV'].astype(np.float64)
        dfs['atom'] -= 1
        self.contribution = dfs

    def parse_excitation(self):
        # Excitation
        _re_exc_00 = '(sum=1) transition dipole moment'
        _re_exc_01 = ' no.     E/a.u.        E/eV      f           Symmetry'
        found = self.find_next(_re_exc_00, keys_only=True)
        if not found: return
        # First table of interest here
        start = found + 4
        stop = self.find_next(_re_exc_01, keys_only=True) - 3
        os = len(self[start].split()) == 9
        todrop = ['occ:', 'virt:']
        cols = [
            'excitation', 'occ', 'drop', 'virt', 'weight', 'TDMx', 'TDMy',
            'TDMz'
        ]
        if os: cols.insert(1, 'spin')
        if os: todrop = ['occ', 'virt']
        adf = self.pandas_dataframe(start, stop, cols)
        adf.drop('drop', axis=1, inplace=True)
        s1 = set(adf[cols[1]][adf[cols[1]] == 'NTO'].index)
        s2 = set(adf['excitation'][adf['excitation'].isin(todrop)].index)
        adf.drop(s1 | s2, axis=0, inplace=True)
        adf['excitation'] = adf['excitation'].str[:-1].astype(np.int64) - 1
        if os: adf['spin'] = adf['spin'].map({'Alph': 0, 'Beta': 1})
        adf[['occ', 'occsym']] = adf['occ'].str.extract('([0-9]*)(.*)',
                                                        expand=True)
        adf[['virt', 'virtsym']] = adf['virt'].str.extract('([0-9]*)(.*)',
                                                           expand=True)
        adf['occ'] = adf['occ'].astype(np.int64) - 1
        adf['virt'] = adf['virt'].astype(np.int64) - 1
        # Second one here
        start = stop + 5
        stop = start
        while self[stop].strip():
            stop += 1
        cols = _re_exc_01.split()
        df = self.pandas_dataframe(start, stop + 1, cols)
        df.drop(cols[0], axis=1, inplace=True)
        df.columns = ['energy', 'eV', 'osc', 'symmetry']
        # Expand the second table to fit the original
        for col in df.columns:
            adf[col] = adf.excitation.map(df[col])
        adf['frame'] = adf['group'] = 0
        self.excitation = adf

    def parse_momatrix(self):
        _re_mo_00 = 'Eigenvectors .* in BAS representation'
        _re_mo_01 = 'row '
        _re_mo_02 = 'nosym'
        found = self.regex(_re_mo_00,
                           _re_mo_01,
                           _re_mo_02,
                           flags=re.IGNORECASE,
                           keys_only=True)
        if not found[_re_mo_00] or not found[_re_mo_01]: return
        if found[_re_mo_02]:
            thresh = found[_re_mo_00][0]
            rowmajor = 'rows' in self[thresh]
            starts = np.array([i for i in found[_re_mo_01] if i > thresh]) + 1
            nchi = starts[1] - starts[0] - 3
            ncol = len(self[starts[0] + 1].split()) - 1
            if len(starts) % 2: os = False
            else:
                anchor = starts[len(starts) // 2 - 1] + nchi
                sail = starts[len(starts) // 2]
                os = True if self.find('SPIN 2', start=anchor,
                                       stop=sail) else False
            blocks = [starts] if not os else [
                starts[:len(starts) // 2], starts[len(starts) // 2:]
            ]
            data = pd.DataFrame()
            for i, block in enumerate(blocks):
                stop = block[-1] + nchi
                skips = [
                    k + j for k in list(block[1:] - block[0] - 3)
                    for j in range(3)
                ]
                name = 'coef' if not i else 'coef{}'.format(i)
                col = self.pandas_dataframe(
                    block[0], stop, ncol + 1, skiprows=skips).drop(
                        0,
                        axis=1,
                    ).unstack().dropna().reset_index(drop=True)
                data[name] = col
            norb = len(data.index) // nchi
            data['orbital'] = np.concatenate(
                [np.repeat(range(i, norb, ncol), nchi) for i in range(ncol)])
            data['chi'] = np.tile(range(nchi), norb)
            data['frame'] = 0
            if rowmajor:
                data.rename(columns={
                    'orbital': 'chi',
                    'chi': 'orbital'
                },
                            inplace=True)
                data.sort_values(by=['orbital', 'chi'], inplace=True)
            self.momatrix = data
        else:
            print('Symmetrized calcs not supported yet.')

    def parse_sphr_momatrix(self, verbose=False):
        """
        Parser localized momatrix (if present).

        If the ``locorb`` keyword is used in ADF, an additional momatrix is
        printed after localization is performed. Parsing this table allows
        for visualization of these orbitals.

        Note:
            The attr :attr:`~exatomic.adf.output._re_loc_mo` is used for parsing this
            section.
        """
        _re_loc_mo = ("Localized MOs expanded in CFs+SFOs",
                      "SFO contributions (%) per Localized Orbital")
        found = self.find(*_re_loc_mo)
        if len(found[_re_loc_mo[0]]) == 0:
            if verbose:
                print("No localization performed.")
            return  # Nothing to parse
        start = found[_re_loc_mo[0]][0][0] + 8
        stop = found[_re_loc_mo[1]][0][0] - 4
        # Parse the localized momatrix as a whole block of text
        df = pd.read_fwf(StringIO("\n".join(self[start:stop])),
                         widths=(16, 9, 9, 9, 9, 9, 9, 9, 9),
                         header=None)
        del df[0]
        # Identify the eigenvectors and (un)stack them correctly
        n = df[df[1].isnull()].index[0]  # number of basis functions
        m = np.ceil(df.shape[0] / n).astype(
            int)  # number of printed blocks of text
        # idx - indexes of "lines" (rows) that don't contain coefficients
        idx = [(n + 5) * j + i - 5 for j in range(1, m) for i in range(0, 5)]
        df = df[~df.index.isin(idx)]
        coefs = []
        for i in range(0, df.shape[0] // n + 1):
            d = df.iloc[n * (i - 1):n * i, :]
            coefs.append(d.unstack().dropna().values.astype(float))
        coefs = np.concatenate(coefs)
        m = coefs.shape[0] // n  # Number of localized MOs
        momatrix = pd.DataFrame.from_dict({
            'coef':
            coefs,
            'orbital': [i for i in range(m) for _ in range(n)],
            'chi': [j for _ in range(m) for j in range(n)]
        })
        momatrix['frame'] = self.atom['frame'].unique()[-1]
        self.sphr_momatrix = momatrix

    def __init__(self, *args, **kwargs):
        super(Output, self).__init__(*args, **kwargs)

Example #3

Show file

File: output.py Project: wgong/exatomic

class Output(six.with_metaclass(OutMeta, Editor)):
    """The ADF output parser."""
    def parse_atom(self):
        # TODO : only supports single frame, gets last atomic positions
        #        this will actually get the very first coordinates
        #_re_atom_00 = 'Atoms in this Fragment     Cart. coord.s (Angstrom)'
        _re_atom_00 = 'ATOMS'
        found1 = self.find(_re_atom_00, keys_only=True)
        # use the regex instead of find because we have a similar search string in an nmr and
        # cpl calculation for the nuclear coordinates
        _reatom = "(?i)NUCLEAR COORDINATES"
        found2 = self.regex(_reatom, keys_only=True)
        # to find the optimized frames
        _reopt = "Coordinates (Cartesian)"
        found_opt = self.find(_reopt, keys_only=True)
        if found_opt:
            starts = np.array(found_opt) + 6
            stop = starts[0]
            while '------' not in self[stop]:
                stop += 1
            stops = starts + stop - starts[0]
            dfs = []
            for idx, (start, stop) in enumerate(zip(starts, stops)):
                # parse everything as they may be useful in the future
                df = self.pandas_dataframe(start, stop, ncol=11)
                # drop everything
                df.drop(list(range(5, 11)), axis='columns', inplace=True)
                # we read the coordinates in bohr so no need to convrt
                df.columns = ['set', 'symbol', 'x', 'y', 'z']
                df['set'] = df['set'].astype(int)
                df['Z'] = df['symbol'].map(sym2z)
                df['frame'] = idx
                df['set'] -= 1
                dfs.append(df)
            atom = pd.concat(dfs, ignore_index=True)
        elif found1:
            start = stop = found1[-1] + 4
            while self[stop].strip():
                stop += 1
            atom = self.pandas_dataframe(start, stop, ncol=8)
            atom.drop(list(range(5, 8)), axis='columns', inplace=True)
            atom.columns = ['set', 'symbol', 'x', 'y', 'z']
            for c in ['x', 'y', 'z']:
                atom[c] *= Length['Angstrom', 'au']
            atom['Z'] = atom['symbol'].map(sym2z)
            atom['set'] -= 1
            atom['frame'] = 0
        elif found2:
            #if len(found) > 1:
            #    raise NotImplementedError("We can only parse outputs from a single NMR calculation")
            atom = []
            for idx, val in enumerate(found2):
                start = val + 3
                stop = start
                while self[stop].strip():
                    stop += 1
                # a bit of a hack to make sure that there is no formatting change depending on the
                # number of atoms in the molecule as the index is right justified so if there are
                # more than 100 atoms it will fill the alloted space for the atom index and change the
                # delimitter and therefore the number of columns
                self[start:stop] = map(lambda x: x.replace('(', ''),
                                       self[start:stop])
                df = self.pandas_dataframe(start, stop, ncol=5)
                df.columns = ['symbol', 'set', 'x', 'y', 'z']
                for c in ['x', 'y', 'z']:
                    df[c] *= Length['Angstrom', 'au']
                df['Z'] = df['symbol'].map(sym2z)
                df['frame'] = idx
                # remove the trailing chracters from the index
                df['set'] = list(map(lambda x: x.replace('):', ''), df['set']))
                df['set'] = df['set'].astype(int) - 1
                atom.append(df)
            atom = pd.concat(atom)
        else:
            raise NotImplementedError("We could not find the atom table in this output. Please submit "+ \
                                      "an issue ticket so we can add it in.")
        self.atom = atom

    def parse_basis_set(self):
        # Find the basis set
        _re_bas_00 = '(Slater-type)  F U N C T I O N S'
        _re_bas_01 = 'Atom Type'
        start = self.find(_re_bas_00, keys_only=True)[-1] + 3
        starts = self.find(_re_bas_01, start=start, keys_only=True)
        lines = []
        for ext in starts:
            for i in range(4):
                lines.append(start + ext + i)
            stop = start + ext + 4
            while self[stop].strip():
                lines.append(stop)
                stop += 1
        df = pd.read_fwf(StringIO('\n'.join([self[i] for i in lines])),
                         widths=[4, 2, 12, 4],
                         names=['n', 'L', 'alpha', 'symbol'])
        # Where atom types change
        idxs = [0] + df['n'][df['n'] == '---'].index.tolist() + [df.shape[0]]
        sets, shells = [], []
        for i, (start, stop) in enumerate(zip(idxs, idxs[1:])):
            sets.append(np.repeat(i - 1, stop - start))
            shells.append(np.arange(-1, stop - start - 1))
        df['set'] = np.concatenate(sets)
        df['shell'] = np.concatenate(shells)
        # Atom table basis set map
        basmap = df['symbol'].dropna()
        basmap = basmap[basmap.str.endswith(')')].str.strip(')')
        basmap = {
            val: df['set'][key] + 1
            for key, val in basmap.to_dict().items()
        }
        # Discard the garbage
        drop = df['n'].str.strip().str.isnumeric().fillna(False)
        df.drop(drop[drop == False].index, inplace=True)
        df.drop('symbol', axis=1, inplace=True)
        # Clean up the series
        df['alpha'] = df['alpha'].astype(np.float64)
        df['n'] = df['n'].astype(np.int64)
        df['L'] = df['L'].str.lower().map(lmap)
        df['d'] = np.sqrt((2 * df['L'] + 1) / (4 * np.pi))
        df['r'] = df['n'] - (df['L'] + 1)
        df['frame'] = 0
        self.basis_set = BasisSet(df)
        self.meta['spherical'] = False
        self.atom['set'] = self.atom['symbol'].map(basmap)

    def parse_basis_set_order(self):
        # All the columns we need
        data = defaultdict(list)
        sets = self.basis_set.groupby('set')
        # Iterate over atoms
        for center, symbol, seht in zip(self.atom.index, self.atom['symbol'],
                                        self.atom['set']):
            # Per basis set
            bas = sets.get_group(seht).groupby('L')
            for L, grp in bas:
                # Iterate over cartesians
                for l, m, n in enum_cartesian[L]:
                    for shell, r in zip(grp['shell'], grp['r']):
                        data['center'].append(center)
                        data['symbol'].append(symbol)
                        data['shell'].append(shell)
                        data['seht'].append(seht)
                        data['L'].append(L)
                        data['l'].append(l)
                        data['m'].append(m)
                        data['n'].append(n)
                        data['r'].append(r)
        data['set'] = data.pop('seht')
        data['frame'] = 0
        self.basis_set_order = pd.DataFrame.from_dict(data)
        self.basis_set_order['prefac'] = (
            self.basis_set_order['L'].apply(dfac21) /
            (self.basis_set_order['l'].apply(dfac21) *
             self.basis_set_order['m'].apply(dfac21) *
             self.basis_set_order['n'].apply(dfac21))).apply(np.sqrt)

    def parse_orbital(self):
        _re_orb_00 = 'Orbital Energies, both Spins'
        _re_orb_01 = 'Orbital Energies, per Irrep and Spin'
        found = self.find(_re_orb_00, _re_orb_01, keys_only=True)
        # Open shell vs. closed shell
        cols = {
            _re_orb_00:
            ['symmetry', 'vector', 'spin', 'occupation', 'energy', 'eV'],
            _re_orb_01: ['vector', 'occupation', 'energy', 'eV', 'dE']
        }
        key = _re_orb_00 if found[_re_orb_00] else _re_orb_01
        start = stop = found[key][-1] + 5
        while self[stop].strip():
            stop += 1
        df = self.pandas_dataframe(start, stop, cols[key])
        df['vector'] -= 1
        if 'spin' in cols[key]:
            df['spin'] = df.spin.map({'A': 0, 'B': 1})
            df.sort_values(by=['spin', 'energy'], inplace=True)
        else:
            df.sort_values(by='energy', inplace=True)
            df['spin'] = 0
        df.reset_index(drop=True, inplace=True)
        df['frame'] = df['group'] = 0
        self.orbital = df

    def parse_contribution(self):
        _re_con_00 = ('E(eV)  Occ       MO           %     '
                      'SFO (first member)   E(eV)  Occ   Fragment')
        # MO contribution by percentage
        found = self.find(_re_con_00, keys_only=True)
        starts = [i + 3 for i in found]
        widths = [12, 6, 6, 6, 11, 6, 10, 12, 6, 6, 3]
        names = [
            'eV', 'occupation', 'vector', 'sym', '%', 'SFO', 'angmom',
            'eV(sfo)', 'occ(sfo)', 'atom', 'symbol'
        ]
        dfs = []
        # Prints for both spins
        for i, start in enumerate(starts):
            stop = start
            while self[stop].strip():
                stop += 1
            dfs.append(
                pd.read_fwf(StringIO('\n'.join(self[start:stop])),
                            delim_whitespace=True,
                            widths=widths,
                            names=names))
            dfs[-1]['spin'] = i
        dfs = pd.concat(dfs).reset_index(drop=True)
        dfs = dfs.applymap(lambda x: np.nan if (isinstance(
            x, six.string_types) and x.isspace()) else x)
        dfs.fillna(method='ffill', inplace=True)
        # Clean up
        dfs['symbol'] = dfs['symbol'].str.strip()
        dfs['angmom'] = dfs['angmom'].str.strip()
        dfs['angmom'].update(dfs['angmom'].map({'S': 'S:'}))
        dfs[['L', 'ml']] = dfs['angmom'].str.extract('(.*):(.*)', expand=True)
        dfs['%'] = dfs['%'].str.replace('%', '')
        dfs['%'].update(dfs['%'].map({"    ******": np.inf}))
        dfs['%'] = dfs['%'].astype(np.float64)
        dfs['occupation'] = dfs['occupation'].astype(np.float64)
        dfs['vector'] = dfs['vector'].astype(np.int64) - 1
        dfs['eV'] = dfs['eV'].astype(np.float64)
        dfs['atom'] -= 1
        self.contribution = dfs

    def parse_excitation(self):
        # Excitation
        _re_exc_00 = '(sum=1) transition dipole moment'
        _re_exc_01 = ' no.     E/a.u.        E/eV      f           Symmetry'
        found = self.find_next(_re_exc_00, keys_only=True)
        if not found: return
        # First table of interest here
        start = found + 4
        stop = self.find_next(_re_exc_01, keys_only=True) - 3
        os = len(self[start].split()) == 9
        todrop = ['occ:', 'virt:']
        cols = [
            'excitation', 'occ', 'drop', 'virt', 'weight', 'TDMx', 'TDMy',
            'TDMz'
        ]
        if os: cols.insert(1, 'spin')
        if os: todrop = ['occ', 'virt']
        adf = self.pandas_dataframe(start, stop, cols)
        adf.drop('drop', axis=1, inplace=True)
        s1 = set(adf[cols[1]][adf[cols[1]] == 'NTO'].index)
        s2 = set(adf['excitation'][adf['excitation'].isin(todrop)].index)
        adf.drop(s1 | s2, axis=0, inplace=True)
        adf['excitation'] = adf['excitation'].str[:-1].astype(np.int64) - 1
        if os: adf['spin'] = adf['spin'].map({'Alph': 0, 'Beta': 1})
        adf[['occ', 'occsym']] = adf['occ'].str.extract('([0-9]*)(.*)',
                                                        expand=True)
        adf[['virt', 'virtsym']] = adf['virt'].str.extract('([0-9]*)(.*)',
                                                           expand=True)
        adf['occ'] = adf['occ'].astype(np.int64) - 1
        adf['virt'] = adf['virt'].astype(np.int64) - 1
        # Second one here
        start = stop + 5
        stop = start
        while self[stop].strip():
            stop += 1
        cols = _re_exc_01.split()
        df = self.pandas_dataframe(start, stop + 1, cols)
        df.drop(cols[0], axis=1, inplace=True)
        df.columns = ['energy', 'eV', 'osc', 'symmetry']
        # Expand the second table to fit the original
        for col in df.columns:
            adf[col] = adf.excitation.map(df[col])
        adf['frame'] = adf['group'] = 0
        self.excitation = adf

    def parse_momatrix(self):
        _re_mo_00 = 'Eigenvectors .* in BAS representation'
        _re_mo_01 = 'row '
        _re_mo_02 = 'nosym'
        found = self.regex(_re_mo_00,
                           _re_mo_01,
                           _re_mo_02,
                           flags=re.IGNORECASE,
                           keys_only=True)
        if not found[_re_mo_00] or not found[_re_mo_01]: return
        if found[_re_mo_02]:
            thresh = found[_re_mo_00][0]
            rowmajor = 'rows' in self[thresh]
            starts = np.array([i for i in found[_re_mo_01] if i > thresh]) + 1
            nchi = starts[1] - starts[0] - 3
            ncol = len(self[starts[0] + 1].split()) - 1
            if len(starts) % 2: os = False
            else:
                anchor = starts[len(starts) // 2 - 1] + nchi
                sail = starts[len(starts) // 2]
                os = True if self.find('SPIN 2', start=anchor,
                                       stop=sail) else False
            blocks = [starts] if not os else [
                starts[:len(starts) // 2], starts[len(starts) // 2:]
            ]
            data = pd.DataFrame()
            for i, block in enumerate(blocks):
                stop = block[-1] + nchi
                skips = [
                    k + j for k in list(block[1:] - block[0] - 3)
                    for j in range(3)
                ]
                name = 'coef' if not i else 'coef{}'.format(i)
                col = self.pandas_dataframe(
                    block[0], stop, ncol + 1, skiprows=skips).drop(
                        0,
                        axis=1,
                    ).unstack().dropna().reset_index(drop=True)
                data[name] = col
            norb = len(data.index) // nchi
            data['orbital'] = np.concatenate(
                [np.repeat(range(i, norb, ncol), nchi) for i in range(ncol)])
            data['chi'] = np.tile(range(nchi), norb)
            data['frame'] = 0
            if rowmajor:
                data.rename(columns={
                    'orbital': 'chi',
                    'chi': 'orbital'
                },
                            inplace=True)
                data.sort_values(by=['orbital', 'chi'], inplace=True)
            self.momatrix = data
        else:
            print('Symmetrized calcs not supported yet.')

    def parse_sphr_momatrix(self, verbose=False):
        """
        Parser localized momatrix (if present).

        If the ``locorb`` keyword is used in ADF, an additional momatrix is
        printed after localization is performed. Parsing this table allows
        for visualization of these orbitals.

        Note:
            The attr :attr:`~exatomic.adf.output._re_loc_mo` is used for parsing this
            section.
        """
        _re_loc_mo = ("Localized MOs expanded in CFs+SFOs",
                      "SFO contributions (%) per Localized Orbital")
        found = self.find(*_re_loc_mo)
        if len(found[_re_loc_mo[0]]) == 0:
            if verbose:
                print("No localization performed.")
            return  # Nothing to parse
        start = found[_re_loc_mo[0]][0][0] + 8
        stop = found[_re_loc_mo[1]][0][0] - 4
        # Parse the localized momatrix as a whole block of text
        df = pd.read_fwf(StringIO("\n".join(self[start:stop])),
                         widths=(16, 9, 9, 9, 9, 9, 9, 9, 9),
                         header=None)
        del df[0]
        # Identify the eigenvectors and (un)stack them correctly
        n = df[df[1].isnull()].index[0]  # number of basis functions
        m = np.ceil(df.shape[0] / n).astype(
            int)  # number of printed blocks of text
        # idx - indexes of "lines" (rows) that don't contain coefficients
        idx = [(n + 5) * j + i - 5 for j in range(1, m) for i in range(0, 5)]
        df = df[~df.index.isin(idx)]
        coefs = []
        for i in range(0, df.shape[0] // n + 1):
            d = df.iloc[n * (i - 1):n * i, :]
            coefs.append(d.unstack().dropna().values.astype(float))
        coefs = np.concatenate(coefs)
        m = coefs.shape[0] // n  # Number of localized MOs
        momatrix = pd.DataFrame.from_dict({
            'coef':
            coefs,
            'orbital': [i for i in range(m) for _ in range(n)],
            'chi': [j for _ in range(m) for j in range(n)]
        })
        momatrix['frame'] = self.atom['frame'].unique()[-1]
        self.sphr_momatrix = momatrix

    def parse_gradient(self):
        _regrad = "Energy gradients wrt nuclear displacements"
        found = self.find(_regrad, keys_only=True)
        if not found:
            return
        starts = np.array(found) + 6
        stop = starts[0]
        while '----' not in self[stop]:
            stop += 1
        stops = starts + (stop - starts[0])
        dfs = []
        for i, (start, stop) in enumerate(zip(starts, stops)):
            df = self.pandas_dataframe(start, stop, ncol=5)
            df.columns = ['atom', 'symbol', 'fx', 'fy', 'fz']
            df['frame'] = i
            df['atom'] -= 1
            dfs.append(df)
        grad = pd.concat(dfs, ignore_index=True)
        grad['Z'] = grad['symbol'].map(sym2z)
        grad = grad[['atom', 'Z', 'fx', 'fy', 'fz', 'symbol', 'frame']]
        for u in ['fx', 'fy', 'fz']:
            grad[u] *= 1. / Length['Angstrom', 'au']
        self.gradient = grad

    def parse_frequency(self):
        _renorm = "Vibrations and Normal Modes"
        _refreq = "List of All Frequencies:"
        found = self.find(_refreq, keys_only=True)
        if not found:
            return
        elif len(found) > 1:
            raise NotImplementedError(
                "We cannot parse more than one frequency calculation in a single output"
            )
        found = self.find(_refreq, _renorm, keys_only=True)
        start = found[_refreq][0] + 9
        stop = start
        while self[stop]:
            stop += 1
        df = self.pandas_dataframe(start, stop, ncol=3)
        freqs = df[0].values
        n = int(np.ceil(freqs.shape[0] / 3))
        start = found[_renorm][0] + 9
        stop = start
        while self[stop]:
            stop += 1
        natoms = stop - start
        dfs = []
        fdx = 0
        for i in range(n):
            if i == 0:
                start = found[_renorm][0] + 9
            else:
                start = stop + 4
            stop = start + natoms
            freqs = list(map(lambda x: float(x), self[start - 2].split()))
            ncol = len(freqs)
            df = self.pandas_dataframe(start, stop, ncol=1 + 3 * ncol)
            tmp = list(map(lambda x: x.split('.'), df[0]))
            index, symbol = list(map(list, zip(*tmp)))
            slices = [list(range(1 + i, 1 + 3 * ncol, 3)) for i in range(ncol)]
            dx, dy, dz = [df[i].unstack().values for i in slices]
            freqdx = np.repeat(list(range(fdx, ncol + fdx)), natoms)
            zs = pd.Series(symbol).map(sym2z)
            freqs = np.repeat(freqs, natoms)
            stacked = pd.DataFrame.from_dict({
                'Z': np.tile(zs, ncol),
                'label': np.tile(index, ncol),
                'dx': dx,
                'dy': dy,
                'dz': dz,
                'frequency': freqs,
                'freqdx': freqdx
            })
            stacked['ir_int'] = 0.0
            stacked['symbol'] = np.tile(symbol, ncol)
            dfs.append(stacked)
            fdx += ncol
        frequency = pd.concat(dfs, ignore_index=True)
        frequency['frame'] = 0
        # TODO: check units of the normal modes
        self.frequency = frequency

    def parse_nmr_shielding(self):
        _reatom = "N U C L E U S :"
        _reshield = "==== total shielding tensor"
        _renatom = "NUCLEAR COORDINATES (ANGSTROMS)"
        found = self.find(_reatom, keys_only=True)
        if not found:
            #raise NotImplementedError("Could not find {} in output".format(_reatom))
            return
        ncalc = self.find(_renatom, keys_only=True)
        ncalc.append(len(self))
        ndx = 0
        dfs = []
        for start in found:
            try:
                ndx = ndx if start > ncalc[ndx] and start < ncalc[
                    ndx + 1] else ndx + 1
            except IndexError:
                raise IndexError(
                    "It seems that there was an issue with determining which NMR calculation we are in"
                )
            start_shield = self.find(_reshield, keys_only=True,
                                     start=start)[0] + start + 2
            end_shield = start_shield + 3
            symbol, index = self[start].split()[-1].split('(')
            index = int(index.replace(')', ''))
            isotropic = float(self[start_shield + 4].split()[-1])
            df = self.pandas_dataframe(start_shield, end_shield, ncol=3)
            cols = ['xx', 'xy', 'xz', 'yx', 'yy', 'yz', 'zx', 'zy', 'zz']
            df = pd.DataFrame(df.unstack().values.reshape(1, 9), columns=cols)
            df['isotropic'] = isotropic
            df['atom'] = index - 1
            df['symbol'] = symbol
            df['label'] = 'nmr shielding'
            df['frame'] = ndx
            dfs.append(df)
        shielding = pd.concat(dfs, ignore_index=True)
        self.nmr_shielding = shielding

    def parse_j_coupling(self):
        _recoupl = "total calculated spin-spin coupling:"
        _reatom = "Internal CPL numbering of atoms:"
        found = self.find(_reatom, keys_only=True)
        if not found:
            return
        found = self.find(_reatom, _recoupl, keys_only=True)
        # we grab the tensors inside the principal axis representation
        # for the cartesian axis representation we start the list at 0 and grab every other instance
        start_coupl = found[_recoupl][1::2]
        start_pert = np.array(found[_reatom]) - 3
        dfs = []
        # grab atoms
        cols = ['xx', 'xy', 'xz', 'yx', 'yy', 'yz', 'zx', 'zy', 'zz']
        for ln, start in zip(start_pert, start_coupl):
            line = self[ln].split()
            # we just replace all of the () in the strings
            pert_nucl = list(
                map(lambda x: x.replace('(', '').replace(')', ''), line[5:]))
            nucl = list(
                map(lambda x: x.replace('(', '').replace(')', ''), line[1:3]))
            # grab both tensors
            df = self.pandas_dataframe(start + 2, start + 5, ncol=6)
            # this will grab the iso value and tensor elements for the j coupling in hz
            df.drop(range(3), axis='columns', inplace=True)
            df = pd.DataFrame(df.unstack().values.reshape(1, 9), columns=cols)
            iso = self[start + 1].split()[-1]
            # place all of the dataframe columns
            df['isotropic'] = float(iso)
            df['atom'] = int(nucl[0])
            df['symbol'] = nucl[1]
            df['pt_atom'] = int(pert_nucl[0])
            df['pt_symbol'] = pert_nucl[1]
            df['label'] = 'j coupling'
            df['frame'] = 0
            dfs.append(df)
        # put everything together
        j_coupling = pd.concat(dfs, ignore_index=True)
        j_coupling['atom'] -= 1
        j_coupling['pt_atom'] -= 1
        self.j_coupling = j_coupling

    def __init__(self, *args, **kwargs):
        super(Output, self).__init__(*args, **kwargs)

Example #4

Show file

File: output.py Project: farnoushnouri/exatomic

class Output(six.with_metaclass(OutMeta, Editor)):
    """Editor for NWChem calculation output file (stdout)."""
    def parse_atom(self):
        """Parse the atom dataframe."""
        _reatom01 = 'Geometry "'
        _reatom02 = 'Atomic Mass'
        _reatom03 = 'ECP       "ecp basis"'
        _reatom04 = 'Output coordinates in'
        found = self.find(_reatom01,
                          _reatom02,
                          _reatom03,
                          _reatom04,
                          keys_only=True)
        unit = self[found[_reatom04][0]].split()[3]
        unit = "Angstrom" if unit == "angstroms" else "au"
        starts = np.array(found[_reatom01]) + 7
        stops = np.array(found[_reatom02]) - 1
        ecps = np.array(found[_reatom03]) + 2
        ecps = {self[ln].split()[0]: int(self[ln].split()[3]) for ln in ecps}
        columns = ['label', 'tag', 'Z', 'x', 'y', 'z']
        atom = pd.concat([
            self.pandas_dataframe(s, e, columns)
            for s, e in zip(starts, stops)
        ])
        atom['symbol'] = atom['tag'].str.extract(
            '([A-z]{1,})([0-9]*)', expand=False)[0].str.lower().str.title()
        atom['Z'] = atom['Z'].astype(np.int64)
        atom['Zeff'] = (atom['Z'] -
                        atom['tag'].map(ecps).fillna(value=0)).astype(np.int64)
        #n = len(atom)
        nf = atom.label.value_counts().max()
        nat = atom.label.max()
        atom['frame'] = [i for i in range(nf) for j in range(nat)]
        atom['label'] -= 1
        atom['x'] *= Length[unit, 'au']
        atom['y'] *= Length[unit, 'au']
        atom['z'] *= Length[unit, 'au']
        if atom['frame'].max() > 0:
            li = atom['frame'].max()
            atom = atom[~(atom['frame'] == li)]
            atom.reset_index(drop=True, inplace=True)
        del atom['label']
        self.atom = Atom(atom)

    def parse_orbital(self):
        """Parse the :class:`~exatomic.core.orbital.Orbital` dataframe."""
        orbital = None
        _remo01 = 'Molecular Orbital Analysis'
        _remo02 = 'alpha - beta orbital overlaps'
        _remo03 = 'center of mass'
        check = self.find(_remo01)
        if any(['Alpha' in value for value in check]):
            alpha_starts = np.array(
                [no
                 for no, line in check if 'Alpha' in line], dtype=np.int64) + 2
            alpha_stops = np.array(
                [no
                 for no, line in check if 'Beta' in line], dtype=np.int64) - 1
            beta_starts = alpha_stops + 3
            beta_stops = np.array(self.find(_remo02, keys_only=True),
                                  dtype=np.int64) - 1
            alpha_orbital = self._parse_orbital(alpha_starts, alpha_stops)
            beta_orbital = self._parse_orbital(beta_starts, beta_stops)
            alpha_orbital['spin'] = 0
            beta_orbital['spin'] = 1
            orbital = pd.concat((alpha_orbital, beta_orbital),
                                ignore_index=True)
        else:
            starts = np.array(list(zip(*check))[0], dtype=np.int64) + 2
            stops = np.array(self.find(_remo03, keys_only=True),
                             dtype=np.int64) - 1
            orbital = self._parse_orbital(starts, stops)
            orbital['spin'] = 0
        orbital['group'] = 0
        self.orbital = Orbital(orbital)

    def parse_momatrix(self):
        """
        Parse the :class:`~exatomic.core.orbital.MOMatrix` dataframe.

        Note:
            Must supply 'print "final vectors" "final vectors analysis"' for momatrix
        """
        key0 = "Final MO vectors"
        key1 = "center of mass"
        found = self.find(key0, key1)
        if found[key0]:
            start = found[key0][0][0] + 6
            end = found[key1][0][0] - 1
            c = pd.read_fwf(StringIO("\n".join(self[start:end])),
                            widths=(6, 12, 12, 12, 12, 12, 12),
                            names=list(range(7)))
            self.c = c
            idx = c[c[0].isnull()].index.values
            c = c[~c.index.isin(idx)]
            del c[0]
            nbas = len(self.basis_set_order)
            n = c.shape[0] // nbas
            coefs = []
            # The for loop below is like numpy.array_split(df, n); using numpy.array_split
            # with dataframes seemed to have strange results where splits had wrong sizes?
            for i in range(n):
                coefs.append(c.iloc[i * nbas:(i + 1) *
                                    nbas, :].astype(float).dropna(
                                        axis=1).values.ravel("F"))
            c = np.concatenate(coefs)
            del coefs
            orbital, chi = _square_indices(len(self.basis_set_order))
            self.momatrix = MOMatrix.from_dict({
                'coef': c,
                'chi': chi,
                'orbital': orbital,
                'frame': 0
            })
            # momatrix = pd.DataFrame.from_dict({'coef': c, 'chi': chi, 'orbital': orbital})
            # momatrix['frame'] = 0
            # self.momatrix = momatrix

    def _parse_orbital(self, starts, stops):
        '''
        This function actually performs parsing of :class:`~exatomic.orbital.Orbital`

        See Also:
            :func:`~exnwchem.output.Output.parse_orbital`
        '''
        joined = '\n'.join(
            ['\n'.join(self[s:e]) for s, e in zip(starts, stops)])
        nvec = joined.count('Vector')
        if 'spherical' not in self.meta:
            self.parse_basis_set()
        mapper = self.basis_set.functions(
            self.meta['spherical']).groupby(level="set").sum()
        nbas = self.atom['set'].map(mapper).sum()
        nbas *= nvec
        # Orbital dataframe -- alternatively one could parse the strings
        # into the DataFrame and then use the pd.Series.str methods to
        # perform all the replacements at the same time, eg. 'D' --> 'E'
        # and 'Occ=' --> '', etc.
        orb_no = np.empty((nvec, ), dtype=np.int64)
        occ = np.empty((nvec, ), dtype=np.float64)
        nrg = np.empty((nvec, ), dtype=np.float64)
        x = np.empty((nvec, ), dtype=np.float64)
        y = np.empty((nvec, ), dtype=np.float64)
        z = np.empty((nvec, ), dtype=np.float64)
        frame = np.empty((nvec, ), dtype=np.int64)
        fc = -1  # Frame counter
        oc = 0  # Orbital counter
        for s, e in zip(starts, stops):
            fc += 1
            for line in self[s:e]:
                ls = line.split()
                if 'Vector' in line:
                    orb_no[oc] = ls[1]
                    occ[oc] = ls[2].replace('Occ=', '').replace('D', 'E')
                    nrg[oc] = ls[3].replace('E=', '').replace(
                        'D', 'E') if 'E=-' in line else ls[4].replace(
                            'D', 'E')
                    frame[oc] = fc
                elif 'MO Center' in line:
                    x[oc] = ls[2].replace(',', '').replace('D', 'E')
                    y[oc] = ls[3].replace(',', '').replace('D', 'E')
                    z[oc] = ls[4].replace(',', '').replace('D', 'E')
                    oc += 1
        orb_no -= 1
        return pd.DataFrame.from_dict({
            'x': x,
            'y': z,
            'z': z,
            'frame': frame,
            'vector': orb_no,
            'occupation': occ,
            'energy': nrg
        })

    def parse_basis_set(self):
        """
        Parse the :class:`~exatomic.core.basis.BasisSet` dataframe.
        """
        if not hasattr(self, "atom"):
            self.parse_atom()
        _rebas01 = ' Basis "'
        _rebas02 = ' Summary of "'
        _rebas03 = [
            ' s ', ' px ', ' py ', ' pz ', ' d ', ' f ', ' g ', ' h ', ' i ',
            ' j ', ' k ', ' l ', ' m ', ' p '
        ]
        found = self.find(_rebas01, _rebas02)
        spherical = True if "spherical" in found[_rebas01][0][1] else False
        start = found[_rebas01][0][0] + 2
        idx = 1 if len(found[_rebas02]) > 1 else -1
        stop = found[_rebas02][idx][0] - 1
        # Read in all of the extra lines that contain ---- and tag names
        df = pd.read_fwf(StringIO("\n".join(self[start:stop])),
                         widths=(4, 2, 16, 16),
                         names=("shell", "L", "alpha", "d"))
        df.loc[df['shell'] == "--", "shell"] = np.nan
        tags = df.loc[(df['shell'].str.isdigit() == False), "shell"]
        idxs = tags.index.tolist()
        idxs.append(len(df))
        df['set'] = ""
        for i, tag in enumerate(tags):
            df.loc[idxs[i]:idxs[i + 1], "set"] = tag
        df = df.dropna().reset_index(drop=True)
        mapper = {v: k for k, v in dict(enumerate(df['set'].unique())).items()}
        df['set'] = df['set'].map(mapper)
        df['L'] = df['L'].str.strip().str.lower().map(lmap)
        df['alpha'] = df['alpha'].astype(float)
        df['d'] = df['d'].astype(float)
        # NO SUPPORT FOR MULTIPLE FRAMES?
        df['frame'] = 0
        self.basis_set = BasisSet(df)
        self.meta['spherical'] = spherical
        self.atom['set'] = self.atom['tag'].map(mapper)

    def parse_basis_set_order(self):
        dtype = [('center', 'i8'), ('shell', 'i8'), ('L', 'i8')]
        if 'spherical' not in self.meta:
            self.parse_basis_set()
        if self.meta['spherical']:
            dtype += [('ml', 'i8')]
        else:
            dtype += [('l', 'i8'), ('m', 'i8'), ('n', 'i8')]
        mapper = self.basis_set.functions(
            self.meta['spherical']).groupby(level="set").sum()
        nbas = self.atom['set'].map(mapper).sum()
        bso = np.empty((nbas, ), dtype=dtype)
        cnt = 0
        bases = self.basis_set.groupby('set')
        for seht, center in zip(self.atom['set'], self.atom.index):
            bas = bases.get_group(seht).groupby('shell')
            if self.meta['spherical']:
                for shell, grp in bas:
                    l = grp['L'].values[0]
                    for ml in spherical_ordering_function(l):
                        bso[cnt] = (center, shell, l, ml)
                        cnt += 1
            else:
                for shell, grp in bas:
                    l = grp['L'].values[0]
                    for _, ll, m, n in cartesian_ordering_function(l):
                        bso[cnt] = (center, shell, l, ll, m, n)
                        cnt += 1
        bso = pd.DataFrame(bso)
        bso['frame'] = 0
        # New shell definition consistent with basis internals
        shls = []
        grps = bso.groupby(['center', 'L'])
        cache = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
        for (cen, L), grp in grps:
            for ml in grp['ml']:
                shls.append(cache[cen][L][ml])
                cache[cen][L][ml] += 1
        bso['shell'] = shls
        self.basis_set_order = bso

    def parse_frame(self):
        """
        Create a minimal :class:`~exatomic.core.frame.Frame` from the (parsed)
        :class:`~exatomic.core.atom.Atom` object.
        """
        _rescfen = 'Total SCF energy'
        _redften = 'Total DFT energy'
        self.frame = compute_frame_from_atom(self.atom)
        found = self.find(_rescfen, _redften)
        scfs = found[_rescfen]
        dfts = found[_redften]
        if scfs and dfts:
            print('Warning: found total energies from scf and dft, using dft')
            dfts = [float(val.split()[-1]) for key, val in dfts]
            self.frame['total_energy'] = dfts
        elif scfs:
            scfs = [float(val.split()[-1]) for key, val in scfs]
            self.frame['total_energy'] = scfs
        elif dfts:
            dfts = [float(val.split()[-1]) for key, val in dfts]
            self.frame['total_energy'] = dfts

    def __init__(self, *args, **kwargs):
        super(Output, self).__init__(*args, **kwargs)

Example #5

Show file

class Output(six.with_metaclass(GauMeta, Editor)):
    def _parse_triangular_matrix(self, regex, column='coef', values_only=False):
        _rebas01 = r'basis functions,'
        found = self.find_next(_rebas01, keys_only=True)
        nbas = int(self[found].split()[0])
        found = self.find_next(regex, keys_only=True)
        if not found: return
        ncol = len(self[found + 1].split())
        start = found + 2
        rmdr = nbas % ncol
        skips = np.array(list(reversed(range(rmdr, nbas + max(1, rmdr), ncol))))
        skips = np.cumsum(skips) + np.arange(len(skips))
        stop = start + skips[-1]
        matrix = self.pandas_dataframe(start, stop, ncol + 1,
                                       index_col=0, skiprows=skips,
                                       ).unstack().dropna().apply(
                                       lambda x: x.replace('D', 'E')
                                       ).astype(np.float64).values
        if values_only: return matrix
        idxs = _triangular_indices(ncol, nbas)
        return pd.DataFrame.from_dict({'chi0': idxs[:,0],
                                       'chi1': idxs[:,1],
                                      'frame': idxs[:,2],
                                       column: matrix})

    def parse_atom(self):
        # Atom flags
        _regeom01 = 'Input orientation'
        _regeom02 = 'Standard orientation'
        # Find our data
        found = self.find(_regeom01, _regeom02, keys_only=True)
        # Check if nosymm was specified
        key = _regeom02 if found[_regeom02] else _regeom01
        starts = np.array(found[key]) + 5
        # Prints converged geometry twice but only need it once
        starts = starts[:-1] if len(starts) > 1 else starts
        stop = starts[0]
        # Find where the data stops
        while '-------' not in self[stop]: stop += 1
        # But it should be same sized array each time
        stops = starts + (stop - starts[0])
        dfs = []
        # Iterate over frames
        for i, (start, stop) in enumerate(zip(starts, stops)):
            atom = self.pandas_dataframe(start, stop, 6)
            atom['frame'] = i
            dfs.append(atom)
        atom = pd.concat(dfs).reset_index(drop=True)
        # Drop the column of atomic type (whatever that is)
        atom.drop([2], axis=1, inplace=True)
        # Name the data
        atom.columns = ['set', 'Z', 'x', 'y', 'z', 'frame']
        # Zero-based indexing
        atom['set'] -= 1
        # Convert to atomic units
        atom['x'] *= Length['Angstrom', 'au']
        atom['y'] *= Length['Angstrom', 'au']
        atom['z'] *= Length['Angstrom', 'au']
        # Map atomic symbols onto Z numbers
        atom['symbol'] = atom['Z'].map(z2sym)
        self.atom = atom

    def parse_basis_set(self):
        # Basis flags
        _rebas02 = 'AO basis set in the form of general basis input'
        _rebas03 = ' (Standard|General) basis'
        _basrep = {'D 0': 'D0 ', 'F 0': 'F0 ',
                   'G 0': 'G0 ', 'H 0': 'H0 ', 'I 0': 'I0 '}
        _rebaspat = re.compile('|'.join(_basrep.keys()))
        # Find the basis set
        found = self.regex(_rebas02, _rebas03, keys_only=True)
        if not found[_rebas02]: return
        start = stop = found[_rebas02][0] + 1
        while self[stop].strip(): stop += 1
        # Raw data
        df = self.pandas_dataframe(start, stop, 4)
        def _padx(srs): return [0] + srs.tolist() + [df.shape[0]]
        # Get some indices for appropriate columns
        setdx = _padx(df[0][df[0] == '****'].index)
        shldx = _padx(df[3][~np.isnan(df[3])].index)
        lindx = df[0][df[0].str.lower().isin(lorder + ['sp'])]
        # Populate the df
        df['L'] = lindx.str.lower().map(lmap)
        df['L'] = df['L'].fillna(method='ffill').fillna(
                                 method='bfill').astype(np.int64)
        df['center'] = np.concatenate([np.repeat(i, stop - start)
                       for i, (start, stop) in enumerate(zip(setdx, setdx[1:]))])
        df['shell'] = np.concatenate([np.repeat(i-1, stop - start)
                      for i, (start, stop) in enumerate(zip(shldx, shldx[1:]))])
        # Complicated way to get shells but it is flat
        maxshl = df.groupby('center').apply(lambda x: x.shell.max() + 1)
        maxshl.index += 1
        maxshl[0] = 0
        df['shell'] = df['shell'] - df['center'].map(maxshl)
        # Drop all the garbage
        todrop = setdx[:-1] + [i+1 for i in setdx[:-2]] + lindx.index.tolist()
        df.drop(todrop, inplace=True)
        # Keep cleaning
        if df[0].dtype == 'object':
            df[0] = df[0].str.replace('D', 'E').astype(np.float64)
        if df[1].dtype == 'object':
            df[1] = df[1].str.replace('D', 'E').astype(np.float64)
        try: sp = np.isnan(df[2]).sum() == df.shape[0]
        except TypeError:
            df[2] = df[2].str.replace('D', 'E').astype(np.float64)
            sp = True
        df.rename(columns={0: 'alpha', 1: 'd'}, inplace=True)
        # Deduplicate basis sets and expand 'SP' shells if present
        df, setmap = deduplicate_basis_sets(df, sp=sp)
        spherical = '5D' in self[found[_rebas03][0]]
        if df['L'].max() < 2:
            spherical = True
        self.basis_set = BasisSet(df)
        self.meta['spherical'] = spherical
        self.atom['set'] = self.atom['set'].map(setmap)


    def parse_orbital(self):
        _rebas01 = r'basis functions,'
        # Orbital flags
        _realphaelec = 'alpha electrons'
        _reorb01 = '(?=Alpha|Beta).*(?=occ|virt)'
        _reorb02 = 'Orbital symmetries'
        _orbslice = [slice(10 * i, 10 * i + 9) for i in range(5)]
        _symrep = {'Occupied': '', 'Virtual': '', 'Alpha Orbitals:': '',
                   'Beta  Orbitals:': '', '\(': '', '\)': ''}
        _resympat = re.compile('|'.join(_symrep.keys()))
        _symrep['('] = ''
        _symrep[')'] = ''
        # Find where our data is
        found = self.regex(_reorb01, _reorb02, _rebas01, _realphaelec)
        # If no orbital energies, quit
        if not found[_reorb01]: return
        # Check if open shell
        os = any(('Beta' in ln for lno, ln in found[_reorb01]))
        #UNUSED?
        #occ = 1 if os else 2
        # Find number of electrons
        ae, x, x, be, x, x = found[_realphaelec][0][1].split()
        ae, be = int(ae), int(be)
        # Get orbital energies
        ens = '\n'.join([ln.split('-- ')[1] for i, ln in found[_reorb01]])
        ens = pd.read_fwf(six.StringIO(ens), header=None,
                          widths=np.repeat(10, 5)).stack().values
        # Other arrays
        orbital = Orbital.from_energies(ens, ae, be, os=os)
        # Symmetry labels
        if found[_reorb02]:
            # Gaussian seems to print out a lot of these blocks
            # maybe a better way to deal with this
            allsyms = []
            match = ['(', 'Orbitals']
            for i, (start, ln) in enumerate(found[_reorb02]):
                # Find the start, stop indices for each block
                while match[0] not in self[start]: start += 1
                stop = start + 1
                while any((i in self[stop] for i in match)): stop += 1
                # Clean up the text block so it is just symmetries
                syms = _resympat.sub(lambda m: _symrep[m.group(0)],
                                     ' '.join([i.strip() for i in
                                     self[start:stop]])).split()
                # cat the syms for each block together
                allsyms += syms
            # Add it to our dataframe
            orbital['symmetry'] = allsyms[-orbital.shape[0]:]
        self.orbital = orbital


    def parse_momatrix(self):
        """
        Parses the MO matrix if asked for in the input.

        Note:
            Requires specification of pop(full) or pop(no) or the like.
        """
        if hasattr(self, '_momatrix'): return
        _rebas01 = r'basis functions,'
        # MOMatrix flags
        _remomat01 = r'pop.*(?=full|no)'
        _remomat02 = 'Orbital Coefficients'
        _basrep = {'D 0': 'D0 ', 'F 0': 'F0 ',
                   'G 0': 'G0 ', 'H 0': 'H0 ', 'I 0': 'I0 '}
        _rebaspat = re.compile('|'.join(_basrep.keys()))
        # Check if a full MO matrix was specified in the input
        check = self.regex(_remomat01, stop=1000, flags=re.IGNORECASE)
        if not check: return
        # Find approximately where our data is
        found = self.find(_remomat02, _rebas01)
        # Get some dimensions
        ndim = len(found[_remomat02])
        # If something goes wrong
        if not ndim: return
        nbas = int(found[_rebas01][0][1].split()[0])
        nblocks = np.int64(np.ceil(nbas / 5))
        # Allocate a big ol' array
        coefs = np.empty((nbas ** 2, ndim), dtype=np.float64)
        # Dynamic column generation hasn't been worked out yet
        colnames = ['coef'] + ['coef' + str(i) for i in range(1, ndim)]
        # Iterate over where the data was found
        # c counts the column in the resulting momatrix table
        _csv_args = {'delim_whitespace': True, 'header': None}
        for c, (lno, ln) in enumerate(found[_remomat02]):
            gap = 0
            while not 'eigenvalues' in self[lno + gap].lower(): gap += 1
            start = lno + gap + 1
            stop = start + nbas
            # The basis set order is printed with every chunk of eigenvectors
            if not c:
                mapr = self.basis_set.groupby(['set', 'L']).apply(
                        lambda x: x['shell'].unique()).to_dict()
                self.basis_set_order = _basis_set_order(self[start:stop], mapr,
                                                        self.atom['set'])
            # Some fudge factors due to extra lines being printed
            space = start - lno - 1
            fnbas = nbas + space
            span = start + fnbas * nblocks
            # Finally get where our chunks are
            starts = np.arange(start, span, fnbas)
            stops = np.arange(stop, span, fnbas)
            stride = 0
            # b counts the blocks of eigenvectors per column in momatrix
            for b, (start, stop) in enumerate(zip(starts, stops)):
                # Number of eigenvectors in this block
                ncol = len(self[start][21:].split())
                step = nbas * ncol
                _csv_args['names'] = range(ncol)
                # Massage the text so that we can read csv
                block = '\n'.join([ln[21:] for ln in self[start:stop]])
                block = _rebaspat.sub(lambda m: _basrep[m.group(0)], block)
                # Enplacen the resultant unstacked values
                coefs[stride:stride + nbas * ncol, c] = pd.read_fwf(
                        six.StringIO(block), header=None,
                        widths=np.repeat(10, 5)).unstack().dropna().values
                stride += step
        # Index chi, phi
        chis = np.tile(range(nbas), nbas)
        orbs = np.repeat(range(nbas), nbas)
        momatrix = pd.DataFrame(coefs, columns=colnames)
        momatrix['chi'] = chis
        momatrix['orbital'] = orbs
        # Frame not really implemented for momatrix
        momatrix['frame'] = 0
        self.momatrix = momatrix

    def parse_basis_set_order(self):
        if hasattr(self, '_basis_set_order'): return
        self.parse_momatrix()


    def parse_frame(self):
        # Frame flags
        _retoten = 'SCF Done:'
        _realphaelec = 'alpha electrons'
        _reelecstate = 'The electronic state'
        # Get the default frame from the atom table
        self.frame = compute_frame_from_atom(self.atom)
        # Find our data
        found = self.find(_retoten, _realphaelec, _reelecstate)
        # Extract just the total SCF energies
        ens = [float(ln.split()[4]) for lno, ln in found[_retoten]]
        # If 'SCF Done' prints out more times than frames
        try:
            ens = ens if len(self.frame) == len(ens) else ens[-len(self.frame):]
            self.frame['E_tot'] = ens
        except ValueError:
            pass
        # We will assume number of electrons doesn't change per frame
        ae, x, x, be, x, x = found[_realphaelec][0][1].split()
        self.frame['N_e'] = int(ae) + int(be)
        self.frame['N_a'] = int(ae)
        self.frame['N_b'] = int(be)
        # Try to get the electronic state but don't try too hard
        try:
            states = []
            #for lno, ln in found[_reelecstate]:
            for _, ln in found[_reelecstate]:
                if 'initial' in ln: continue
                states.append(ln.split()[4].replace('.', ''))
            self.frame['state'] = states
        except (IndexError, ValueError):
            pass


    def parse_excitation(self):
        # TDDFT flags
        _retddft = 'TD'
        _reexcst = 'Excited State'
        chk = self.find(_retddft, stop=1000, keys_only=True)
        if not chk: return
        # Find the data
        found = self.find(_reexcst)
        keeps, maps, summ = [], [] ,[]
        for i, (lno, ln) in enumerate(found):
            summ.append(ln)
            lno += 1
            while '->' in self[lno]:
                keeps.append(lno)
                maps.append(i)
                lno += 1
        cols = [0, 1, 2, 'kind', 'eV', 3, 'nm', 4, 'osc', 's2']
        summ = pd.read_csv(six.StringIO('\n'.join([ln for lno, ln in found])),
                           delim_whitespace=True, header=None, names=cols,
                           usecols=[c for c in cols if type(c) == str])
        summ['s2'] = summ['s2'].str[7:].astype(np.float64)
        summ['osc'] = summ['osc'].str[2:].astype(np.float64)
        cols = ['occ', 0, 'virt', 'cont']
        conts = pd.read_csv(six.StringIO('\n'.join([self[i] for i in keeps])),
                            delim_whitespace=True, header=None, names=cols,
                            usecols=[c for c in cols if type(c) == str])
        conts['map'] = maps
        for col in summ.columns:
            conts[col] = conts['map'].map(summ[col])
        conts['energy'] = conts['eV'] * Energy['eV', 'Ha']
        conts['frame'] = conts['group'] = 0
        self.excitation = conts


    def parse_frequency(self):
        # Frequency flags
        _refreq = 'Freq'
        found = self.regex(_refreq, stop=1000, flags=re.IGNORECASE)
        # Don't need the input deck or 2 from the summary at the end
        found = self.find(_refreq)[1:-2]
        if not found: return
        # Total lines per block minus the unnecessary ones
        span = found[1][0] - found[0][0] - 7
        dfs, fdx = [], 0
        # Iterate over what we found
        for lno, ln in found:
            # Get the frequencies first
            freqs = ln[15:].split()
            nfreqs = len(freqs)
            # Get just the atom displacement vectors
            start = lno + 5
            stop = start + span
            cols = range(2 + 3 * nfreqs)
            df = self.pandas_dataframe(start, stop, ncol=cols)
            # Split up the df and unstack it
            slices = [list(range(2 + i, 2 + 3 * nfreqs, 3)) for i in range(nfreqs)]
            dx, dy, dz = [df[i].unstack().values for i in slices]
            # Generate the appropriate dimensions of other columns
            labels = np.tile(df[0].values, nfreqs)
            zs = np.tile(df[1].values, nfreqs)
            freqdxs = np.repeat(range(fdx, fdx + nfreqs), df.shape[0])
            freqs = np.repeat(freqs, df.shape[0])
            fdx += nfreqs
            # Put it all together
            stacked = pd.DataFrame.from_dict({'Z': zs, 'label': labels,
                                    'dx': dx, 'dy': dy, 'dz': dz,
                                    'frequency': freqs, 'freqdx': freqdxs})
            stacked['symbol'] = stacked['Z'].map(z2sym)
            dfs.append(stacked)
        # Now put all our frequencies together
        frequency = pd.concat(dfs).reset_index(drop=True)
        # Pretty sure displacements are in cartesian angstroms
        # TODO: verify with an external program that vibrational
        #       modes look the same as the ones generated with
        #       this methodology.
        frequency['dx'] *= Length['Angstrom', 'au']
        frequency['dy'] *= Length['Angstrom', 'au']
        frequency['dz'] *= Length['Angstrom', 'au']
        # Frame not really implemented here either
        frequency['frame'] = 0
        self.frequency = frequency

    # Below are triangular matrices -- One electron integrals

    def parse_overlap(self):
        _reovl01 = '*** Overlap ***'
        overlap = self._parse_triangular_matrix(_reovl01, 'coef')
        if overlap is not None: self.overlap = overlap

    def parse_multipole(self):
        _reixn = 'IX=    {}'
        mltpl = self._parse_triangular_matrix(_reixn.format(1), 'ix1')
        if mltpl is not None:
            mltpl['ix2'] = self._parse_triangular_matrix(_reixn.format(2), 'ix2', True)
            mltpl['ix3'] = self._parse_triangular_matrix(_reixn.format(3), 'ix3', True)
            self.multipole = mltpl

    def __init__(self, *args, **kwargs):
        super(Output, self).__init__(*args, **kwargs)

Example #6

Show file

File: output.py Project: chrinide/exatomic

class Output(six.with_metaclass(OutMeta, Editor)):
    """Editor for NWChem calculation output file (stdout)."""
    def parse_atom(self):
        """Parse the atom dataframe."""
        _reatom01 = 'Geometry "'
        _reatom02 = 'Atomic Mass'
        _reatom03 = 'ECP       "ecp basis"'
        _reatom04 = 'Output coordinates in'
        found = self.find(_reatom01,
                          _reatom02,
                          _reatom03,
                          _reatom04,
                          keys_only=True)
        unit = self[found[_reatom04][0]].split()[3]
        unit = "Angstrom" if unit == "angstroms" else "au"
        starts = np.array(found[_reatom01]) + 7
        stops = np.array(found[_reatom02]) - 1
        ecps = np.array(found[_reatom03]) + 2
        ecps = {self[ln].split()[0]: int(self[ln].split()[3]) for ln in ecps}
        columns = ['label', 'tag', 'Z', 'x', 'y', 'z']
        atom = pd.concat([
            self.pandas_dataframe(s, e, columns)
            for s, e in zip(starts, stops)
        ])
        atom['symbol'] = atom['tag'].str.extract(
            '([A-z]{1,})([0-9]*)', expand=False)[0].str.lower().str.title()
        atom['Z'] = atom['Z'].astype(np.int64)
        atom['Zeff'] = (atom['Z'] -
                        atom['tag'].map(ecps).fillna(value=0)).astype(np.int64)
        #n = len(atom)
        nf = atom.label.value_counts().max()
        nat = atom.label.max()
        atom['frame'] = [i for i in range(nf) for j in range(nat)]
        atom['label'] -= 1
        atom['x'] *= Length[unit, 'au']
        atom['y'] *= Length[unit, 'au']
        atom['z'] *= Length[unit, 'au']
        if atom['frame'].max() > 0:
            li = atom['frame'].max()
            atom = atom[~(atom['frame'] == li)]
            atom.reset_index(drop=True, inplace=True)
        del atom['label']
        self.atom = Atom(atom)

    def parse_orbital(self):
        """Parse the :class:`~exatomic.core.orbital.Orbital` dataframe."""
        orbital = None
        _remo01 = 'Molecular Orbital Analysis'
        _remo02 = 'alpha - beta orbital overlaps'
        _remo03 = 'center of mass'
        check = self.find(_remo01)
        if any(['Alpha' in value for value in check]):
            alpha_starts = np.array(
                [no
                 for no, line in check if 'Alpha' in line], dtype=np.int64) + 2
            alpha_stops = np.array(
                [no
                 for no, line in check if 'Beta' in line], dtype=np.int64) - 1
            beta_starts = alpha_stops + 3
            beta_stops = np.array(self.find(_remo02, keys_only=True),
                                  dtype=np.int64) - 1
            alpha_orbital = self._parse_orbital(alpha_starts, alpha_stops)
            beta_orbital = self._parse_orbital(beta_starts, beta_stops)
            alpha_orbital['spin'] = 0
            beta_orbital['spin'] = 1
            orbital = pd.concat((alpha_orbital, beta_orbital),
                                ignore_index=True)
        else:
            starts = np.array(list(zip(*check))[0], dtype=np.int64) + 2
            stops = np.array(self.find(_remo03, keys_only=True),
                             dtype=np.int64) - 1
            orbital = self._parse_orbital(starts, stops)
            orbital['spin'] = 0
        orbital['group'] = 0
        self.orbital = Orbital(orbital)

    def parse_momatrix(self):
        """
        Parse the :class:`~exatomic.core.orbital.MOMatrix` dataframe.

        Note:
            Must supply 'print "final vectors" "final vectors analysis"' for momatrix
        """
        key0 = "Final MO vectors"
        key1 = "center of mass"
        found = self.find(key0, key1)
        if found[key0]:
            start = found[key0][0][0] + 6
            end = found[key1][0][0] - 1
            c = pd.read_fwf(StringIO("\n".join(self[start:end])),
                            widths=(6, 12, 12, 12, 12, 12, 12),
                            names=list(range(7)))
            self.c = c
            idx = c[c[0].isnull()].index.values
            c = c[~c.index.isin(idx)]
            del c[0]
            nbas = len(self.basis_set_order)
            n = c.shape[0] // nbas
            coefs = []
            # The for loop below is like numpy.array_split(df, n); using numpy.array_split
            # with dataframes seemed to have strange results where splits had wrong sizes?
            for i in range(n):
                coefs.append(c.iloc[i * nbas:(i + 1) *
                                    nbas, :].astype(float).dropna(
                                        axis=1).values.ravel("F"))
            c = np.concatenate(coefs)
            del coefs
            orbital, chi = _square_indices(len(self.basis_set_order))
            self.momatrix = MOMatrix.from_dict({
                'coef': c,
                'chi': chi,
                'orbital': orbital,
                'frame': 0
            })
            # momatrix = pd.DataFrame.from_dict({'coef': c, 'chi': chi, 'orbital': orbital})
            # momatrix['frame'] = 0
            # self.momatrix = momatrix

    def _parse_orbital(self, starts, stops):
        '''
        This function actually performs parsing of :class:`~exatomic.orbital.Orbital`

        See Also:
            :func:`~exnwchem.output.Output.parse_orbital`
        '''
        joined = '\n'.join(
            ['\n'.join(self[s:e]) for s, e in zip(starts, stops)])
        nvec = joined.count('Vector')
        if 'spherical' not in self.meta:
            self.parse_basis_set()
        mapper = self.basis_set.functions(
            self.meta['spherical']).groupby(level="set").sum()
        nbas = self.atom['set'].map(mapper).sum()
        nbas *= nvec
        # Orbital dataframe -- alternatively one could parse the strings
        # into the DataFrame and then use the pd.Series.str methods to
        # perform all the replacements at the same time, eg. 'D' --> 'E'
        # and 'Occ=' --> '', etc.
        orb_no = np.empty((nvec, ), dtype=np.int64)
        occ = np.empty((nvec, ), dtype=np.float64)
        nrg = np.empty((nvec, ), dtype=np.float64)
        x = np.empty((nvec, ), dtype=np.float64)
        y = np.empty((nvec, ), dtype=np.float64)
        z = np.empty((nvec, ), dtype=np.float64)
        frame = np.empty((nvec, ), dtype=np.int64)
        fc = -1  # Frame counter
        oc = 0  # Orbital counter
        for s, e in zip(starts, stops):
            fc += 1
            for line in self[s:e]:
                ls = line.split()
                if 'Vector' in line:
                    orb_no[oc] = ls[1]
                    occ[oc] = ls[2].replace('Occ=', '').replace('D', 'E')
                    nrg[oc] = ls[3].replace('E=', '').replace(
                        'D', 'E') if 'E=-' in line else ls[4].replace(
                            'D', 'E')
                    frame[oc] = fc
                elif 'MO Center' in line:
                    x[oc] = ls[2].replace(',', '').replace('D', 'E')
                    y[oc] = ls[3].replace(',', '').replace('D', 'E')
                    z[oc] = ls[4].replace(',', '').replace('D', 'E')
                    oc += 1
        orb_no -= 1
        return pd.DataFrame.from_dict({
            'x': x,
            'y': z,
            'z': z,
            'frame': frame,
            'vector': orb_no,
            'occupation': occ,
            'energy': nrg
        })

    def parse_basis_set(self):
        """
        Parse the :class:`~exatomic.core.basis.BasisSet` dataframe.
        """
        if not hasattr(self, "atom"):
            self.parse_atom()
        _rebas01 = ' Basis "'
        _rebas02 = ' Summary of "'
        _rebas03 = [
            ' s ', ' px ', ' py ', ' pz ', ' d ', ' f ', ' g ', ' h ', ' i ',
            ' j ', ' k ', ' l ', ' m ', ' p '
        ]
        found = self.find(_rebas01, _rebas02)
        spherical = True if "spherical" in found[_rebas01][0][1] else False
        start = found[_rebas01][0][0] + 2
        idx = 1 if len(found[_rebas02]) > 1 else -1
        stop = found[_rebas02][idx][0] - 1
        # Read in all of the extra lines that contain ---- and tag names
        df = pd.read_fwf(StringIO("\n".join(self[start:stop])),
                         widths=(4, 2, 16, 16),
                         names=("shell", "L", "alpha", "d"))
        df.loc[df['shell'] == "--", "shell"] = np.nan
        tags = df.loc[(df['shell'].str.isdigit() == False), "shell"]
        idxs = tags.index.tolist()
        idxs.append(len(df))
        df['set'] = ""
        for i, tag in enumerate(tags):
            df.loc[idxs[i]:idxs[i + 1], "set"] = tag
        df = df.dropna().reset_index(drop=True)
        mapper = {v: k for k, v in dict(enumerate(df['set'].unique())).items()}
        df['set'] = df['set'].map(mapper)
        df['L'] = df['L'].str.strip().str.lower().map(lmap)
        df['alpha'] = df['alpha'].astype(float)
        df['d'] = df['d'].astype(float)
        # NO SUPPORT FOR MULTIPLE FRAMES?
        df['frame'] = 0
        self.basis_set = BasisSet(df)
        self.meta['spherical'] = spherical
        self.atom['set'] = self.atom['tag'].map(mapper)

    def parse_basis_set_order(self):
        dtype = [('center', 'i8'), ('shell', 'i8'), ('L', 'i8')]
        if 'spherical' not in self.meta:
            self.parse_basis_set()
        if self.meta['spherical']:
            dtype += [('ml', 'i8')]
        else:
            dtype += [('l', 'i8'), ('m', 'i8'), ('n', 'i8')]
        mapper = self.basis_set.functions(
            self.meta['spherical']).groupby(level="set").sum()
        nbas = self.atom['set'].map(mapper).sum()
        bso = np.empty((nbas, ), dtype=dtype)
        cnt = 0
        bases = self.basis_set.groupby('set')
        for seht, center in zip(self.atom['set'], self.atom.index):
            bas = bases.get_group(seht).groupby('shell')
            if self.meta['spherical']:
                for shell, grp in bas:
                    l = grp['L'].values[0]
                    for ml in spherical_ordering_function(l):
                        bso[cnt] = (center, shell, l, ml)
                        cnt += 1
            else:
                for shell, grp in bas:
                    l = grp['L'].values[0]
                    for _, ll, m, n in cartesian_ordering_function(l):
                        bso[cnt] = (center, shell, l, ll, m, n)
                        cnt += 1
        bso = pd.DataFrame(bso)
        bso['frame'] = 0
        # New shell definition consistent with basis internals
        shls = []
        grps = bso.groupby(['center', 'L'])
        cache = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
        for (cen, L), grp in grps:
            for ml in grp['ml']:
                shls.append(cache[cen][L][ml])
                cache[cen][L][ml] += 1
        bso['shell'] = shls
        self.basis_set_order = bso

    def parse_roa(self):
        """
        Parse the :class:`~exatomic.core.tensor.Polarizability` dataframe. This will parse the
        output from the Raman Optical Activity outputs.

        Note:
            We generate a 3D tensor with the 2D tensor code. 3D tensors will have 3 rows labeled
            with the same name.
        """
        _reroa = 'roa begin'
        _reare = 'alpha real'
        _reaim = 'alpha im'
        #        _reombre = 'beta real'
        #        _reombim = 'beta im'
        _reombre = 'omega beta(real)'
        _reombim = 'omega beta(imag)'
        _redqre = 'dipole-quadrupole real (Cartesian)'
        _redqim = 'dipole-quadrupole imag (Cartesian)'

        if not self.find(_reroa):
            return
        found_2d = self.find(_reare,
                             _reaim,
                             _reombre,
                             _reombim,
                             keys_only=True)
        found_3d = self.find(_redqre, _redqim, keys_only=True)
        data = {}
        start = np.array(list(found_2d.values())).reshape(4, ) + 1
        end = np.array(list(found_2d.values())).reshape(4, ) + 10
        columns = ['x', 'val']
        data = [
            self.pandas_dataframe(s, e, columns) for s, e in zip(start, end)
        ]
        df = pd.concat([dat for dat in data]).reset_index(drop=True)
        df['grp'] = [i for i in range(4) for j in range(9)]
        df = df[['val', 'grp']]
        df = pd.DataFrame(
            df.groupby('grp').apply(
                lambda x: x.unstack().values[:-9]).values.tolist(),
            columns=['xx', 'xy', 'xz', 'yx', 'yy', 'yz', 'zx', 'zy', 'zz'])
        # find the electric dipole-quadrupole polarizability
        # NWChem gives this as a list of 18 values assuming the matrix to be symmetric
        # for our implementation we need to extend it to 27 elements
        # TODO: check that NWChem does assume that the 3D tensors are symmetric
        start = np.sort(np.array(list(found_3d.values())).reshape(2, )) + 1
        end = np.sort(np.array(list(found_3d.values())).reshape(2, )) + 19
        data = [
            self.pandas_dataframe(s, e, columns) for s, e in zip(start, end)
        ]
        df3 = pd.concat([dat for dat in data]).reset_index(drop=True)
        vals = df3['val'].values.reshape(2, 3, 6)
        adx = np.triu_indices(3)
        mat = np.zeros((2, 3, 3, 3))
        for i in range(2):
            for j in range(3):
                mat[i][j][adx] = vals[i][j]
                mat[i][j] = mat[i][j] + np.transpose(
                    mat[i][j]) - np.identity(3) * mat[i][j]
        mat = mat.reshape(18, 3)
        df3 = pd.DataFrame(mat, columns=['x', 'y', 'z'])
        df3['grp1'] = [i for i in range(2) for j in range(9)]
        df3['grp2'] = [j for i in range(2) for j in range(3) for n in range(3)]
        df3 = pd.DataFrame(
            df3.groupby([
                'grp1', 'grp2'
            ]).apply(lambda x: x.unstack().values[:-6]).values.tolist(),
            columns=['xx', 'xy', 'xz', 'yx', 'yy', 'yz', 'zx', 'zy', 'zz'],
            index=[
                'Ax_real', 'Ay_real', 'Az_real', 'Ax_imag', 'Ay_imag',
                'Az_imag'
            ])
        split_label = np.transpose([i.split('_') for i in df3.index.values])
        label = split_label[0]
        types = split_label[1]
        df['label'] = found_2d.keys()
        df['label'].replace(
            [_reare, _reombre, _reaim, _reombim],
            ['alpha-real', 'g_prime-real', 'alpha-imag', 'g_prime-imag'],
            inplace=True)
        df['type'] = [i.split('-')[-1] for i in df['label'].values]
        df['label'] = [i.split('-')[0] for i in df['label'].values]
        df['frame'] = np.repeat([0], len(df.index))
        df3['label'] = label
        df3['type'] = types
        df3['frame'] = np.repeat([0], len(df3.index))
        self.roa = pd.concat([df, df3], ignore_index=True)

    def parse_frequency(self):
        """
        Parse the :class:`~exatomic.core.atom.Frequency` dataframe.

        Note:
            This code removes all negative frequencies.
        """
        _remeth = "NORMAL MODE EIGENVECTORS IN CARTESIAN COORDINATES"
        _refreq = "Frequency"
        _renat = "Atom information"

        found = self.find(_remeth)
        fnat = self.find(_renat)
        if not found and not fnat:
            return
        # get atom information
        start = fnat[0][0] + 3
        stop = start
        while '----' not in self[stop]:
            stop += 1
        # we assume that there is only one instance of where _renat is found
        columns = ['symbol', 'atom', 'x', 'y', 'z', 'mass']
        atom = self.pandas_dataframe(start, stop, columns)
        atom['atom'] -= 1
        nat = len(atom)
        # find bounds where the calculated frequencies are
        start = found[0][0]
        stop = found[1][0]
        # get the data
        found = self.find(_refreq, start=start, stop=stop)
        dfs = []
        fdx = 0
        # get frequencies
        for lno, ln in found:
            # get the frequency values
            tmp = ln.split()[1:]
            freq = np.asarray([float(i) for i in tmp])
            ## TODO: here we remove all negative frequencies
            ##       need to find out if this is ok to do
            # set start and end points for the calculated normal modes
            staf = lno + start + 1
            stof = lno + start + nat * 3 + 2
            nm = self.pandas_dataframe(staf, stof,
                                       ncol=len(freq)).reset_index(drop=True)
            # generate boolean array that shows False for negative frequencies
            neg = [not f < 0 for f in freq]
            # remove negative frequencies
            nm.drop(columns=[idx for idx, val in enumerate(neg) if not val],
                    inplace=True)
            freq = freq[neg]
            # get normal modes in the x, y, z directions
            nm = nm.stack().values
            nfreq = len(freq)
            dx = nm[::3]
            dy = nm[1::3]
            dz = nm[2::3]
            # assemble dataframe
            symbol = np.tile(atom['symbol'], nfreq)
            adx = np.tile(atom['atom'], nfreq)
            freq = np.repeat(freq, nat)
            freqdx = np.repeat([i for i in range(fdx, fdx + nfreq)], nfreq)
            frames = np.repeat([0], nfreq * nat)
            fdx += nfreq
            stacked = pd.DataFrame.from_dict({
                'symbol': symbol,
                'atom': adx,
                'dx': dx,
                'dy': dy,
                'dz': dz,
                'freq': freq,
                'freqdx': freqdx,
                'frames': frames
            })
            dfs.append(stacked)
        frequency = pd.concat(dfs).reset_index(drop=True)
        self.frequency = frequency

    def parse_gradient(self):
        """
        Parse :class:`exatomic.core.gradient.Gradient` dataframe.
        """
        _regrad = "DFT ENERGY GRADIENTS"

        found = self.find(_regrad)
        if not found:
            return
        found = self.find(_regrad, keys_only=True)
        # find start and stop points
        starts = np.array(found) + 4
        stop = starts[0]
        while '----' not in self[stop]:
            stop += 1
        # backtrack one line as the line after the needed info is empty
        stop -= 1
        stops = starts + (stop - starts[0])
        dfs = []
        # generate dataframe array
        columns = ['atom', 'symbol', 'x', 'y', 'z', 'fx', 'fy', 'fz']
        for i, (start, stop) in enumerate(zip(starts, stops)):
            gradient = self.pandas_dataframe(start, stop, columns)
            gradient['frame'] = i
            dfs.append(gradient[['atom', 'symbol', 'fx', 'fy', 'fz', 'frame']])
        # construct the dataframe
        gradient = pd.concat(dfs).reset_index(drop=True)
        gradient['Z'] = gradient['symbol'].map(sym2z)
        # want to keep more or less the same order across dataframes
        # or at least try
        self.gradient = gradient[[
            'Z', 'atom', 'fx', 'fy', 'fz', 'symbol', 'frame'
        ]]

    def parse_frame(self):
        """
        Create a minimal :class:`~exatomic.core.frame.Frame` from the (parsed)
        :class:`~exatomic.core.atom.Atom` object.
        """
        _rescfen = 'Total SCF energy'
        _redften = 'Total DFT energy'
        self.frame = compute_frame_from_atom(self.atom)
        found = self.find(_rescfen, _redften)
        scfs = found[_rescfen]
        dfts = found[_redften]
        if scfs and dfts:
            print('Warning: found total energies from scf and dft, using dft')
            dfts = [float(val.split()[-1]) for key, val in dfts]
            self.frame['total_energy'] = dfts
        elif scfs:
            scfs = [float(val.split()[-1]) for key, val in scfs]
            self.frame['total_energy'] = scfs
        elif dfts:
            dfts = [float(val.split()[-1]) for key, val in dfts]
            self.frame['total_energy'] = dfts

    def __init__(self, *args, **kwargs):
        super(Output, self).__init__(*args, **kwargs)