class Output(six.with_metaclass(OutMeta, Editor)): """Editor for NWChem calculation output file (stdout).""" def parse_atom(self): """Parse the atom dataframe.""" _reatom01 = 'Geometry "' _reatom02 = 'Atomic Mass' _reatom03 = 'ECP "ecp basis"' _reatom04 = 'Output coordinates in' found = self.find(_reatom01, _reatom02, _reatom03, _reatom04, keys_only=True) unit = self[found[_reatom04][0]].split()[3] unit = "Angstrom" if unit == "angstroms" else "au" starts = np.array(found[_reatom01]) + 7 stops = np.array(found[_reatom02]) - 1 ecps = np.array(found[_reatom03]) + 2 ecps = {self[ln].split()[0]: int(self[ln].split()[3]) for ln in ecps} columns = ['label', 'tag', 'Z', 'x', 'y', 'z'] atom = pd.concat([self.pandas_dataframe(s, e, columns) for s, e in zip(starts, stops)]) atom['symbol'] = atom['tag'].str.extract('([A-z]{1,})([0-9]*)', expand=False)[0].str.lower().str.title() atom['Z'] = atom['Z'].astype(np.int64) atom['Zeff'] = (atom['Z'] - atom['tag'].map(ecps).fillna(value=0)).astype(np.int64) #n = len(atom) nf = atom.label.value_counts().max() nat = atom.label.max() atom['frame'] = [i for i in range(nf) for j in range(nat)] atom['label'] -= 1 atom['x'] *= Length[unit, 'au'] atom['y'] *= Length[unit, 'au'] atom['z'] *= Length[unit, 'au'] if atom['frame'].max() > 0: li = atom['frame'].max() atom = atom[~(atom['frame'] == li)] atom.reset_index(drop=True, inplace=True) del atom['label'] self.atom = Atom(atom) def parse_orbital(self): """Parse the :class:`~exatomic.core.orbital.Orbital` dataframe.""" orbital = None _remo01 = 'Molecular Orbital Analysis' _remo02 = 'alpha - beta orbital overlaps' _remo03 = 'center of mass' check = self.find(_remo01) if any(['Alpha' in value for value in check]): alpha_starts = np.array([no for no, line in check if 'Alpha' in line], dtype=np.int64) + 2 alpha_stops = np.array([no for no, line in check if 'Beta' in line], dtype=np.int64) - 1 beta_starts = alpha_stops + 3 beta_stops = np.array(self.find(_remo02, keys_only=True), dtype=np.int64) - 1 alpha_orbital = self._parse_orbital(alpha_starts, alpha_stops) beta_orbital = self._parse_orbital(beta_starts, beta_stops) alpha_orbital['spin'] = 0 beta_orbital['spin'] = 1 orbital = pd.concat((alpha_orbital, beta_orbital), ignore_index=True) else: starts = np.array(list(zip(*check))[0], dtype=np.int64) + 2 stops = np.array(self.find(_remo03, keys_only=True), dtype=np.int64) - 1 orbital = self._parse_orbital(starts, stops) orbital['spin'] = 0 orbital['group'] = 0 self.orbital = Orbital(orbital) def parse_momatrix(self): """ Parse the :class:`~exatomic.core.orbital.MOMatrix` dataframe. Note: Must supply 'print "final vectors" "final vectors analysis"' for momatrix """ key0 = "Final MO vectors" key1 = "center of mass" found = self.find(key0, key1) if found[key0]: start = found[key0][0][0] + 6 end = found[key1][0][0] - 1 c = pd.read_fwf(StringIO("\n".join(self[start:end])), widths=(6, 12, 12, 12, 12, 12, 12), names=list(range(7))) self.c = c idx = c[c[0].isnull()].index.values c = c[~c.index.isin(idx)] del c[0] nbas = len(self.basis_set_order) n = c.shape[0]//nbas coefs = [] # The for loop below is like numpy.array_split(df, n); using numpy.array_split # with dataframes seemed to have strange results where splits had wrong sizes? for i in range(n): coefs.append(c.iloc[i*nbas:(i+1)*nbas, :].astype(float).dropna(axis=1).values.ravel("F")) c = np.concatenate(coefs) del coefs orbital, chi = _square_indices(len(self.basis_set_order)) self.momatrix = MOMatrix.from_dict({'coef': c, 'chi': chi, 'orbital': orbital, 'frame': 0}) # momatrix = pd.DataFrame.from_dict({'coef': c, 'chi': chi, 'orbital': orbital}) # momatrix['frame'] = 0 # self.momatrix = momatrix def _parse_orbital(self, starts, stops): ''' This function actually performs parsing of :class:`~exatomic.orbital.Orbital` See Also: :func:`~exnwchem.output.Output.parse_orbital` ''' joined = '\n'.join(['\n'.join(self[s:e]) for s, e in zip(starts, stops)]) nvec = joined.count('Vector') if 'spherical' not in self.meta: self.parse_basis_set() mapper = self.basis_set.functions(self.meta['spherical']).groupby(level="set").sum() nbas = self.atom['set'].map(mapper).sum() nbas *= nvec # Orbital dataframe -- alternatively one could parse the strings # into the DataFrame and then use the pd.Series.str methods to # perform all the replacements at the same time, eg. 'D' --> 'E' # and 'Occ=' --> '', etc. orb_no = np.empty((nvec, ), dtype=np.int64) occ = np.empty((nvec, ), dtype=np.float64) nrg = np.empty((nvec, ), dtype=np.float64) x = np.empty((nvec, ), dtype=np.float64) y = np.empty((nvec, ), dtype=np.float64) z = np.empty((nvec, ), dtype=np.float64) frame = np.empty((nvec, ), dtype=np.int64) fc = -1 # Frame counter oc = 0 # Orbital counter for s, e in zip(starts, stops): fc += 1 for line in self[s:e]: ls = line.split() if 'Vector' in line: orb_no[oc] = ls[1] occ[oc] = ls[2].replace('Occ=', '').replace('D', 'E') nrg[oc] = ls[3].replace('E=', '').replace('D', 'E') if 'E=-' in line else ls[4].replace('D', 'E') frame[oc] = fc elif 'MO Center' in line: x[oc] = ls[2].replace(',', '').replace('D', 'E') y[oc] = ls[3].replace(',', '').replace('D', 'E') z[oc] = ls[4].replace(',', '').replace('D', 'E') oc += 1 orb_no -= 1 return pd.DataFrame.from_dict({'x': x, 'y': z, 'z': z, 'frame': frame, 'vector': orb_no, 'occupation': occ, 'energy': nrg}) def parse_basis_set(self): """ Parse the :class:`~exatomic.core.basis.BasisSet` dataframe. """ if not hasattr(self, "atom"): self.parse_atom() _rebas01 = ' Basis "' _rebas02 = ' Summary of "' _rebas03 = [' s ', ' px ', ' py ', ' pz ', ' d ', ' f ', ' g ', ' h ', ' i ', ' j ', ' k ', ' l ', ' m ', ' p '] found = self.find(_rebas01, _rebas02) spherical = True if "spherical" in found[_rebas01][0][1] else False start = found[_rebas01][0][0] + 2 idx = 1 if len(found[_rebas02]) > 1 else -1 stop = found[_rebas02][idx][0] - 1 # Read in all of the extra lines that contain ---- and tag names df = pd.read_fwf(StringIO("\n".join(self[start:stop])), widths=(4, 2, 16, 16), names=("shell", "L", "alpha", "d")) df.loc[df['shell'] == "--", "shell"] = np.nan tags = df.loc[(df['shell'].str.isdigit() == False), "shell"] idxs = tags.index.tolist() idxs.append(len(df)) df['set'] = "" for i, tag in enumerate(tags): df.loc[idxs[i]:idxs[i + 1], "set"] = tag df = df.dropna().reset_index(drop=True) mapper = {v: k for k, v in dict(enumerate(df['set'].unique())).items()} df['set'] = df['set'].map(mapper) df['L'] = df['L'].str.strip().str.lower().map(lmap) df['alpha'] = df['alpha'].astype(float) df['d'] = df['d'].astype(float) # NO SUPPORT FOR MULTIPLE FRAMES? df['frame'] = 0 self.basis_set = BasisSet(df) self.meta['spherical'] = spherical self.atom['set'] = self.atom['tag'].map(mapper) def parse_basis_set_order(self): dtype = [('center', 'i8'), ('shell', 'i8'), ('L', 'i8')] if 'spherical' not in self.meta: self.parse_basis_set() if self.meta['spherical']: dtype += [('ml', 'i8')] else: dtype += [('l', 'i8'), ('m', 'i8'), ('n', 'i8')] mapper = self.basis_set.functions(self.meta['spherical']).groupby(level="set").sum() nbas = self.atom['set'].map(mapper).sum() bso = np.empty((nbas,), dtype=dtype) cnt = 0 bases = self.basis_set.groupby('set') for seht, center in zip(self.atom['set'], self.atom.index): bas = bases.get_group(seht).groupby('shell') if self.meta['spherical']: for shell, grp in bas: l = grp['L'].values[0] for ml in spherical_ordering_function(l): bso[cnt] = (center, shell, l, ml) cnt += 1 else: for shell, grp in bas: l = grp['L'].values[0] for _, ll, m, n in cartesian_ordering_function(l): bso[cnt] = (center, shell, l, ll, m, n) cnt += 1 bso = pd.DataFrame(bso) bso['frame'] = 0 # New shell definition consistent with basis internals shls = [] grps = bso.groupby(['center', 'L']) cache = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) for (cen, L), grp in grps: for ml in grp['ml']: shls.append(cache[cen][L][ml]) cache[cen][L][ml] += 1 bso['shell'] = shls self.basis_set_order = bso def parse_frame(self): """ Create a minimal :class:`~exatomic.core.frame.Frame` from the (parsed) :class:`~exatomic.core.atom.Atom` object. """ _rescfen = 'Total SCF energy' _redften = 'Total DFT energy' self.frame = compute_frame_from_atom(self.atom) found = self.find(_rescfen, _redften) scfs = found[_rescfen] dfts = found[_redften] if scfs and dfts: print('Warning: found total energies from scf and dft, using dft') dfts = [float(val.split()[-1]) for key, val in dfts] self.frame['total_energy'] = dfts elif scfs: scfs = [float(val.split()[-1]) for key, val in scfs] self.frame['total_energy'] = scfs elif dfts: dfts = [float(val.split()[-1]) for key, val in dfts] self.frame['total_energy'] = dfts def __init__(self, *args, **kwargs): super(Output, self).__init__(*args, **kwargs)
class Output(six.with_metaclass(OutMeta, Editor)): """The ADF output parser.""" def parse_atom(self): # TODO : only supports single frame, gets last atomic positions _re_atom_00 = 'Atoms in this Fragment Cart. coord.s (Angstrom)' start = stop = self.find(_re_atom_00, keys_only=True)[0] + 2 while self[stop].strip(): stop += 1 atom = self.pandas_dataframe(start, stop, 7) atom.drop([0, 2, 3], axis=1, inplace=True) atom.columns = ['symbol', 'x', 'y', 'z'] for c in ['x', 'y', 'z']: atom[c] *= Length['Angstrom', 'au'] atom['Z'] = atom['symbol'].map(sym2z) atom['frame'] = 0 self.atom = atom def parse_basis_set(self): # Find the basis set _re_bas_00 = '(Slater-type) F U N C T I O N S' _re_bas_01 = 'Atom Type' start = self.find(_re_bas_00, keys_only=True)[-1] + 3 starts = self.find(_re_bas_01, start=start, keys_only=True) lines = [] for ext in starts: for i in range(4): lines.append(start + ext + i) stop = start + ext + 4 while self[stop].strip(): lines.append(stop) stop += 1 df = pd.read_fwf(StringIO('\n'.join([self[i] for i in lines])), widths=[4, 2, 12, 4], names=['n', 'L', 'alpha', 'symbol']) # Where atom types change idxs = [0] + df['n'][df['n'] == '---'].index.tolist() + [df.shape[0]] sets, shells = [], [] for i, (start, stop) in enumerate(zip(idxs, idxs[1:])): sets.append(np.repeat(i - 1, stop - start)) shells.append(np.arange(-1, stop - start - 1)) df['set'] = np.concatenate(sets) df['shell'] = np.concatenate(shells) # Atom table basis set map basmap = df['symbol'].dropna() basmap = basmap[basmap.str.endswith(')')].str.strip(')') basmap = { val: df['set'][key] + 1 for key, val in basmap.to_dict().items() } # Discard the garbage drop = df['n'].str.strip().str.isnumeric().fillna(False) df.drop(drop[drop == False].index, inplace=True) df.drop('symbol', axis=1, inplace=True) # Clean up the series df['alpha'] = df['alpha'].astype(np.float64) df['n'] = df['n'].astype(np.int64) df['L'] = df['L'].str.lower().map(lmap) df['d'] = np.sqrt((2 * df['L'] + 1) / (4 * np.pi)) df['r'] = df['n'] - (df['L'] + 1) df['frame'] = 0 self.basis_set = BasisSet(df) self.meta['spherical'] = False self.atom['set'] = self.atom['symbol'].map(basmap) def parse_basis_set_order(self): # All the columns we need data = defaultdict(list) sets = self.basis_set.groupby('set') # Iterate over atoms for center, symbol, seht in zip(self.atom.index, self.atom['symbol'], self.atom['set']): # Per basis set bas = sets.get_group(seht).groupby('L') for L, grp in bas: # Iterate over cartesians for l, m, n in enum_cartesian[L]: for shell, r in zip(grp['shell'], grp['r']): data['center'].append(center) data['symbol'].append(symbol) data['shell'].append(shell) data['seht'].append(seht) data['L'].append(L) data['l'].append(l) data['m'].append(m) data['n'].append(n) data['r'].append(r) data['set'] = data.pop('seht') data['frame'] = 0 self.basis_set_order = pd.DataFrame.from_dict(data) self.basis_set_order['prefac'] = ( self.basis_set_order['L'].apply(dfac21) / (self.basis_set_order['l'].apply(dfac21) * self.basis_set_order['m'].apply(dfac21) * self.basis_set_order['n'].apply(dfac21))).apply(np.sqrt) def parse_orbital(self): _re_orb_00 = 'Orbital Energies, both Spins' _re_orb_01 = 'Orbital Energies, per Irrep and Spin' found = self.find(_re_orb_00, _re_orb_01, keys_only=True) # Open shell vs. closed shell cols = { _re_orb_00: ['symmetry', 'vector', 'spin', 'occupation', 'energy', 'eV'], _re_orb_01: ['vector', 'occupation', 'energy', 'eV', 'dE'] } key = _re_orb_00 if found[_re_orb_00] else _re_orb_01 start = stop = found[key][-1] + 5 while self[stop].strip(): stop += 1 df = self.pandas_dataframe(start, stop, cols[key]) df['vector'] -= 1 if 'spin' in cols[key]: df['spin'] = df.spin.map({'A': 0, 'B': 1}) df.sort_values(by=['spin', 'energy'], inplace=True) else: df.sort_values(by='energy', inplace=True) df['spin'] = 0 df.reset_index(drop=True, inplace=True) df['frame'] = df['group'] = 0 self.orbital = df def parse_contribution(self): _re_con_00 = ('E(eV) Occ MO % ' 'SFO (first member) E(eV) Occ Fragment') # MO contribution by percentage found = self.find(_re_con_00, keys_only=True) starts = [i + 3 for i in found] widths = [12, 6, 6, 6, 11, 6, 10, 12, 6, 6, 3] names = [ 'eV', 'occupation', 'vector', 'sym', '%', 'SFO', 'angmom', 'eV(sfo)', 'occ(sfo)', 'atom', 'symbol' ] dfs = [] # Prints for both spins for i, start in enumerate(starts): stop = start while self[stop].strip(): stop += 1 dfs.append( pd.read_fwf(StringIO('\n'.join(self[start:stop])), delim_whitespace=True, widths=widths, names=names)) dfs[-1]['spin'] = i dfs = pd.concat(dfs).reset_index(drop=True) dfs = dfs.applymap(lambda x: np.nan if (isinstance( x, six.string_types) and x.isspace()) else x) dfs.fillna(method='ffill', inplace=True) # Clean up dfs['symbol'] = dfs['symbol'].str.strip() dfs['angmom'] = dfs['angmom'].str.strip() dfs['angmom'].update(dfs['angmom'].map({'S': 'S:'})) dfs[['L', 'ml']] = dfs['angmom'].str.extract('(.*):(.*)', expand=True) dfs['%'] = dfs['%'].str.replace('%', '') dfs['%'].update(dfs['%'].map({" ******": np.inf})) dfs['%'] = dfs['%'].astype(np.float64) dfs['occupation'] = dfs['occupation'].astype(np.float64) dfs['vector'] = dfs['vector'].astype(np.int64) - 1 dfs['eV'] = dfs['eV'].astype(np.float64) dfs['atom'] -= 1 self.contribution = dfs def parse_excitation(self): # Excitation _re_exc_00 = '(sum=1) transition dipole moment' _re_exc_01 = ' no. E/a.u. E/eV f Symmetry' found = self.find_next(_re_exc_00, keys_only=True) if not found: return # First table of interest here start = found + 4 stop = self.find_next(_re_exc_01, keys_only=True) - 3 os = len(self[start].split()) == 9 todrop = ['occ:', 'virt:'] cols = [ 'excitation', 'occ', 'drop', 'virt', 'weight', 'TDMx', 'TDMy', 'TDMz' ] if os: cols.insert(1, 'spin') if os: todrop = ['occ', 'virt'] adf = self.pandas_dataframe(start, stop, cols) adf.drop('drop', axis=1, inplace=True) s1 = set(adf[cols[1]][adf[cols[1]] == 'NTO'].index) s2 = set(adf['excitation'][adf['excitation'].isin(todrop)].index) adf.drop(s1 | s2, axis=0, inplace=True) adf['excitation'] = adf['excitation'].str[:-1].astype(np.int64) - 1 if os: adf['spin'] = adf['spin'].map({'Alph': 0, 'Beta': 1}) adf[['occ', 'occsym']] = adf['occ'].str.extract('([0-9]*)(.*)', expand=True) adf[['virt', 'virtsym']] = adf['virt'].str.extract('([0-9]*)(.*)', expand=True) adf['occ'] = adf['occ'].astype(np.int64) - 1 adf['virt'] = adf['virt'].astype(np.int64) - 1 # Second one here start = stop + 5 stop = start while self[stop].strip(): stop += 1 cols = _re_exc_01.split() df = self.pandas_dataframe(start, stop + 1, cols) df.drop(cols[0], axis=1, inplace=True) df.columns = ['energy', 'eV', 'osc', 'symmetry'] # Expand the second table to fit the original for col in df.columns: adf[col] = adf.excitation.map(df[col]) adf['frame'] = adf['group'] = 0 self.excitation = adf def parse_momatrix(self): _re_mo_00 = 'Eigenvectors .* in BAS representation' _re_mo_01 = 'row ' _re_mo_02 = 'nosym' found = self.regex(_re_mo_00, _re_mo_01, _re_mo_02, flags=re.IGNORECASE, keys_only=True) if not found[_re_mo_00] or not found[_re_mo_01]: return if found[_re_mo_02]: thresh = found[_re_mo_00][0] rowmajor = 'rows' in self[thresh] starts = np.array([i for i in found[_re_mo_01] if i > thresh]) + 1 nchi = starts[1] - starts[0] - 3 ncol = len(self[starts[0] + 1].split()) - 1 if len(starts) % 2: os = False else: anchor = starts[len(starts) // 2 - 1] + nchi sail = starts[len(starts) // 2] os = True if self.find('SPIN 2', start=anchor, stop=sail) else False blocks = [starts] if not os else [ starts[:len(starts) // 2], starts[len(starts) // 2:] ] data = pd.DataFrame() for i, block in enumerate(blocks): stop = block[-1] + nchi skips = [ k + j for k in list(block[1:] - block[0] - 3) for j in range(3) ] name = 'coef' if not i else 'coef{}'.format(i) col = self.pandas_dataframe( block[0], stop, ncol + 1, skiprows=skips).drop( 0, axis=1, ).unstack().dropna().reset_index(drop=True) data[name] = col norb = len(data.index) // nchi data['orbital'] = np.concatenate( [np.repeat(range(i, norb, ncol), nchi) for i in range(ncol)]) data['chi'] = np.tile(range(nchi), norb) data['frame'] = 0 if rowmajor: data.rename(columns={ 'orbital': 'chi', 'chi': 'orbital' }, inplace=True) data.sort_values(by=['orbital', 'chi'], inplace=True) self.momatrix = data else: print('Symmetrized calcs not supported yet.') def parse_sphr_momatrix(self, verbose=False): """ Parser localized momatrix (if present). If the ``locorb`` keyword is used in ADF, an additional momatrix is printed after localization is performed. Parsing this table allows for visualization of these orbitals. Note: The attr :attr:`~exatomic.adf.output._re_loc_mo` is used for parsing this section. """ _re_loc_mo = ("Localized MOs expanded in CFs+SFOs", "SFO contributions (%) per Localized Orbital") found = self.find(*_re_loc_mo) if len(found[_re_loc_mo[0]]) == 0: if verbose: print("No localization performed.") return # Nothing to parse start = found[_re_loc_mo[0]][0][0] + 8 stop = found[_re_loc_mo[1]][0][0] - 4 # Parse the localized momatrix as a whole block of text df = pd.read_fwf(StringIO("\n".join(self[start:stop])), widths=(16, 9, 9, 9, 9, 9, 9, 9, 9), header=None) del df[0] # Identify the eigenvectors and (un)stack them correctly n = df[df[1].isnull()].index[0] # number of basis functions m = np.ceil(df.shape[0] / n).astype( int) # number of printed blocks of text # idx - indexes of "lines" (rows) that don't contain coefficients idx = [(n + 5) * j + i - 5 for j in range(1, m) for i in range(0, 5)] df = df[~df.index.isin(idx)] coefs = [] for i in range(0, df.shape[0] // n + 1): d = df.iloc[n * (i - 1):n * i, :] coefs.append(d.unstack().dropna().values.astype(float)) coefs = np.concatenate(coefs) m = coefs.shape[0] // n # Number of localized MOs momatrix = pd.DataFrame.from_dict({ 'coef': coefs, 'orbital': [i for i in range(m) for _ in range(n)], 'chi': [j for _ in range(m) for j in range(n)] }) momatrix['frame'] = self.atom['frame'].unique()[-1] self.sphr_momatrix = momatrix def __init__(self, *args, **kwargs): super(Output, self).__init__(*args, **kwargs)
class Output(six.with_metaclass(OutMeta, Editor)): """The ADF output parser.""" def parse_atom(self): # TODO : only supports single frame, gets last atomic positions # this will actually get the very first coordinates #_re_atom_00 = 'Atoms in this Fragment Cart. coord.s (Angstrom)' _re_atom_00 = 'ATOMS' found1 = self.find(_re_atom_00, keys_only=True) # use the regex instead of find because we have a similar search string in an nmr and # cpl calculation for the nuclear coordinates _reatom = "(?i)NUCLEAR COORDINATES" found2 = self.regex(_reatom, keys_only=True) # to find the optimized frames _reopt = "Coordinates (Cartesian)" found_opt = self.find(_reopt, keys_only=True) if found_opt: starts = np.array(found_opt) + 6 stop = starts[0] while '------' not in self[stop]: stop += 1 stops = starts + stop - starts[0] dfs = [] for idx, (start, stop) in enumerate(zip(starts, stops)): # parse everything as they may be useful in the future df = self.pandas_dataframe(start, stop, ncol=11) # drop everything df.drop(list(range(5, 11)), axis='columns', inplace=True) # we read the coordinates in bohr so no need to convrt df.columns = ['set', 'symbol', 'x', 'y', 'z'] df['set'] = df['set'].astype(int) df['Z'] = df['symbol'].map(sym2z) df['frame'] = idx df['set'] -= 1 dfs.append(df) atom = pd.concat(dfs, ignore_index=True) elif found1: start = stop = found1[-1] + 4 while self[stop].strip(): stop += 1 atom = self.pandas_dataframe(start, stop, ncol=8) atom.drop(list(range(5, 8)), axis='columns', inplace=True) atom.columns = ['set', 'symbol', 'x', 'y', 'z'] for c in ['x', 'y', 'z']: atom[c] *= Length['Angstrom', 'au'] atom['Z'] = atom['symbol'].map(sym2z) atom['set'] -= 1 atom['frame'] = 0 elif found2: #if len(found) > 1: # raise NotImplementedError("We can only parse outputs from a single NMR calculation") atom = [] for idx, val in enumerate(found2): start = val + 3 stop = start while self[stop].strip(): stop += 1 # a bit of a hack to make sure that there is no formatting change depending on the # number of atoms in the molecule as the index is right justified so if there are # more than 100 atoms it will fill the alloted space for the atom index and change the # delimitter and therefore the number of columns self[start:stop] = map(lambda x: x.replace('(', ''), self[start:stop]) df = self.pandas_dataframe(start, stop, ncol=5) df.columns = ['symbol', 'set', 'x', 'y', 'z'] for c in ['x', 'y', 'z']: df[c] *= Length['Angstrom', 'au'] df['Z'] = df['symbol'].map(sym2z) df['frame'] = idx # remove the trailing chracters from the index df['set'] = list(map(lambda x: x.replace('):', ''), df['set'])) df['set'] = df['set'].astype(int) - 1 atom.append(df) atom = pd.concat(atom) else: raise NotImplementedError("We could not find the atom table in this output. Please submit "+ \ "an issue ticket so we can add it in.") self.atom = atom def parse_basis_set(self): # Find the basis set _re_bas_00 = '(Slater-type) F U N C T I O N S' _re_bas_01 = 'Atom Type' start = self.find(_re_bas_00, keys_only=True)[-1] + 3 starts = self.find(_re_bas_01, start=start, keys_only=True) lines = [] for ext in starts: for i in range(4): lines.append(start + ext + i) stop = start + ext + 4 while self[stop].strip(): lines.append(stop) stop += 1 df = pd.read_fwf(StringIO('\n'.join([self[i] for i in lines])), widths=[4, 2, 12, 4], names=['n', 'L', 'alpha', 'symbol']) # Where atom types change idxs = [0] + df['n'][df['n'] == '---'].index.tolist() + [df.shape[0]] sets, shells = [], [] for i, (start, stop) in enumerate(zip(idxs, idxs[1:])): sets.append(np.repeat(i - 1, stop - start)) shells.append(np.arange(-1, stop - start - 1)) df['set'] = np.concatenate(sets) df['shell'] = np.concatenate(shells) # Atom table basis set map basmap = df['symbol'].dropna() basmap = basmap[basmap.str.endswith(')')].str.strip(')') basmap = { val: df['set'][key] + 1 for key, val in basmap.to_dict().items() } # Discard the garbage drop = df['n'].str.strip().str.isnumeric().fillna(False) df.drop(drop[drop == False].index, inplace=True) df.drop('symbol', axis=1, inplace=True) # Clean up the series df['alpha'] = df['alpha'].astype(np.float64) df['n'] = df['n'].astype(np.int64) df['L'] = df['L'].str.lower().map(lmap) df['d'] = np.sqrt((2 * df['L'] + 1) / (4 * np.pi)) df['r'] = df['n'] - (df['L'] + 1) df['frame'] = 0 self.basis_set = BasisSet(df) self.meta['spherical'] = False self.atom['set'] = self.atom['symbol'].map(basmap) def parse_basis_set_order(self): # All the columns we need data = defaultdict(list) sets = self.basis_set.groupby('set') # Iterate over atoms for center, symbol, seht in zip(self.atom.index, self.atom['symbol'], self.atom['set']): # Per basis set bas = sets.get_group(seht).groupby('L') for L, grp in bas: # Iterate over cartesians for l, m, n in enum_cartesian[L]: for shell, r in zip(grp['shell'], grp['r']): data['center'].append(center) data['symbol'].append(symbol) data['shell'].append(shell) data['seht'].append(seht) data['L'].append(L) data['l'].append(l) data['m'].append(m) data['n'].append(n) data['r'].append(r) data['set'] = data.pop('seht') data['frame'] = 0 self.basis_set_order = pd.DataFrame.from_dict(data) self.basis_set_order['prefac'] = ( self.basis_set_order['L'].apply(dfac21) / (self.basis_set_order['l'].apply(dfac21) * self.basis_set_order['m'].apply(dfac21) * self.basis_set_order['n'].apply(dfac21))).apply(np.sqrt) def parse_orbital(self): _re_orb_00 = 'Orbital Energies, both Spins' _re_orb_01 = 'Orbital Energies, per Irrep and Spin' found = self.find(_re_orb_00, _re_orb_01, keys_only=True) # Open shell vs. closed shell cols = { _re_orb_00: ['symmetry', 'vector', 'spin', 'occupation', 'energy', 'eV'], _re_orb_01: ['vector', 'occupation', 'energy', 'eV', 'dE'] } key = _re_orb_00 if found[_re_orb_00] else _re_orb_01 start = stop = found[key][-1] + 5 while self[stop].strip(): stop += 1 df = self.pandas_dataframe(start, stop, cols[key]) df['vector'] -= 1 if 'spin' in cols[key]: df['spin'] = df.spin.map({'A': 0, 'B': 1}) df.sort_values(by=['spin', 'energy'], inplace=True) else: df.sort_values(by='energy', inplace=True) df['spin'] = 0 df.reset_index(drop=True, inplace=True) df['frame'] = df['group'] = 0 self.orbital = df def parse_contribution(self): _re_con_00 = ('E(eV) Occ MO % ' 'SFO (first member) E(eV) Occ Fragment') # MO contribution by percentage found = self.find(_re_con_00, keys_only=True) starts = [i + 3 for i in found] widths = [12, 6, 6, 6, 11, 6, 10, 12, 6, 6, 3] names = [ 'eV', 'occupation', 'vector', 'sym', '%', 'SFO', 'angmom', 'eV(sfo)', 'occ(sfo)', 'atom', 'symbol' ] dfs = [] # Prints for both spins for i, start in enumerate(starts): stop = start while self[stop].strip(): stop += 1 dfs.append( pd.read_fwf(StringIO('\n'.join(self[start:stop])), delim_whitespace=True, widths=widths, names=names)) dfs[-1]['spin'] = i dfs = pd.concat(dfs).reset_index(drop=True) dfs = dfs.applymap(lambda x: np.nan if (isinstance( x, six.string_types) and x.isspace()) else x) dfs.fillna(method='ffill', inplace=True) # Clean up dfs['symbol'] = dfs['symbol'].str.strip() dfs['angmom'] = dfs['angmom'].str.strip() dfs['angmom'].update(dfs['angmom'].map({'S': 'S:'})) dfs[['L', 'ml']] = dfs['angmom'].str.extract('(.*):(.*)', expand=True) dfs['%'] = dfs['%'].str.replace('%', '') dfs['%'].update(dfs['%'].map({" ******": np.inf})) dfs['%'] = dfs['%'].astype(np.float64) dfs['occupation'] = dfs['occupation'].astype(np.float64) dfs['vector'] = dfs['vector'].astype(np.int64) - 1 dfs['eV'] = dfs['eV'].astype(np.float64) dfs['atom'] -= 1 self.contribution = dfs def parse_excitation(self): # Excitation _re_exc_00 = '(sum=1) transition dipole moment' _re_exc_01 = ' no. E/a.u. E/eV f Symmetry' found = self.find_next(_re_exc_00, keys_only=True) if not found: return # First table of interest here start = found + 4 stop = self.find_next(_re_exc_01, keys_only=True) - 3 os = len(self[start].split()) == 9 todrop = ['occ:', 'virt:'] cols = [ 'excitation', 'occ', 'drop', 'virt', 'weight', 'TDMx', 'TDMy', 'TDMz' ] if os: cols.insert(1, 'spin') if os: todrop = ['occ', 'virt'] adf = self.pandas_dataframe(start, stop, cols) adf.drop('drop', axis=1, inplace=True) s1 = set(adf[cols[1]][adf[cols[1]] == 'NTO'].index) s2 = set(adf['excitation'][adf['excitation'].isin(todrop)].index) adf.drop(s1 | s2, axis=0, inplace=True) adf['excitation'] = adf['excitation'].str[:-1].astype(np.int64) - 1 if os: adf['spin'] = adf['spin'].map({'Alph': 0, 'Beta': 1}) adf[['occ', 'occsym']] = adf['occ'].str.extract('([0-9]*)(.*)', expand=True) adf[['virt', 'virtsym']] = adf['virt'].str.extract('([0-9]*)(.*)', expand=True) adf['occ'] = adf['occ'].astype(np.int64) - 1 adf['virt'] = adf['virt'].astype(np.int64) - 1 # Second one here start = stop + 5 stop = start while self[stop].strip(): stop += 1 cols = _re_exc_01.split() df = self.pandas_dataframe(start, stop + 1, cols) df.drop(cols[0], axis=1, inplace=True) df.columns = ['energy', 'eV', 'osc', 'symmetry'] # Expand the second table to fit the original for col in df.columns: adf[col] = adf.excitation.map(df[col]) adf['frame'] = adf['group'] = 0 self.excitation = adf def parse_momatrix(self): _re_mo_00 = 'Eigenvectors .* in BAS representation' _re_mo_01 = 'row ' _re_mo_02 = 'nosym' found = self.regex(_re_mo_00, _re_mo_01, _re_mo_02, flags=re.IGNORECASE, keys_only=True) if not found[_re_mo_00] or not found[_re_mo_01]: return if found[_re_mo_02]: thresh = found[_re_mo_00][0] rowmajor = 'rows' in self[thresh] starts = np.array([i for i in found[_re_mo_01] if i > thresh]) + 1 nchi = starts[1] - starts[0] - 3 ncol = len(self[starts[0] + 1].split()) - 1 if len(starts) % 2: os = False else: anchor = starts[len(starts) // 2 - 1] + nchi sail = starts[len(starts) // 2] os = True if self.find('SPIN 2', start=anchor, stop=sail) else False blocks = [starts] if not os else [ starts[:len(starts) // 2], starts[len(starts) // 2:] ] data = pd.DataFrame() for i, block in enumerate(blocks): stop = block[-1] + nchi skips = [ k + j for k in list(block[1:] - block[0] - 3) for j in range(3) ] name = 'coef' if not i else 'coef{}'.format(i) col = self.pandas_dataframe( block[0], stop, ncol + 1, skiprows=skips).drop( 0, axis=1, ).unstack().dropna().reset_index(drop=True) data[name] = col norb = len(data.index) // nchi data['orbital'] = np.concatenate( [np.repeat(range(i, norb, ncol), nchi) for i in range(ncol)]) data['chi'] = np.tile(range(nchi), norb) data['frame'] = 0 if rowmajor: data.rename(columns={ 'orbital': 'chi', 'chi': 'orbital' }, inplace=True) data.sort_values(by=['orbital', 'chi'], inplace=True) self.momatrix = data else: print('Symmetrized calcs not supported yet.') def parse_sphr_momatrix(self, verbose=False): """ Parser localized momatrix (if present). If the ``locorb`` keyword is used in ADF, an additional momatrix is printed after localization is performed. Parsing this table allows for visualization of these orbitals. Note: The attr :attr:`~exatomic.adf.output._re_loc_mo` is used for parsing this section. """ _re_loc_mo = ("Localized MOs expanded in CFs+SFOs", "SFO contributions (%) per Localized Orbital") found = self.find(*_re_loc_mo) if len(found[_re_loc_mo[0]]) == 0: if verbose: print("No localization performed.") return # Nothing to parse start = found[_re_loc_mo[0]][0][0] + 8 stop = found[_re_loc_mo[1]][0][0] - 4 # Parse the localized momatrix as a whole block of text df = pd.read_fwf(StringIO("\n".join(self[start:stop])), widths=(16, 9, 9, 9, 9, 9, 9, 9, 9), header=None) del df[0] # Identify the eigenvectors and (un)stack them correctly n = df[df[1].isnull()].index[0] # number of basis functions m = np.ceil(df.shape[0] / n).astype( int) # number of printed blocks of text # idx - indexes of "lines" (rows) that don't contain coefficients idx = [(n + 5) * j + i - 5 for j in range(1, m) for i in range(0, 5)] df = df[~df.index.isin(idx)] coefs = [] for i in range(0, df.shape[0] // n + 1): d = df.iloc[n * (i - 1):n * i, :] coefs.append(d.unstack().dropna().values.astype(float)) coefs = np.concatenate(coefs) m = coefs.shape[0] // n # Number of localized MOs momatrix = pd.DataFrame.from_dict({ 'coef': coefs, 'orbital': [i for i in range(m) for _ in range(n)], 'chi': [j for _ in range(m) for j in range(n)] }) momatrix['frame'] = self.atom['frame'].unique()[-1] self.sphr_momatrix = momatrix def parse_gradient(self): _regrad = "Energy gradients wrt nuclear displacements" found = self.find(_regrad, keys_only=True) if not found: return starts = np.array(found) + 6 stop = starts[0] while '----' not in self[stop]: stop += 1 stops = starts + (stop - starts[0]) dfs = [] for i, (start, stop) in enumerate(zip(starts, stops)): df = self.pandas_dataframe(start, stop, ncol=5) df.columns = ['atom', 'symbol', 'fx', 'fy', 'fz'] df['frame'] = i df['atom'] -= 1 dfs.append(df) grad = pd.concat(dfs, ignore_index=True) grad['Z'] = grad['symbol'].map(sym2z) grad = grad[['atom', 'Z', 'fx', 'fy', 'fz', 'symbol', 'frame']] for u in ['fx', 'fy', 'fz']: grad[u] *= 1. / Length['Angstrom', 'au'] self.gradient = grad def parse_frequency(self): _renorm = "Vibrations and Normal Modes" _refreq = "List of All Frequencies:" found = self.find(_refreq, keys_only=True) if not found: return elif len(found) > 1: raise NotImplementedError( "We cannot parse more than one frequency calculation in a single output" ) found = self.find(_refreq, _renorm, keys_only=True) start = found[_refreq][0] + 9 stop = start while self[stop]: stop += 1 df = self.pandas_dataframe(start, stop, ncol=3) freqs = df[0].values n = int(np.ceil(freqs.shape[0] / 3)) start = found[_renorm][0] + 9 stop = start while self[stop]: stop += 1 natoms = stop - start dfs = [] fdx = 0 for i in range(n): if i == 0: start = found[_renorm][0] + 9 else: start = stop + 4 stop = start + natoms freqs = list(map(lambda x: float(x), self[start - 2].split())) ncol = len(freqs) df = self.pandas_dataframe(start, stop, ncol=1 + 3 * ncol) tmp = list(map(lambda x: x.split('.'), df[0])) index, symbol = list(map(list, zip(*tmp))) slices = [list(range(1 + i, 1 + 3 * ncol, 3)) for i in range(ncol)] dx, dy, dz = [df[i].unstack().values for i in slices] freqdx = np.repeat(list(range(fdx, ncol + fdx)), natoms) zs = pd.Series(symbol).map(sym2z) freqs = np.repeat(freqs, natoms) stacked = pd.DataFrame.from_dict({ 'Z': np.tile(zs, ncol), 'label': np.tile(index, ncol), 'dx': dx, 'dy': dy, 'dz': dz, 'frequency': freqs, 'freqdx': freqdx }) stacked['ir_int'] = 0.0 stacked['symbol'] = np.tile(symbol, ncol) dfs.append(stacked) fdx += ncol frequency = pd.concat(dfs, ignore_index=True) frequency['frame'] = 0 # TODO: check units of the normal modes self.frequency = frequency def parse_nmr_shielding(self): _reatom = "N U C L E U S :" _reshield = "==== total shielding tensor" _renatom = "NUCLEAR COORDINATES (ANGSTROMS)" found = self.find(_reatom, keys_only=True) if not found: #raise NotImplementedError("Could not find {} in output".format(_reatom)) return ncalc = self.find(_renatom, keys_only=True) ncalc.append(len(self)) ndx = 0 dfs = [] for start in found: try: ndx = ndx if start > ncalc[ndx] and start < ncalc[ ndx + 1] else ndx + 1 except IndexError: raise IndexError( "It seems that there was an issue with determining which NMR calculation we are in" ) start_shield = self.find(_reshield, keys_only=True, start=start)[0] + start + 2 end_shield = start_shield + 3 symbol, index = self[start].split()[-1].split('(') index = int(index.replace(')', '')) isotropic = float(self[start_shield + 4].split()[-1]) df = self.pandas_dataframe(start_shield, end_shield, ncol=3) cols = ['xx', 'xy', 'xz', 'yx', 'yy', 'yz', 'zx', 'zy', 'zz'] df = pd.DataFrame(df.unstack().values.reshape(1, 9), columns=cols) df['isotropic'] = isotropic df['atom'] = index - 1 df['symbol'] = symbol df['label'] = 'nmr shielding' df['frame'] = ndx dfs.append(df) shielding = pd.concat(dfs, ignore_index=True) self.nmr_shielding = shielding def parse_j_coupling(self): _recoupl = "total calculated spin-spin coupling:" _reatom = "Internal CPL numbering of atoms:" found = self.find(_reatom, keys_only=True) if not found: return found = self.find(_reatom, _recoupl, keys_only=True) # we grab the tensors inside the principal axis representation # for the cartesian axis representation we start the list at 0 and grab every other instance start_coupl = found[_recoupl][1::2] start_pert = np.array(found[_reatom]) - 3 dfs = [] # grab atoms cols = ['xx', 'xy', 'xz', 'yx', 'yy', 'yz', 'zx', 'zy', 'zz'] for ln, start in zip(start_pert, start_coupl): line = self[ln].split() # we just replace all of the () in the strings pert_nucl = list( map(lambda x: x.replace('(', '').replace(')', ''), line[5:])) nucl = list( map(lambda x: x.replace('(', '').replace(')', ''), line[1:3])) # grab both tensors df = self.pandas_dataframe(start + 2, start + 5, ncol=6) # this will grab the iso value and tensor elements for the j coupling in hz df.drop(range(3), axis='columns', inplace=True) df = pd.DataFrame(df.unstack().values.reshape(1, 9), columns=cols) iso = self[start + 1].split()[-1] # place all of the dataframe columns df['isotropic'] = float(iso) df['atom'] = int(nucl[0]) df['symbol'] = nucl[1] df['pt_atom'] = int(pert_nucl[0]) df['pt_symbol'] = pert_nucl[1] df['label'] = 'j coupling' df['frame'] = 0 dfs.append(df) # put everything together j_coupling = pd.concat(dfs, ignore_index=True) j_coupling['atom'] -= 1 j_coupling['pt_atom'] -= 1 self.j_coupling = j_coupling def __init__(self, *args, **kwargs): super(Output, self).__init__(*args, **kwargs)
class Output(six.with_metaclass(OutMeta, Editor)): """Editor for NWChem calculation output file (stdout).""" def parse_atom(self): """Parse the atom dataframe.""" _reatom01 = 'Geometry "' _reatom02 = 'Atomic Mass' _reatom03 = 'ECP "ecp basis"' _reatom04 = 'Output coordinates in' found = self.find(_reatom01, _reatom02, _reatom03, _reatom04, keys_only=True) unit = self[found[_reatom04][0]].split()[3] unit = "Angstrom" if unit == "angstroms" else "au" starts = np.array(found[_reatom01]) + 7 stops = np.array(found[_reatom02]) - 1 ecps = np.array(found[_reatom03]) + 2 ecps = {self[ln].split()[0]: int(self[ln].split()[3]) for ln in ecps} columns = ['label', 'tag', 'Z', 'x', 'y', 'z'] atom = pd.concat([ self.pandas_dataframe(s, e, columns) for s, e in zip(starts, stops) ]) atom['symbol'] = atom['tag'].str.extract( '([A-z]{1,})([0-9]*)', expand=False)[0].str.lower().str.title() atom['Z'] = atom['Z'].astype(np.int64) atom['Zeff'] = (atom['Z'] - atom['tag'].map(ecps).fillna(value=0)).astype(np.int64) #n = len(atom) nf = atom.label.value_counts().max() nat = atom.label.max() atom['frame'] = [i for i in range(nf) for j in range(nat)] atom['label'] -= 1 atom['x'] *= Length[unit, 'au'] atom['y'] *= Length[unit, 'au'] atom['z'] *= Length[unit, 'au'] if atom['frame'].max() > 0: li = atom['frame'].max() atom = atom[~(atom['frame'] == li)] atom.reset_index(drop=True, inplace=True) del atom['label'] self.atom = Atom(atom) def parse_orbital(self): """Parse the :class:`~exatomic.core.orbital.Orbital` dataframe.""" orbital = None _remo01 = 'Molecular Orbital Analysis' _remo02 = 'alpha - beta orbital overlaps' _remo03 = 'center of mass' check = self.find(_remo01) if any(['Alpha' in value for value in check]): alpha_starts = np.array( [no for no, line in check if 'Alpha' in line], dtype=np.int64) + 2 alpha_stops = np.array( [no for no, line in check if 'Beta' in line], dtype=np.int64) - 1 beta_starts = alpha_stops + 3 beta_stops = np.array(self.find(_remo02, keys_only=True), dtype=np.int64) - 1 alpha_orbital = self._parse_orbital(alpha_starts, alpha_stops) beta_orbital = self._parse_orbital(beta_starts, beta_stops) alpha_orbital['spin'] = 0 beta_orbital['spin'] = 1 orbital = pd.concat((alpha_orbital, beta_orbital), ignore_index=True) else: starts = np.array(list(zip(*check))[0], dtype=np.int64) + 2 stops = np.array(self.find(_remo03, keys_only=True), dtype=np.int64) - 1 orbital = self._parse_orbital(starts, stops) orbital['spin'] = 0 orbital['group'] = 0 self.orbital = Orbital(orbital) def parse_momatrix(self): """ Parse the :class:`~exatomic.core.orbital.MOMatrix` dataframe. Note: Must supply 'print "final vectors" "final vectors analysis"' for momatrix """ key0 = "Final MO vectors" key1 = "center of mass" found = self.find(key0, key1) if found[key0]: start = found[key0][0][0] + 6 end = found[key1][0][0] - 1 c = pd.read_fwf(StringIO("\n".join(self[start:end])), widths=(6, 12, 12, 12, 12, 12, 12), names=list(range(7))) self.c = c idx = c[c[0].isnull()].index.values c = c[~c.index.isin(idx)] del c[0] nbas = len(self.basis_set_order) n = c.shape[0] // nbas coefs = [] # The for loop below is like numpy.array_split(df, n); using numpy.array_split # with dataframes seemed to have strange results where splits had wrong sizes? for i in range(n): coefs.append(c.iloc[i * nbas:(i + 1) * nbas, :].astype(float).dropna( axis=1).values.ravel("F")) c = np.concatenate(coefs) del coefs orbital, chi = _square_indices(len(self.basis_set_order)) self.momatrix = MOMatrix.from_dict({ 'coef': c, 'chi': chi, 'orbital': orbital, 'frame': 0 }) # momatrix = pd.DataFrame.from_dict({'coef': c, 'chi': chi, 'orbital': orbital}) # momatrix['frame'] = 0 # self.momatrix = momatrix def _parse_orbital(self, starts, stops): ''' This function actually performs parsing of :class:`~exatomic.orbital.Orbital` See Also: :func:`~exnwchem.output.Output.parse_orbital` ''' joined = '\n'.join( ['\n'.join(self[s:e]) for s, e in zip(starts, stops)]) nvec = joined.count('Vector') if 'spherical' not in self.meta: self.parse_basis_set() mapper = self.basis_set.functions( self.meta['spherical']).groupby(level="set").sum() nbas = self.atom['set'].map(mapper).sum() nbas *= nvec # Orbital dataframe -- alternatively one could parse the strings # into the DataFrame and then use the pd.Series.str methods to # perform all the replacements at the same time, eg. 'D' --> 'E' # and 'Occ=' --> '', etc. orb_no = np.empty((nvec, ), dtype=np.int64) occ = np.empty((nvec, ), dtype=np.float64) nrg = np.empty((nvec, ), dtype=np.float64) x = np.empty((nvec, ), dtype=np.float64) y = np.empty((nvec, ), dtype=np.float64) z = np.empty((nvec, ), dtype=np.float64) frame = np.empty((nvec, ), dtype=np.int64) fc = -1 # Frame counter oc = 0 # Orbital counter for s, e in zip(starts, stops): fc += 1 for line in self[s:e]: ls = line.split() if 'Vector' in line: orb_no[oc] = ls[1] occ[oc] = ls[2].replace('Occ=', '').replace('D', 'E') nrg[oc] = ls[3].replace('E=', '').replace( 'D', 'E') if 'E=-' in line else ls[4].replace( 'D', 'E') frame[oc] = fc elif 'MO Center' in line: x[oc] = ls[2].replace(',', '').replace('D', 'E') y[oc] = ls[3].replace(',', '').replace('D', 'E') z[oc] = ls[4].replace(',', '').replace('D', 'E') oc += 1 orb_no -= 1 return pd.DataFrame.from_dict({ 'x': x, 'y': z, 'z': z, 'frame': frame, 'vector': orb_no, 'occupation': occ, 'energy': nrg }) def parse_basis_set(self): """ Parse the :class:`~exatomic.core.basis.BasisSet` dataframe. """ if not hasattr(self, "atom"): self.parse_atom() _rebas01 = ' Basis "' _rebas02 = ' Summary of "' _rebas03 = [ ' s ', ' px ', ' py ', ' pz ', ' d ', ' f ', ' g ', ' h ', ' i ', ' j ', ' k ', ' l ', ' m ', ' p ' ] found = self.find(_rebas01, _rebas02) spherical = True if "spherical" in found[_rebas01][0][1] else False start = found[_rebas01][0][0] + 2 idx = 1 if len(found[_rebas02]) > 1 else -1 stop = found[_rebas02][idx][0] - 1 # Read in all of the extra lines that contain ---- and tag names df = pd.read_fwf(StringIO("\n".join(self[start:stop])), widths=(4, 2, 16, 16), names=("shell", "L", "alpha", "d")) df.loc[df['shell'] == "--", "shell"] = np.nan tags = df.loc[(df['shell'].str.isdigit() == False), "shell"] idxs = tags.index.tolist() idxs.append(len(df)) df['set'] = "" for i, tag in enumerate(tags): df.loc[idxs[i]:idxs[i + 1], "set"] = tag df = df.dropna().reset_index(drop=True) mapper = {v: k for k, v in dict(enumerate(df['set'].unique())).items()} df['set'] = df['set'].map(mapper) df['L'] = df['L'].str.strip().str.lower().map(lmap) df['alpha'] = df['alpha'].astype(float) df['d'] = df['d'].astype(float) # NO SUPPORT FOR MULTIPLE FRAMES? df['frame'] = 0 self.basis_set = BasisSet(df) self.meta['spherical'] = spherical self.atom['set'] = self.atom['tag'].map(mapper) def parse_basis_set_order(self): dtype = [('center', 'i8'), ('shell', 'i8'), ('L', 'i8')] if 'spherical' not in self.meta: self.parse_basis_set() if self.meta['spherical']: dtype += [('ml', 'i8')] else: dtype += [('l', 'i8'), ('m', 'i8'), ('n', 'i8')] mapper = self.basis_set.functions( self.meta['spherical']).groupby(level="set").sum() nbas = self.atom['set'].map(mapper).sum() bso = np.empty((nbas, ), dtype=dtype) cnt = 0 bases = self.basis_set.groupby('set') for seht, center in zip(self.atom['set'], self.atom.index): bas = bases.get_group(seht).groupby('shell') if self.meta['spherical']: for shell, grp in bas: l = grp['L'].values[0] for ml in spherical_ordering_function(l): bso[cnt] = (center, shell, l, ml) cnt += 1 else: for shell, grp in bas: l = grp['L'].values[0] for _, ll, m, n in cartesian_ordering_function(l): bso[cnt] = (center, shell, l, ll, m, n) cnt += 1 bso = pd.DataFrame(bso) bso['frame'] = 0 # New shell definition consistent with basis internals shls = [] grps = bso.groupby(['center', 'L']) cache = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) for (cen, L), grp in grps: for ml in grp['ml']: shls.append(cache[cen][L][ml]) cache[cen][L][ml] += 1 bso['shell'] = shls self.basis_set_order = bso def parse_frame(self): """ Create a minimal :class:`~exatomic.core.frame.Frame` from the (parsed) :class:`~exatomic.core.atom.Atom` object. """ _rescfen = 'Total SCF energy' _redften = 'Total DFT energy' self.frame = compute_frame_from_atom(self.atom) found = self.find(_rescfen, _redften) scfs = found[_rescfen] dfts = found[_redften] if scfs and dfts: print('Warning: found total energies from scf and dft, using dft') dfts = [float(val.split()[-1]) for key, val in dfts] self.frame['total_energy'] = dfts elif scfs: scfs = [float(val.split()[-1]) for key, val in scfs] self.frame['total_energy'] = scfs elif dfts: dfts = [float(val.split()[-1]) for key, val in dfts] self.frame['total_energy'] = dfts def __init__(self, *args, **kwargs): super(Output, self).__init__(*args, **kwargs)
class Output(six.with_metaclass(GauMeta, Editor)): def _parse_triangular_matrix(self, regex, column='coef', values_only=False): _rebas01 = r'basis functions,' found = self.find_next(_rebas01, keys_only=True) nbas = int(self[found].split()[0]) found = self.find_next(regex, keys_only=True) if not found: return ncol = len(self[found + 1].split()) start = found + 2 rmdr = nbas % ncol skips = np.array(list(reversed(range(rmdr, nbas + max(1, rmdr), ncol)))) skips = np.cumsum(skips) + np.arange(len(skips)) stop = start + skips[-1] matrix = self.pandas_dataframe(start, stop, ncol + 1, index_col=0, skiprows=skips, ).unstack().dropna().apply( lambda x: x.replace('D', 'E') ).astype(np.float64).values if values_only: return matrix idxs = _triangular_indices(ncol, nbas) return pd.DataFrame.from_dict({'chi0': idxs[:,0], 'chi1': idxs[:,1], 'frame': idxs[:,2], column: matrix}) def parse_atom(self): # Atom flags _regeom01 = 'Input orientation' _regeom02 = 'Standard orientation' # Find our data found = self.find(_regeom01, _regeom02, keys_only=True) # Check if nosymm was specified key = _regeom02 if found[_regeom02] else _regeom01 starts = np.array(found[key]) + 5 # Prints converged geometry twice but only need it once starts = starts[:-1] if len(starts) > 1 else starts stop = starts[0] # Find where the data stops while '-------' not in self[stop]: stop += 1 # But it should be same sized array each time stops = starts + (stop - starts[0]) dfs = [] # Iterate over frames for i, (start, stop) in enumerate(zip(starts, stops)): atom = self.pandas_dataframe(start, stop, 6) atom['frame'] = i dfs.append(atom) atom = pd.concat(dfs).reset_index(drop=True) # Drop the column of atomic type (whatever that is) atom.drop([2], axis=1, inplace=True) # Name the data atom.columns = ['set', 'Z', 'x', 'y', 'z', 'frame'] # Zero-based indexing atom['set'] -= 1 # Convert to atomic units atom['x'] *= Length['Angstrom', 'au'] atom['y'] *= Length['Angstrom', 'au'] atom['z'] *= Length['Angstrom', 'au'] # Map atomic symbols onto Z numbers atom['symbol'] = atom['Z'].map(z2sym) self.atom = atom def parse_basis_set(self): # Basis flags _rebas02 = 'AO basis set in the form of general basis input' _rebas03 = ' (Standard|General) basis' _basrep = {'D 0': 'D0 ', 'F 0': 'F0 ', 'G 0': 'G0 ', 'H 0': 'H0 ', 'I 0': 'I0 '} _rebaspat = re.compile('|'.join(_basrep.keys())) # Find the basis set found = self.regex(_rebas02, _rebas03, keys_only=True) if not found[_rebas02]: return start = stop = found[_rebas02][0] + 1 while self[stop].strip(): stop += 1 # Raw data df = self.pandas_dataframe(start, stop, 4) def _padx(srs): return [0] + srs.tolist() + [df.shape[0]] # Get some indices for appropriate columns setdx = _padx(df[0][df[0] == '****'].index) shldx = _padx(df[3][~np.isnan(df[3])].index) lindx = df[0][df[0].str.lower().isin(lorder + ['sp'])] # Populate the df df['L'] = lindx.str.lower().map(lmap) df['L'] = df['L'].fillna(method='ffill').fillna( method='bfill').astype(np.int64) df['center'] = np.concatenate([np.repeat(i, stop - start) for i, (start, stop) in enumerate(zip(setdx, setdx[1:]))]) df['shell'] = np.concatenate([np.repeat(i-1, stop - start) for i, (start, stop) in enumerate(zip(shldx, shldx[1:]))]) # Complicated way to get shells but it is flat maxshl = df.groupby('center').apply(lambda x: x.shell.max() + 1) maxshl.index += 1 maxshl[0] = 0 df['shell'] = df['shell'] - df['center'].map(maxshl) # Drop all the garbage todrop = setdx[:-1] + [i+1 for i in setdx[:-2]] + lindx.index.tolist() df.drop(todrop, inplace=True) # Keep cleaning if df[0].dtype == 'object': df[0] = df[0].str.replace('D', 'E').astype(np.float64) if df[1].dtype == 'object': df[1] = df[1].str.replace('D', 'E').astype(np.float64) try: sp = np.isnan(df[2]).sum() == df.shape[0] except TypeError: df[2] = df[2].str.replace('D', 'E').astype(np.float64) sp = True df.rename(columns={0: 'alpha', 1: 'd'}, inplace=True) # Deduplicate basis sets and expand 'SP' shells if present df, setmap = deduplicate_basis_sets(df, sp=sp) spherical = '5D' in self[found[_rebas03][0]] if df['L'].max() < 2: spherical = True self.basis_set = BasisSet(df) self.meta['spherical'] = spherical self.atom['set'] = self.atom['set'].map(setmap) def parse_orbital(self): _rebas01 = r'basis functions,' # Orbital flags _realphaelec = 'alpha electrons' _reorb01 = '(?=Alpha|Beta).*(?=occ|virt)' _reorb02 = 'Orbital symmetries' _orbslice = [slice(10 * i, 10 * i + 9) for i in range(5)] _symrep = {'Occupied': '', 'Virtual': '', 'Alpha Orbitals:': '', 'Beta Orbitals:': '', '\(': '', '\)': ''} _resympat = re.compile('|'.join(_symrep.keys())) _symrep['('] = '' _symrep[')'] = '' # Find where our data is found = self.regex(_reorb01, _reorb02, _rebas01, _realphaelec) # If no orbital energies, quit if not found[_reorb01]: return # Check if open shell os = any(('Beta' in ln for lno, ln in found[_reorb01])) #UNUSED? #occ = 1 if os else 2 # Find number of electrons ae, x, x, be, x, x = found[_realphaelec][0][1].split() ae, be = int(ae), int(be) # Get orbital energies ens = '\n'.join([ln.split('-- ')[1] for i, ln in found[_reorb01]]) ens = pd.read_fwf(six.StringIO(ens), header=None, widths=np.repeat(10, 5)).stack().values # Other arrays orbital = Orbital.from_energies(ens, ae, be, os=os) # Symmetry labels if found[_reorb02]: # Gaussian seems to print out a lot of these blocks # maybe a better way to deal with this allsyms = [] match = ['(', 'Orbitals'] for i, (start, ln) in enumerate(found[_reorb02]): # Find the start, stop indices for each block while match[0] not in self[start]: start += 1 stop = start + 1 while any((i in self[stop] for i in match)): stop += 1 # Clean up the text block so it is just symmetries syms = _resympat.sub(lambda m: _symrep[m.group(0)], ' '.join([i.strip() for i in self[start:stop]])).split() # cat the syms for each block together allsyms += syms # Add it to our dataframe orbital['symmetry'] = allsyms[-orbital.shape[0]:] self.orbital = orbital def parse_momatrix(self): """ Parses the MO matrix if asked for in the input. Note: Requires specification of pop(full) or pop(no) or the like. """ if hasattr(self, '_momatrix'): return _rebas01 = r'basis functions,' # MOMatrix flags _remomat01 = r'pop.*(?=full|no)' _remomat02 = 'Orbital Coefficients' _basrep = {'D 0': 'D0 ', 'F 0': 'F0 ', 'G 0': 'G0 ', 'H 0': 'H0 ', 'I 0': 'I0 '} _rebaspat = re.compile('|'.join(_basrep.keys())) # Check if a full MO matrix was specified in the input check = self.regex(_remomat01, stop=1000, flags=re.IGNORECASE) if not check: return # Find approximately where our data is found = self.find(_remomat02, _rebas01) # Get some dimensions ndim = len(found[_remomat02]) # If something goes wrong if not ndim: return nbas = int(found[_rebas01][0][1].split()[0]) nblocks = np.int64(np.ceil(nbas / 5)) # Allocate a big ol' array coefs = np.empty((nbas ** 2, ndim), dtype=np.float64) # Dynamic column generation hasn't been worked out yet colnames = ['coef'] + ['coef' + str(i) for i in range(1, ndim)] # Iterate over where the data was found # c counts the column in the resulting momatrix table _csv_args = {'delim_whitespace': True, 'header': None} for c, (lno, ln) in enumerate(found[_remomat02]): gap = 0 while not 'eigenvalues' in self[lno + gap].lower(): gap += 1 start = lno + gap + 1 stop = start + nbas # The basis set order is printed with every chunk of eigenvectors if not c: mapr = self.basis_set.groupby(['set', 'L']).apply( lambda x: x['shell'].unique()).to_dict() self.basis_set_order = _basis_set_order(self[start:stop], mapr, self.atom['set']) # Some fudge factors due to extra lines being printed space = start - lno - 1 fnbas = nbas + space span = start + fnbas * nblocks # Finally get where our chunks are starts = np.arange(start, span, fnbas) stops = np.arange(stop, span, fnbas) stride = 0 # b counts the blocks of eigenvectors per column in momatrix for b, (start, stop) in enumerate(zip(starts, stops)): # Number of eigenvectors in this block ncol = len(self[start][21:].split()) step = nbas * ncol _csv_args['names'] = range(ncol) # Massage the text so that we can read csv block = '\n'.join([ln[21:] for ln in self[start:stop]]) block = _rebaspat.sub(lambda m: _basrep[m.group(0)], block) # Enplacen the resultant unstacked values coefs[stride:stride + nbas * ncol, c] = pd.read_fwf( six.StringIO(block), header=None, widths=np.repeat(10, 5)).unstack().dropna().values stride += step # Index chi, phi chis = np.tile(range(nbas), nbas) orbs = np.repeat(range(nbas), nbas) momatrix = pd.DataFrame(coefs, columns=colnames) momatrix['chi'] = chis momatrix['orbital'] = orbs # Frame not really implemented for momatrix momatrix['frame'] = 0 self.momatrix = momatrix def parse_basis_set_order(self): if hasattr(self, '_basis_set_order'): return self.parse_momatrix() def parse_frame(self): # Frame flags _retoten = 'SCF Done:' _realphaelec = 'alpha electrons' _reelecstate = 'The electronic state' # Get the default frame from the atom table self.frame = compute_frame_from_atom(self.atom) # Find our data found = self.find(_retoten, _realphaelec, _reelecstate) # Extract just the total SCF energies ens = [float(ln.split()[4]) for lno, ln in found[_retoten]] # If 'SCF Done' prints out more times than frames try: ens = ens if len(self.frame) == len(ens) else ens[-len(self.frame):] self.frame['E_tot'] = ens except ValueError: pass # We will assume number of electrons doesn't change per frame ae, x, x, be, x, x = found[_realphaelec][0][1].split() self.frame['N_e'] = int(ae) + int(be) self.frame['N_a'] = int(ae) self.frame['N_b'] = int(be) # Try to get the electronic state but don't try too hard try: states = [] #for lno, ln in found[_reelecstate]: for _, ln in found[_reelecstate]: if 'initial' in ln: continue states.append(ln.split()[4].replace('.', '')) self.frame['state'] = states except (IndexError, ValueError): pass def parse_excitation(self): # TDDFT flags _retddft = 'TD' _reexcst = 'Excited State' chk = self.find(_retddft, stop=1000, keys_only=True) if not chk: return # Find the data found = self.find(_reexcst) keeps, maps, summ = [], [] ,[] for i, (lno, ln) in enumerate(found): summ.append(ln) lno += 1 while '->' in self[lno]: keeps.append(lno) maps.append(i) lno += 1 cols = [0, 1, 2, 'kind', 'eV', 3, 'nm', 4, 'osc', 's2'] summ = pd.read_csv(six.StringIO('\n'.join([ln for lno, ln in found])), delim_whitespace=True, header=None, names=cols, usecols=[c for c in cols if type(c) == str]) summ['s2'] = summ['s2'].str[7:].astype(np.float64) summ['osc'] = summ['osc'].str[2:].astype(np.float64) cols = ['occ', 0, 'virt', 'cont'] conts = pd.read_csv(six.StringIO('\n'.join([self[i] for i in keeps])), delim_whitespace=True, header=None, names=cols, usecols=[c for c in cols if type(c) == str]) conts['map'] = maps for col in summ.columns: conts[col] = conts['map'].map(summ[col]) conts['energy'] = conts['eV'] * Energy['eV', 'Ha'] conts['frame'] = conts['group'] = 0 self.excitation = conts def parse_frequency(self): # Frequency flags _refreq = 'Freq' found = self.regex(_refreq, stop=1000, flags=re.IGNORECASE) # Don't need the input deck or 2 from the summary at the end found = self.find(_refreq)[1:-2] if not found: return # Total lines per block minus the unnecessary ones span = found[1][0] - found[0][0] - 7 dfs, fdx = [], 0 # Iterate over what we found for lno, ln in found: # Get the frequencies first freqs = ln[15:].split() nfreqs = len(freqs) # Get just the atom displacement vectors start = lno + 5 stop = start + span cols = range(2 + 3 * nfreqs) df = self.pandas_dataframe(start, stop, ncol=cols) # Split up the df and unstack it slices = [list(range(2 + i, 2 + 3 * nfreqs, 3)) for i in range(nfreqs)] dx, dy, dz = [df[i].unstack().values for i in slices] # Generate the appropriate dimensions of other columns labels = np.tile(df[0].values, nfreqs) zs = np.tile(df[1].values, nfreqs) freqdxs = np.repeat(range(fdx, fdx + nfreqs), df.shape[0]) freqs = np.repeat(freqs, df.shape[0]) fdx += nfreqs # Put it all together stacked = pd.DataFrame.from_dict({'Z': zs, 'label': labels, 'dx': dx, 'dy': dy, 'dz': dz, 'frequency': freqs, 'freqdx': freqdxs}) stacked['symbol'] = stacked['Z'].map(z2sym) dfs.append(stacked) # Now put all our frequencies together frequency = pd.concat(dfs).reset_index(drop=True) # Pretty sure displacements are in cartesian angstroms # TODO: verify with an external program that vibrational # modes look the same as the ones generated with # this methodology. frequency['dx'] *= Length['Angstrom', 'au'] frequency['dy'] *= Length['Angstrom', 'au'] frequency['dz'] *= Length['Angstrom', 'au'] # Frame not really implemented here either frequency['frame'] = 0 self.frequency = frequency # Below are triangular matrices -- One electron integrals def parse_overlap(self): _reovl01 = '*** Overlap ***' overlap = self._parse_triangular_matrix(_reovl01, 'coef') if overlap is not None: self.overlap = overlap def parse_multipole(self): _reixn = 'IX= {}' mltpl = self._parse_triangular_matrix(_reixn.format(1), 'ix1') if mltpl is not None: mltpl['ix2'] = self._parse_triangular_matrix(_reixn.format(2), 'ix2', True) mltpl['ix3'] = self._parse_triangular_matrix(_reixn.format(3), 'ix3', True) self.multipole = mltpl def __init__(self, *args, **kwargs): super(Output, self).__init__(*args, **kwargs)
class Output(six.with_metaclass(OutMeta, Editor)): """Editor for NWChem calculation output file (stdout).""" def parse_atom(self): """Parse the atom dataframe.""" _reatom01 = 'Geometry "' _reatom02 = 'Atomic Mass' _reatom03 = 'ECP "ecp basis"' _reatom04 = 'Output coordinates in' found = self.find(_reatom01, _reatom02, _reatom03, _reatom04, keys_only=True) unit = self[found[_reatom04][0]].split()[3] unit = "Angstrom" if unit == "angstroms" else "au" starts = np.array(found[_reatom01]) + 7 stops = np.array(found[_reatom02]) - 1 ecps = np.array(found[_reatom03]) + 2 ecps = {self[ln].split()[0]: int(self[ln].split()[3]) for ln in ecps} columns = ['label', 'tag', 'Z', 'x', 'y', 'z'] atom = pd.concat([ self.pandas_dataframe(s, e, columns) for s, e in zip(starts, stops) ]) atom['symbol'] = atom['tag'].str.extract( '([A-z]{1,})([0-9]*)', expand=False)[0].str.lower().str.title() atom['Z'] = atom['Z'].astype(np.int64) atom['Zeff'] = (atom['Z'] - atom['tag'].map(ecps).fillna(value=0)).astype(np.int64) #n = len(atom) nf = atom.label.value_counts().max() nat = atom.label.max() atom['frame'] = [i for i in range(nf) for j in range(nat)] atom['label'] -= 1 atom['x'] *= Length[unit, 'au'] atom['y'] *= Length[unit, 'au'] atom['z'] *= Length[unit, 'au'] if atom['frame'].max() > 0: li = atom['frame'].max() atom = atom[~(atom['frame'] == li)] atom.reset_index(drop=True, inplace=True) del atom['label'] self.atom = Atom(atom) def parse_orbital(self): """Parse the :class:`~exatomic.core.orbital.Orbital` dataframe.""" orbital = None _remo01 = 'Molecular Orbital Analysis' _remo02 = 'alpha - beta orbital overlaps' _remo03 = 'center of mass' check = self.find(_remo01) if any(['Alpha' in value for value in check]): alpha_starts = np.array( [no for no, line in check if 'Alpha' in line], dtype=np.int64) + 2 alpha_stops = np.array( [no for no, line in check if 'Beta' in line], dtype=np.int64) - 1 beta_starts = alpha_stops + 3 beta_stops = np.array(self.find(_remo02, keys_only=True), dtype=np.int64) - 1 alpha_orbital = self._parse_orbital(alpha_starts, alpha_stops) beta_orbital = self._parse_orbital(beta_starts, beta_stops) alpha_orbital['spin'] = 0 beta_orbital['spin'] = 1 orbital = pd.concat((alpha_orbital, beta_orbital), ignore_index=True) else: starts = np.array(list(zip(*check))[0], dtype=np.int64) + 2 stops = np.array(self.find(_remo03, keys_only=True), dtype=np.int64) - 1 orbital = self._parse_orbital(starts, stops) orbital['spin'] = 0 orbital['group'] = 0 self.orbital = Orbital(orbital) def parse_momatrix(self): """ Parse the :class:`~exatomic.core.orbital.MOMatrix` dataframe. Note: Must supply 'print "final vectors" "final vectors analysis"' for momatrix """ key0 = "Final MO vectors" key1 = "center of mass" found = self.find(key0, key1) if found[key0]: start = found[key0][0][0] + 6 end = found[key1][0][0] - 1 c = pd.read_fwf(StringIO("\n".join(self[start:end])), widths=(6, 12, 12, 12, 12, 12, 12), names=list(range(7))) self.c = c idx = c[c[0].isnull()].index.values c = c[~c.index.isin(idx)] del c[0] nbas = len(self.basis_set_order) n = c.shape[0] // nbas coefs = [] # The for loop below is like numpy.array_split(df, n); using numpy.array_split # with dataframes seemed to have strange results where splits had wrong sizes? for i in range(n): coefs.append(c.iloc[i * nbas:(i + 1) * nbas, :].astype(float).dropna( axis=1).values.ravel("F")) c = np.concatenate(coefs) del coefs orbital, chi = _square_indices(len(self.basis_set_order)) self.momatrix = MOMatrix.from_dict({ 'coef': c, 'chi': chi, 'orbital': orbital, 'frame': 0 }) # momatrix = pd.DataFrame.from_dict({'coef': c, 'chi': chi, 'orbital': orbital}) # momatrix['frame'] = 0 # self.momatrix = momatrix def _parse_orbital(self, starts, stops): ''' This function actually performs parsing of :class:`~exatomic.orbital.Orbital` See Also: :func:`~exnwchem.output.Output.parse_orbital` ''' joined = '\n'.join( ['\n'.join(self[s:e]) for s, e in zip(starts, stops)]) nvec = joined.count('Vector') if 'spherical' not in self.meta: self.parse_basis_set() mapper = self.basis_set.functions( self.meta['spherical']).groupby(level="set").sum() nbas = self.atom['set'].map(mapper).sum() nbas *= nvec # Orbital dataframe -- alternatively one could parse the strings # into the DataFrame and then use the pd.Series.str methods to # perform all the replacements at the same time, eg. 'D' --> 'E' # and 'Occ=' --> '', etc. orb_no = np.empty((nvec, ), dtype=np.int64) occ = np.empty((nvec, ), dtype=np.float64) nrg = np.empty((nvec, ), dtype=np.float64) x = np.empty((nvec, ), dtype=np.float64) y = np.empty((nvec, ), dtype=np.float64) z = np.empty((nvec, ), dtype=np.float64) frame = np.empty((nvec, ), dtype=np.int64) fc = -1 # Frame counter oc = 0 # Orbital counter for s, e in zip(starts, stops): fc += 1 for line in self[s:e]: ls = line.split() if 'Vector' in line: orb_no[oc] = ls[1] occ[oc] = ls[2].replace('Occ=', '').replace('D', 'E') nrg[oc] = ls[3].replace('E=', '').replace( 'D', 'E') if 'E=-' in line else ls[4].replace( 'D', 'E') frame[oc] = fc elif 'MO Center' in line: x[oc] = ls[2].replace(',', '').replace('D', 'E') y[oc] = ls[3].replace(',', '').replace('D', 'E') z[oc] = ls[4].replace(',', '').replace('D', 'E') oc += 1 orb_no -= 1 return pd.DataFrame.from_dict({ 'x': x, 'y': z, 'z': z, 'frame': frame, 'vector': orb_no, 'occupation': occ, 'energy': nrg }) def parse_basis_set(self): """ Parse the :class:`~exatomic.core.basis.BasisSet` dataframe. """ if not hasattr(self, "atom"): self.parse_atom() _rebas01 = ' Basis "' _rebas02 = ' Summary of "' _rebas03 = [ ' s ', ' px ', ' py ', ' pz ', ' d ', ' f ', ' g ', ' h ', ' i ', ' j ', ' k ', ' l ', ' m ', ' p ' ] found = self.find(_rebas01, _rebas02) spherical = True if "spherical" in found[_rebas01][0][1] else False start = found[_rebas01][0][0] + 2 idx = 1 if len(found[_rebas02]) > 1 else -1 stop = found[_rebas02][idx][0] - 1 # Read in all of the extra lines that contain ---- and tag names df = pd.read_fwf(StringIO("\n".join(self[start:stop])), widths=(4, 2, 16, 16), names=("shell", "L", "alpha", "d")) df.loc[df['shell'] == "--", "shell"] = np.nan tags = df.loc[(df['shell'].str.isdigit() == False), "shell"] idxs = tags.index.tolist() idxs.append(len(df)) df['set'] = "" for i, tag in enumerate(tags): df.loc[idxs[i]:idxs[i + 1], "set"] = tag df = df.dropna().reset_index(drop=True) mapper = {v: k for k, v in dict(enumerate(df['set'].unique())).items()} df['set'] = df['set'].map(mapper) df['L'] = df['L'].str.strip().str.lower().map(lmap) df['alpha'] = df['alpha'].astype(float) df['d'] = df['d'].astype(float) # NO SUPPORT FOR MULTIPLE FRAMES? df['frame'] = 0 self.basis_set = BasisSet(df) self.meta['spherical'] = spherical self.atom['set'] = self.atom['tag'].map(mapper) def parse_basis_set_order(self): dtype = [('center', 'i8'), ('shell', 'i8'), ('L', 'i8')] if 'spherical' not in self.meta: self.parse_basis_set() if self.meta['spherical']: dtype += [('ml', 'i8')] else: dtype += [('l', 'i8'), ('m', 'i8'), ('n', 'i8')] mapper = self.basis_set.functions( self.meta['spherical']).groupby(level="set").sum() nbas = self.atom['set'].map(mapper).sum() bso = np.empty((nbas, ), dtype=dtype) cnt = 0 bases = self.basis_set.groupby('set') for seht, center in zip(self.atom['set'], self.atom.index): bas = bases.get_group(seht).groupby('shell') if self.meta['spherical']: for shell, grp in bas: l = grp['L'].values[0] for ml in spherical_ordering_function(l): bso[cnt] = (center, shell, l, ml) cnt += 1 else: for shell, grp in bas: l = grp['L'].values[0] for _, ll, m, n in cartesian_ordering_function(l): bso[cnt] = (center, shell, l, ll, m, n) cnt += 1 bso = pd.DataFrame(bso) bso['frame'] = 0 # New shell definition consistent with basis internals shls = [] grps = bso.groupby(['center', 'L']) cache = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) for (cen, L), grp in grps: for ml in grp['ml']: shls.append(cache[cen][L][ml]) cache[cen][L][ml] += 1 bso['shell'] = shls self.basis_set_order = bso def parse_roa(self): """ Parse the :class:`~exatomic.core.tensor.Polarizability` dataframe. This will parse the output from the Raman Optical Activity outputs. Note: We generate a 3D tensor with the 2D tensor code. 3D tensors will have 3 rows labeled with the same name. """ _reroa = 'roa begin' _reare = 'alpha real' _reaim = 'alpha im' # _reombre = 'beta real' # _reombim = 'beta im' _reombre = 'omega beta(real)' _reombim = 'omega beta(imag)' _redqre = 'dipole-quadrupole real (Cartesian)' _redqim = 'dipole-quadrupole imag (Cartesian)' if not self.find(_reroa): return found_2d = self.find(_reare, _reaim, _reombre, _reombim, keys_only=True) found_3d = self.find(_redqre, _redqim, keys_only=True) data = {} start = np.array(list(found_2d.values())).reshape(4, ) + 1 end = np.array(list(found_2d.values())).reshape(4, ) + 10 columns = ['x', 'val'] data = [ self.pandas_dataframe(s, e, columns) for s, e in zip(start, end) ] df = pd.concat([dat for dat in data]).reset_index(drop=True) df['grp'] = [i for i in range(4) for j in range(9)] df = df[['val', 'grp']] df = pd.DataFrame( df.groupby('grp').apply( lambda x: x.unstack().values[:-9]).values.tolist(), columns=['xx', 'xy', 'xz', 'yx', 'yy', 'yz', 'zx', 'zy', 'zz']) # find the electric dipole-quadrupole polarizability # NWChem gives this as a list of 18 values assuming the matrix to be symmetric # for our implementation we need to extend it to 27 elements # TODO: check that NWChem does assume that the 3D tensors are symmetric start = np.sort(np.array(list(found_3d.values())).reshape(2, )) + 1 end = np.sort(np.array(list(found_3d.values())).reshape(2, )) + 19 data = [ self.pandas_dataframe(s, e, columns) for s, e in zip(start, end) ] df3 = pd.concat([dat for dat in data]).reset_index(drop=True) vals = df3['val'].values.reshape(2, 3, 6) adx = np.triu_indices(3) mat = np.zeros((2, 3, 3, 3)) for i in range(2): for j in range(3): mat[i][j][adx] = vals[i][j] mat[i][j] = mat[i][j] + np.transpose( mat[i][j]) - np.identity(3) * mat[i][j] mat = mat.reshape(18, 3) df3 = pd.DataFrame(mat, columns=['x', 'y', 'z']) df3['grp1'] = [i for i in range(2) for j in range(9)] df3['grp2'] = [j for i in range(2) for j in range(3) for n in range(3)] df3 = pd.DataFrame( df3.groupby([ 'grp1', 'grp2' ]).apply(lambda x: x.unstack().values[:-6]).values.tolist(), columns=['xx', 'xy', 'xz', 'yx', 'yy', 'yz', 'zx', 'zy', 'zz'], index=[ 'Ax_real', 'Ay_real', 'Az_real', 'Ax_imag', 'Ay_imag', 'Az_imag' ]) split_label = np.transpose([i.split('_') for i in df3.index.values]) label = split_label[0] types = split_label[1] df['label'] = found_2d.keys() df['label'].replace( [_reare, _reombre, _reaim, _reombim], ['alpha-real', 'g_prime-real', 'alpha-imag', 'g_prime-imag'], inplace=True) df['type'] = [i.split('-')[-1] for i in df['label'].values] df['label'] = [i.split('-')[0] for i in df['label'].values] df['frame'] = np.repeat([0], len(df.index)) df3['label'] = label df3['type'] = types df3['frame'] = np.repeat([0], len(df3.index)) self.roa = pd.concat([df, df3], ignore_index=True) def parse_frequency(self): """ Parse the :class:`~exatomic.core.atom.Frequency` dataframe. Note: This code removes all negative frequencies. """ _remeth = "NORMAL MODE EIGENVECTORS IN CARTESIAN COORDINATES" _refreq = "Frequency" _renat = "Atom information" found = self.find(_remeth) fnat = self.find(_renat) if not found and not fnat: return # get atom information start = fnat[0][0] + 3 stop = start while '----' not in self[stop]: stop += 1 # we assume that there is only one instance of where _renat is found columns = ['symbol', 'atom', 'x', 'y', 'z', 'mass'] atom = self.pandas_dataframe(start, stop, columns) atom['atom'] -= 1 nat = len(atom) # find bounds where the calculated frequencies are start = found[0][0] stop = found[1][0] # get the data found = self.find(_refreq, start=start, stop=stop) dfs = [] fdx = 0 # get frequencies for lno, ln in found: # get the frequency values tmp = ln.split()[1:] freq = np.asarray([float(i) for i in tmp]) ## TODO: here we remove all negative frequencies ## need to find out if this is ok to do # set start and end points for the calculated normal modes staf = lno + start + 1 stof = lno + start + nat * 3 + 2 nm = self.pandas_dataframe(staf, stof, ncol=len(freq)).reset_index(drop=True) # generate boolean array that shows False for negative frequencies neg = [not f < 0 for f in freq] # remove negative frequencies nm.drop(columns=[idx for idx, val in enumerate(neg) if not val], inplace=True) freq = freq[neg] # get normal modes in the x, y, z directions nm = nm.stack().values nfreq = len(freq) dx = nm[::3] dy = nm[1::3] dz = nm[2::3] # assemble dataframe symbol = np.tile(atom['symbol'], nfreq) adx = np.tile(atom['atom'], nfreq) freq = np.repeat(freq, nat) freqdx = np.repeat([i for i in range(fdx, fdx + nfreq)], nfreq) frames = np.repeat([0], nfreq * nat) fdx += nfreq stacked = pd.DataFrame.from_dict({ 'symbol': symbol, 'atom': adx, 'dx': dx, 'dy': dy, 'dz': dz, 'freq': freq, 'freqdx': freqdx, 'frames': frames }) dfs.append(stacked) frequency = pd.concat(dfs).reset_index(drop=True) self.frequency = frequency def parse_gradient(self): """ Parse :class:`exatomic.core.gradient.Gradient` dataframe. """ _regrad = "DFT ENERGY GRADIENTS" found = self.find(_regrad) if not found: return found = self.find(_regrad, keys_only=True) # find start and stop points starts = np.array(found) + 4 stop = starts[0] while '----' not in self[stop]: stop += 1 # backtrack one line as the line after the needed info is empty stop -= 1 stops = starts + (stop - starts[0]) dfs = [] # generate dataframe array columns = ['atom', 'symbol', 'x', 'y', 'z', 'fx', 'fy', 'fz'] for i, (start, stop) in enumerate(zip(starts, stops)): gradient = self.pandas_dataframe(start, stop, columns) gradient['frame'] = i dfs.append(gradient[['atom', 'symbol', 'fx', 'fy', 'fz', 'frame']]) # construct the dataframe gradient = pd.concat(dfs).reset_index(drop=True) gradient['Z'] = gradient['symbol'].map(sym2z) # want to keep more or less the same order across dataframes # or at least try self.gradient = gradient[[ 'Z', 'atom', 'fx', 'fy', 'fz', 'symbol', 'frame' ]] def parse_frame(self): """ Create a minimal :class:`~exatomic.core.frame.Frame` from the (parsed) :class:`~exatomic.core.atom.Atom` object. """ _rescfen = 'Total SCF energy' _redften = 'Total DFT energy' self.frame = compute_frame_from_atom(self.atom) found = self.find(_rescfen, _redften) scfs = found[_rescfen] dfts = found[_redften] if scfs and dfts: print('Warning: found total energies from scf and dft, using dft') dfts = [float(val.split()[-1]) for key, val in dfts] self.frame['total_energy'] = dfts elif scfs: scfs = [float(val.split()[-1]) for key, val in scfs] self.frame['total_energy'] = scfs elif dfts: dfts = [float(val.split()[-1]) for key, val in dfts] self.frame['total_energy'] = dfts def __init__(self, *args, **kwargs): super(Output, self).__init__(*args, **kwargs)