def parse_extxyz(dbpath, xyzpath, env, cache=False): r"""Parses file in XYZ format and writes content to sqllite database Args: dbpath(str): path to sqllite database xyzpath (str): path to file with xyz file format """ with connect(dbpath, use_lock_file=False) as conn: with open(xyzpath) as f: atoms = [] energies = [] forces = [] energiesperatom = [] eform = [] eformperatom = [] ehull = [] ebin = [] for at in read_xyz(f, index=slice(None)): nat = at.get_number_of_atoms() energies.append(at.get_total_energy()) forces.append(at.get_forces()) atoms.append(at) energiesperatom.append(energies[-1] / nat) eform.append(energies[-1]) # - nat*-19.0329202806) eformperatom.append(eform[-1] / nat) ehull.append(0) ebin.append(0) energies = np.array(energies) m = np.mean(energies) emin = np.min(energies) emax = np.max(energies) #energies -= m for i in range(len(atoms)): #atoms[i].energy = energies[i] atoms[i]._calc.results['energy'] = energies[i] r_ij, f_ij = None, None if cache: r_ij, f_ij = neighbor_gen(at, distance_expansion=None, cutoff=5.0, n_gaussians=25, trainable_gaussians=False, environment_provider=env, collect_triples=False, pair_provider=None, center_positions=True) conn.write(atoms[i], data={ ExtXYZ.E: energies[i], ExtXYZ.F: forces[i], ExtXYZ.E + 'peratom': energiesperatom[i], 'Eform': eform[i], 'Eformperatom': eformperatom[i], 'Ehull': ehull[i], 'Ebin': ebin[i], 'mean': m, 'r_ij': r_ij, 'f_ij': f_ij })
def load_data(dbpath): logging.info('Downloading QM9 data...') tmpdir = tempfile.mkdtemp('qm9') tar_path = os.path.join(tmpdir, 'qm9.tar.gz') raw_path = os.path.join(tmpdir, 'qm9_xyz') url = 'https://ndownloader.figshare.com/files/3195398' try: urllib.request.urlretrieve(url, tar_path) logging.info("Done.") except HTTPError as e: logging.error("HTTP Error:", e.code, url) return False except URLError as e: logging.error("URL Error:", e.reason, url) return False tar = tarfile.open(tar_path) tar.extractall(raw_path) tar.close() prop_names = [ 'rcA', 'rcB', 'rcC', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'energy_U0', 'energy_U', 'enthalpy_H', 'free_G', 'Cv' ] conversions = [ 1., 1., 1., 1., Bohr**3 / Ang**3, Hartree / eV, Hartree / eV, Hartree / eV, Bohr**2 / Ang**2, Hartree / eV, Hartree / eV, Hartree / eV, Hartree / eV, Hartree / eV, 1. ] logging.info('Parse xyz files...') with connect(dbpath) as con: for i, xyzfile in enumerate(os.listdir(raw_path)): xyzfile = os.path.join(raw_path, xyzfile) if i % 10000 == 0: logging.info('Parsed: ' + str(i) + ' / ' + str(len(os.listdir(raw_path)))) properties = {} tmp = os.path.join(tmpdir, 'tmp.xyz') with open(xyzfile, 'r') as f: lines = f.readlines() l = lines[1].split()[2:] for pn, p, c in zip(prop_names, l, conversions): properties[pn] = float(p) * c with open(tmp, "wt") as fout: for line in lines: fout.write(line.replace('*^', 'e')) with open(tmp, 'r') as f: ats = list(read_xyz(f, 0))[0] con.write(ats, key_value_pairs=properties) logging.info('Done.') return True
def _load_data(self): logging.info('Downloading GDB-9 data...') tmpdir = tempfile.mkdtemp('gdb9') tar_path = os.path.join(tmpdir, 'gdb9.tar.gz') raw_path = os.path.join(tmpdir, 'gdb9_xyz') url = 'https://ndownloader.figshare.com/files/3195389' try: request.urlretrieve(url, tar_path) logging.info("Done.") except HTTPError as e: logging.error("HTTP Error:", e.code, url) return False except URLError as e: logging.error("URL Error:", e.reason, url) return False logging.info("Extracting files...") tar = tarfile.open(tar_path) tar.extractall(raw_path) tar.close() logging.info("Done.") logging.info('Parse xyz files...') ordered_files = sorted(os.listdir(raw_path), key=lambda x: (int(re.sub('\D', '', x)), x)) all_atoms = [] all_properties = [] for i, xyzfile in enumerate(ordered_files): xyzfile = os.path.join(raw_path, xyzfile) if (i + 1) % 10000 == 0: logging.info('Parsed: {:6d} / 133885'.format(i + 1)) properties = {} tmp = os.path.join(tmpdir, 'tmp.xyz') with open(xyzfile, 'r') as f: lines = f.readlines() l = lines[1].split()[2:] for pn, p in zip(self.properties, l): properties[pn] = np.array([float(p) * self.units[pn]]) with open(tmp, "wt") as fout: for line in lines: fout.write(line.replace('*^', 'e')) with open(tmp, 'r') as f: ats = list(read_xyz(f, 0))[0] all_atoms.append(ats) all_properties.append(properties) logging.info('Write atoms to db...') self.add_systems(all_atoms, all_properties) logging.info('Done.') shutil.rmtree(tmpdir) return True
def _load_data(self, evilmols=None): logging.info("Downloading GDB-9 data...") tmpdir = tempfile.mkdtemp("gdb9") tar_path = os.path.join(tmpdir, "gdb9.tar.gz") raw_path = os.path.join(tmpdir, "gdb9_xyz") url = "https://ndownloader.figshare.com/files/3195389" request.urlretrieve(url, tar_path) logging.info("Done.") logging.info("Extracting files...") tar = tarfile.open(tar_path) tar.extractall(raw_path) tar.close() logging.info("Done.") logging.info("Parse xyz files...") ordered_files = sorted( os.listdir(raw_path), key=lambda x: (int(re.sub("\D", "", x)), x) ) all_atoms = [] all_properties = [] irange = np.arange(len(ordered_files), dtype=np.int) if evilmols is not None: irange = np.setdiff1d(irange, evilmols - 1) for i in irange: xyzfile = os.path.join(raw_path, ordered_files[i]) if (i + 1) % 10000 == 0: logging.info("Parsed: {:6d} / 133885".format(i + 1)) properties = {} tmp = os.path.join(tmpdir, "tmp.xyz") with open(xyzfile, "r") as f: lines = f.readlines() l = lines[1].split()[2:] for pn, p in zip(self.available_properties, l): properties[pn] = np.array([float(p) * self.units[pn]]) with open(tmp, "wt") as fout: for line in lines: fout.write(line.replace("*^", "e")) with open(tmp, "r") as f: ats = list(read_xyz(f, 0))[0] all_atoms.append(ats) all_properties.append(properties) logging.info("Write atoms to db...") self.add_systems(all_atoms, all_properties) logging.info("Done.") shutil.rmtree(tmpdir) return True
def _load_data(self): logging.info('Downloading GDB-9 data...') tmpdir = tempfile.mkdtemp('gdb9') tar_path = os.path.join(tmpdir, 'gdb9.tar.gz') raw_path = os.path.join(tmpdir, 'gdb9_xyz') url = 'https://ndownloader.figshare.com/files/3195389' try: request.urlretrieve(url, tar_path) logging.info('Done.') except HTTPError as e: logging.error('HTTP Error:', e.code, url) return False except URLError as e: logging.error('URL Error:', e.reason, url) return False logging.info('Extracting data from tar file...') tar = tarfile.open(tar_path) tar.extractall(raw_path) tar.close() logging.info('Done.') logging.info('Parsing xyz files...') with connect(os.path.join(self.path, 'qm9.db')) as con: ordered_files = sorted(os.listdir(raw_path), key=lambda x: (int(re.sub('\D', '', x)), x)) for i, xyzfile in enumerate(ordered_files): xyzfile = os.path.join(raw_path, xyzfile) if (i + 1) % 10000 == 0: logging.info('Parsed: {:6d} / 133885'.format(i + 1)) properties = {} tmp = os.path.join(tmpdir, 'tmp.xyz') with open(xyzfile, 'r') as f: lines = f.readlines() l = lines[1].split()[2:] for pn, p in zip(self.properties, l): properties[pn] = float(p) * self.units[pn] with open(tmp, "wt") as fout: for line in lines: fout.write(line.replace('*^', 'e')) with open(tmp, 'r') as f: ats = list(read_xyz(f, 0))[0] con.write(ats, data=properties) logging.info('Done.') shutil.rmtree(tmpdir) return True
def parse_extxyz(dbpath, xyzpath): r"""Parses file in XYZ format and writes content to sqllite database Args: dbpath(str): path to sqllite database xyzpath (str): path to file with xyz file format """ with connect(dbpath, use_lock_file=False) as conn: with open(xyzpath) as f: for at in read_xyz(f, index=slice(None)): e = at.get_total_energy() f = at.get_forces() conn.write(at, data={ExtXYZ.E: e, ExtXYZ.F: f})
def _load_data(self, evilmols=None): tmpdir = tempfile.mkdtemp('gdb9') raw_path = os.path.join(r'../../database/qm9') logging.info('Parse xyz files...') ordered_files = sorted(os.listdir(raw_path), key=lambda x: (int(re.sub('\D', '', x)), x)) all_atoms = [] all_properties = [] irange = np.arange(len(ordered_files), dtype=np.int) if evilmols is not None: irange = np.setdiff1d(irange, evilmols - 1) for i in irange: xyzfile = os.path.join(raw_path, ordered_files[i]) if (i + 1) % 10000 == 0: logging.info('Parsed: {:6d}'.format(i + 1)) properties = {} tmp = os.path.join(tmpdir, 'tmp.xyz') with open(xyzfile, 'r') as f: lines = f.readlines() l = lines[1].split()[2:] for pn, p in zip(QM9.available_properties, l): properties[pn] = np.array([float(p)]) # * self.units[pn]]) with open(tmp, "wt") as fout: have_tag = False for line in lines: fout.write(line.replace('*^', 'e')) if not have_tag: tags = list(range(1, int(line) + 1)) have_tag = True with open(tmp, 'r') as f: ats = list(read_xyz(f, 0))[0] ats.set_tags(tags=tags) all_atoms.append(ats) all_properties.append(properties) logging.info('Write atoms to db...') self.add_systems(all_atoms, all_properties) logging.info('Done.') shutil.rmtree(tmpdir) return True
def extxyz_to_db(extxyz_path, db_path): r""" Convertes en extxyz-file to an ase database Args: extxyz_path (str): path to extxyz-file db_path(str): path to sqlite database """ with connect(db_path, use_lock_file=False) as conn: with open(extxyz_path) as f: for at in tqdm(read_xyz(f, index=slice(None)), "creating ase db"): data = {} if at.has("forces"): data["forces"] = at.get_forces() data.update(at.info) conn.write(at, data=data)
def read_xyz(fileobj, property_names=None, idx=None): props = {} zs = [] coords = [] nas = [] nsheav = [] nm = len(re.findall('^\s*\d\d*$', open(fileobj).read(), re.MULTILINE)) index = slice(0, nm) _ms = [] for i, mi in enumerate( rx.read_xyz(fileobj, index=index, properties_parser=rx.key_val_str_to_dict_regex)): _ms.append(mi) if idx is not None: ms = [_ms[im] for im in idx] else: ms = _ms for mi in ms: #print('mi=', mi.info) nas.append(len(mi)) nsheav.append((mi.numbers > 1).sum()) zs += list(mi.numbers) coords += list(mi.positions) #if i%1000 == 0: print('i=',i) if property_names: if ('a' in property_names) or ('all' in property_names): property_names = list(mi.info.keys()) #print('pns=', property_names) for key in property_names: #mi.info.keys(): #if key not in mi.info.keys(): # print('#ERROR: key absent!') # print('i,key=',i,key, mi.info.keys()) # raise #print('props=',props) if key in props.keys(): props[key] += [mi.info[key]] else: props[key] = [mi.info[key]] #print('props=',props) return np.array(nas, int), np.array(zs, int), np.array(coords), np.array( nsheav, int), props
def parse_xyz(tmp_dir):#, dbpath): client = MongoClient() db = client.mydb conn = db.my_collection prop_names = ['rcA', 'rcB', 'rcC', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'energy_U0', 'energy_U', 'enthalpy_H', 'free_G', 'Cv'] conversions = [1., 1., 1., 1., Bohr ** 3 / Ang ** 3, Hartree / eV, Hartree / eV, Hartree / eV, Bohr ** 2 / Ang ** 2, Hartree / eV, Hartree / eV, Hartree / eV, Hartree / eV, Hartree / eV, 1.] for i, xyzfile in enumerate(os.listdir(tmp_dir)): xyzfile = os.path.join(tmp_dir, xyzfile) if i % 10000 == 0: log.info(str(i) + "/133885 parsed.") #if i == 500: # break properties = {} tmp = os.path.join(tmp_dir, 'tmp.xyz') with open(xyzfile, 'r') as f: lines = f.readlines() l = lines[1].split()[2:] for pn, p, c in zip(prop_names, l, conversions): properties[pn] = float(p) * c with open(tmp, 'wt') as fout: for line in lines: fout.write(line.replace('*^', 'e')) with open(tmp, 'r') as f: atoms = list(read_xyz(f, 0))[0] idx_ik, seg_i, idx_j, idx_jk, seg_j, offset, ratio_j = collect_neighbors(atoms, 20.) data = {'_idx_ik': idx_ik, '_idx_jk': idx_jk, '_idx_j': idx_j, '_seg_i': seg_i, '_seg_j': seg_j, '_offset': offset, '_ratio_j': ratio_j}
def read(filename, index=None, format=None): """Read Atoms object(s) from file. filename: str Name of the file to read from. index: int or slice If the file contains several configurations, the last configuration will be returned by default. Use index=n to get configuration number n (counting from zero). format: str Used to specify the file-format. If not given, the file-format will be guessed by the *filetype* function. Known formats: ========================= ============= format short name ========================= ============= GPAW restart-file gpw Dacapo netCDF output file dacapo Old ASE netCDF trajectory nc Virtual Nano Lab file vnl ASE pickle trajectory traj ASE bundle trajectory bundle GPAW text output gpaw-text CUBE file cube XCrySDen Structure File xsf Dacapo text output dacapo-text XYZ-file xyz VASP POSCAR/CONTCAR file vasp VASP OUTCAR file vasp_out SIESTA STRUCT file struct_out ABINIT input file abinit V_Sim ascii file v_sim Protein Data Bank pdb CIF-file cif FHI-aims geometry file aims FHI-aims output file aims_out VTK XML Image Data vti VTK XML Structured Grid vts VTK XML Unstructured Grid vtu TURBOMOLE coord file tmol TURBOMOLE gradient file tmol-gradient exciting input exi AtomEye configuration cfg WIEN2k structure file struct DftbPlus input file dftb CASTEP geom file cell CASTEP output file castep CASTEP trajectory file geom ETSF format etsf.nc DFTBPlus GEN format gen CMR db/cmr-file db CMR db/cmr-file cmr LAMMPS dump file lammps EON reactant.con file eon Gromacs coordinates gro Gaussian com (input) file gaussian Gaussian output file gaussian_out Quantum espresso in file esp_in Quantum espresso out file esp_out Extended XYZ file extxyz NWChem input file nw ========================= ============= """ if isinstance(filename, str) and ('.json@' in filename or '.db@' in filename or filename.startswith('pg://') and '@' in filename): filename, index = filename.rsplit('@', 1) if index.isdigit(): index = int(index) else: if isinstance(filename, str): p = filename.rfind('@') if p != -1: try: index = string2index(filename[p + 1:]) except ValueError: pass else: filename = filename[:p] if isinstance(index, str): index = string2index(index) if format is None: format = filetype(filename) if format.startswith('gpw'): import gpaw r = gpaw.io.open(filename, 'r') positions = r.get('CartesianPositions') * Bohr numbers = r.get('AtomicNumbers') cell = r.get('UnitCell') * Bohr pbc = r.get('BoundaryConditions') tags = r.get('Tags') magmoms = r.get('MagneticMoments') energy = r.get('PotentialEnergy') * Hartree if r.has_array('CartesianForces'): forces = r.get('CartesianForces') * Hartree / Bohr else: forces = None atoms = Atoms(positions=positions, numbers=numbers, cell=cell, pbc=pbc) if tags.any(): atoms.set_tags(tags) if magmoms.any(): atoms.set_initial_magnetic_moments(magmoms) else: magmoms = None atoms.calc = SinglePointDFTCalculator(atoms, energy=energy, forces=forces, magmoms=magmoms) kpts = [] if r.has_array('IBZKPoints'): for w, kpt, eps_n, f_n in zip(r.get('IBZKPointWeights'), r.get('IBZKPoints'), r.get('Eigenvalues'), r.get('OccupationNumbers')): kpts.append( SinglePointKPoint(w, kpt[0], kpt[1], eps_n[0], f_n[0])) atoms.calc.kpts = kpts return atoms if format in ['json', 'db', 'postgresql']: from ase.db.core import connect, dict2atoms if index == slice(None, None): index = None images = [ dict2atoms(d) for d in connect(filename, format).select(index) ] if len(images) == 1: return images[0] else: return images if index is None: index = -1 if format == 'castep': from ase.io.castep import read_castep return read_castep(filename, index) if format == 'castep_cell': import ase.io.castep return ase.io.castep.read_cell(filename, index) if format == 'castep_geom': import ase.io.castep return ase.io.castep.read_geom(filename, index) if format == 'exi': from ase.io.exciting import read_exciting return read_exciting(filename, index) if format in ['xyz', 'extxyz']: from ase.io.extxyz import read_xyz return read_xyz(filename, index) if format == 'traj': from ase.io.trajectory import read_trajectory return read_trajectory(filename, index) if format == 'bundle': from ase.io.bundletrajectory import read_bundletrajectory return read_bundletrajectory(filename, index) if format == 'cube': from ase.io.cube import read_cube return read_cube(filename, index) if format == 'nc': from ase.io.netcdf import read_netcdf return read_netcdf(filename, index) if format == 'gpaw-text': from ase.io.gpawtext import read_gpaw_text return read_gpaw_text(filename, index) if format == 'dacapo-text': from ase.io.dacapo import read_dacapo_text return read_dacapo_text(filename) if format == 'dacapo': from ase.io.dacapo import read_dacapo return read_dacapo(filename) if format == 'xsf': from ase.io.xsf import read_xsf return read_xsf(filename, index) if format == 'vasp': from ase.io.vasp import read_vasp return read_vasp(filename) if format == 'vasp_out': from ase.io.vasp import read_vasp_out return read_vasp_out(filename, index) if format == 'abinit': from ase.io.abinit import read_abinit return read_abinit(filename) if format == 'v_sim': from ase.io.v_sim import read_v_sim return read_v_sim(filename) if format == 'mol': from ase.io.mol import read_mol return read_mol(filename) if format == 'pdb': from ase.io.pdb import read_pdb return read_pdb(filename, index) if format == 'cif': from ase.io.cif import read_cif return read_cif(filename, index) if format == 'struct': from ase.io.wien2k import read_struct return read_struct(filename) if format == 'struct_out': from ase.io.siesta import read_struct return read_struct(filename) if format == 'vti': from ase.io.vtkxml import read_vti return read_vti(filename) if format == 'vts': from ase.io.vtkxml import read_vts return read_vts(filename) if format == 'vtu': from ase.io.vtkxml import read_vtu return read_vtu(filename) if format == 'aims': from ase.io.aims import read_aims return read_aims(filename) if format == 'aims_out': from ase.io.aims import read_aims_output return read_aims_output(filename, index) if format == 'iwm': from ase.io.iwm import read_iwm return read_iwm(filename) if format == 'Cmdft': from ase.io.cmdft import read_I_info return read_I_info(filename) if format == 'tmol': from ase.io.turbomole import read_turbomole return read_turbomole(filename) if format == 'tmol-gradient': from ase.io.turbomole import read_turbomole_gradient return read_turbomole_gradient(filename) if format == 'cfg': from ase.io.cfg import read_cfg return read_cfg(filename) if format == 'dftb': from ase.io.dftb import read_dftb return read_dftb(filename) if format == 'sdf': from ase.io.sdf import read_sdf return read_sdf(filename) if format == 'etsf': from ase.io.etsf import ETSFReader return ETSFReader(filename).read_atoms() if format == 'gen': from ase.io.gen import read_gen return read_gen(filename) if format == 'cmr': from ase.io.cmr_io import read_db return read_db(filename, index) if format == 'lammps': from ase.io.lammpsrun import read_lammps_dump return read_lammps_dump(filename, index) if format == 'eon': from ase.io.eon import read_reactant_con return read_reactant_con(filename) if format == 'gromacs': from ase.io.gromacs import read_gromacs return read_gromacs(filename) if format == 'gaussian': from ase.io.gaussian import read_gaussian return read_gaussian(filename) if format == 'gaussian_out': from ase.io.gaussian import read_gaussian_out return read_gaussian_out(filename, index) if format == 'esp_in': from ase.io.espresso import read_espresso_in return read_espresso_in(filename) if format == 'esp_out': from ase.io.espresso import read_espresso_out return read_espresso_out(filename, index) if format == 'nw': from ase.io.nwchem import read_nwchem_input return read_nwchem_input(filename) raise RuntimeError('File format descriptor ' + format + ' not recognized!')
def read(filename, index=None, format=None): """Read Atoms object(s) from file. filename: str Name of the file to read from. index: int or slice If the file contains several configurations, the last configuration will be returned by default. Use index=n to get configuration number n (counting from zero). format: str Used to specify the file-format. If not given, the file-format will be guessed by the *filetype* function. Known formats: ========================= ============= format short name ========================= ============= GPAW restart-file gpw Dacapo netCDF output file dacapo Old ASE netCDF trajectory nc Virtual Nano Lab file vnl ASE pickle trajectory traj ASE bundle trajectory bundle GPAW text output gpaw-text CUBE file cube XCrySDen Structure File xsf Dacapo text output dacapo-text XYZ-file xyz VASP POSCAR/CONTCAR file vasp VASP OUTCAR file vasp_out VASP XDATCAR file vasp_xdatcar SIESTA STRUCT file struct_out ABINIT input file abinit V_Sim ascii file v_sim Protein Data Bank pdb CIF-file cif FHI-aims geometry file aims FHI-aims output file aims_out VTK XML Image Data vti VTK XML Structured Grid vts VTK XML Unstructured Grid vtu TURBOMOLE coord file tmol TURBOMOLE gradient file tmol-gradient exciting input exi AtomEye configuration cfg WIEN2k structure file struct DftbPlus input file dftb CASTEP geom file cell CASTEP output file castep CASTEP trajectory file geom ETSF format etsf.nc DFTBPlus GEN format gen CMR db/cmr-file db CMR db/cmr-file cmr LAMMPS dump file lammps EON reactant.con file eon Gromacs coordinates gro Gaussian com (input) file gaussian Gaussian output file gaussian_out Quantum espresso in file esp_in Quantum espresso out file esp_out Extended XYZ file extxyz NWChem input file nw Materials Studio file xsd ========================= ============= Many formats allow on open file-like object to be passed instead of ``filename``. In this case the format cannot be auto-decected, so the ``format`` argument should be explicitly given. """ if isinstance(filename, str) and ( '.json@' in filename or '.db@' in filename or filename.startswith('pg://') and '@' in filename): filename, index = filename.rsplit('@', 1) if index.isdigit(): index = int(index) else: if isinstance(filename, str): p = filename.rfind('@') if p != -1: try: index = string2index(filename[p + 1:]) except ValueError: pass else: filename = filename[:p] if isinstance(index, str): index = string2index(index) if format is None: format = filetype(filename) if format.startswith('gpw'): import gpaw r = gpaw.io.open(filename, 'r') positions = r.get('CartesianPositions') * Bohr numbers = r.get('AtomicNumbers') cell = r.get('UnitCell') * Bohr pbc = r.get('BoundaryConditions') tags = r.get('Tags') magmoms = r.get('MagneticMoments') energy = r.get('PotentialEnergy') * Hartree if r.has_array('CartesianForces'): forces = r.get('CartesianForces') * Hartree / Bohr else: forces = None atoms = Atoms(positions=positions, numbers=numbers, cell=cell, pbc=pbc) if tags.any(): atoms.set_tags(tags) if magmoms.any(): atoms.set_initial_magnetic_moments(magmoms) else: magmoms = None atoms.calc = SinglePointDFTCalculator(atoms, energy=energy, forces=forces, magmoms=magmoms) kpts = [] if r.has_array('IBZKPoints'): for w, kpt, eps_n, f_n in zip(r.get('IBZKPointWeights'), r.get('IBZKPoints'), r.get('Eigenvalues'), r.get('OccupationNumbers')): kpts.append(SinglePointKPoint(w, kpt[0], kpt[1], eps_n[0], f_n[0])) atoms.calc.kpts = kpts return atoms if format in ['json', 'db', 'postgresql']: if index == slice(None, None): index = None from ase.db.core import connect images = [row.toatoms() for row in connect(filename, format).select(index)] if len(images) == 1: return images[0] else: return images if index is None: index = -1 if format == 'castep': from ase.io.castep import read_castep return read_castep(filename, index) if format == 'castep_cell': import ase.io.castep return ase.io.castep.read_cell(filename, index) if format == 'castep_geom': import ase.io.castep return ase.io.castep.read_geom(filename, index) if format == 'exi': from ase.io.exciting import read_exciting return read_exciting(filename, index) if format in ['xyz', 'extxyz']: from ase.io.extxyz import read_xyz return read_xyz(filename, index) if format == 'traj': from ase.io.trajectory import read_trajectory return read_trajectory(filename, index) if format == 'trj': from ase.io.pickletrajectory import read_trajectory return read_trajectory(filename, index) if format == 'bundle': from ase.io.bundletrajectory import read_bundletrajectory return read_bundletrajectory(filename, index) if format == 'cube': from ase.io.cube import read_cube return read_cube(filename, index) if format == 'nc': from ase.io.netcdf import read_netcdf return read_netcdf(filename, index) if format == 'gpaw-text': from ase.io.gpawtext import read_gpaw_text return read_gpaw_text(filename, index) if format == 'dacapo-text': from ase.io.dacapo import read_dacapo_text return read_dacapo_text(filename) if format == 'dacapo': from ase.io.dacapo import read_dacapo return read_dacapo(filename) if format == 'xsf': from ase.io.xsf import read_xsf return read_xsf(filename, index) if format == 'vasp': from ase.io.vasp import read_vasp return read_vasp(filename) if format == 'vasp_out': from ase.io.vasp import read_vasp_out return read_vasp_out(filename, index) if format == 'vasp_xdatcar': from ase.io.vasp import read_vasp_xdatcar return read_vasp_xdatcar(filename, index) if format == 'abinit': from ase.io.abinit import read_abinit return read_abinit(filename) if format == 'v_sim': from ase.io.v_sim import read_v_sim return read_v_sim(filename) if format == 'mol': from ase.io.mol import read_mol return read_mol(filename) if format == 'pdb': from ase.io.pdb import read_pdb return read_pdb(filename, index) if format == 'cif': from ase.io.cif import read_cif return read_cif(filename, index) if format == 'struct': from ase.io.wien2k import read_struct return read_struct(filename) if format == 'struct_out': from ase.io.siesta import read_struct return read_struct(filename) if format == 'vti': from ase.io.vtkxml import read_vti return read_vti(filename) if format == 'vts': from ase.io.vtkxml import read_vts return read_vts(filename) if format == 'vtu': from ase.io.vtkxml import read_vtu return read_vtu(filename) if format == 'aims': from ase.io.aims import read_aims return read_aims(filename) if format == 'aims_out': from ase.io.aims import read_aims_output return read_aims_output(filename, index) if format == 'iwm': from ase.io.iwm import read_iwm return read_iwm(filename) if format == 'Cmdft': from ase.io.cmdft import read_I_info return read_I_info(filename) if format == 'tmol': from ase.io.turbomole import read_turbomole return read_turbomole(filename) if format == 'tmol-gradient': from ase.io.turbomole import read_turbomole_gradient return read_turbomole_gradient(filename) if format == 'cfg': from ase.io.cfg import read_cfg return read_cfg(filename) if format == 'dftb': from ase.io.dftb import read_dftb return read_dftb(filename) if format == 'sdf': from ase.io.sdf import read_sdf return read_sdf(filename) if format == 'etsf': from ase.io.etsf import ETSFReader return ETSFReader(filename).read_atoms() if format == 'gen': from ase.io.gen import read_gen return read_gen(filename) if format == 'cmr': from ase.io.cmr_io import read_db return read_db(filename, index) if format == 'lammps': from ase.io.lammpsrun import read_lammps_dump return read_lammps_dump(filename, index) if format == 'eon': from ase.io.eon import read_reactant_con return read_reactant_con(filename) if format == 'gromacs': from ase.io.gromacs import read_gromacs return read_gromacs(filename) if format == 'gaussian': from ase.io.gaussian import read_gaussian return read_gaussian(filename) if format == 'gaussian_out': from ase.io.gaussian import read_gaussian_out return read_gaussian_out(filename, index) if format == 'esp_in': from ase.io.espresso import read_espresso_in return read_espresso_in(filename) if format == 'esp_out': from ase.io.espresso import read_espresso_out return read_espresso_out(filename, index) if format == 'nw': from ase.io.nwchem import read_nwchem_input return read_nwchem_input(filename) if format == 'xsd': from ase.io.xsd import read_xsd return read_xsd(filename) raise RuntimeError('File format descriptor ' + format + ' not recognized!')
print("Parameters for main part") pprint.pprint(input_main) print("Parameters for ase part") pprint.pprint(input_ase) print("Parameters for solver part") pprint.pprint(input_sol) print("Pseudopotentials") pprint.pprint(pseudopotentials) ### Read/Write file names ### file_name = input_main["input_xyz_file"] wfile_name = input_main["output_file_head"] from ase.io.extxyz import read_xyz, write_xyz atoms = read_xyz(open(file_name), index=0) atoms_info = Atoms() for atom in atoms: atoms_info += atom ratoms = atoms_info.get_positions() satoms = atoms_info.symbols natoms = ratoms.shape[0] z_max = np.max(ratoms, axis=0)[2] z_min = np.min(ratoms, axis=0)[2] slab_size = z_max - z_min print('slab_size=', slab_size) z_margin = input_main["param"]["z_margin"] z_atoms = ratoms[:, 2] z_bottom_most = [z_min - z_margin, z_min + z_margin]
def unpack_xyz_str_to_results(data): buffer = StringIO.StringIO(data) at = read_xyz(buffer) buffer.close() label = at.info['label'] return (label, at)
def _load_data( self, xyzpath, evilmols=None, ): tmpdir = tempfile.mkdtemp('sym') raw_path = os.path.join(xyzpath) ordered_files = [] logging.info('Parse xyz files...') for fpathe, dirs, fs in os.walk(raw_path): for f in fs: ordered_files.append(os.path.join(fpathe, f)) ordered_files = sorted(ordered_files) all_atoms = [] all_properties = [] irange = np.arange(len(ordered_files), dtype=np.int) if evilmols is not None: irange = np.setdiff1d(irange, evilmols - 1) sym_dict = dict((c, i) for i, c in enumerate( ['BU', 'BG', 'AU', 'AG', 'EU', 'EG', 'A"', 'E"', "A'", "E'"])) # parse XYZ file for i in irange: xyzfile = ordered_files[i] if (i + 1) % 1000 == 0: logging.info('Parsed: {:6d}'.format(i + 1)) properties = {} try: with open(xyzfile, 'r') as f: lines = f.readlines() info = lines[1].strip().split('|') for j in range(1, 20): properties[self.required_properties[j - 1]] = np.array( float(info[j])).reshape(1) for j in range(20, 32): # degeneracy properties[self.required_properties[j - 1]] = np.array( int(info[j])).reshape(1) for j in range(32, 44): # symmetry properties[self.required_properties[j - 1]] = np.array( sym_dict[info[j]]).reshape(1) def to_idx(idx_str): return int(idx_str[1:]) # the tage for each atoms, idicate the index of the atoms in which primitive cell tags = list(map(to_idx, info[44].strip().split(' '))) tmp = os.path.join(tmpdir, 'tmp.xyz') with open(tmp, 'wt') as fout: fout.write(lines[0]) fout.write('**\n') for line in lines[2:]: fout.write(line[:line.rfind(' ')] + '\n') # remove charge with open(tmp, 'r') as ftmp: ats = list(read_xyz(ftmp, 0))[0] ats.set_tags(tags=tags) all_atoms.append(ats) all_properties.append(properties) except: print(xyzfile, info) logging.info('Write atoms to db...') self.add_systems(all_atoms, all_properties) logging.info('Done.') shutil.rmtree(tmpdir) return True
def load_data(dbpath): print('Downloading GDB-9 data...') tmpdir = '..\\data' tar_path = os.path.join(tmpdir, 'dsgdb9nsd.xyz.tar.bz2') raw_path = os.path.join(tmpdir, 'gdb9_xyz') # url = 'https://ndownloader.figshare.com/files/3195389' # try: # urllib.request.urlretrieve(url, tar_path) # logging.info("Done.") # except HTTPError as e: # logging.error("HTTP Error:", e.code, url) # return False # except URLError as e: # logging.error("URL Error:", e.reason, url) # return False # tar = tarfile.open(tar_path) # tar.extractall(raw_path) # tar.close() basic_atoms = [ 'Atom_H', 'Atom_C', 'Atom_N', 'Atom_O', 'Atom_F', 'Atom_P', 'Atom_S', 'Atom_Cl', 'Atom_Br', 'Atom_I' ] prop_names = [ 'rcA', 'rcB', 'rcC', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'energy_U0', 'energy_U', 'enthalpy_H', 'free_G', 'Cv' ] conversions = [ 1., 1., 1., 1., Bohr**3 / Ang**3, Hartree / eV, Hartree / eV, Hartree / eV, Bohr**2 / Ang**2, Hartree / eV, Hartree / eV, Hartree / eV, Hartree / eV, Hartree / eV, 1. ] print('Parse xyz files...') with connect(dbpath) as con: for i, xyzfile in enumerate(os.listdir(raw_path)): ''' structure of the xyz files: number:int, the amounts of atoms in the molecule properties:list, properties respond to the list builtin from third to the end the following lines represent atoms an atom a line with the formatter of (symbol, x, y, z, initial charge) ''' xyzfile = os.path.join(raw_path, xyzfile) if i % 10000 == 0: print('Parsed: ' + str(i) + ' / 133885') properties = {} charges = {a: 0 for a in basic_atoms} # put the content into a temp file tmp = os.path.join(tmpdir, 'tmp.xyz') # parse the XYZ files # get the number of atoms # get the properties # get the numbers and the charges of each atom with open(xyzfile, 'r') as f: lines = f.readlines() # read the properties l = lines[1].split()[2:] # do preprocessing for pn, p, c in zip(prop_names, l, conversions): properties[pn] = float(p) * c with open(tmp, "wt") as fout: for line in lines: fout.write(line.replace('*^', 'e')) with open(tmp, 'r') as f: lines = f.readlines() # get the number cnt = int(lines[0]) # get the numbers and the charges atoms = lines[2:cnt + 2] for atom in atoms: a, _, _, _, c = atom.split() a = 'Atom_' + a charges[a] += float(c) properties.update(charges) # a function from ase module, which can read from XYZ formatter ats = list(read_xyz(f, 0))[0] # idx_ik, seg_i, idx_j, idx_jk, seg_j, offset, ratio_j = \ # collect_neighbors(ats, 20.) # data = {'_idx_ik': idx_ik, '_idx_jk': idx_jk, '_idx_j': idx_j, # '_seg_i': seg_i, '_seg_j': seg_j, '_offset': offset, # '_ratio_j': ratio_j} con.write(ats, key_value_pairs=properties) print('Done.') return True