def mset_to_smiles(max_atoms, filename): mset = MSet() try: mset.load(filename) except json.JSONDecodeError: print("json decode error") return (None, None) if mset.n_atoms > max_atoms: return (None, None) geom = mset.get_min_geom try: charges = [ float(atom.labels['wB97X-D.6-311g**.charges']) for atom in geom.atoms ] except KeyError: charges = [ float(atom.labels['wb97x-d.6-311gss.mulliken_charges']) for atom in geom.atoms ] charge = sum(charges) / len(charges) try: mol = xyz2mol(geom.at_nums, [atom.xyz for atom in geom.atoms], charge) except Chem.AtomValenceException: print("atom valence exception") return (None, None) smiles = Chem.MolToSmiles(mol, isomericSmiles=False) m = Chem.MolFromSmiles(smiles) smiles = Chem.MolToSmiles(m, isomericSmiles=False) mset.identifiers.update({"smiles": smiles}) mset.save(filename) return (filename, smiles)
def mset_to_smiles(filename): mset = MSet() try: mset.load(filename) except json.JSONDecodeError: print("json decode error") return None geom = get_min_geom(mset.geometries) charge = 0 try: smiles = geom_to_smiles(geom, charge) except: return None mset.identifiers.update({"smiles": smiles}) mset.save(filename) return filename
def mset_to_smiles(filename): mset = MSet() mset.load(filename) geom = mset.get_min_geom try: cm5_charges = [ float(atom.labels['wb97x_dz.cm5_charges']) for atom in geom.atoms ] hirshfeld_charges = [ float(atom.labels['wb97x_dz.hirshfeld_charges']) for atom in geom.atoms ] charge = ((sum(cm5_charges) / len(cm5_charges)) + (sum(hirshfeld_charges) / len(hirshfeld_charges))) / 2 except: charge = 0 try: smiles = geom_to_smiles(geom, charge) except: return None mset.identifiers.update({"smiles": smiles}) mset.save(filename) return filename
def load_ani1x(path_to_h5file, data_keys=[]): # Example for extracting DFT/DZ energies and forces for i, data in enumerate(iter_data_buckets(path_to_h5file, keys=data_keys)): atoms = [Atom(at_num) for at_num in data['atomic_numbers'].tolist()] mset = MoleculeSet(atoms) mset.filename = "/mnt/sdb1/adriscoll/ani1x-data/ani1x-msets/ani1x-mol" + str( i) + ".mset" mol_keys, atom_keys, geoms = [], [], [] for key in data.keys(): if key == 'atomic_numbers' or key == 'coordinates': continue elif 'energy' in key or 'dipole' in key: mol_keys.append(key) elif 'force' in key or 'charge' in key: atom_keys.append(key) mol_labels = {key: data[key][-1].tolist() for key in mol_keys} atom_labels = {key: data[key][-1].tolist() for key in atom_keys} geoms.append( mset.new_geometry(data['coordinates'][-1].tolist(), mol_labels, atom_labels)) mset.trajectories['ani.data'] = geoms mset.save() return
opt_natoms[len(opt_mol_data['atoms'])].append(opt_mol) else: opt_natoms[len(opt_mol_data['atoms'])] = [opt_mol] with open( "/mnt/sdb1/jeherr/chemspider_data/chno_msets/chno_opt_natoms.txt", "w") as f: json.dump(opt_natoms, f) opt_matches = {} meta_matches = {} for n_atoms, meta_mols in meta_natoms.items(): opt_mols = opt_natoms[n_atoms] opt_msets = [] meta_msets = [] for opt_mol in opt_mols: opt_mset = MoleculeSet() opt_mset.load(opt_mol) opt_mset.filename = opt_mol opt_msets.append(opt_mset) for meta_mol in meta_mols: meta_mset = MoleculeSet() meta_mset.load(meta_mol) meta_mset.filename = meta_mol meta_msets.append(meta_mset) for meta_mset in meta_msets: matches = [] for opt_mset in opt_msets: if meta_mset.compare_hash(opt_mset): matches.append(opt_mset.filename) meta_matches[meta_mset.filename] = matches for opt_mset in opt_msets:
else: opt_natoms[len(opt_mol_data['atoms'])] = [opt_mol] with open( "/mnt/sdb1/adriscoll/chemspider_data/expanded_msets/opt_smiles_natoms.txt", "w") as file: json.dump(opt_natoms, file) opt_matches = {} meta_matches = {} for n_atoms, meta_mols in meta_natoms.items(): if n_atoms in opt_natoms.keys(): opt_mols = opt_natoms[n_atoms] opt_msets = [] meta_msets = [] for opt_mol in opt_mols: opt_mset = MoleculeSet() opt_mset.load(opt_mol) opt_mset.filename = opt_mol opt_msets.append(opt_mset) for meta_mol in meta_mols: meta_mset = MoleculeSet() meta_mset.load(meta_mol) meta_mset.filename = meta_mol meta_msets.append(meta_mset) for meta_mset in meta_msets: matches = [] for opt_mset in opt_msets: if meta_mset.identifiers['smiles'] == opt_mset.identifiers[ 'smiles']: matches.append(opt_mset.filename) meta_matches[meta_mset.filename] = matches
def read_opt_data(filename): n_atoms = None mset = None atomic_nums = [] coords = [] energies = [] forces = [] dipoles = [] quadrupoles = [] charges = [] print("Reading ", filename) with open(filename, "r") as f: try: while True: line = next(f) if "User input:" in line: n_atoms, at_sym, method, basis = parse_sp_input(f) elif "Standard Nuclear Orientation" in line: atom_nums, coord = parse_atoms_coords(f, n_atoms) if atom_nums is None: return None atomic_nums.append(atom_nums) coords.append(coord) # energy elif "Convergence failure" in line: return None elif "Cycle" in line and "Energy" in line: energies.append(parse_energy(f)) # forces elif "Gradient of SCF Energy" in line: force = parse_forces(f, n_atoms) if force is not None: forces.append(force) else: print(filename, " contains unparsed forces") # dipoles elif "Dipole Moment (Debye)" in line: dipoles.append(parse_dipole(f)) # quadrupoles elif "Quadrupole Moments (Debye-Ang)" in line: quadrupoles.append(parse_quadrupole(f)) # charges elif "Ground-State Mulliken Net Atomic Charges" in line: charges.append(parse_charges(f, n_atoms)) elif "Optimization Cycle" in line: if len(atomic_nums) == len(energies) == len(forces) == len( coords) == len(dipoles) == len(quadrupoles) == len( charges): if line.split()[-1] == "1": try: atoms = [ Atom(at_num) for at_num in atomic_nums[0] ] mset = MoleculeSet(atoms) opt_trajectory = [] except: print("Error making MSet for ", filename) return None if len(energies) > len(opt_trajectory): opt_trajectory.append( build_new_geom(atomic_nums, coords, energies, forces, dipoles, quadrupoles, charges, method, basis)) elif "OPTIMIZATION CONVERGED" in line: mset.trajectories[".".join( (method, basis, "opt"))] = opt_trajectory return mset except StopIteration: print("Hit EOF on ", filename) try: mset.trajectories[".".join( (method, basis, "opt"))] = opt_trajectory return mset except UnboundLocalError: print("No MSet built for ", filename) return None
def read_multi_sp_data(filenames): n_atoms = None mset = None atomic_nums = [] coords = [] energies = [] forces = [] dipoles = [] quadrupoles = [] charges = [] for filename in filenames: with open(filename, "r") as f: try: while True: line = next(f) if "User input:" in line: n_atoms, at_sym, method, basis = parse_sp_input(f) elif "Standard Nuclear Orientation" in line: atom_nums, coord = parse_atoms_coords(f, n_atoms) atomic_nums.append(atom_nums) coords.append(coord) # energy elif "Cycle" in line and "Energy" in line: energies.append(parse_energy(f)) # forces elif "Gradient of SCF Energy" in line: force = parse_forces(f, n_atoms) if force is not None: forces.append(force) else: print(filename, " contains unparsed forces") # dipoles elif "Dipole Moment (Debye)" in line: dipoles.append(parse_dipole(f)) # quadrupoles elif "Quadrupole Moments (Debye-Ang)" in line: quadrupoles.append(parse_quadrupole(f)) # charges elif "Ground-State Mulliken Net Atomic Charges" in line: charges.append(parse_charges(f, n_atoms)) elif "Thank you very much for using Q-Chem." in line: if mset is None: print("Attempting to make initial MSet for ", filename) try: atoms = [ Atom(at_num) for at_num in atomic_nums[0] ] mset = MoleculeSet(atoms) meta_trajectory = [] except: print("Error making MSet for ", filename) return None if len(atomic_nums) == len(energies) == len( forces) == len(coords) == len(dipoles) == len( quadrupoles) == len(charges): if len(energies) > len(meta_trajectory): meta_trajectory.append( build_new_geom(atomic_nums, coords, energies, forces, dipoles, quadrupoles, charges, method, basis)) else: min_len = min([ len(atomic_nums), len(energies), len(forces), len(coords), len(dipoles), len(quadrupoles), len(charges) ]) atomic_nums, energies, forces, coords, dipoles, quadrupoles, charges = atomic_nums[ :min_len], \ energies[ :min_len], forces[ :min_len], coords[ :min_len], dipoles[ :min_len], quadrupoles[ :min_len], charges[ :min_len] except (StopIteration, UnicodeDecodeError): if mset is not None: if len(atomic_nums) == len(energies) == len(forces) == len( coords) == len(dipoles) == len(quadrupoles) == len( charges): if len(energies) > len(meta_trajectory): meta_trajectory.append( build_new_geom(atomic_nums, coords, energies, forces, dipoles, quadrupoles, charges, method, basis)) else: min_len = min([ len(atomic_nums), len(energies), len(forces), len(coords), len(dipoles), len(quadrupoles), len(charges) ]) atomic_nums, energies, forces, coords, dipoles, quadrupoles, charges = atomic_nums[:min_len], \ energies[ :min_len], forces[ :min_len], coords[ :min_len], dipoles[ :min_len], quadrupoles[ :min_len], charges[ :min_len] continue if mset is None: print("No geometries collected for ", filenames[0]) return None mset.trajectories[".".join((method, basis, "meta"))] = meta_trajectory return mset
def read_aimd_data(filename): atomic_nums = [] coords = [] energies = [] forces = [] dipoles = [] quadrupoles = [] charges = [] print("Reading ", filename) with open(filename, "r") as f: try: while True: line = next(f) if "User input:" in line: n_atoms, at_sym, method, basis, time_steps, aimd_steps, aimd_temp = parse_aimd_input( f) # These next sections **SHOULD** only grab the first instance of these properties (i.e. before the AIMD # section starts) elif "Standard Nuclear Orientation" in line: atom_nums, coord = parse_atoms_coords(f, n_atoms) atomic_nums.append(atom_nums) coords.append(coord) # energy elif "Cycle" in line and "Energy" in line: energies.append(parse_energy(f)) # forces elif "Gradient of SCF Energy" in line: forces.append(parse_forces(f, n_atoms)) # dipoles elif "Dipole Moment (Debye)" in line: dipoles.append(parse_dipole(f)) # quadrupoles elif "Quadrupole Moments (Debye-Ang)" in line: quadrupoles.append(parse_quadrupole(f)) # charges elif "Ground-State Mulliken Net Atomic Charges" in line: charges.append(parse_charges(f, n_atoms)) elif "AB INITIO MOLECULAR DYNAMICS" in line: atoms = [Atom(at_num) for at_num in atomic_nums[0]] mset = MoleculeSet(atoms) aimd_trajectory = [ build_new_geom(atomic_nums, coords, energies, forces, dipoles, quadrupoles, charges, method, basis) ] while True: line = next(f) if "TIME STEPS COMPLETED" in line: if len(atomic_nums) == len(energies) == len( forces) == len(coords) == len( dipoles) == len(quadrupoles) == len( charges): if len(energies) > len(aimd_trajectory): aimd_trajectory.append( build_new_geom(atomic_nums, coords, energies, forces, dipoles, quadrupoles, charges, method, basis)) else: print( "Error reading AIMD trajectory at last step", filename, "Returning MSet before this step") mset.trajectories[".".join( (method, basis, "aimd"))] = aimd_trajectory return mset elif "TIME STEP" in line: time_step = int(line.split()[2].lstrip("#")) time_au = float(line.split()[5]) time_fs = float(line.split()[8]) if len(atomic_nums) == len(energies) == len( forces) == len(coords) == len( dipoles) == len(quadrupoles) == len( charges): aimd_trajectory.append( build_new_geom(atomic_nums, coords, energies, forces, dipoles, quadrupoles, charges, method, basis)) else: print("Error reading AIMD trajectory at step", time_step, filename, "Returning MSet before " "this step") mset.trajectories[".".join( (method, basis, "aimd"))] = aimd_trajectory return mset elif "Standard Nuclear Orientation" in line: atom_nums, coord = parse_atoms_coords(f, n_atoms) atomic_nums.append(atom_nums) coords.append(coord) # energy elif "Cycle" in line and "Energy" in line: energies.append(parse_energy(f)) # forces elif "Gradient of SCF Energy" in line: forces.append(parse_forces(f, n_atoms)) # dipoles elif "Dipole Moment (Debye)" in line: dipoles.append(parse_dipole(f)) # quadrupoles elif "Quadrupole Moments (Debye-Ang)" in line: quadrupoles.append(parse_quadrupole(f)) # charges elif "Ground-State Mulliken Net Atomic Charges" in line: charges.append(parse_charges(f, n_atoms)) except StopIteration: print("Hit EOF on ", filename) try: mset.trajectories[".".join( (method, basis, "aimd"))] = aimd_trajectory return mset except UnboundLocalError: print("No MSet built for ", filename) return None