def test_thiane_ethynyl(self): mm = MoleculeMatcher(tolerance=0.05, mapper=InchiMolAtomMapper()) mol1 = Molecule.from_file(os.path.join(test_dir, "thiane_ethynyl1.sdf")) mol2 = Molecule.from_file(os.path.join(test_dir, "thiane_ethynyl2.sdf")) self.assertFalse(mm.fit(mol1, mol2))
def test_get_rmsd(self): mm = MoleculeMatcher() mol1 = BabelMolAdaptor.from_file(os.path.join(test_dir, "t3.xyz")).pymatgen_mol mol2 = BabelMolAdaptor.from_file(os.path.join(test_dir, "t4.xyz")).pymatgen_mol self.assertEqual('{0:7.3}'.format(mm.get_rmsd(mol1, mol2)), "0.00488")
def test_random_seed(self, water, ethanol): """ Confirm that seed = -1 generates random structures while seed = 1 is deterministic """ mm = MoleculeMatcher() # deterministic output with tempfile.TemporaryDirectory() as scratch_dir: pw = PackmolBoxGen( seed=1, inputfile="input.in", outputfile="output.xyz", ).get_input_set( # scratch_dir, molecules=[ { "name": "water", "number": 10, "coords": water }, { "name": "ethanol", "number": 20, "coords": ethanol }, ], ) pw.write_input(scratch_dir) pw.run(scratch_dir) out1 = Molecule.from_file(os.path.join(scratch_dir, "output.xyz")) pw.run(scratch_dir) out2 = Molecule.from_file(os.path.join(scratch_dir, "output.xyz")) assert mm.fit(out1, out2) # randomly generated structures with tempfile.TemporaryDirectory() as scratch_dir: pw = PackmolBoxGen( seed=-1, inputfile="input.in", outputfile="output.xyz", ).get_input_set(molecules=[ { "name": "water", "number": 10, "coords": water }, { "name": "ethanol", "number": 20, "coords": ethanol }, ], ) pw.write_input(scratch_dir) pw.run(scratch_dir) out1 = Molecule.from_file(os.path.join(scratch_dir, "output.xyz")) pw.run(scratch_dir) out2 = Molecule.from_file(os.path.join(scratch_dir, "output.xyz")) assert not mm.fit(out1, out2)
def test_to_and_from_dict(self): mm = MoleculeMatcher(tolerance=0.5, mapper=InchiMolAtomMapper(angle_tolerance=50.0)) d = mm.to_dict mm2 = MoleculeMatcher.from_dict(d) self.assertEqual(d, mm2.to_dict) mm = MoleculeMatcher(tolerance=0.5, mapper=IsomorphismMolAtomMapper()) d = mm.to_dict mm2 = MoleculeMatcher.from_dict(d) self.assertEqual(d, mm2.to_dict)
def test_group_molecules(self): mm = MoleculeMatcher(tolerance=0.001) with open(os.path.join(test_dir, "mol_list.txt")) as f: filename_list = [line.strip() for line in f.readlines()] mol_list = [read_mol(os.path.join(test_dir, f)) for f in filename_list] mol_groups = mm.group_molecules(mol_list) filename_groups = [[filename_list[mol_list.index(m)] for m in g] for g in mol_groups] with open(os.path.join(test_dir, "grouped_mol_list.txt")) as f: grouped_text = f.read().strip() self.assertEqual(str(filename_groups), grouped_text)
def test_group_molecules(self): mm = MoleculeMatcher(tolerance=0.001) with open(os.path.join(test_dir, "mol_list.txt")) as f: filename_list = [line.strip() for line in f.readlines()] mol_list = [Molecule.from_file(os.path.join(test_dir, f)) for f in filename_list] mol_groups = mm.group_molecules(mol_list) filename_groups = [[filename_list[mol_list.index(m)] for m in g] for g in mol_groups] with open(os.path.join(test_dir, "grouped_mol_list.txt")) as f: grouped_text = f.read().strip() self.assertEqual(str(filename_groups), grouped_text)
def test_group_molecules(self): mm = MoleculeMatcher(tolerance=0.001) filename_list = None with open(os.path.join(test_dir, "mol_list.txt")) as f: filename_list = [line.strip() for line in f.readlines()] mol_list = [BabelMolAdaptor.from_file(os.path.join(test_dir, f)).pymatgen_mol\ for f in filename_list] mol_groups = mm.group_molecules(mol_list) filename_groups = [[filename_list[mol_list.index(m)] for m in g] for g \ in mol_groups] grouped_text = None with open(os.path.join(test_dir, "grouped_mol_list.txt")) as f: grouped_text = f.read().strip() self.assertEqual(str(filename_groups), grouped_text)
def test_confab_conformers(self): mol = pb.readstring("smi", "CCCC").OBMol adaptor = BabelMolAdaptor(mol) adaptor.make3d() conformers = adaptor.confab_conformers() self.assertEquals(adaptor.openbabel_mol.NumRotors(), 1) self.assertGreaterEqual(len(conformers), 1) if len(conformers) > 1: self.assertNotAlmostEqual( MoleculeMatcher().get_rmsd(conformers[0], conformers[1]), 0)
def add_if_belongs(self, cand_snl, exact_match=True): # no need to compare if structue is different if cand_snl.snlgroup_key != self.canonical_snl.snlgroup_key: return False # make sure the structure is not already in all_structures if cand_snl.snl_id in self.all_snl_ids: print('WARNING: add_if_belongs() has detected that you are ' \ 'trying to add the same SNL id twice!') return False if exact_match: mm = MoleculeMatcher( tolerance=0.01, mapper=InchiMolAtomMapper(angle_tolerance=5.0)) if not mm.fit(cand_snl.structure, self.canonical_structure): return False # everything checks out, add to the group self.all_snl_ids.append(cand_snl.snl_id) self.updated_at = datetime.datetime.utcnow() return True
def test_get_rmsd(self): mm = MoleculeMatcher() mol1 = Molecule.from_file(os.path.join(test_dir, "t3.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "t4.xyz")) self.assertEqual('{0:7.3}'.format(mm.get_rmsd(mol1, mol2)), "0.00488")
def test_cdi_23(self): mm = MoleculeMatcher(tolerance=0.05, mapper=InchiMolAtomMapper()) mol1 = Molecule.from_file(os.path.join(test_dir, "cdi_23_1.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "cdi_23_2.xyz")) self.assertFalse(mm.fit(mol1, mol2))
def test_strange_inchi(self): mm = MoleculeMatcher(tolerance=0.05, mapper=InchiMolAtomMapper()) mol1 = Molecule.from_file(os.path.join(test_dir, "k1.sdf")) mol2 = Molecule.from_file(os.path.join(test_dir, "k2.sdf")) self.assertTrue(mm.fit(mol1, mol2))
def test_thiane(self): mm = MoleculeMatcher(tolerance=0.05, mapper=InchiMolAtomMapper()) mol1 = read_mol(os.path.join(test_dir, "thiane1.sdf")) mol2 = read_mol(os.path.join(test_dir, "thiane2.sdf")) self.assertFalse(mm.fit(mol1, mol2))
def fit_with_mapper(self, mapper): coords = [[0.000000, 0.000000, 0.000000], [0.000000, 0.000000, 1.089000], [1.026719, 0.000000, -0.363000], [-0.513360, -0.889165, -0.363000], [-0.513360, 0.889165, -0.363000]] mol1 = Molecule(["C", "H", "H", "H", "H"], coords) op = SymmOp.from_origin_axis_angle([0, 0, 0], [0.1, 0.2, 0.3], 60) rotcoords = [op.operate(c) for c in coords] mol2 = Molecule(["C", "H", "H", "H", "H"], rotcoords) mm = MoleculeMatcher(mapper=mapper) self.assertTrue(mm.fit(mol1, mol2)) mol1 = Molecule.from_file(os.path.join(test_dir, "benzene1.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "benzene2.xyz")) self.assertTrue(mm.fit(mol1, mol2)) mol1 = Molecule.from_file(os.path.join(test_dir, "benzene1.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "t2.xyz")) self.assertFalse(mm.fit(mol1, mol2)) mol1 = Molecule.from_file(os.path.join(test_dir, "c1.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "c2.xyz")) self.assertTrue(mm.fit(mol1, mol2)) mol1 = Molecule.from_file(os.path.join(test_dir, "t3.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "t4.xyz")) self.assertTrue(mm.fit(mol1, mol2)) mol1 = Molecule.from_file(os.path.join(test_dir, "j1.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "j2.xyz")) self.assertTrue(mm.fit(mol1, mol2)) mol1 = Molecule.from_file(os.path.join(test_dir, "ethene1.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "ethene2.xyz")) self.assertTrue(mm.fit(mol1, mol2)) mol1 = Molecule.from_file(os.path.join(test_dir, "toluene1.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "toluene2.xyz")) self.assertTrue(mm.fit(mol1, mol2)) mol1 = Molecule.from_file(os.path.join(test_dir, "cyclohexane1.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "cyclohexane2.xyz")) self.assertTrue(mm.fit(mol1, mol2)) mol1 = Molecule.from_file(os.path.join(test_dir, "oxygen1.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "oxygen2.xyz")) self.assertTrue(mm.fit(mol1, mol2)) mm = MoleculeMatcher(tolerance=0.001, mapper=mapper) mol1 = Molecule.from_file(os.path.join(test_dir, "t3.xyz")) mol2 = Molecule.from_file(os.path.join(test_dir, "t4.xyz")) self.assertFalse(mm.fit(mol1, mol2))
def __init__(self, redundancy_parameters, geometry): ''' Makes a RedundancyGuard, and sets default parameter values if necessary. TODO: currently using pymatgen's structure matcher for comparing bulk and sheet structures, both pymatgen's structure matcher and molecule matcher for comparing wires, and only the molecule matcher for clusters. The sheet and wire cases aren't ideal, since the structure matcher assumes periodicity in all three dimensions, and the molecule matcher assumes no periodicity. Args: redundancy parameters: a dictionary of parameters geometry: the Geometry object ''' # defaults # # lattice length tolerance, in fractional coordinates self.default_lattice_length_tol = 0.05 # lattice angle tolerance, in degrees self.default_lattice_angle_tol = 2 # site tolerance, in fraction of average free length per atom self.default_site_tol = 0.1 # whether to transform to primitive cells before comparing self.default_use_primitive_cell = True # whether to check if structures are equal to supercells of each other self.default_attempt_supercell = True # RMSD tolerance for comparing clusters self.default_rmsd_tol = 0.1 # the epa difference interval self.default_epa_diff = 0.0 # set to defaults if redundancy_parameters in (None, 'default'): self.set_all_to_defaults() # parse the parameters, and set to defaults if necessary else: # lattice length tolerance if 'lattice_length_tol' not in redundancy_parameters: self.lattice_length_tol = self.default_lattice_length_tol elif redundancy_parameters['lattice_length_tol'] in (None, 'default'): self.lattice_length_tol = self.default_lattice_length_tol else: self.lattice_length_tol = redundancy_parameters[ 'lattice_length_tol'] # lattice angle tolerance if 'lattice_angle_tol' not in redundancy_parameters: self.lattice_angle_tol = self.default_lattice_angle_tol elif redundancy_parameters['lattice_angle_tol'] in (None, 'default'): self.lattice_angle_tol = self.default_lattice_angle_tol else: self.lattice_angle_tol = redundancy_parameters[ 'lattice_angle_tol'] # site tolerance if 'site_tol' not in redundancy_parameters: self.site_tol = self.default_site_tol elif redundancy_parameters['site_tol'] in (None, 'default'): self.site_tol = self.default_site_tol else: self.site_tol = redundancy_parameters['site_tol'] # whether to use primitive cells if 'use_primitive_cell' not in redundancy_parameters: self.use_primitive_cell = self.default_use_primitive_cell elif redundancy_parameters['use_primitive_cell'] in (None, 'default'): self.use_primitive_cell = self.default_use_primitive_cell else: self.use_primitive_cell = redundancy_parameters[ 'use_primitive_cell'] # whether to try matching supercells if 'attempt_supercell' not in redundancy_parameters: self.attempt_supercell = self.default_attempt_supercell elif redundancy_parameters['attempt_supercell'] in (None, 'default'): self.attempt_supercell = self.default_attempt_supercell else: self.attempt_supercell = redundancy_parameters[ 'attempt_supercell'] # RMSD tolerance if 'rmsd_tol' not in redundancy_parameters: self.rmsd_tol = self.default_rmsd_tol elif redundancy_parameters['rmsd_tol'] in (None, 'default'): self.rmsd_tol = self.default_rmsd_tol else: self.rmsd_tol = redundancy_parameters['rmsd_tol'] # epa difference if 'epa_diff' not in redundancy_parameters: self.epa_diff = self.default_epa_diff elif redundancy_parameters['epa_diff'] in (None, 'default'): self.epa_diff = self.default_epa_diff else: self.epa_diff = redundancy_parameters['epa_diff'] # make the StructureMatcher object # # the first False is to prevent the matcher from scaling the volumes, # and the second False is to prevent subset matching self.structure_matcher = StructureMatcher( self.lattice_length_tol, self.site_tol, self.lattice_angle_tol, self.use_primitive_cell, False, self.attempt_supercell, False, ElementComparator()) # make the MoleculeMatcher object if geometry.shape == 'cluster' or geometry.shape == 'wire': iso_mol_atom_mapper = IsomorphismMolAtomMapper() self.molecule_matcher = MoleculeMatcher(self.rmsd_tol, iso_mol_atom_mapper) ob.obErrorLog.SetOutputLevel(0) # to suppress openbabel warnings
class RedundancyGuard(object): ''' A RedundancyGuard object is used to check if an Organism is redundant with other organisms already seen by the algorithm. ''' def __init__(self, redundancy_parameters, geometry): ''' Makes a RedundancyGuard, and sets default parameter values if necessary. TODO: currently using pymatgen's structure matcher for comparing bulk and sheet structures, both pymatgen's structure matcher and molecule matcher for comparing wires, and only the molecule matcher for clusters. The sheet and wire cases aren't ideal, since the structure matcher assumes periodicity in all three dimensions, and the molecule matcher assumes no periodicity. Args: redundancy parameters: a dictionary of parameters geometry: the Geometry object ''' # defaults # # lattice length tolerance, in fractional coordinates self.default_lattice_length_tol = 0.05 # lattice angle tolerance, in degrees self.default_lattice_angle_tol = 2 # site tolerance, in fraction of average free length per atom self.default_site_tol = 0.1 # whether to transform to primitive cells before comparing self.default_use_primitive_cell = True # whether to check if structures are equal to supercells of each other self.default_attempt_supercell = True # RMSD tolerance for comparing clusters self.default_rmsd_tol = 0.1 # the epa difference interval self.default_epa_diff = 0.0 # set to defaults if redundancy_parameters in (None, 'default'): self.set_all_to_defaults() # parse the parameters, and set to defaults if necessary else: # lattice length tolerance if 'lattice_length_tol' not in redundancy_parameters: self.lattice_length_tol = self.default_lattice_length_tol elif redundancy_parameters['lattice_length_tol'] in (None, 'default'): self.lattice_length_tol = self.default_lattice_length_tol else: self.lattice_length_tol = redundancy_parameters[ 'lattice_length_tol'] # lattice angle tolerance if 'lattice_angle_tol' not in redundancy_parameters: self.lattice_angle_tol = self.default_lattice_angle_tol elif redundancy_parameters['lattice_angle_tol'] in (None, 'default'): self.lattice_angle_tol = self.default_lattice_angle_tol else: self.lattice_angle_tol = redundancy_parameters[ 'lattice_angle_tol'] # site tolerance if 'site_tol' not in redundancy_parameters: self.site_tol = self.default_site_tol elif redundancy_parameters['site_tol'] in (None, 'default'): self.site_tol = self.default_site_tol else: self.site_tol = redundancy_parameters['site_tol'] # whether to use primitive cells if 'use_primitive_cell' not in redundancy_parameters: self.use_primitive_cell = self.default_use_primitive_cell elif redundancy_parameters['use_primitive_cell'] in (None, 'default'): self.use_primitive_cell = self.default_use_primitive_cell else: self.use_primitive_cell = redundancy_parameters[ 'use_primitive_cell'] # whether to try matching supercells if 'attempt_supercell' not in redundancy_parameters: self.attempt_supercell = self.default_attempt_supercell elif redundancy_parameters['attempt_supercell'] in (None, 'default'): self.attempt_supercell = self.default_attempt_supercell else: self.attempt_supercell = redundancy_parameters[ 'attempt_supercell'] # RMSD tolerance if 'rmsd_tol' not in redundancy_parameters: self.rmsd_tol = self.default_rmsd_tol elif redundancy_parameters['rmsd_tol'] in (None, 'default'): self.rmsd_tol = self.default_rmsd_tol else: self.rmsd_tol = redundancy_parameters['rmsd_tol'] # epa difference if 'epa_diff' not in redundancy_parameters: self.epa_diff = self.default_epa_diff elif redundancy_parameters['epa_diff'] in (None, 'default'): self.epa_diff = self.default_epa_diff else: self.epa_diff = redundancy_parameters['epa_diff'] # make the StructureMatcher object # # the first False is to prevent the matcher from scaling the volumes, # and the second False is to prevent subset matching self.structure_matcher = StructureMatcher( self.lattice_length_tol, self.site_tol, self.lattice_angle_tol, self.use_primitive_cell, False, self.attempt_supercell, False, ElementComparator()) # make the MoleculeMatcher object if geometry.shape == 'cluster' or geometry.shape == 'wire': iso_mol_atom_mapper = IsomorphismMolAtomMapper() self.molecule_matcher = MoleculeMatcher(self.rmsd_tol, iso_mol_atom_mapper) ob.obErrorLog.SetOutputLevel(0) # to suppress openbabel warnings def set_all_to_defaults(self): ''' Sets all the redundancy parameters to default values. ''' self.lattice_length_tol = self.default_lattice_length_tol self.lattice_angle_tol = self.default_lattice_angle_tol self.site_tol = self.default_site_tol self.use_primitive_cell = self.default_use_primitive_cell self.attempt_supercell = self.default_attempt_supercell self.rmsd_tol = self.default_rmsd_tol self.epa_diff = self.default_epa_diff def check_redundancy(self, new_organism, orgs_list, geometry): ''' Checks for redundancy, both structural and if specified, epa (d-value). Returns the organism with which new_organism is redundant, or None if no redundancy. Args: new_organism: the Organism to check for redundancy orgs_list: the list containing all Organisms to check against geometry: the Geometry of the search ''' # if new_organism isn't relaxed, then just check structures if new_organism.epa is None: for organism in orgs_list: if new_organism.id != organism.id: # just in case # check if their structures match if self.check_structures(new_organism, organism, geometry): print('Organism {} failed structural redundancy - ' 'looks like organism {} '.format( new_organism.id, organism.id)) return organism # if new_organism is relaxed, only check against relaxed organisms else: for organism in orgs_list: if new_organism.id != organism.id and organism.epa is not None: # check if their structures match if self.check_structures(new_organism, organism, geometry): print('Organism {} failed structural redundancy - ' 'looks like organism {} '.format( new_organism.id, organism.id)) return organism # check how close their epa's are if abs(new_organism.epa - organism.epa) < self.epa_diff: print('Organism {} failed energy per atom redundancy ' '- looks like organism {} '.format( new_organism.id, organism.id)) return organism return None def check_structures(self, org1, org2, geometry): ''' Compares the structures of two organisms to determine if they are redundant. Returns a boolean indicating whether the structures of the two organisms are redundant. Args: org1: the first Organism org2: the second Organism geometry: the Geometry of the search ''' # use the molecule matcher for cluster searches if geometry.shape == 'cluster': return self.match_molecules(org1.cell, org2.cell) elif geometry.shape == 'wire': molecules_match = self.match_molecules(org1.cell, org2.cell) structures_match = self.structure_matcher.fit(org1.cell, org2.cell) return molecules_match or structures_match else: return self.structure_matcher.fit(org1.cell, org2.cell) def match_molecules(self, cell1, cell2): ''' Compares two cells to determine if they are redundant using pymatgen's comparison algorithm that assumes no periodicity in any direction. Returns a boolean indicating whether the cells are redundant. Args: cell1: the first Cell cell2: the second Cell ''' mol1 = Molecule(cell1.species, cell1.cart_coords) mol2 = Molecule(cell2.species, cell2.cart_coords) return self.molecule_matcher.fit(mol1, mol2)