def test_single_fragment_3_atoms_1_bonds(self): bt = text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C atoms: ATOM_C bonds { atom_a: 0 atom_b: 1 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology()) self.assertFalse(utilities.is_single_fragment(bt))
def hydrogen_to_nearest_atom(bond_topology, distances, bond_lengths): """Generate a BondTopology with each Hydrogen atom to its nearest heavy atom. If bond_lengths is given, the distance of the hydrogen is checked to the nearest heavy is checked to be allowed under that distance Args: bond_topology: distances: matrix of interatomic distances. bond_lengths: None or AllAtomPairLengthDistributions Returns: dataset_pb2.BondTopology """ result = dataset_pb2.BondTopology() result.atoms[:] = bond_topology.atoms natoms = len(bond_topology.atoms) for a1 in range(0, natoms): if bond_topology.atoms[a1] != dataset_pb2.BondTopology.AtomType.ATOM_H: continue shortest_distance = 1.0e+30 closest_heavy_atom = -1 for a2 in range(0, natoms): if bond_topology.atoms[a2] == dataset_pb2.BondTopology.AtomType.ATOM_H: continue if distances[a1, a2] >= THRESHOLD: continue if distances[a1, a2] < shortest_distance: shortest_distance = distances[a1, a2] closest_heavy_atom = a2 if closest_heavy_atom < 0: return None if bond_lengths: if (bond_lengths[(bond_topology.atoms[closest_heavy_atom], dataset_pb2.BondTopology.ATOM_H)] [dataset_pb2.BondTopology.BOND_SINGLE].pdf(shortest_distance) == 0.0): return None bond = dataset_pb2.BondTopology.Bond( atom_a=a1, atom_b=closest_heavy_atom, bond_type=dataset_pb2.BondTopology.BondType.BOND_SINGLE) result.bonds.append(bond) return result
def test_canonical(self): bt = text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C atoms: ATOM_C bonds { atom_a: 2 atom_b: 1 bond_type: BOND_SINGLE }, bonds { atom_a: 1 atom_b: 0 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology()) expected = text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C atoms: ATOM_C bonds { atom_a: 0 atom_b: 1 bond_type: BOND_SINGLE }, bonds { atom_a: 1 atom_b: 2 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology()) utilities.canonicalize_bond_topology(bt) self.assertEqual( text_format.MessageToString(bt), text_format.MessageToString(expected))
def test_equality(self): bt1 = text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C atoms: ATOM_C bonds { atom_a: 2 atom_b: 1 bond_type: BOND_SINGLE }, bonds { atom_a: 1 atom_b: 0 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology()) bt2 = text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C atoms: ATOM_C bonds { atom_a: 0 atom_b: 1 bond_type: BOND_SINGLE }, bonds { atom_a: 1 atom_b: 2 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology()) self.assertFalse(utilities.same_bond_topology(bt1, bt2)) utilities.canonicalize_bond_topology(bt1) self.assertTrue(utilities.same_bond_topology(bt1, bt2))
def hydrogen_to_nearest_atom( bond_topology, distances): """Generate a BondTopology that joins each Hydrogen atom to its nearest. heavy atom. Args: bond_topology: distances: Returns: """ result = dataset_pb2.BondTopology() result.atoms[:] = bond_topology.atoms natoms = len(bond_topology.atoms) for a1 in range(0, natoms): if bond_topology.atoms[a1] != dataset_pb2.BondTopology.AtomType.ATOM_H: continue shortest_distance = 1.0e+30 closest_heavy_atom = -1 for a2 in range(0, natoms): if bond_topology.atoms[a2] == dataset_pb2.BondTopology.AtomType.ATOM_H: continue if distances[a1, a2] >= THRESHOLD: continue if distances[a1, a2] < shortest_distance: shortest_distance = distances[a1, a2] closest_heavy_atom = a2 if closest_heavy_atom < 0: return None bond = dataset_pb2.BondTopology.Bond( atom_a=a1, atom_b=closest_heavy_atom, bond_type=dataset_pb2.BondTopology.BondType.BOND_SINGLE) result.bonds.append(bond) return result
def test_ethane_all(self, btype, expected_bond): cc = text_format.Parse(""" atoms: ATOM_C atoms: ATOM_C """, dataset_pb2.BondTopology()) bonds_to_scores = {(0, 1): np.zeros(4, dtype=np.float32)} bonds_to_scores[(0, 1)][btype] = 1.0 matching_parameters = smu_molecule.MatchingParameters() matching_parameters.must_match_all_bonds = False mol = smu_molecule.SmuMolecule(cc, bonds_to_scores, matching_parameters) state = mol.generate_search_state() for s in itertools.product(*state): res = mol.place_bonds(s, matching_parameters) if btype == 0: self.assertIsNone(res) else: self.assertIsNotNone(res) self.assertLen(res.bonds, 1) self.assertEqual(res.bonds[0].bond_type, expected_bond)
def place_bonds_inner(self, state): """Place bonds corresponding to `state`. No validity checking is done, the calling function is responsible for that. Args: state: for each pair of atoms, the kind of bond to be placed. Returns: If successful, a BondTopology. """ self._current_bonds_attached = np.copy( self._bonds_with_hydrogens_attached) result = dataset_pb2.BondTopology() result.CopyFrom( self._starting_bond_topology) # only Hydrogens attached. result.score = self._initial_score # Make sure each atoms gets at least one bond atom_got_bond = np.zeros(self._heavy_atoms) for i, btype in enumerate(state): if btype != dataset_pb2.BondTopology.BOND_UNDEFINED: a1 = self._bonds[i][0] a2 = self._bonds[i][1] if not self._place_bond(a1, a2, btype): return None add_bond(a1, a2, btype, result) atom_got_bond[a1] = 1 atom_got_bond[a2] = 1 result.score = self._accumulate_score(result.score, self._scores[i][btype]) if not np.all(atom_got_bond): return None return result
def molecule_to_bond_topology(mol): """Molecule to bond topology. Args: mol: Returns: Bond topology. """ bond_topology = dataset_pb2.BondTopology() for atom in mol.GetAtoms(): bond_topology.atoms.append(rdkit_atom_to_atom_type(atom)) for bond in mol.GetBonds(): btype = rdkit_bond_type_to_btype(bond.GetBondType()) bt_bond = dataset_pb2.BondTopology.Bond() bt_bond.atom_a = bond.GetBeginAtom().GetIdx() bt_bond.atom_b = bond.GetEndAtom().GetIdx() bt_bond.bond_type = btype bond_topology.bonds.append(bt_bond) return bond_topology
def get_molecule(self, oc_dist, cn_dist): molecule = dataset_pb2.Molecule(molecule_id=12345) molecule.bond_topologies.append(dataset_pb2.BondTopology(smiles='N=C=O')) molecule.bond_topologies[0].atoms.extend([ dataset_pb2.BondTopology.ATOM_O, dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_H ]) molecule.bond_topologies[0].bonds.append( dataset_pb2.BondTopology.Bond( atom_a=0, atom_b=1, bond_type=dataset_pb2.BondTopology.BondType.BOND_DOUBLE)) molecule.bond_topologies[0].bonds.append( dataset_pb2.BondTopology.Bond( atom_a=1, atom_b=2, bond_type=dataset_pb2.BondTopology.BondType.BOND_DOUBLE)) molecule.bond_topologies[0].bonds.append( dataset_pb2.BondTopology.Bond( atom_a=2, atom_b=3, bond_type=dataset_pb2.BondTopology.BondType.BOND_SINGLE)) molecule.optimized_geometry.atom_positions.append( dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0)) molecule.optimized_geometry.atom_positions.append( dataset_pb2.Geometry.AtomPos( x=0, y=0, z=oc_dist / smu_utils_lib.BOHR_TO_ANGSTROMS)) molecule.optimized_geometry.atom_positions.append( dataset_pb2.Geometry.AtomPos( x=0, y=0, z=(oc_dist + cn_dist) / smu_utils_lib.BOHR_TO_ANGSTROMS)) molecule.optimized_geometry.atom_positions.append( dataset_pb2.Geometry.AtomPos( x=0, y=0, z=(oc_dist + cn_dist + 1) / smu_utils_lib.BOHR_TO_ANGSTROMS)) return molecule
def test_single_fragment_4_atoms_3_bonds_no_ring(self): bt = text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C atoms: ATOM_C atoms: ATOM_C bonds { atom_a: 0 atom_b: 1 bond_type: BOND_SINGLE } bonds { atom_a: 1 atom_b: 2 bond_type: BOND_SINGLE } bonds { atom_a: 2 atom_b: 3 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology()) self.assertTrue(utilities.is_single_fragment(bt))
def test_operators(self): cc = text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C atoms: ATOM_C """, dataset_pb2.BondTopology()) # print(f"Generating bonds {btype1} and {btype2}") bonds_to_scores = { (0, 1): np.zeros(4, dtype=np.float32), (1, 2): np.zeros(4, dtype=np.float32) } scores = np.array([1.0, 3.0], dtype=np.float32) bonds_to_scores[(0, 1)][1] = scores[0] bonds_to_scores[(1, 2)][1] = scores[1] matching_parameters = smu_molecule.MatchingParameters() matching_parameters.must_match_all_bonds = False mol = smu_molecule.SmuMolecule(cc, bonds_to_scores, matching_parameters) mol.set_initial_score_and_incrementer(1.0, operator.mul) state = mol.generate_search_state() for s in itertools.product(*state): res = mol.place_bonds(s, matching_parameters) self.assertAlmostEqual(res.score, np.product(scores))
def create_bond_topology(atoms, connectivity_matrix_string, hydrogens_string): """Creates a BondTopology from a compact string representation. Any hydrogens in the atoms string will be ignored. The appropriate number will be added based on what is in the hydrogens string. Args: atoms: a string like 'CCCCOON' (case insensitive) for the heavy atoms connectivity_matrix_string: a string for the uppertriangular connectivity matrix with bond orders, like '010210' hydrogens_string: a string for the number of hydrogens conencted to each heavy atom Returns: BondTopology """ bond_topology = dataset_pb2.BondTopology() # Add the heavy atoms for atom_type in atoms.lower(): if atom_type == 'c': bond_topology.atoms.append( dataset_pb2.BondTopology.AtomType.ATOM_C) elif atom_type == 'n': bond_topology.atoms.append( dataset_pb2.BondTopology.AtomType.ATOM_N) elif atom_type == 'o': bond_topology.atoms.append( dataset_pb2.BondTopology.AtomType.ATOM_O) elif atom_type == 'f': bond_topology.atoms.append( dataset_pb2.BondTopology.AtomType.ATOM_F) elif atom_type == 'h': pass else: raise ValueError('Unknown atom type: {}'.format(atom_type)) num_heavy_atoms = len(bond_topology.atoms) # Now add the bonds between the heavy atoms if num_heavy_atoms > 1: for (i, j), bond_order in zip( np.nditer(np.triu_indices(num_heavy_atoms, k=1)), connectivity_matrix_string): if bond_order == '0': continue bond = bond_topology.bonds.add() bond.atom_a = int(i) bond.atom_b = int(j) if bond_order == '1': bond.bond_type = dataset_pb2.BondTopology.BondType.BOND_SINGLE elif bond_order == '2': bond.bond_type = dataset_pb2.BondTopology.BondType.BOND_DOUBLE elif bond_order == '3': bond.bond_type = dataset_pb2.BondTopology.BondType.BOND_TRIPLE else: raise ValueError('Bad bond order {}'.format(bond_order)) # Now add the hydrogens, and adjust charged atoms if the total bond counts # indicate that. expected_hydrogens = compute_bonded_hydrogens( bond_topology, compute_adjacency_matrix(bond_topology)) for atom_idx, (actual_h, expected_h) in enumerate( zip(hydrogens_string, expected_hydrogens)): actual_h = int(actual_h) diff = expected_h - actual_h atom_type = bond_topology.atoms[atom_idx] if diff == -1 and atom_type == dataset_pb2.BondTopology.AtomType.ATOM_N: bond_topology.atoms[ atom_idx] = dataset_pb2.BondTopology.AtomType.ATOM_NPOS elif diff == 1 and atom_type == dataset_pb2.BondTopology.AtomType.ATOM_O: bond_topology.atoms[ atom_idx] = dataset_pb2.BondTopology.AtomType.ATOM_ONEG elif diff: raise ValueError( f'Bad hydrogen count (actual={actual_h}, expected={expected_h} ' 'for {atom_type}, index {atom_idx}') for _ in range(actual_h): bond_topology.atoms.append( dataset_pb2.BondTopology.AtomType.ATOM_H) h_idx = len(bond_topology.atoms) - 1 bond = bond_topology.bonds.add() bond.atom_a = atom_idx bond.atom_b = h_idx bond.bond_type = dataset_pb2.BondTopology.BondType.BOND_SINGLE return bond_topology
def str_to_bond_topology(s): bt = dataset_pb2.BondTopology() text_format.Parse(s, bt) return bt
def test_single_fragment_two_disconnected_atoms(self): bt = text_format.Parse(""" atoms: ATOM_C atoms: ATOM_C """, dataset_pb2.BondTopology()) self.assertFalse(utilities.is_single_fragment(bt))
def test_single_fragment_single_atom(self): bt = text_format.Parse(""" atoms: ATOM_C """, dataset_pb2.BondTopology()) self.assertTrue(utilities.is_single_fragment(bt))
def test_scores(self): carbon = dataset_pb2.BondTopology.ATOM_C single_bond = dataset_pb2.BondTopology.BondType.BOND_SINGLE double_bond = dataset_pb2.BondTopology.BondType.BOND_DOUBLE # For testing, turn off the need for complete matching. smu_molecule.default_must_match_all_bonds = False all_distributions = bond_length_distribution.AllAtomPairLengthDistributions( ) x, y = triangular_distribution(1.0, 1.4, 2.0) df = pd.DataFrame({"length": x, "count": y}) bldc1c = bond_length_distribution.EmpiricalLengthDistribution(df, 0.0) all_distributions.add(carbon, carbon, single_bond, bldc1c) x, y = triangular_distribution(1.0, 1.5, 2.0) df = pd.DataFrame({"length": x, "count": y}) bldc2c = bond_length_distribution.EmpiricalLengthDistribution(df, 0.0) all_distributions.add(carbon, carbon, double_bond, bldc2c) bond_topology = text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C bonds: { atom_a: 0 atom_b: 1 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology()) geometry = text_format.Parse( """ atom_positions { x: 0.0 y: 0.0 z: 0.0 }, atom_positions { x: 0.0 y: 0.0 z: 0.0 } """, dataset_pb2.Geometry()) geometry.atom_positions[1].x = 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS matching_parameters = smu_molecule.MatchingParameters() matching_parameters.must_match_all_bonds = False fate = dataset_pb2.Conformer.FATE_SUCCESS conformer_id = 1001 result = topology_from_geom.bond_topologies_from_geom( all_distributions, conformer_id, fate, bond_topology, geometry, matching_parameters) self.assertIsNotNone(result) self.assertLen(result.bond_topology, 2) self.assertLen(result.bond_topology[0].bonds, 1) self.assertLen(result.bond_topology[1].bonds, 1) self.assertEqual(result.bond_topology[0].bonds[0].bond_type, single_bond) self.assertEqual(result.bond_topology[1].bonds[0].bond_type, double_bond) self.assertGreater(result.bond_topology[0].topology_score, result.bond_topology[1].topology_score) self.assertAlmostEqual( np.sum(np.exp([bt.topology_score for bt in result.bond_topology])), 1.0) self.assertAlmostEqual(result.bond_topology[0].geometry_score, np.log(bldc1c.pdf(1.4))) self.assertAlmostEqual(result.bond_topology[1].geometry_score, np.log(bldc2c.pdf(1.4)))
def test_scores(self): carbon = dataset_pb2.BondTopology.ATOM_C single_bond = dataset_pb2.BondTopology.BondType.BOND_SINGLE double_bond = dataset_pb2.BondTopology.BondType.BOND_DOUBLE # For testing, turn off the need for complete matching. topology_molecule.default_must_match_all_bonds = False all_distributions = bond_length_distribution.AllAtomPairLengthDistributions( ) bldc1c = triangular_distribution(1.0, 1.4, 2.0) all_distributions.add(carbon, carbon, single_bond, bldc1c) bldc2c = triangular_distribution(1.0, 1.5, 2.0) all_distributions.add(carbon, carbon, double_bond, bldc2c) molecule = dataset_pb2.Molecule() molecule.bond_topologies.append( text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C bonds: { atom_a: 0 atom_b: 1 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology())) molecule.optimized_geometry.MergeFrom( text_format.Parse( """ atom_positions { x: 0.0 y: 0.0 z: 0.0 }, atom_positions { x: 0.0 y: 0.0 z: 0.0 } """, dataset_pb2.Geometry())) molecule.optimized_geometry.atom_positions[1].x = ( 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS) matching_parameters = topology_molecule.MatchingParameters() matching_parameters.must_match_all_bonds = False molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS molecule.molecule_id = 1001 result = topology_from_geom.bond_topologies_from_geom( molecule, all_distributions, matching_parameters) self.assertIsNotNone(result) self.assertLen(result.bond_topology, 2) self.assertLen(result.bond_topology[0].bonds, 1) self.assertLen(result.bond_topology[1].bonds, 1) self.assertEqual(result.bond_topology[0].bonds[0].bond_type, single_bond) self.assertEqual(result.bond_topology[1].bonds[0].bond_type, double_bond) self.assertGreater(result.bond_topology[0].topology_score, result.bond_topology[1].topology_score) self.assertAlmostEqual( np.sum(np.exp([bt.topology_score for bt in result.bond_topology])), 1.0) self.assertAlmostEqual(result.bond_topology[0].geometry_score, np.log(bldc1c.pdf(1.4))) self.assertAlmostEqual(result.bond_topology[1].geometry_score, np.log(bldc2c.pdf(1.4)))