def output(self, conformer): """Writes a Conformer. Args: conformer: dataset_pb2.Conformer """ matches = topology_from_geom.bond_topologies_from_geom( bond_lengths=self._geometry_data.bond_lengths, conformer_id=conformer.conformer_id, fate=conformer.fate, bond_topology=conformer.bond_topologies[0], geometry=conformer.optimized_geometry, matching_parameters=self._matching_parameters) if not matches.bond_topology: logging.error('No bond topology matched for %s', conformer.conformer_id) else: del conformer.bond_topologies[:] conformer.bond_topologies.extend(matches.bond_topology) for bt in conformer.bond_topologies: try: bt.bond_topology_id = self._geometry_data.smiles_id_dict[ bt.smiles] except KeyError: logging.error( 'Did not find bond topology id for smiles %s', bt.smiles) self._wrapped_outputter.output(conformer)
def _add_alternative_bond_topologies(self, conformer, smiles_id_dict): beam.metrics.Metrics.counter(_METRICS_NAMESPACE, 'attempted_topology_matches').inc() matching_parameters = smu_molecule.MatchingParameters() matching_parameters.must_match_all_bonds = True matching_parameters.smiles_with_h = False matching_parameters.smiles_with_labels = False matching_parameters.neutral_forms_during_bond_matching = True matching_parameters.consider_not_bonded = True matching_parameters.ring_atom_count_cannot_decrease = False matches = topology_from_geom.bond_topologies_from_geom( bond_lengths=self._cached_bond_lengths, conformer_id=conformer.conformer_id, fate=conformer.fate, bond_topology=conformer.bond_topologies[0], geometry=conformer.optimized_geometry, matching_parameters=matching_parameters) if not matches.bond_topology: beam.metrics.Metrics.counter(_METRICS_NAMESPACE, 'no_topology_matches').inc() return del conformer.bond_topologies[:] conformer.bond_topologies.extend(matches.bond_topology) for bt in conformer.bond_topologies: try: bt.bond_topology_id = smiles_id_dict[bt.smiles] except KeyError: beam.metrics.Metrics.counter( _METRICS_NAMESPACE, 'topology_match_smiles_failure').inc()
def test_scores(self): carbon = dataset_pb2.BondTopology.AtomType.ATOM_C single_bond = dataset_pb2.BondTopology.BondType.BOND_SINGLE double_bond = dataset_pb2.BondTopology.BondType.BOND_DOUBLE # For testing, turn off the need for complete matching. smu_molecule.default_must_match_all_bonds = False all_distributions = bond_length_distribution.AllAtomPairLengthDistributions( ) x, y = triangular_distribution(1.0, 1.4, 2.0) df = pd.DataFrame({"length": x, "count": y}) bldc1c = bond_length_distribution.EmpiricalLengthDistribution(df, 0.0) all_distributions.add(carbon, carbon, single_bond, bldc1c) x, y = triangular_distribution(1.0, 1.5, 2.0) df = pd.DataFrame({"length": x, "count": y}) bldc2c = bond_length_distribution.EmpiricalLengthDistribution(df, 0.0) all_distributions.add(carbon, carbon, double_bond, bldc2c) bond_topology = text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C bonds: { atom_a: 0 atom_b: 1 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology()) geometry = text_format.Parse( """ atom_positions { x: 0.0 y: 0.0 z: 0.0 }, atom_positions { x: 0.0 y: 0.0 z: 0.0 } """, dataset_pb2.Geometry()) geometry.atom_positions[1].x = 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS matching_parameters = smu_molecule.MatchingParameters() matching_parameters.must_match_all_bonds = False result = topology_from_geom.bond_topologies_from_geom( all_distributions, bond_topology, geometry, matching_parameters) self.assertIsNotNone(result) self.assertEqual(len(result.bond_topology), 2) self.assertEqual(len(result.bond_topology[0].bonds), 1) self.assertEqual(len(result.bond_topology[1].bonds), 1) self.assertGreater(result.bond_topology[0].score, result.bond_topology[1].score) self.assertEqual(result.bond_topology[0].bonds[0].bond_type, single_bond) self.assertEqual(result.bond_topology[1].bonds[0].bond_type, double_bond)
def output(self, molecule): """Writes a Molecule. Args: molecule: dataset_pb2.Molecule """ matches = topology_from_geom.bond_topologies_from_geom( molecule, bond_lengths=self._geometry_data.bond_lengths, matching_parameters=self._matching_parameters) if not matches.bond_topology: logging.error('No bond topology matched for %s', molecule.molecule_id) else: del molecule.bond_topologies[:] molecule.bond_topologies.extend(matches.bond_topology) for bt in molecule.bond_topologies: bt.source = dataset_pb2.BondTopology.SOURCE_CUSTOM try: bt.bond_topology_id = self._db.find_bond_topology_id_for_smiles( bt.smiles) except KeyError: logging.error( 'Did not find bond topology id for smiles %s', bt.smiles) self._wrapped_outputter.output(molecule)
def test_multi_topology_detection(self): """Tests that we can find multiple versions of the same topology.""" single = dataset_pb2.BondTopology.BondType.BOND_SINGLE double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE all_dist = bond_length_distribution.AllAtomPairLengthDistributions() all_dist.add(dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, single, triangular_distribution(1.0, 1.5, 2.0)) all_dist.add(dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, double, triangular_distribution(1.0, 1.4, 2.0)) # This molecule is a flat aromatic square of nitrogens. The single and # double bonds can be rotated such that it's the same topology but # individual bonds have switched single/double. # We set it so the bond lengths favor one of the two arrangements molecule = dataset_pb2.Molecule(molecule_id=123) molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS molecule.bond_topologies.add(bond_topology_id=123, smiles='N1=NN=N1') molecule.bond_topologies[0].atoms.extend([ dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, ]) molecule.bond_topologies[0].bonds.extend([ dataset_pb2.BondTopology.Bond(atom_a=0, atom_b=1, bond_type=single), dataset_pb2.BondTopology.Bond(atom_a=1, atom_b=2, bond_type=double), dataset_pb2.BondTopology.Bond(atom_a=2, atom_b=3, bond_type=single), dataset_pb2.BondTopology.Bond(atom_a=3, atom_b=0, bond_type=double), ]) dist15a = 1.5 / smu_utils_lib.BOHR_TO_ANGSTROMS dist14a = 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS molecule.optimized_geometry.atom_positions.extend([ dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0), dataset_pb2.Geometry.AtomPos(x=0, y=dist15a, z=0), dataset_pb2.Geometry.AtomPos(x=dist14a, y=dist15a, z=0), dataset_pb2.Geometry.AtomPos(x=dist14a, y=0, z=0), ]) matching_parameters = topology_molecule.MatchingParameters() result = topology_from_geom.bond_topologies_from_geom( molecule, all_dist, matching_parameters) self.assertLen(result.bond_topology, 2) first = result.bond_topology[0] self.assertEqual(smu_utils_lib.get_bond_type(first, 0, 1), single) self.assertEqual(smu_utils_lib.get_bond_type(first, 1, 2), double) self.assertEqual(smu_utils_lib.get_bond_type(first, 2, 3), single) self.assertEqual(smu_utils_lib.get_bond_type(first, 3, 0), double) second = result.bond_topology[1] self.assertEqual(smu_utils_lib.get_bond_type(second, 0, 1), double) self.assertEqual(smu_utils_lib.get_bond_type(second, 1, 2), single) self.assertEqual(smu_utils_lib.get_bond_type(second, 2, 3), double) self.assertEqual(smu_utils_lib.get_bond_type(second, 3, 0), single)
def find_by_topology( self, smiles, bond_lengths, matching_parameters=topology_molecule.MatchingParameters()): """Find all molecules which have a detected bond topology. Note that this *redoes* the detection. If you want the default detected versions, you can just query by SMILES string. This is only useful if you adjust the distance thresholds for what a matching bond is. To adjust those, you probably want to use AllAtomPairLengthDistributions.add_from_string_spec Args: smiles: smiles string for the target bond topology bond_lengths: AllAtomPairLengthDistributions matching_parameters: controls the algorithm for matching topologies. Generally should not need to be modified. Yields: dataset_pb2.Molecule """ query_bt = smu_utils_lib.rdkit_molecule_to_bond_topology( smu_utils_lib.smiles_to_rdkit_molecule(smiles)) expanded_stoich = smu_utils_lib.expanded_stoichiometry_from_topology( query_bt) cnt_matched_molecule = 0 cnt_molecule = 0 logging.info('Starting query for %s with stoich %s', smiles, expanded_stoich) for molecule in self.find_by_expanded_stoichiometry_list( [expanded_stoich]): if not smu_utils_lib.molecule_eligible_for_topology_detection( molecule): continue cnt_molecule += 1 matches = topology_from_geom.bond_topologies_from_geom( molecule, bond_lengths=bond_lengths, matching_parameters=matching_parameters) if smiles in [bt.smiles for bt in matches.bond_topology]: cnt_matched_molecule += 1 del molecule.bond_topologies[:] molecule.bond_topologies.extend(matches.bond_topology) for bt in molecule.bond_topologies: try: bt.source = dataset_pb2.BondTopology.SOURCE_CUSTOM bt.bond_topology_id = self.find_bond_topology_id_for_smiles( bt.smiles) except KeyError: logging.error( 'Did not find bond topology id for smiles %s', bt.smiles) yield molecule logging.info('Topology query for %s matched %d / %d', smiles, cnt_matched_molecule, cnt_molecule)
def topology_query(db, smiles): """Find all conformers which have a detected bond topology. Note that this *redoes* the detection. If you want to use the default detected versions, you can just query by SMILES string. This is only useful if you adjust the distance thresholds for what a matching bond is. Args: db: smu_sqlite.SMUSQLite smiles: smiles string for the target bond topology Yields: dataset_pb2.Conformer """ mol = Chem.MolFromSmiles(smiles, sanitize=False) Chem.SanitizeMol(mol, Chem.rdmolops.SanitizeFlags.SANITIZE_ADJUSTHS) mol = Chem.AddHs(mol) query_bt = utilities.molecule_to_bond_topology(mol) expanded_stoich = smu_utils_lib.expanded_stoichiometry_from_topology( query_bt) matching_parameters = _get_geometry_matching_parameters() geometry_data = GeometryData.get_singleton() cnt_matched_conformer = 0 cnt_conformer = 0 logging.info('Starting query for %s with stoich %s', smiles, expanded_stoich) for conformer in db.find_by_expanded_stoichiometry(expanded_stoich): if not smu_utils_lib.conformer_eligible_for_topology_detection( conformer): continue cnt_conformer += 1 matches = topology_from_geom.bond_topologies_from_geom( bond_lengths=geometry_data.bond_lengths, conformer_id=conformer.conformer_id, fate=conformer.fate, bond_topology=conformer.bond_topologies[0], geometry=conformer.optimized_geometry, matching_parameters=matching_parameters) if smiles in [bt.smiles for bt in matches.bond_topology]: cnt_matched_conformer += 1 del conformer.bond_topologies[:] conformer.bond_topologies.extend(matches.bond_topology) for bt in conformer.bond_topologies: try: bt.bond_topology_id = geometry_data.smiles_id_dict[ bt.smiles] except KeyError: logging.error( 'Did not find bond topology id for smiles %s', bt.smiles) yield conformer logging.info('Topology query for %s matched %d / %d', smiles, cnt_matched_conformer, cnt_conformer)
def process(self, molecule): """Called by Beam. Returns a TopologyMatches for the plausible BondTopology's in `molecule`. Args: molecule: Yields: dataset_pb2.TopologyMatches """ # Adjust as needed... # if molecule.properties.errors.fate != dataset_pb2.Properties.FATE_SUCCESS: # return matching_parameters = topology_molecule.MatchingParameters() matching_parameters.neutral_forms_during_bond_matching = True matching_parameters.must_match_all_bonds = True matching_parameters.consider_not_bonded = True matching_parameters.ring_atom_count_cannot_decrease = False yield topology_from_geom.bond_topologies_from_geom( molecule, self._bond_lengths, matching_parameters)
def process(self, conformer): """Called by Beam. Returns a TopologyMatches for the plausible BondTopology's in `conformer`. Args: conformer: Yields: dataset_pb2.TopologyMatches """ # Adjust as needed... # if conformer.fate != dataset_pb2.Conformer.FATE_SUCCESS: # return matching_parameters = smu_molecule.MatchingParameters() matching_parameters.neutral_forms_during_bond_matching = True matching_parameters.must_match_all_bonds = True matching_parameters.consider_not_bonded = True matching_parameters.ring_atom_count_cannot_decrease = False yield topology_from_geom.bond_topologies_from_geom( self._bond_lengths, conformer.conformer_id, conformer.fate, conformer.bond_topologies[0], conformer.optimized_geometry, matching_parameters)
def test_multi_topology_detection(self): """Tests that we can find multiple versions of the same topology.""" single = dataset_pb2.BondTopology.BondType.BOND_SINGLE double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE all_dist = bond_length_distribution.AllAtomPairLengthDistributions() for bond_type in [single, double]: all_dist.add( dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, bond_type, bond_length_distribution.FixedWindowLengthDistribution( 1.0, 2.0, None)) # This conformer is a flat aromatic square of nitrogens. The single and # double bonds can be rotated such that it's the same topology but # individual bonds have switched single/double. conformer = dataset_pb2.Conformer() conformer.bond_topologies.add(bond_topology_id=123, smiles="N1=NN=N1") conformer.bond_topologies[0].atoms.extend([ dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_N, ]) conformer.bond_topologies[0].bonds.extend([ dataset_pb2.BondTopology.Bond(atom_a=0, atom_b=1, bond_type=single), dataset_pb2.BondTopology.Bond(atom_a=1, atom_b=2, bond_type=double), dataset_pb2.BondTopology.Bond(atom_a=2, atom_b=3, bond_type=single), dataset_pb2.BondTopology.Bond(atom_a=3, atom_b=0, bond_type=double), ]) dist15a = 1.5 / smu_utils_lib.BOHR_TO_ANGSTROMS conformer.optimized_geometry.atom_positions.extend([ dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0), dataset_pb2.Geometry.AtomPos(x=0, y=dist15a, z=0), dataset_pb2.Geometry.AtomPos(x=dist15a, y=dist15a, z=0), dataset_pb2.Geometry.AtomPos(x=dist15a, y=0, z=0), ]) matching_parameters = smu_molecule.MatchingParameters() result = topology_from_geom.bond_topologies_from_geom( bond_lengths=all_dist, conformer_id=123, fate=dataset_pb2.Conformer.FATE_SUCCESS, bond_topology=conformer.bond_topologies[0], geometry=conformer.optimized_geometry, matching_parameters=matching_parameters) self.assertLen(result.bond_topology, 2) # The returned order is arbitrary so we figure out which is is marked # as the starting topology. starting_idx = min([ i for i, bt, in enumerate(result.bond_topology) if bt.is_starting_topology ]) other_idx = (starting_idx + 1) % 2 starting = result.bond_topology[starting_idx] self.assertTrue(starting.is_starting_topology) self.assertEqual(smu_utils_lib.get_bond_type(starting, 0, 1), single) self.assertEqual(smu_utils_lib.get_bond_type(starting, 1, 2), double) self.assertEqual(smu_utils_lib.get_bond_type(starting, 2, 3), single) self.assertEqual(smu_utils_lib.get_bond_type(starting, 3, 0), double) other = result.bond_topology[other_idx] self.assertFalse(other.is_starting_topology) self.assertEqual(smu_utils_lib.get_bond_type(other, 0, 1), double) self.assertEqual(smu_utils_lib.get_bond_type(other, 1, 2), single) self.assertEqual(smu_utils_lib.get_bond_type(other, 2, 3), double) self.assertEqual(smu_utils_lib.get_bond_type(other, 3, 0), single)
def test_scores(self): carbon = dataset_pb2.BondTopology.ATOM_C single_bond = dataset_pb2.BondTopology.BondType.BOND_SINGLE double_bond = dataset_pb2.BondTopology.BondType.BOND_DOUBLE # For testing, turn off the need for complete matching. topology_molecule.default_must_match_all_bonds = False all_distributions = bond_length_distribution.AllAtomPairLengthDistributions( ) bldc1c = triangular_distribution(1.0, 1.4, 2.0) all_distributions.add(carbon, carbon, single_bond, bldc1c) bldc2c = triangular_distribution(1.0, 1.5, 2.0) all_distributions.add(carbon, carbon, double_bond, bldc2c) molecule = dataset_pb2.Molecule() molecule.bond_topologies.append( text_format.Parse( """ atoms: ATOM_C atoms: ATOM_C bonds: { atom_a: 0 atom_b: 1 bond_type: BOND_SINGLE } """, dataset_pb2.BondTopology())) molecule.optimized_geometry.MergeFrom( text_format.Parse( """ atom_positions { x: 0.0 y: 0.0 z: 0.0 }, atom_positions { x: 0.0 y: 0.0 z: 0.0 } """, dataset_pb2.Geometry())) molecule.optimized_geometry.atom_positions[1].x = ( 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS) matching_parameters = topology_molecule.MatchingParameters() matching_parameters.must_match_all_bonds = False molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS molecule.molecule_id = 1001 result = topology_from_geom.bond_topologies_from_geom( molecule, all_distributions, matching_parameters) self.assertIsNotNone(result) self.assertLen(result.bond_topology, 2) self.assertLen(result.bond_topology[0].bonds, 1) self.assertLen(result.bond_topology[1].bonds, 1) self.assertEqual(result.bond_topology[0].bonds[0].bond_type, single_bond) self.assertEqual(result.bond_topology[1].bonds[0].bond_type, double_bond) self.assertGreater(result.bond_topology[0].topology_score, result.bond_topology[1].topology_score) self.assertAlmostEqual( np.sum(np.exp([bt.topology_score for bt in result.bond_topology])), 1.0) self.assertAlmostEqual(result.bond_topology[0].geometry_score, np.log(bldc1c.pdf(1.4))) self.assertAlmostEqual(result.bond_topology[1].geometry_score, np.log(bldc2c.pdf(1.4)))