Esempio n. 1
0
    def output(self, conformer):
        """Writes a Conformer.

    Args:
      conformer: dataset_pb2.Conformer
    """
        matches = topology_from_geom.bond_topologies_from_geom(
            bond_lengths=self._geometry_data.bond_lengths,
            conformer_id=conformer.conformer_id,
            fate=conformer.fate,
            bond_topology=conformer.bond_topologies[0],
            geometry=conformer.optimized_geometry,
            matching_parameters=self._matching_parameters)

        if not matches.bond_topology:
            logging.error('No bond topology matched for %s',
                          conformer.conformer_id)
        else:
            del conformer.bond_topologies[:]
            conformer.bond_topologies.extend(matches.bond_topology)
            for bt in conformer.bond_topologies:
                try:
                    bt.bond_topology_id = self._geometry_data.smiles_id_dict[
                        bt.smiles]
                except KeyError:
                    logging.error(
                        'Did not find bond topology id for smiles %s',
                        bt.smiles)

        self._wrapped_outputter.output(conformer)
Esempio n. 2
0
    def _add_alternative_bond_topologies(self, conformer, smiles_id_dict):
        beam.metrics.Metrics.counter(_METRICS_NAMESPACE,
                                     'attempted_topology_matches').inc()

        matching_parameters = smu_molecule.MatchingParameters()
        matching_parameters.must_match_all_bonds = True
        matching_parameters.smiles_with_h = False
        matching_parameters.smiles_with_labels = False
        matching_parameters.neutral_forms_during_bond_matching = True
        matching_parameters.consider_not_bonded = True
        matching_parameters.ring_atom_count_cannot_decrease = False

        matches = topology_from_geom.bond_topologies_from_geom(
            bond_lengths=self._cached_bond_lengths,
            conformer_id=conformer.conformer_id,
            fate=conformer.fate,
            bond_topology=conformer.bond_topologies[0],
            geometry=conformer.optimized_geometry,
            matching_parameters=matching_parameters)

        if not matches.bond_topology:
            beam.metrics.Metrics.counter(_METRICS_NAMESPACE,
                                         'no_topology_matches').inc()
            return

        del conformer.bond_topologies[:]
        conformer.bond_topologies.extend(matches.bond_topology)
        for bt in conformer.bond_topologies:
            try:
                bt.bond_topology_id = smiles_id_dict[bt.smiles]
            except KeyError:
                beam.metrics.Metrics.counter(
                    _METRICS_NAMESPACE, 'topology_match_smiles_failure').inc()
    def test_scores(self):
        carbon = dataset_pb2.BondTopology.AtomType.ATOM_C
        single_bond = dataset_pb2.BondTopology.BondType.BOND_SINGLE
        double_bond = dataset_pb2.BondTopology.BondType.BOND_DOUBLE

        # For testing, turn off the need for complete matching.
        smu_molecule.default_must_match_all_bonds = False

        all_distributions = bond_length_distribution.AllAtomPairLengthDistributions(
        )
        x, y = triangular_distribution(1.0, 1.4, 2.0)
        df = pd.DataFrame({"length": x, "count": y})
        bldc1c = bond_length_distribution.EmpiricalLengthDistribution(df, 0.0)
        all_distributions.add(carbon, carbon, single_bond, bldc1c)

        x, y = triangular_distribution(1.0, 1.5, 2.0)
        df = pd.DataFrame({"length": x, "count": y})
        bldc2c = bond_length_distribution.EmpiricalLengthDistribution(df, 0.0)
        all_distributions.add(carbon, carbon, double_bond, bldc2c)

        bond_topology = text_format.Parse(
            """
atoms: ATOM_C
atoms: ATOM_C
bonds: {
  atom_a: 0
  atom_b: 1
  bond_type: BOND_SINGLE
}
""", dataset_pb2.BondTopology())

        geometry = text_format.Parse(
            """
atom_positions {
  x: 0.0
  y: 0.0
  z: 0.0
},
atom_positions {
  x: 0.0
  y: 0.0
  z: 0.0
}
""", dataset_pb2.Geometry())
        geometry.atom_positions[1].x = 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS

        matching_parameters = smu_molecule.MatchingParameters()
        matching_parameters.must_match_all_bonds = False
        result = topology_from_geom.bond_topologies_from_geom(
            all_distributions, bond_topology, geometry, matching_parameters)
        self.assertIsNotNone(result)
        self.assertEqual(len(result.bond_topology), 2)
        self.assertEqual(len(result.bond_topology[0].bonds), 1)
        self.assertEqual(len(result.bond_topology[1].bonds), 1)
        self.assertGreater(result.bond_topology[0].score,
                           result.bond_topology[1].score)
        self.assertEqual(result.bond_topology[0].bonds[0].bond_type,
                         single_bond)
        self.assertEqual(result.bond_topology[1].bonds[0].bond_type,
                         double_bond)
Esempio n. 4
0
    def output(self, molecule):
        """Writes a Molecule.

    Args:
      molecule: dataset_pb2.Molecule
    """
        matches = topology_from_geom.bond_topologies_from_geom(
            molecule,
            bond_lengths=self._geometry_data.bond_lengths,
            matching_parameters=self._matching_parameters)

        if not matches.bond_topology:
            logging.error('No bond topology matched for %s',
                          molecule.molecule_id)
        else:
            del molecule.bond_topologies[:]
            molecule.bond_topologies.extend(matches.bond_topology)
            for bt in molecule.bond_topologies:
                bt.source = dataset_pb2.BondTopology.SOURCE_CUSTOM
                try:
                    bt.bond_topology_id = self._db.find_bond_topology_id_for_smiles(
                        bt.smiles)
                except KeyError:
                    logging.error(
                        'Did not find bond topology id for smiles %s',
                        bt.smiles)

        self._wrapped_outputter.output(molecule)
Esempio n. 5
0
  def test_multi_topology_detection(self):
    """Tests that we can find multiple versions of the same topology."""
    single = dataset_pb2.BondTopology.BondType.BOND_SINGLE
    double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE

    all_dist = bond_length_distribution.AllAtomPairLengthDistributions()
    all_dist.add(dataset_pb2.BondTopology.ATOM_N,
                 dataset_pb2.BondTopology.ATOM_N, single,
                 triangular_distribution(1.0, 1.5, 2.0))
    all_dist.add(dataset_pb2.BondTopology.ATOM_N,
                 dataset_pb2.BondTopology.ATOM_N, double,
                 triangular_distribution(1.0, 1.4, 2.0))

    # This molecule is a flat aromatic square of nitrogens. The single and
    # double bonds can be rotated such that it's the same topology but
    # individual bonds have switched single/double.
    # We set it so the bond lengths favor one of the two arrangements
    molecule = dataset_pb2.Molecule(molecule_id=123)
    molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS

    molecule.bond_topologies.add(bond_topology_id=123, smiles='N1=NN=N1')
    molecule.bond_topologies[0].atoms.extend([
        dataset_pb2.BondTopology.ATOM_N,
        dataset_pb2.BondTopology.ATOM_N,
        dataset_pb2.BondTopology.ATOM_N,
        dataset_pb2.BondTopology.ATOM_N,
    ])
    molecule.bond_topologies[0].bonds.extend([
        dataset_pb2.BondTopology.Bond(atom_a=0, atom_b=1, bond_type=single),
        dataset_pb2.BondTopology.Bond(atom_a=1, atom_b=2, bond_type=double),
        dataset_pb2.BondTopology.Bond(atom_a=2, atom_b=3, bond_type=single),
        dataset_pb2.BondTopology.Bond(atom_a=3, atom_b=0, bond_type=double),
    ])

    dist15a = 1.5 / smu_utils_lib.BOHR_TO_ANGSTROMS
    dist14a = 1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS
    molecule.optimized_geometry.atom_positions.extend([
        dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0),
        dataset_pb2.Geometry.AtomPos(x=0, y=dist15a, z=0),
        dataset_pb2.Geometry.AtomPos(x=dist14a, y=dist15a, z=0),
        dataset_pb2.Geometry.AtomPos(x=dist14a, y=0, z=0),
    ])

    matching_parameters = topology_molecule.MatchingParameters()
    result = topology_from_geom.bond_topologies_from_geom(
        molecule, all_dist, matching_parameters)

    self.assertLen(result.bond_topology, 2)

    first = result.bond_topology[0]
    self.assertEqual(smu_utils_lib.get_bond_type(first, 0, 1), single)
    self.assertEqual(smu_utils_lib.get_bond_type(first, 1, 2), double)
    self.assertEqual(smu_utils_lib.get_bond_type(first, 2, 3), single)
    self.assertEqual(smu_utils_lib.get_bond_type(first, 3, 0), double)

    second = result.bond_topology[1]
    self.assertEqual(smu_utils_lib.get_bond_type(second, 0, 1), double)
    self.assertEqual(smu_utils_lib.get_bond_type(second, 1, 2), single)
    self.assertEqual(smu_utils_lib.get_bond_type(second, 2, 3), double)
    self.assertEqual(smu_utils_lib.get_bond_type(second, 3, 0), single)
Esempio n. 6
0
    def find_by_topology(
        self,
        smiles,
        bond_lengths,
        matching_parameters=topology_molecule.MatchingParameters()):
        """Find all molecules which have a detected bond topology.

    Note that this *redoes* the detection. If you want the default detected
    versions, you can just query by SMILES string. This is only useful if you
    adjust the distance thresholds for what a matching bond is.
    To adjust those, you probably want to use
    AllAtomPairLengthDistributions.add_from_string_spec

    Args:
      smiles: smiles string for the target bond topology
      bond_lengths: AllAtomPairLengthDistributions
      matching_parameters: controls the algorithm for matching topologies.
        Generally should not need to be modified.

    Yields:
      dataset_pb2.Molecule
    """
        query_bt = smu_utils_lib.rdkit_molecule_to_bond_topology(
            smu_utils_lib.smiles_to_rdkit_molecule(smiles))
        expanded_stoich = smu_utils_lib.expanded_stoichiometry_from_topology(
            query_bt)
        cnt_matched_molecule = 0
        cnt_molecule = 0
        logging.info('Starting query for %s with stoich %s', smiles,
                     expanded_stoich)
        for molecule in self.find_by_expanded_stoichiometry_list(
            [expanded_stoich]):
            if not smu_utils_lib.molecule_eligible_for_topology_detection(
                    molecule):
                continue
            cnt_molecule += 1
            matches = topology_from_geom.bond_topologies_from_geom(
                molecule,
                bond_lengths=bond_lengths,
                matching_parameters=matching_parameters)
            if smiles in [bt.smiles for bt in matches.bond_topology]:
                cnt_matched_molecule += 1
                del molecule.bond_topologies[:]
                molecule.bond_topologies.extend(matches.bond_topology)
                for bt in molecule.bond_topologies:
                    try:
                        bt.source = dataset_pb2.BondTopology.SOURCE_CUSTOM
                        bt.bond_topology_id = self.find_bond_topology_id_for_smiles(
                            bt.smiles)
                    except KeyError:
                        logging.error(
                            'Did not find bond topology id for smiles %s',
                            bt.smiles)
                yield molecule
        logging.info('Topology query for %s matched %d / %d', smiles,
                     cnt_matched_molecule, cnt_molecule)
Esempio n. 7
0
def topology_query(db, smiles):
    """Find all conformers which have a detected bond topology.

  Note that this *redoes* the detection. If you want to use the default detected
  versions, you can just query by SMILES string. This is only useful if you
  adjust the distance thresholds for what a matching bond is.

  Args:
    db: smu_sqlite.SMUSQLite
    smiles: smiles string for the target bond topology

  Yields:
    dataset_pb2.Conformer
  """
    mol = Chem.MolFromSmiles(smiles, sanitize=False)
    Chem.SanitizeMol(mol, Chem.rdmolops.SanitizeFlags.SANITIZE_ADJUSTHS)
    mol = Chem.AddHs(mol)
    query_bt = utilities.molecule_to_bond_topology(mol)
    expanded_stoich = smu_utils_lib.expanded_stoichiometry_from_topology(
        query_bt)
    matching_parameters = _get_geometry_matching_parameters()
    geometry_data = GeometryData.get_singleton()
    cnt_matched_conformer = 0
    cnt_conformer = 0
    logging.info('Starting query for %s with stoich %s', smiles,
                 expanded_stoich)
    for conformer in db.find_by_expanded_stoichiometry(expanded_stoich):
        if not smu_utils_lib.conformer_eligible_for_topology_detection(
                conformer):
            continue
        cnt_conformer += 1
        matches = topology_from_geom.bond_topologies_from_geom(
            bond_lengths=geometry_data.bond_lengths,
            conformer_id=conformer.conformer_id,
            fate=conformer.fate,
            bond_topology=conformer.bond_topologies[0],
            geometry=conformer.optimized_geometry,
            matching_parameters=matching_parameters)
        if smiles in [bt.smiles for bt in matches.bond_topology]:
            cnt_matched_conformer += 1
            del conformer.bond_topologies[:]
            conformer.bond_topologies.extend(matches.bond_topology)
            for bt in conformer.bond_topologies:
                try:
                    bt.bond_topology_id = geometry_data.smiles_id_dict[
                        bt.smiles]
                except KeyError:
                    logging.error(
                        'Did not find bond topology id for smiles %s',
                        bt.smiles)
            yield conformer
    logging.info('Topology query for %s matched %d / %d', smiles,
                 cnt_matched_conformer, cnt_conformer)
    def process(self, molecule):
        """Called by Beam.

      Returns a TopologyMatches for the plausible BondTopology's in `molecule`.
    Args:
      molecule:

    Yields:
      dataset_pb2.TopologyMatches
    """
        # Adjust as needed...
        # if molecule.properties.errors.fate != dataset_pb2.Properties.FATE_SUCCESS:
        #   return
        matching_parameters = topology_molecule.MatchingParameters()
        matching_parameters.neutral_forms_during_bond_matching = True
        matching_parameters.must_match_all_bonds = True
        matching_parameters.consider_not_bonded = True
        matching_parameters.ring_atom_count_cannot_decrease = False
        yield topology_from_geom.bond_topologies_from_geom(
            molecule, self._bond_lengths, matching_parameters)
Esempio n. 9
0
    def process(self, conformer):
        """Called by Beam.

      Returns a TopologyMatches for the plausible BondTopology's in `conformer`.
    Args:
      conformer:

    Yields:
      dataset_pb2.TopologyMatches
    """
        # Adjust as needed...
        #   if conformer.fate != dataset_pb2.Conformer.FATE_SUCCESS:
        #     return
        matching_parameters = smu_molecule.MatchingParameters()
        matching_parameters.neutral_forms_during_bond_matching = True
        matching_parameters.must_match_all_bonds = True
        matching_parameters.consider_not_bonded = True
        matching_parameters.ring_atom_count_cannot_decrease = False
        yield topology_from_geom.bond_topologies_from_geom(
            self._bond_lengths, conformer.conformer_id, conformer.fate,
            conformer.bond_topologies[0], conformer.optimized_geometry,
            matching_parameters)
Esempio n. 10
0
    def test_multi_topology_detection(self):
        """Tests that we can find multiple versions of the same topology."""
        single = dataset_pb2.BondTopology.BondType.BOND_SINGLE
        double = dataset_pb2.BondTopology.BondType.BOND_DOUBLE

        all_dist = bond_length_distribution.AllAtomPairLengthDistributions()
        for bond_type in [single, double]:
            all_dist.add(
                dataset_pb2.BondTopology.ATOM_N,
                dataset_pb2.BondTopology.ATOM_N, bond_type,
                bond_length_distribution.FixedWindowLengthDistribution(
                    1.0, 2.0, None))

        # This conformer is a flat aromatic square of nitrogens. The single and
        # double bonds can be rotated such that it's the same topology but
        # individual bonds have switched single/double.
        conformer = dataset_pb2.Conformer()

        conformer.bond_topologies.add(bond_topology_id=123, smiles="N1=NN=N1")
        conformer.bond_topologies[0].atoms.extend([
            dataset_pb2.BondTopology.ATOM_N,
            dataset_pb2.BondTopology.ATOM_N,
            dataset_pb2.BondTopology.ATOM_N,
            dataset_pb2.BondTopology.ATOM_N,
        ])
        conformer.bond_topologies[0].bonds.extend([
            dataset_pb2.BondTopology.Bond(atom_a=0, atom_b=1,
                                          bond_type=single),
            dataset_pb2.BondTopology.Bond(atom_a=1, atom_b=2,
                                          bond_type=double),
            dataset_pb2.BondTopology.Bond(atom_a=2, atom_b=3,
                                          bond_type=single),
            dataset_pb2.BondTopology.Bond(atom_a=3, atom_b=0,
                                          bond_type=double),
        ])

        dist15a = 1.5 / smu_utils_lib.BOHR_TO_ANGSTROMS
        conformer.optimized_geometry.atom_positions.extend([
            dataset_pb2.Geometry.AtomPos(x=0, y=0, z=0),
            dataset_pb2.Geometry.AtomPos(x=0, y=dist15a, z=0),
            dataset_pb2.Geometry.AtomPos(x=dist15a, y=dist15a, z=0),
            dataset_pb2.Geometry.AtomPos(x=dist15a, y=0, z=0),
        ])

        matching_parameters = smu_molecule.MatchingParameters()
        result = topology_from_geom.bond_topologies_from_geom(
            bond_lengths=all_dist,
            conformer_id=123,
            fate=dataset_pb2.Conformer.FATE_SUCCESS,
            bond_topology=conformer.bond_topologies[0],
            geometry=conformer.optimized_geometry,
            matching_parameters=matching_parameters)

        self.assertLen(result.bond_topology, 2)

        # The returned order is arbitrary so we figure out which is is marked
        # as the starting topology.
        starting_idx = min([
            i for i, bt, in enumerate(result.bond_topology)
            if bt.is_starting_topology
        ])
        other_idx = (starting_idx + 1) % 2

        starting = result.bond_topology[starting_idx]
        self.assertTrue(starting.is_starting_topology)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 0, 1), single)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 1, 2), double)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 2, 3), single)
        self.assertEqual(smu_utils_lib.get_bond_type(starting, 3, 0), double)

        other = result.bond_topology[other_idx]
        self.assertFalse(other.is_starting_topology)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 0, 1), double)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 1, 2), single)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 2, 3), double)
        self.assertEqual(smu_utils_lib.get_bond_type(other, 3, 0), single)
Esempio n. 11
0
  def test_scores(self):
    carbon = dataset_pb2.BondTopology.ATOM_C
    single_bond = dataset_pb2.BondTopology.BondType.BOND_SINGLE
    double_bond = dataset_pb2.BondTopology.BondType.BOND_DOUBLE

    # For testing, turn off the need for complete matching.
    topology_molecule.default_must_match_all_bonds = False

    all_distributions = bond_length_distribution.AllAtomPairLengthDistributions(
    )
    bldc1c = triangular_distribution(1.0, 1.4, 2.0)
    all_distributions.add(carbon, carbon, single_bond, bldc1c)
    bldc2c = triangular_distribution(1.0, 1.5, 2.0)
    all_distributions.add(carbon, carbon, double_bond, bldc2c)

    molecule = dataset_pb2.Molecule()

    molecule.bond_topologies.append(
        text_format.Parse(
            """
atoms: ATOM_C
atoms: ATOM_C
bonds: {
  atom_a: 0
  atom_b: 1
  bond_type: BOND_SINGLE
}
""", dataset_pb2.BondTopology()))

    molecule.optimized_geometry.MergeFrom(
        text_format.Parse(
            """
atom_positions {
  x: 0.0
  y: 0.0
  z: 0.0
},
atom_positions {
  x: 0.0
  y: 0.0
  z: 0.0
}
""", dataset_pb2.Geometry()))
    molecule.optimized_geometry.atom_positions[1].x = (
        1.4 / smu_utils_lib.BOHR_TO_ANGSTROMS)

    matching_parameters = topology_molecule.MatchingParameters()
    matching_parameters.must_match_all_bonds = False
    molecule.properties.errors.fate = dataset_pb2.Properties.FATE_SUCCESS
    molecule.molecule_id = 1001
    result = topology_from_geom.bond_topologies_from_geom(
        molecule, all_distributions, matching_parameters)
    self.assertIsNotNone(result)
    self.assertLen(result.bond_topology, 2)
    self.assertLen(result.bond_topology[0].bonds, 1)
    self.assertLen(result.bond_topology[1].bonds, 1)
    self.assertEqual(result.bond_topology[0].bonds[0].bond_type, single_bond)
    self.assertEqual(result.bond_topology[1].bonds[0].bond_type, double_bond)
    self.assertGreater(result.bond_topology[0].topology_score,
                       result.bond_topology[1].topology_score)
    self.assertAlmostEqual(
        np.sum(np.exp([bt.topology_score for bt in result.bond_topology])), 1.0)
    self.assertAlmostEqual(result.bond_topology[0].geometry_score,
                           np.log(bldc1c.pdf(1.4)))
    self.assertAlmostEqual(result.bond_topology[1].geometry_score,
                           np.log(bldc2c.pdf(1.4)))