Exemple #1
0
 def test_fully_saturated(self):
     self.assertEqual(
         smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(
             smu_utils_lib.create_bond_topology('C', '', '4')), '(ch4)')
     self.assertEqual(
         smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(
             smu_utils_lib.create_bond_topology('N', '', '3')), '(nh3)')
     self.assertEqual(
         smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(
             smu_utils_lib.create_bond_topology('O', '', '2')), '(oh2)')
     self.assertEqual(
         smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(
             smu_utils_lib.create_bond_topology('F', '', '1')), '(fh)')
Exemple #2
0
def topology_query(db, smiles):
  """Find all conformers which have a detected bond topology.

  Note that this *redoes* the detection. If you want to use the default detected
  versions, you can just query by SMILES string. This is only useful if you
  adjust the distance thresholds for what a matching bond is.

  Args:
    db: smu_sqlite.SMUSQLite
    smiles: smiles string for the target bond topology

  Yields:
    dataset_pb2.Conformer
  """
  mol = Chem.MolFromSmiles(smiles, sanitize=False)
  Chem.SanitizeMol(mol, Chem.rdmolops.SanitizeFlags.SANITIZE_ADJUSTHS)
  mol = Chem.AddHs(mol)
  query_bt = utilities.molecule_to_bond_topology(mol)
  expanded_stoich = smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(
      query_bt)
  matching_parameters = _get_geometry_matching_parameters()
  geometry_data = GeometryData.get_singleton()
  cnt_matched_conformer = 0
  cnt_conformer = 0
  logging.info('Starting query for %s with stoich %s', smiles, expanded_stoich)
  for conformer in db.find_by_expanded_stoichiometry(expanded_stoich):
    if not smu_utils_lib.conformer_eligible_for_topology_detection(conformer):
      continue
    cnt_conformer += 1
    matches = topology_from_geom.bond_topologies_from_geom(
        bond_lengths=geometry_data.bond_lengths,
        conformer_id=conformer.conformer_id,
        fate=conformer.fate,
        bond_topology=conformer.bond_topologies[0],
        geometry=conformer.optimized_geometry,
        matching_parameters=matching_parameters)
    if smiles in [bt.smiles for bt in matches.bond_topology]:
      cnt_matched_conformer += 1
      del conformer.bond_topologies[:]
      conformer.bond_topologies.extend(matches.bond_topology)
      for bt in conformer.bond_topologies:
        try:
          bt.bond_topology_id = geometry_data.smiles_id_dict[bt.smiles]
        except KeyError:
          logging.error('Did not find bond topology id for smiles %s',
                        bt.smiles)
      yield conformer
  logging.info('Topology query for %s matched %d / %d', smiles,
               cnt_matched_conformer, cnt_conformer)
Exemple #3
0
 def test_ethylene(self):
   bt = smu_utils_lib.create_bond_topology('CC', '2', '22')
   self.assertEqual(
       smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(bt), '(ch2)2')
Exemple #4
0
 def test_cyclobutane(self):
   bt = smu_utils_lib.create_bond_topology('CCCC', '110011', '2222')
   self.assertEqual(
       smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(bt), '(ch2)4')
Exemple #5
0
 def test_nplus_oneg(self):
   bt = smu_utils_lib.create_bond_topology('NO', '1', '30')
   self.assertEqual(
       smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(bt),
       '(nh3)(o)')
Exemple #6
0
 def test_fluorine(self):
   bt = smu_utils_lib.create_bond_topology('OFF', '110', '000')
   self.assertEqual(
       smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(bt), '(o)(f)2')
Exemple #7
0
 def test_acrylic_acid(self):
   bt = smu_utils_lib.create_bond_topology('CCCOO', '2000100210', '21001')
   self.assertEqual(
       smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(bt),
       '(c)(ch)(ch2)(o)(oh)')
  def bulk_insert(self, encoded_conformers, batch_size=10000, limit=None):
    """Inserts conformers into the database.

    Args:
      encoded_conformers: iterable for encoded dataset_pb2.Conformer
      batch_size: insert performance is greatly improved by putting multiple
        insert into one transaction. 10k was a reasonable default from some
        early exploration.
      limit: maximum number of records to insert

    Raises:
      ReadOnlyError: if mode is 'r'
      ValueError: If encoded_conformers is empty.
    """
    if self._read_only:
      raise ReadOnlyError()
    if not encoded_conformers:
      raise ValueError()

    insert_conformer = (f'INSERT INTO {_CONFORMER_TABLE_NAME} '
                        'VALUES (?, ?, ?)')
    insert_btid = f'INSERT INTO {_BTID_TABLE_NAME} VALUES (?, ?)'
    insert_smiles = (f'INSERT INTO {_SMILES_TABLE_NAME} VALUES (?, ?) '
                     f'ON CONFLICT(smiles) DO NOTHING')

    cur = self._conn.cursor()

    start_time = datetime.datetime.now()

    pending_conformer_args = []
    pending_btid_args = []
    pending_smiles_args = []

    def commit_pending():
      cur.executemany(insert_conformer, pending_conformer_args)
      cur.executemany(insert_btid, pending_btid_args)
      cur.executemany(insert_smiles, pending_smiles_args)
      pending_conformer_args.clear()
      pending_btid_args.clear()
      pending_smiles_args.clear()
      self._conn.commit()

    idx = None
    for idx, encoded_conformer in enumerate(encoded_conformers, 1):
      conformer = dataset_pb2.Conformer.FromString(encoded_conformer)
      # A small efficiency hack: the expanded stoich is only intended for use
      # with topology_detection, so we only put a real value for those so that
      # we dont' even have to return the entries we don't want.
      if smu_utils_lib.conformer_eligible_for_topology_detection(conformer):
        expanded_stoich = (
            smu_utils_lib.get_canonical_stoichiometry_with_hydrogens(
                conformer.bond_topologies[0]))
      else:
        expanded_stoich = ''
      pending_conformer_args.append((conformer.conformer_id, expanded_stoich,
                                     snappy.compress(encoded_conformer)))
      for bond_topology in conformer.bond_topologies:
        pending_btid_args.append(
            (bond_topology.bond_topology_id, conformer.conformer_id))
        pending_smiles_args.append(
            (bond_topology.smiles, bond_topology.bond_topology_id))
      if batch_size and idx % batch_size == 0:
        commit_pending()
        elapsed = datetime.datetime.now() - start_time
        logging.info(
            'bulk_insert: committed at index %d, %f s total, %.6f s/record',
            idx, elapsed.total_seconds(),
            elapsed.total_seconds() / idx)

      if limit and idx >= limit:
        break

    # Commit a final time
    commit_pending()
    elapsed = datetime.datetime.now() - start_time
    logging.info('bulk_insert: Total records %d, %f s, %.6f s/record', idx,
                 elapsed.total_seconds(),
                 elapsed.total_seconds() / idx)