Example #1
0
def write_bond_lengths(records, filename):
    """DoFn for writing the bond lengths.

  We write directly to filename because the entire pcollection
  should have been combined to a single entry.

  Args:
    records: records as expected by
      bond_length_distribution.sparse_dataframe_from_records
    filename: file to write to
  """
    with gfile.GFile(filename, 'w') as f:
        df = bond_length_distribution.sparse_dataframe_from_records(records)
        df.to_csv(f, index=False)
Example #2
0
  def process(self, conformer, bond_length_records, smiles_id_dict):
    """Per conformer updates.

    Args:
      conformer: dataset_pb2.Conformer
      bond_length_records: tuples to go to
        bond_length_distribution.AllAtomPairLengthDistributions
      smiles_id_dict: dict from SMILES to bond topology id

    Yields:
      Conformer.
    """
    # There is probably a better way to do this.
    # We get the side input with each call to process. We'll assume that it's
    # always the same input, so we set our cache value and never update it.
    # We only do this with bond_length_records because there is a reasonable
    # amount of processing in creating AllAtomPairLengthDistributions.
    # The smiles_id_dict is used directly.
    if not self._cached_bond_lengths:
      self._cached_bond_lengths = (
          bond_length_distribution.AllAtomPairLengthDistributions())
      try:
        self._cached_bond_lengths.add_from_sparse_dataframe(
            bond_length_distribution.sparse_dataframe_from_records(
                bond_length_records), _BOND_LENGTHS_UNBONDED_RIGHT_TAIL_MASS,
            _BOND_LENGTHS_SIG_DIGITS)
      except ValueError as err:
        raise ValueError(
            'Invalid sparse dataframe for conformer {0} org. ValueError: {1}'
            .format(str(conformer.conformer_id), err))

    conformer = copy.deepcopy(conformer)

    conformer.fate = smu_utils_lib.determine_fate(conformer)

    yield from self._compare_smiles(conformer)

    if (conformer.duplicated_by == 0 and
        conformer.properties.errors.status < 512):
      # The duplicate records do not need topology extraction and anything
      # with this high an error is pretty messed so, do we won't bother trying
      # to match the topolgy.
      self._add_alternative_bond_topologies(conformer, smiles_id_dict)
    else:
      beam.metrics.Metrics.counter(_METRICS_NAMESPACE,
                                   'skipped_topology_matches').inc()

    yield conformer
Example #3
0
 def test_simple(self):
   input_list = [
       (('n', 'o', 1, '3.456'), 30),
       (('c', 'c', 2, '2.345'), 20),
       (('c', 'c', 1, '1.234'), 10),
   ]
   got = bond_length_distribution.sparse_dataframe_from_records(input_list)
   self.assertCountEqual(
       got.columns,
       ['atom_char_0', 'atom_char_1', 'bond_type', 'length_str', 'count'])
   np.testing.assert_array_equal(got['atom_char_0'], ['c', 'c', 'n'])
   np.testing.assert_array_equal(got['atom_char_1'], ['c', 'c', 'o'])
   np.testing.assert_array_equal(got['bond_type'], [1, 2, 1])
   np.testing.assert_array_equal(got['length_str'],
                                 ['1.234', '2.345', '3.456'])
   np.testing.assert_array_equal(got['count'], [10, 20, 30])
Example #4
0
  def process(self, molecule, bond_length_records, smiles_id_dict):
    """Per molecule updates.

    Args:
      molecule: dataset_pb2.Molecule
      bond_length_records: tuples to go to
        bond_length_distribution.AllAtomPairLengthDistributions
      smiles_id_dict: dict from SMILES to bond topology id

    Yields:
      Molecule.
    """
    # There is probably a better way to do this.
    # We get the side input with each call to process. We'll assume that it's
    # always the same input, so we set our cache value and never update it.
    # We only do this with bond_length_records because there is a reasonable
    # amount of processing in creating AllAtomPairLengthDistributions.
    # The smiles_id_dict is used directly.
    if not self._cached_bond_lengths:
      self._cached_bond_lengths = (
          bond_length_distribution.AllAtomPairLengthDistributions())
      try:
        self._cached_bond_lengths.add_from_sparse_dataframe(
            bond_length_distribution.sparse_dataframe_from_records(
                bond_length_records),
            bond_length_distribution.STANDARD_UNBONDED_RIGHT_TAIL_MASS,
            bond_length_distribution.STANDARD_SIG_DIGITS)
      except ValueError as err:
        raise ValueError(
            'Invalid sparse dataframe for molecule {0} org. ValueError: {1}'
            .format(str(molecule.molecule_id), err)) from err

    molecule = copy.deepcopy(molecule)

    molecule.properties.errors.fate = smu_utils_lib.determine_fate(molecule)

    yield from self._compare_smiles(molecule)

    if smu_utils_lib.molecule_eligible_for_topology_detection(molecule):
      self._add_alternative_bond_topologies(molecule, smiles_id_dict)
    else:
      molecule.bond_topologies[
          0].source = dataset_pb2.BondTopology.SOURCE_STARTING
      beam.metrics.Metrics.counter(_METRICS_NAMESPACE,
                                   'skipped_topology_matches').inc()

    yield molecule