Beispiel #1
0
def write_all_dataset_files(inchikey_dict,
                            inchikey_list,
                            base_name,
                            output_dir,
                            max_atoms,
                            max_mass_spec_peak_loc,
                            make_library_array=False):
    """Helper function for writing all the files associated with a TFRecord.

    Args:
      inchikey_dict : Full dictionary keyed by inchikey containing lists of
                      rdkit.Mol objects
      inchikey_list : List of inchikeys to include in dataset
      base_name : Base name for the dataset
      output_dir : Path for saving all TFRecord files
      max_atoms : Maximum number of atoms to include for a given molecule
      max_mass_spec_peak_loc : Largest m/z peak to include in a spectra.
      make_library_array : Flag for whether to make library array
    Returns:
      Saves 3 files:
       basename.tfrecord : a TFRecord file,
       basename.inchikey.txt : a text file with all the inchikeys in the dataset
       basename.tfrecord.info: a text file with one line describing
           the length of the TFRecord file.
      Also saves if make_library_array is set:
       basename.npy : see parse_sdf_utils.write_dicts_to_example
    """
    record_name = base_name + TFRECORD_FILENAME_END

    mol_list = train_test_split_utils.make_mol_list_from_inchikey_dict(
        inchikey_dict, inchikey_list)

    if make_library_array:
        library_array_pathname = base_name + NP_LIBRARY_ARRAY_END
        parse_sdf_utils.write_dicts_to_example(
            mol_list, os.path.join(output_dir, record_name), max_atoms,
            max_mass_spec_peak_loc,
            os.path.join(output_dir, library_array_pathname))
    else:
        parse_sdf_utils.write_dicts_to_example(
            mol_list, os.path.join(output_dir, record_name), max_atoms,
            max_mass_spec_peak_loc)
    write_list_of_inchikeys(inchikey_list, base_name, output_dir)
    parse_sdf_utils.write_info_file(mol_list,
                                    os.path.join(output_dir, record_name))
Beispiel #2
0
  def test_save_true_spectra_array(self):
    """Checks contents of true spectra array written by write_dicts_to_example.
    """
    mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_short)

    fpath = self.temp_dir

    records_path_name = os.path.join(fpath, 'test_record.gz')
    test_array_filename = 'true_spectra_array.npy'
    array_path_name = os.path.join(fpath, test_array_filename)

    parse_sdf_utils.write_dicts_to_example(
        mol_list,
        records_path_name,
        self.hparams.max_atoms,
        self.hparams.max_mass_spec_peak_loc,
        true_library_array_path_name=array_path_name)
    parse_sdf_utils.write_info_file(mol_list, records_path_name)

    parse_sdf_utils.validate_spectra_array_contents(
        records_path_name, self.hparams, array_path_name)
Beispiel #3
0
  def test_dict_tfexample(self):
    """Check if the contents of tf.Records is the same as input molecule info.

       Writes tf.example as tf.record to disk, then reads from disk.
    """
    mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_short)

    fd, fpath = tempfile.mkstemp(dir=self.temp_dir)
    os.close(fd)

    parse_sdf_utils.write_dicts_to_example(mol_list, fpath,
                                           self.hparams.max_atoms,
                                           self.hparams.max_mass_spec_peak_loc)
    parse_sdf_utils.write_info_file(mol_list, fpath)
    self._validate_info_file(mol_list, fpath)

    dataset = parse_sdf_utils.get_dataset_from_record(
        [fpath], self.hparams, mode=tf.estimator.ModeKeys.EVAL)

    feature_names = [
        fmap_constants.ATOM_WEIGHTS,
        fmap_constants.MOLECULE_WEIGHT,
        fmap_constants.DENSE_MASS_SPEC,
        fmap_constants.INCHIKEY, fmap_constants.NAME,
        fmap_constants.MOLECULAR_FORMULA,
        fmap_constants.ADJACENCY_MATRIX,
        fmap_constants.ATOM_IDS, fmap_constants.SMILES
    ]
    label_names = [fmap_constants.INCHIKEY]

    features, _ = parse_sdf_utils.make_features_and_labels(
        dataset, feature_names, label_names, mode=tf.estimator.ModeKeys.EVAL)

    with tf.Session() as sess:
      feature_values = sess.run(features)

      # Check that the dataset was consumed
      try:
        sess.run(features)
        raise ValueError('Dataset parsing using batch size of length of the'
                         'dataset resulted in more than one batch.')
      except tf.errors.OutOfRangeError:  # expected behavior
        pass

    for i in range(len(self.expected_mol_dicts)):
      self.assertAlmostEqual(
          feature_values[fmap_constants.MOLECULE_WEIGHT][i],
          self.expected_mol_dicts[i][fmap_constants.MOLECULE_WEIGHT])
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ADJACENCY_MATRIX][i]
          .flatten(),
          self.expected_mol_dicts[i][fmap_constants.ADJACENCY_MATRIX],
          delta=0.0001)
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.DENSE_MASS_SPEC][i],
          self.expected_mol_dicts[i][fmap_constants.DENSE_MASS_SPEC],
          delta=0.0001)
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ATOM_WEIGHTS][i],
          self.expected_mol_dicts[i][fmap_constants.ATOM_WEIGHTS],
          delta=0.0001)
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ATOM_IDS][i],
          self.expected_mol_dicts[i][fmap_constants.ATOM_IDS],
          delta=0.0001)
      self.assertEqual(
          feature_values[fmap_constants.NAME][i],
          self.encode(self.expected_mol_dicts[i][fmap_constants.NAME]))
      self.assertEqual(
          feature_values[fmap_constants.INCHIKEY][i],
          self.encode(
              self.expected_mol_dicts[i][fmap_constants.INCHIKEY]))
      self.assertEqual(
          feature_values[fmap_constants.MOLECULAR_FORMULA][i],
          self.encode(
              self.expected_mol_dicts[i][fmap_constants.MOLECULAR_FORMULA]))
      self.assertAllEqual(feature_values[fmap_constants.SMILES][i],
                          self.expected_mol_dicts[i]['parsed_smiles'])
      self.assertAllEqual(
          feature_values[fmap_constants.SMILES_TOKEN_LIST_LENGTH][i],
          self.expected_mol_dicts[i][fmap_constants.SMILES_TOKEN_LIST_LENGTH])