Ejemplo n.º 1
0
def main(_):
    tf.gfile.MkDir(FLAGS.output_master_dir)

    main_train_val_test_fractions_tuple = tuple(
        [float(elem) for elem in FLAGS.main_train_val_test_fractions])
    main_train_val_test_fractions = train_test_split_utils.TrainValTestFractions(
        *main_train_val_test_fractions_tuple)

    replicates_train_val_test_fractions_tuple = tuple(
        [float(elem) for elem in FLAGS.replicates_train_val_test_fractions])
    replicates_train_val_test_fractions = (
        train_test_split_utils.TrainValTestFractions(
            *replicates_train_val_test_fractions_tuple))

    mainlib_mol_list = parse_sdf_utils.get_sdf_to_mol(
        FLAGS.main_sdf_name, max_atoms=FLAGS.max_atoms)
    replicates_mol_list = parse_sdf_utils.get_sdf_to_mol(
        FLAGS.replicates_sdf_name, max_atoms=FLAGS.max_atoms)

    # Breaks the inchikeys lists into train/validation/test splits.
    (mainlib_inchikey_dict, replicates_inchikey_dict,
     component_inchikey_dict) = (make_mainlib_replicates_train_test_split(
         mainlib_mol_list,
         replicates_mol_list,
         FLAGS.splitting_type,
         main_train_val_test_fractions,
         replicates_train_val_test_fractions,
         mainlib_maximum_num_molecules_to_use=FLAGS.
         mainlib_maximum_num_molecules_to_use,
         replicates_maximum_num_molecules_to_use=FLAGS.
         replicates_maximum_num_molecules_to_use))

    # Writes TFRecords for each component using info from the main library file
    write_mainlib_split_datasets(component_inchikey_dict,
                                 mainlib_inchikey_dict,
                                 FLAGS.output_master_dir, FLAGS.max_atoms,
                                 FLAGS.max_mass_spec_peak_loc)

    # Writes TFRecords for each component using info from the replicates file
    write_replicates_split_datasets(component_inchikey_dict,
                                    replicates_inchikey_dict,
                                    FLAGS.output_master_dir, FLAGS.max_atoms,
                                    FLAGS.max_mass_spec_peak_loc)

    for experiment_setup in ds_constants.EXPERIMENT_SETUPS_LIST:
        # Check that experiment setup is valid.
        check_experiment_setup(experiment_setup.experiment_setup_dataset_dict,
                               component_inchikey_dict)

        # Write a json for the experiment setups, pointing to local files.
        write_json_for_experiment(experiment_setup, FLAGS.output_master_dir)
def main():
    mol_list = parse_sdf_utils.get_sdf_to_mol('/mnt/storage/NIST_zipped/NIST17/replib_mend.sdf')
    inchikey_dict = train_test_split_utils.make_inchikey_dict(mol_list)

    spectra_for_one_mol = make_spectra_array(inchikey_dict['PDACHFOTOFNHBT-UHFFFAOYSA-N'])
    distance_matrix = get_similarities(spectra_for_one_mol)
    print('distance for spectra in PDACHFOTOFNHBT-UHFFFAOYSA-N', distance_matrix)
Ejemplo n.º 3
0
 def test_make_mol_dict(self):
   """Test generation of molecule dictionaries."""
   mols = parse_sdf_utils.get_sdf_to_mol(self.test_file_short)
   mol_dicts = [
       parse_sdf_utils.make_mol_dict(mol, self.hparams.max_atoms,
                                     self.hparams.max_mass_spec_peak_loc)
       for mol in mols
   ]
   for i in range(len(self.expected_mol_dicts)):
     mol_dict_key_names = [
         fmap_constants.NAME, fmap_constants.INCHIKEY,
         fmap_constants.SMILES, fmap_constants.MOLECULAR_FORMULA
     ]
     for kwarg in mol_dict_key_names:
       self.assertEqual(self.expected_mol_dicts[i][kwarg], mol_dicts[i][kwarg])
     self.assertAlmostEqual(
         self.expected_mol_dicts[i][fmap_constants.MOLECULE_WEIGHT],
         mol_dicts[i][fmap_constants.MOLECULE_WEIGHT])
     self.assertSequenceAlmostEqual(
         self.expected_mol_dicts[i][fmap_constants.ATOM_WEIGHTS],
         mol_dicts[i][fmap_constants.ATOM_WEIGHTS])
     self.assertSequenceAlmostEqual(
         self.expected_mol_dicts[i][fmap_constants.ADJACENCY_MATRIX],
         mol_dicts[i][fmap_constants.ADJACENCY_MATRIX])
     self.assertSequenceAlmostEqual(
         self.expected_mol_dicts[i][fmap_constants.DENSE_MASS_SPEC],
         mol_dicts[i][fmap_constants.DENSE_MASS_SPEC])
Ejemplo n.º 4
0
 def test_find_largest_number_of_atoms_and_largest_peak(self):
   """Test finding largest number of atoms and largest mass/charge ratio."""
   mol_output = parse_sdf_utils.get_sdf_to_mol(self.test_file_long)
   found_max_atoms, found_max_atom_num, found_max_peak_loc = (
       parse_sdf_utils.find_largest_number_of_atoms_atomic_number_and_ms_peak(
           mol_output))
   self.assertEqual(found_max_atoms, 28)
   self.assertEqual(found_max_atom_num, 35)
   self.assertEqual(found_max_peak_loc, 77)
Ejemplo n.º 5
0
 def test_get_sdf_to_mol(self):
   """Check the contents of the molecules parsed by rdkit.
   """
   mol_output = parse_sdf_utils.get_sdf_to_mol(
       self.test_file_long, max_atoms=self.hparams.max_atoms)
   self.assertLen(mol_output, 12)
   self.assertIsInstance(mol_output[0], Chem.rdchem.Mol)
   self.assertIsInstance(Chem.MolToSmiles(mol_output[0]), str)
   self.assertEqual(
       Chem.MolToSmiles(mol_output[0], isomericSmiles=True), '[H][H]')
   self.assertTrue(mol_output[0].HasProp(ms_constants.SDF_TAG_MASS_SPEC_PEAKS))
    def setUp(self):
        test_data_directory = test_utils.test_dir('testdata/')
        self.temp_dir = tempfile.mkdtemp(
            dir=absltest.get_default_test_tmpdir())
        test_sdf_file_large = os.path.join(test_data_directory,
                                           'test_14_mend.sdf')
        test_sdf_file_small = os.path.join(test_data_directory,
                                           'test_2_mend.sdf')

        max_atoms = ms_constants.MAX_ATOMS
        self.mol_list_large = parse_sdf_utils.get_sdf_to_mol(
            test_sdf_file_large, max_atoms=max_atoms)
        self.mol_list_small = parse_sdf_utils.get_sdf_to_mol(
            test_sdf_file_small, max_atoms=max_atoms)
        self.inchikey_dict_large = train_test_split_utils.make_inchikey_dict(
            self.mol_list_large)
        self.inchikey_dict_small = train_test_split_utils.make_inchikey_dict(
            self.mol_list_small)
        self.inchikey_list_large = self.inchikey_dict_large.keys()
        self.inchikey_list_small = self.inchikey_dict_small.keys()
Ejemplo n.º 7
0
def main():
    # mol_list = parse_sdf_utils.get_sdf_to_mol('testdata/test_14_mend.sdf')
    # inchikey_dict = train_test_split_utils.make_inchikey_dict(mol_list)
    #
    # spectra_for_one_mol = make_spectra_array(inchikey_dict['UFHFLCQGNIYNRP-UHFFFAOYSA-N'])
    # distance_matrix = get_similarities(spectra_for_one_mol)
    # print('distance for spectra in UFHFLCQGNIYNRP-UHFFFAOYSA-N', distance_matrix)

    mol_list = parse_sdf_utils.get_sdf_to_mol('testdata/test_14_mend.sdf')
    spectra_array = make_spectra_array(mol_list)
    distance_matrix = get_similarities(spectra_array)
    print('distance for spectra in test_14_mend.sdf', distance_matrix)
Ejemplo n.º 8
0
  def test_save_true_spectra_array(self):
    """Checks contents of true spectra array written by write_dicts_to_example.
    """
    mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_short)

    fpath = self.temp_dir

    records_path_name = os.path.join(fpath, 'test_record.gz')
    test_array_filename = 'true_spectra_array.npy'
    array_path_name = os.path.join(fpath, test_array_filename)

    parse_sdf_utils.write_dicts_to_example(
        mol_list,
        records_path_name,
        self.hparams.max_atoms,
        self.hparams.max_mass_spec_peak_loc,
        true_library_array_path_name=array_path_name)
    parse_sdf_utils.write_info_file(mol_list, records_path_name)

    parse_sdf_utils.validate_spectra_array_contents(
        records_path_name, self.hparams, array_path_name)
Ejemplo n.º 9
0
  def test_record_contents(self):
    """Test the contents of the stored record file to ensure features match."""
    mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_long)

    mol_dicts = [parse_sdf_utils.make_mol_dict(mol) for mol in mol_list]
    parsed_smiles_tokens = [
        feature_utils.tokenize_smiles(
            np.array([mol_dict[fmap_constants.SMILES]]))
        for mol_dict in mol_dicts
    ]

    token_lengths = [
        np.shape(token_arr)[0] for token_arr in parsed_smiles_tokens
    ]
    parsed_smiles_tokens = [
        np.pad(token_arr,
               (0, ms_constants.MAX_TOKEN_LIST_LENGTH - token_length),
               'constant')
        for token_arr, token_length in zip(parsed_smiles_tokens, token_lengths)
    ]

    hparams_main = tf.contrib.training.HParams(
        max_atoms=ms_constants.MAX_ATOMS,
        max_mass_spec_peak_loc=ms_constants.MAX_PEAK_LOC,
        eval_batch_size=len(mol_list),
        intensity_power=1.0)

    dataset = parse_sdf_utils.get_dataset_from_record(
        [os.path.join(self.test_data_directory, 'test_14_record.gz')],
        hparams_main,
        mode=tf.estimator.ModeKeys.EVAL)

    feature_names = [
        fmap_constants.ATOM_WEIGHTS,
        fmap_constants.MOLECULE_WEIGHT,
        fmap_constants.DENSE_MASS_SPEC,
        fmap_constants.INCHIKEY, fmap_constants.NAME,
        fmap_constants.MOLECULAR_FORMULA,
        fmap_constants.ADJACENCY_MATRIX,
        fmap_constants.ATOM_IDS, fmap_constants.SMILES
    ]
    for fp_len in ms_constants.NUM_CIRCULAR_FP_BITS_LIST:
      for rad in ms_constants.CIRCULAR_FP_RADII_LIST:
        for fp_type in fmap_constants.FP_TYPE_LIST:
          feature_names.append(
              str(ms_constants.CircularFingerprintKey(fp_type, fp_len, rad)))
    label_names = [fmap_constants.INCHIKEY]

    features, _ = parse_sdf_utils.make_features_and_labels(
        dataset, feature_names, label_names, mode=tf.estimator.ModeKeys.EVAL)

    with tf.Session() as sess:
      feature_values = sess.run(features)

      # Check that the dataset was consumed
      try:
        sess.run(features)
        raise ValueError('Dataset parsing using batch size of length of the'
                         ' dataset resulted in more than one batch.')
      except tf.errors.OutOfRangeError:  # expected behavior
        pass

    for i in range(len(mol_list)):
      self.assertAlmostEqual(
          feature_values[fmap_constants.MOLECULE_WEIGHT][i],
          mol_dicts[i][fmap_constants.MOLECULE_WEIGHT])
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ADJACENCY_MATRIX][i]
          .flatten(),
          mol_dicts[i][fmap_constants.ADJACENCY_MATRIX],
          delta=0.0001)
      self.assertEqual(feature_values[fmap_constants.NAME][i],
                       self.encode(mol_dicts[i][fmap_constants.NAME]))
      self.assertEqual(feature_values[fmap_constants.INCHIKEY][i],
                       self.encode(mol_dicts[i][fmap_constants.INCHIKEY]))
      self.assertEqual(
          feature_values[fmap_constants.MOLECULAR_FORMULA][i],
          self.encode(mol_dicts[i][fmap_constants.MOLECULAR_FORMULA]))
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.DENSE_MASS_SPEC][i],
          mol_dicts[i][fmap_constants.DENSE_MASS_SPEC],
          delta=0.0001)
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ATOM_WEIGHTS][i],
          mol_dicts[i][fmap_constants.ATOM_WEIGHTS],
          delta=0.0001)
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ATOM_IDS][i],
          mol_dicts[i][fmap_constants.ATOM_IDS],
          delta=0.0001)
      self.assertAllEqual(feature_values[fmap_constants.SMILES][i],
                          parsed_smiles_tokens[i])
      self.assertAllEqual(
          feature_values[fmap_constants.SMILES_TOKEN_LIST_LENGTH][i],
          token_lengths[i])
      for fp_len in ms_constants.NUM_CIRCULAR_FP_BITS_LIST:
        for rad in ms_constants.CIRCULAR_FP_RADII_LIST:
          for fp_type in fmap_constants.FP_TYPE_LIST:
            fp_key = ms_constants.CircularFingerprintKey(fp_type, fp_len, rad)
            self.assertSequenceAlmostEqual(
                feature_values[str(fp_key)][i],
                mol_dicts[i][fp_key],
                delta=0.0001)
Ejemplo n.º 10
0
  def test_dict_tfexample(self):
    """Check if the contents of tf.Records is the same as input molecule info.

       Writes tf.example as tf.record to disk, then reads from disk.
    """
    mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_short)

    fd, fpath = tempfile.mkstemp(dir=self.temp_dir)
    os.close(fd)

    parse_sdf_utils.write_dicts_to_example(mol_list, fpath,
                                           self.hparams.max_atoms,
                                           self.hparams.max_mass_spec_peak_loc)
    parse_sdf_utils.write_info_file(mol_list, fpath)
    self._validate_info_file(mol_list, fpath)

    dataset = parse_sdf_utils.get_dataset_from_record(
        [fpath], self.hparams, mode=tf.estimator.ModeKeys.EVAL)

    feature_names = [
        fmap_constants.ATOM_WEIGHTS,
        fmap_constants.MOLECULE_WEIGHT,
        fmap_constants.DENSE_MASS_SPEC,
        fmap_constants.INCHIKEY, fmap_constants.NAME,
        fmap_constants.MOLECULAR_FORMULA,
        fmap_constants.ADJACENCY_MATRIX,
        fmap_constants.ATOM_IDS, fmap_constants.SMILES
    ]
    label_names = [fmap_constants.INCHIKEY]

    features, _ = parse_sdf_utils.make_features_and_labels(
        dataset, feature_names, label_names, mode=tf.estimator.ModeKeys.EVAL)

    with tf.Session() as sess:
      feature_values = sess.run(features)

      # Check that the dataset was consumed
      try:
        sess.run(features)
        raise ValueError('Dataset parsing using batch size of length of the'
                         'dataset resulted in more than one batch.')
      except tf.errors.OutOfRangeError:  # expected behavior
        pass

    for i in range(len(self.expected_mol_dicts)):
      self.assertAlmostEqual(
          feature_values[fmap_constants.MOLECULE_WEIGHT][i],
          self.expected_mol_dicts[i][fmap_constants.MOLECULE_WEIGHT])
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ADJACENCY_MATRIX][i]
          .flatten(),
          self.expected_mol_dicts[i][fmap_constants.ADJACENCY_MATRIX],
          delta=0.0001)
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.DENSE_MASS_SPEC][i],
          self.expected_mol_dicts[i][fmap_constants.DENSE_MASS_SPEC],
          delta=0.0001)
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ATOM_WEIGHTS][i],
          self.expected_mol_dicts[i][fmap_constants.ATOM_WEIGHTS],
          delta=0.0001)
      self.assertSequenceAlmostEqual(
          feature_values[fmap_constants.ATOM_IDS][i],
          self.expected_mol_dicts[i][fmap_constants.ATOM_IDS],
          delta=0.0001)
      self.assertEqual(
          feature_values[fmap_constants.NAME][i],
          self.encode(self.expected_mol_dicts[i][fmap_constants.NAME]))
      self.assertEqual(
          feature_values[fmap_constants.INCHIKEY][i],
          self.encode(
              self.expected_mol_dicts[i][fmap_constants.INCHIKEY]))
      self.assertEqual(
          feature_values[fmap_constants.MOLECULAR_FORMULA][i],
          self.encode(
              self.expected_mol_dicts[i][fmap_constants.MOLECULAR_FORMULA]))
      self.assertAllEqual(feature_values[fmap_constants.SMILES][i],
                          self.expected_mol_dicts[i]['parsed_smiles'])
      self.assertAllEqual(
          feature_values[fmap_constants.SMILES_TOKEN_LIST_LENGTH][i],
          self.expected_mol_dicts[i][fmap_constants.SMILES_TOKEN_LIST_LENGTH])
Ejemplo n.º 11
0
 def test_find_inchikey_duplicates(self):
   """Test finding duplicate inchi keys in list of molecules."""
   mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_long)
   dup_dict = parse_sdf_utils.find_inchikey_duplicates(mol_list)
   self.assertLen(dup_dict, 1)
Ejemplo n.º 12
0
 def test_filter_mol_list_by_prop(self):
   """Test filtering rdkit.Mol list by contents of tags."""
   mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_long)
   filtered_mol_list = parse_sdf_utils.filter_mol_list_by_prop(
       mol_list, 'CONTRIBUTOR', 'Moscow', wanted=True)
   self.assertLen(filtered_mol_list, 9)