def main(_): tf.gfile.MkDir(FLAGS.output_master_dir) main_train_val_test_fractions_tuple = tuple( [float(elem) for elem in FLAGS.main_train_val_test_fractions]) main_train_val_test_fractions = train_test_split_utils.TrainValTestFractions( *main_train_val_test_fractions_tuple) replicates_train_val_test_fractions_tuple = tuple( [float(elem) for elem in FLAGS.replicates_train_val_test_fractions]) replicates_train_val_test_fractions = ( train_test_split_utils.TrainValTestFractions( *replicates_train_val_test_fractions_tuple)) mainlib_mol_list = parse_sdf_utils.get_sdf_to_mol( FLAGS.main_sdf_name, max_atoms=FLAGS.max_atoms) replicates_mol_list = parse_sdf_utils.get_sdf_to_mol( FLAGS.replicates_sdf_name, max_atoms=FLAGS.max_atoms) # Breaks the inchikeys lists into train/validation/test splits. (mainlib_inchikey_dict, replicates_inchikey_dict, component_inchikey_dict) = (make_mainlib_replicates_train_test_split( mainlib_mol_list, replicates_mol_list, FLAGS.splitting_type, main_train_val_test_fractions, replicates_train_val_test_fractions, mainlib_maximum_num_molecules_to_use=FLAGS. mainlib_maximum_num_molecules_to_use, replicates_maximum_num_molecules_to_use=FLAGS. replicates_maximum_num_molecules_to_use)) # Writes TFRecords for each component using info from the main library file write_mainlib_split_datasets(component_inchikey_dict, mainlib_inchikey_dict, FLAGS.output_master_dir, FLAGS.max_atoms, FLAGS.max_mass_spec_peak_loc) # Writes TFRecords for each component using info from the replicates file write_replicates_split_datasets(component_inchikey_dict, replicates_inchikey_dict, FLAGS.output_master_dir, FLAGS.max_atoms, FLAGS.max_mass_spec_peak_loc) for experiment_setup in ds_constants.EXPERIMENT_SETUPS_LIST: # Check that experiment setup is valid. check_experiment_setup(experiment_setup.experiment_setup_dataset_dict, component_inchikey_dict) # Write a json for the experiment setups, pointing to local files. write_json_for_experiment(experiment_setup, FLAGS.output_master_dir)
def main(): mol_list = parse_sdf_utils.get_sdf_to_mol('/mnt/storage/NIST_zipped/NIST17/replib_mend.sdf') inchikey_dict = train_test_split_utils.make_inchikey_dict(mol_list) spectra_for_one_mol = make_spectra_array(inchikey_dict['PDACHFOTOFNHBT-UHFFFAOYSA-N']) distance_matrix = get_similarities(spectra_for_one_mol) print('distance for spectra in PDACHFOTOFNHBT-UHFFFAOYSA-N', distance_matrix)
def test_make_mol_dict(self): """Test generation of molecule dictionaries.""" mols = parse_sdf_utils.get_sdf_to_mol(self.test_file_short) mol_dicts = [ parse_sdf_utils.make_mol_dict(mol, self.hparams.max_atoms, self.hparams.max_mass_spec_peak_loc) for mol in mols ] for i in range(len(self.expected_mol_dicts)): mol_dict_key_names = [ fmap_constants.NAME, fmap_constants.INCHIKEY, fmap_constants.SMILES, fmap_constants.MOLECULAR_FORMULA ] for kwarg in mol_dict_key_names: self.assertEqual(self.expected_mol_dicts[i][kwarg], mol_dicts[i][kwarg]) self.assertAlmostEqual( self.expected_mol_dicts[i][fmap_constants.MOLECULE_WEIGHT], mol_dicts[i][fmap_constants.MOLECULE_WEIGHT]) self.assertSequenceAlmostEqual( self.expected_mol_dicts[i][fmap_constants.ATOM_WEIGHTS], mol_dicts[i][fmap_constants.ATOM_WEIGHTS]) self.assertSequenceAlmostEqual( self.expected_mol_dicts[i][fmap_constants.ADJACENCY_MATRIX], mol_dicts[i][fmap_constants.ADJACENCY_MATRIX]) self.assertSequenceAlmostEqual( self.expected_mol_dicts[i][fmap_constants.DENSE_MASS_SPEC], mol_dicts[i][fmap_constants.DENSE_MASS_SPEC])
def test_find_largest_number_of_atoms_and_largest_peak(self): """Test finding largest number of atoms and largest mass/charge ratio.""" mol_output = parse_sdf_utils.get_sdf_to_mol(self.test_file_long) found_max_atoms, found_max_atom_num, found_max_peak_loc = ( parse_sdf_utils.find_largest_number_of_atoms_atomic_number_and_ms_peak( mol_output)) self.assertEqual(found_max_atoms, 28) self.assertEqual(found_max_atom_num, 35) self.assertEqual(found_max_peak_loc, 77)
def test_get_sdf_to_mol(self): """Check the contents of the molecules parsed by rdkit. """ mol_output = parse_sdf_utils.get_sdf_to_mol( self.test_file_long, max_atoms=self.hparams.max_atoms) self.assertLen(mol_output, 12) self.assertIsInstance(mol_output[0], Chem.rdchem.Mol) self.assertIsInstance(Chem.MolToSmiles(mol_output[0]), str) self.assertEqual( Chem.MolToSmiles(mol_output[0], isomericSmiles=True), '[H][H]') self.assertTrue(mol_output[0].HasProp(ms_constants.SDF_TAG_MASS_SPEC_PEAKS))
def setUp(self): test_data_directory = test_utils.test_dir('testdata/') self.temp_dir = tempfile.mkdtemp( dir=absltest.get_default_test_tmpdir()) test_sdf_file_large = os.path.join(test_data_directory, 'test_14_mend.sdf') test_sdf_file_small = os.path.join(test_data_directory, 'test_2_mend.sdf') max_atoms = ms_constants.MAX_ATOMS self.mol_list_large = parse_sdf_utils.get_sdf_to_mol( test_sdf_file_large, max_atoms=max_atoms) self.mol_list_small = parse_sdf_utils.get_sdf_to_mol( test_sdf_file_small, max_atoms=max_atoms) self.inchikey_dict_large = train_test_split_utils.make_inchikey_dict( self.mol_list_large) self.inchikey_dict_small = train_test_split_utils.make_inchikey_dict( self.mol_list_small) self.inchikey_list_large = self.inchikey_dict_large.keys() self.inchikey_list_small = self.inchikey_dict_small.keys()
def main(): # mol_list = parse_sdf_utils.get_sdf_to_mol('testdata/test_14_mend.sdf') # inchikey_dict = train_test_split_utils.make_inchikey_dict(mol_list) # # spectra_for_one_mol = make_spectra_array(inchikey_dict['UFHFLCQGNIYNRP-UHFFFAOYSA-N']) # distance_matrix = get_similarities(spectra_for_one_mol) # print('distance for spectra in UFHFLCQGNIYNRP-UHFFFAOYSA-N', distance_matrix) mol_list = parse_sdf_utils.get_sdf_to_mol('testdata/test_14_mend.sdf') spectra_array = make_spectra_array(mol_list) distance_matrix = get_similarities(spectra_array) print('distance for spectra in test_14_mend.sdf', distance_matrix)
def test_save_true_spectra_array(self): """Checks contents of true spectra array written by write_dicts_to_example. """ mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_short) fpath = self.temp_dir records_path_name = os.path.join(fpath, 'test_record.gz') test_array_filename = 'true_spectra_array.npy' array_path_name = os.path.join(fpath, test_array_filename) parse_sdf_utils.write_dicts_to_example( mol_list, records_path_name, self.hparams.max_atoms, self.hparams.max_mass_spec_peak_loc, true_library_array_path_name=array_path_name) parse_sdf_utils.write_info_file(mol_list, records_path_name) parse_sdf_utils.validate_spectra_array_contents( records_path_name, self.hparams, array_path_name)
def test_record_contents(self): """Test the contents of the stored record file to ensure features match.""" mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_long) mol_dicts = [parse_sdf_utils.make_mol_dict(mol) for mol in mol_list] parsed_smiles_tokens = [ feature_utils.tokenize_smiles( np.array([mol_dict[fmap_constants.SMILES]])) for mol_dict in mol_dicts ] token_lengths = [ np.shape(token_arr)[0] for token_arr in parsed_smiles_tokens ] parsed_smiles_tokens = [ np.pad(token_arr, (0, ms_constants.MAX_TOKEN_LIST_LENGTH - token_length), 'constant') for token_arr, token_length in zip(parsed_smiles_tokens, token_lengths) ] hparams_main = tf.contrib.training.HParams( max_atoms=ms_constants.MAX_ATOMS, max_mass_spec_peak_loc=ms_constants.MAX_PEAK_LOC, eval_batch_size=len(mol_list), intensity_power=1.0) dataset = parse_sdf_utils.get_dataset_from_record( [os.path.join(self.test_data_directory, 'test_14_record.gz')], hparams_main, mode=tf.estimator.ModeKeys.EVAL) feature_names = [ fmap_constants.ATOM_WEIGHTS, fmap_constants.MOLECULE_WEIGHT, fmap_constants.DENSE_MASS_SPEC, fmap_constants.INCHIKEY, fmap_constants.NAME, fmap_constants.MOLECULAR_FORMULA, fmap_constants.ADJACENCY_MATRIX, fmap_constants.ATOM_IDS, fmap_constants.SMILES ] for fp_len in ms_constants.NUM_CIRCULAR_FP_BITS_LIST: for rad in ms_constants.CIRCULAR_FP_RADII_LIST: for fp_type in fmap_constants.FP_TYPE_LIST: feature_names.append( str(ms_constants.CircularFingerprintKey(fp_type, fp_len, rad))) label_names = [fmap_constants.INCHIKEY] features, _ = parse_sdf_utils.make_features_and_labels( dataset, feature_names, label_names, mode=tf.estimator.ModeKeys.EVAL) with tf.Session() as sess: feature_values = sess.run(features) # Check that the dataset was consumed try: sess.run(features) raise ValueError('Dataset parsing using batch size of length of the' ' dataset resulted in more than one batch.') except tf.errors.OutOfRangeError: # expected behavior pass for i in range(len(mol_list)): self.assertAlmostEqual( feature_values[fmap_constants.MOLECULE_WEIGHT][i], mol_dicts[i][fmap_constants.MOLECULE_WEIGHT]) self.assertSequenceAlmostEqual( feature_values[fmap_constants.ADJACENCY_MATRIX][i] .flatten(), mol_dicts[i][fmap_constants.ADJACENCY_MATRIX], delta=0.0001) self.assertEqual(feature_values[fmap_constants.NAME][i], self.encode(mol_dicts[i][fmap_constants.NAME])) self.assertEqual(feature_values[fmap_constants.INCHIKEY][i], self.encode(mol_dicts[i][fmap_constants.INCHIKEY])) self.assertEqual( feature_values[fmap_constants.MOLECULAR_FORMULA][i], self.encode(mol_dicts[i][fmap_constants.MOLECULAR_FORMULA])) self.assertSequenceAlmostEqual( feature_values[fmap_constants.DENSE_MASS_SPEC][i], mol_dicts[i][fmap_constants.DENSE_MASS_SPEC], delta=0.0001) self.assertSequenceAlmostEqual( feature_values[fmap_constants.ATOM_WEIGHTS][i], mol_dicts[i][fmap_constants.ATOM_WEIGHTS], delta=0.0001) self.assertSequenceAlmostEqual( feature_values[fmap_constants.ATOM_IDS][i], mol_dicts[i][fmap_constants.ATOM_IDS], delta=0.0001) self.assertAllEqual(feature_values[fmap_constants.SMILES][i], parsed_smiles_tokens[i]) self.assertAllEqual( feature_values[fmap_constants.SMILES_TOKEN_LIST_LENGTH][i], token_lengths[i]) for fp_len in ms_constants.NUM_CIRCULAR_FP_BITS_LIST: for rad in ms_constants.CIRCULAR_FP_RADII_LIST: for fp_type in fmap_constants.FP_TYPE_LIST: fp_key = ms_constants.CircularFingerprintKey(fp_type, fp_len, rad) self.assertSequenceAlmostEqual( feature_values[str(fp_key)][i], mol_dicts[i][fp_key], delta=0.0001)
def test_dict_tfexample(self): """Check if the contents of tf.Records is the same as input molecule info. Writes tf.example as tf.record to disk, then reads from disk. """ mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_short) fd, fpath = tempfile.mkstemp(dir=self.temp_dir) os.close(fd) parse_sdf_utils.write_dicts_to_example(mol_list, fpath, self.hparams.max_atoms, self.hparams.max_mass_spec_peak_loc) parse_sdf_utils.write_info_file(mol_list, fpath) self._validate_info_file(mol_list, fpath) dataset = parse_sdf_utils.get_dataset_from_record( [fpath], self.hparams, mode=tf.estimator.ModeKeys.EVAL) feature_names = [ fmap_constants.ATOM_WEIGHTS, fmap_constants.MOLECULE_WEIGHT, fmap_constants.DENSE_MASS_SPEC, fmap_constants.INCHIKEY, fmap_constants.NAME, fmap_constants.MOLECULAR_FORMULA, fmap_constants.ADJACENCY_MATRIX, fmap_constants.ATOM_IDS, fmap_constants.SMILES ] label_names = [fmap_constants.INCHIKEY] features, _ = parse_sdf_utils.make_features_and_labels( dataset, feature_names, label_names, mode=tf.estimator.ModeKeys.EVAL) with tf.Session() as sess: feature_values = sess.run(features) # Check that the dataset was consumed try: sess.run(features) raise ValueError('Dataset parsing using batch size of length of the' 'dataset resulted in more than one batch.') except tf.errors.OutOfRangeError: # expected behavior pass for i in range(len(self.expected_mol_dicts)): self.assertAlmostEqual( feature_values[fmap_constants.MOLECULE_WEIGHT][i], self.expected_mol_dicts[i][fmap_constants.MOLECULE_WEIGHT]) self.assertSequenceAlmostEqual( feature_values[fmap_constants.ADJACENCY_MATRIX][i] .flatten(), self.expected_mol_dicts[i][fmap_constants.ADJACENCY_MATRIX], delta=0.0001) self.assertSequenceAlmostEqual( feature_values[fmap_constants.DENSE_MASS_SPEC][i], self.expected_mol_dicts[i][fmap_constants.DENSE_MASS_SPEC], delta=0.0001) self.assertSequenceAlmostEqual( feature_values[fmap_constants.ATOM_WEIGHTS][i], self.expected_mol_dicts[i][fmap_constants.ATOM_WEIGHTS], delta=0.0001) self.assertSequenceAlmostEqual( feature_values[fmap_constants.ATOM_IDS][i], self.expected_mol_dicts[i][fmap_constants.ATOM_IDS], delta=0.0001) self.assertEqual( feature_values[fmap_constants.NAME][i], self.encode(self.expected_mol_dicts[i][fmap_constants.NAME])) self.assertEqual( feature_values[fmap_constants.INCHIKEY][i], self.encode( self.expected_mol_dicts[i][fmap_constants.INCHIKEY])) self.assertEqual( feature_values[fmap_constants.MOLECULAR_FORMULA][i], self.encode( self.expected_mol_dicts[i][fmap_constants.MOLECULAR_FORMULA])) self.assertAllEqual(feature_values[fmap_constants.SMILES][i], self.expected_mol_dicts[i]['parsed_smiles']) self.assertAllEqual( feature_values[fmap_constants.SMILES_TOKEN_LIST_LENGTH][i], self.expected_mol_dicts[i][fmap_constants.SMILES_TOKEN_LIST_LENGTH])
def test_find_inchikey_duplicates(self): """Test finding duplicate inchi keys in list of molecules.""" mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_long) dup_dict = parse_sdf_utils.find_inchikey_duplicates(mol_list) self.assertLen(dup_dict, 1)
def test_filter_mol_list_by_prop(self): """Test filtering rdkit.Mol list by contents of tags.""" mol_list = parse_sdf_utils.get_sdf_to_mol(self.test_file_long) filtered_mol_list = parse_sdf_utils.filter_mol_list_by_prop( mol_list, 'CONTRIBUTOR', 'Moscow', wanted=True) self.assertLen(filtered_mol_list, 9)