def test_calculate_scores_for_metadata(file_names, test_spectra): """Test collect_matches_data_multiple_spectra method of ms2library""" sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \ ms2ds_model_file_name, ms2ds_embeddings_file_name, \ spectrum_id_column_name, ms2q_model_file_name = file_names test_library = MS2Library(sqlite_file_loc, spec2vec_model_file_loc, ms2ds_model_file_name, s2v_pickled_embeddings_file, ms2ds_embeddings_file_name, ms2q_model_file_name, spectrum_id_column_name=spectrum_id_column_name) ms2dscores: pd.DataFrame = load_pickled_file( os.path.join( os.path.split(os.path.dirname(__file__))[0], 'tests/test_files/test_files_ms2library/expected_ms2ds_scores.pickle' )) results_table = ResultsTable(preselection_cut_off=20, ms2deepscores=ms2dscores.iloc[:, 0], query_spectrum=test_spectra[0], sqlite_file_name=sqlite_file_loc) results_table = test_library._calculate_features_for_random_forest_model( results_table) expected_result = load_pickled_file( os.path.join( os.path.split(os.path.dirname(__file__))[0], "tests/test_files/test_files_ms2library/expected_results_table_with_scores.pickle" )) results_table.assert_results_table_equal(expected_result)
def test_get_matches_info_and_tanimoto(): sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \ ms2ds_model_file_name, ms2ds_embeddings_file_name, \ spectrum_id_column_name, training_spectra_file_name, \ validation_spectra_file_name, tanimoto_scores_file_name\ = get_test_file_names() select_data_for_training = DataCollectorForTraining( sqlite_file_loc, spec2vec_model_file_loc, ms2ds_model_file_name, s2v_pickled_embeddings_file, ms2ds_embeddings_file_name, training_spectra_file_name, validation_spectra_file_name, tanimoto_scores_file_name, spectrum_id_column_name=spectrum_id_column_name) query_spectra = load_pickled_file(training_spectra_file_name) result = select_data_for_training.get_matches_info_and_tanimoto( query_spectra) expected_result = load_pickled_file( os.path.join( os.path.split(os.path.dirname(__file__))[0], "tests/test_files/test_files_train_ms2query_nn", "expected_train_and_val_data.pickle"))[:2] assert isinstance(result, tuple), "Expected tuple to be returned" assert len(result) == 2, "Expected tuple to be returned" pd.testing.assert_frame_equal(result[0], expected_result[0]) pd.testing.assert_frame_equal(result[1], expected_result[1])
def test_get_tanimoto_for_spectrum_ids(): sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \ ms2ds_model_file_name, ms2ds_embeddings_file_name, \ spectrum_id_column_name, training_spectra_file_name, \ validation_spectra_file_name, tanimoto_scores_file_name \ = get_test_file_names() select_data_for_training = DataCollectorForTraining( sqlite_file_loc, spec2vec_model_file_loc, ms2ds_model_file_name, s2v_pickled_embeddings_file, ms2ds_embeddings_file_name, training_spectra_file_name, validation_spectra_file_name, tanimoto_scores_file_name, spectrum_id_column_name=spectrum_id_column_name) query_spectrum = load_pickled_file(training_spectra_file_name)[0] spectra_ids_list = \ ['CCMSLIB00000001603', 'CCMSLIB00000001652', 'CCMSLIB00000001640'] result = select_data_for_training.get_tanimoto_for_spectrum_ids( query_spectrum, spectra_ids_list) expected_result = pd.DataFrame([0.199695, 0.177669, 0.192504], index=spectra_ids_list, columns=["Tanimoto_score"]) assert isinstance(result, pd.DataFrame), "Expected a pd.Dataframe" pd.testing.assert_frame_equal(result, expected_result, check_dtype=False)
def test_making_sqlite_file(tmp_path): """Makes a temporary sqlite file and tests if it contains the correct info """ # tmp_path is a fixture that makes sure a temporary file is created new_sqlite_file_name = os.path.join(tmp_path, "test_spectra_database.sqlite") path_to_general_test_files = os.path.join( os.path.split(os.path.dirname(__file__))[0], 'tests/test_files/general_test_files') reference_sqlite_file = os.path.join(path_to_general_test_files, "100_test_spectra.sqlite") list_of_spectra = load_pickled_file( os.path.join(path_to_general_test_files, "100_test_spectra.pickle")) list_of_spectra = minimal_processing_multiple_spectra(list_of_spectra) tanimoto_scores_file_name = os.path.join( path_to_general_test_files, "100_test_spectra_tanimoto_scores.pickle") # Create sqlite file, with 3 tables make_sqlfile_wrapper(new_sqlite_file_name, tanimoto_scores_file_name, list_of_spectra, columns_dict={"precursor_mz": "REAL"}, spectrum_id_column_name="spectrumid") check_sqlite_files_are_equal(new_sqlite_file_name, reference_sqlite_file)
def test_get_spectra_from_sqlite_all_spectra(): """Tests if the correct spectrum data is returned from a sqlite file """ path_to_test_files_sqlite_dir = os.path.join( os.path.split(os.path.dirname(__file__))[0], 'tests/test_files') sqlite_file_name = os.path.join(path_to_test_files_sqlite_dir, "test_spectra_database.sqlite") spectra_list = get_spectra_from_sqlite( sqlite_file_name, [], spectrum_id_storage_name="spectrum_id", get_all_spectra=True) # Test if the output is of the right type assert isinstance(spectra_list, list), "Expected a list" assert isinstance(spectra_list[0], Spectrum), \ "Expected a list with matchms.Spectrum.Spectrum objects" # Test if the right number of spectra are returned assert len(spectra_list) == 10, "Expected 10 spectra" # Test if the correct spectra are loaded pickled_file_name = os.path.join(path_to_test_files_sqlite_dir, "first_10_spectra.pickle") expected_spectra = load_pickled_file(pickled_file_name) for expected_spectrum in expected_spectra: spectrum_returned = False for spectrum in spectra_list: if expected_spectrum.__eq__(spectrum): spectrum_returned = True assert spectrum_returned, \ f"Expected spectrum with spectrumid: " \ f"{expected_spectrum.get('spectrum_id')} to be returned as well"
def test_get_spectrum_data(): """Tests if the correct spectrum data is returned from a sqlite file """ path_to_test_files_sqlite_dir = os.path.join( os.path.split(os.path.dirname(__file__))[0], 'tests/test_files') sqlite_file_name = os.path.join(path_to_test_files_sqlite_dir, "test_spectra_database.sqlite") spectra_id_list = ['CCMSLIB00000001547', 'CCMSLIB00000001549'] spectra_list = get_spectra_from_sqlite( sqlite_file_name, spectra_id_list, spectrum_id_storage_name="spectrum_id") # Test if the output is of the right type assert isinstance(spectra_list, list), "Expected a list" assert isinstance(spectra_list[0], Spectrum), \ "Expected a list with matchms.Spectrum.Spectrum objects" # Test if the right number of spectra are returned assert len(spectra_list) == 2, "Expected only 2 spectra" # Test if the correct spectra are loaded pickled_file_name = os.path.join(path_to_test_files_sqlite_dir, "first_10_spectra.pickle") original_spectra = load_pickled_file(pickled_file_name) assert original_spectra[0].__eq__(spectra_list[0]), \ "Expected different spectrum to be loaded" assert original_spectra[2].__eq__(spectra_list[1]), \ "Expected different spectrum to be loaded"
def test_analog_search(file_names, test_spectra): """Test analog search""" sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \ ms2ds_model_file_name, ms2ds_embeddings_file_name, \ spectrum_id_column_name, ms2q_model_file_name = file_names test_library = MS2Library(sqlite_file_loc, spec2vec_model_file_loc, ms2ds_model_file_name, s2v_pickled_embeddings_file, ms2ds_embeddings_file_name, ms2q_model_file_name, spectrum_id_column_name=spectrum_id_column_name) cutoff = 20 results = test_library.analog_search_return_results_tables( test_spectra, cutoff) expected_result = load_pickled_file( os.path.join( os.path.split(os.path.dirname(__file__))[0], "tests/test_files/test_files_ms2library/expected_analog_search_results.pickle" )) for i in range(len(expected_result)): results[i].assert_results_table_equal(expected_result[i])
def test_create_train_and_val_data_with_saving(tmp_path): """Test create_train_and_val_data without saving the files""" sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \ ms2ds_model_file_name, ms2ds_embeddings_file_name, \ spectrum_id_column_name, training_spectra_file_name, \ validation_spectra_file_name, tanimoto_scores_file_name = \ get_test_file_names() save_file_name = os.path.join( tmp_path, "test_training_and_validation_set_and_labels") select_data_for_training = DataCollectorForTraining( sqlite_file_loc, spec2vec_model_file_loc, ms2ds_model_file_name, s2v_pickled_embeddings_file, ms2ds_embeddings_file_name, training_spectra_file_name, validation_spectra_file_name, tanimoto_scores_file_name, spectrum_id_column_name=spectrum_id_column_name) returned_results = \ select_data_for_training.create_train_and_val_data( save_file_name=save_file_name) assert os.path.exists(save_file_name), "Expected file to be created" expected_result = load_pickled_file( os.path.join( os.path.split(os.path.dirname(__file__))[0], "tests/test_files/test_files_train_ms2query_nn", "expected_train_and_val_data.pickle")) result_in_file = load_pickled_file(save_file_name) # Test if the right result is returned assert isinstance(returned_results, tuple), \ "Expected a tuple to be returned" assert len(returned_results) == 4, "Expected a tuple with length 4" for i, result in enumerate(returned_results): assert isinstance(result, pd.DataFrame) pd.testing.assert_frame_equal(result, expected_result[i]) # Test if right information is stored in file assert isinstance(result_in_file, tuple), \ "Expected a tuple to be returned" assert len(result_in_file) == 4, "Expected a tuple with length 4" for i, result in enumerate(returned_results): assert isinstance(result, pd.DataFrame) pd.testing.assert_frame_equal(result, expected_result[i])
def test_store_s2v_embeddings(tmp_path, path_to_general_test_files): """Tests store_ms2ds_embeddings""" base_file_name = os.path.join(tmp_path, '100_test_spectra') test_create_files = LibraryFilesCreator( os.path.join(path_to_general_test_files, '100_test_spectra.pickle'), base_file_name) test_create_files.store_s2v_embeddings( os.path.join(path_to_general_test_files, "100_test_spectra_s2v_model.model")) new_embeddings_file_name = base_file_name + "_s2v_embeddings.pickle" assert os.path.isfile(new_embeddings_file_name), \ "Expected file to be created" embeddings = load_pickled_file(new_embeddings_file_name) expected_embeddings = load_pickled_file( os.path.join(path_to_general_test_files, "100_test_spectra_s2v_embeddings.pickle")) pd.testing.assert_frame_equal(embeddings, expected_embeddings)
def test_create_all_library_files(tmp_path, path_to_general_test_files): """Tests create_all_library_files""" base_file_name = os.path.join(tmp_path, '100_test_spectra') test_create_files = LibraryFilesCreator( os.path.join(path_to_general_test_files, '100_test_spectra.pickle'), base_file_name) test_create_files.create_all_library_files( os.path.join(path_to_general_test_files, '100_test_spectra_tanimoto_scores.pickle'), os.path.join(path_to_general_test_files, 'ms2ds_siamese_210301_5000_500_400.hdf5'), os.path.join(path_to_general_test_files, '100_test_spectra_s2v_model.model')) expected_ms2ds_emb_file_name = base_file_name + "_ms2ds_embeddings.pickle" expected_s2v_emb_file_name = base_file_name + "_s2v_embeddings.pickle" expected_sqlite_file_name = base_file_name + ".sqlite" assert os.path.isfile(expected_ms2ds_emb_file_name), \ "Expected ms2ds embeddings file to be created" assert os.path.isfile(expected_s2v_emb_file_name), \ "Expected s2v file to be created" assert os.path.isfile(expected_sqlite_file_name), \ "Expected sqlite file to be created" # Test if correct embeddings are stored ms2ds_embeddings = load_pickled_file(expected_ms2ds_emb_file_name) s2v_embeddings = load_pickled_file(expected_s2v_emb_file_name) expected_s2v_embeddings = load_pickled_file( os.path.join(path_to_general_test_files, "100_test_spectra_s2v_embeddings.pickle")) expected_ms2ds_embeddings = load_pickled_file( os.path.join(path_to_general_test_files, "100_test_spectra_ms2ds_embeddings.pickle")) pd.testing.assert_frame_equal(ms2ds_embeddings, expected_ms2ds_embeddings, check_exact=False, atol=1e-5) pd.testing.assert_frame_equal(s2v_embeddings, expected_s2v_embeddings, check_exact=False, atol=1e-5) # Check if sqlite file is stored correctly check_sqlite_files_are_equal( expected_sqlite_file_name, os.path.join(path_to_general_test_files, "100_test_spectra.sqlite"))
def test_get_ms2query_model_prediction_single_spectrum(): results_table = load_pickled_file( os.path.join( os.path.split(os.path.dirname(__file__))[0], "tests/test_files/test_files_ms2library/expected_results_table_with_scores.pickle" )) ms2q_model_file_name = os.path.join( os.path.split(os.path.dirname(__file__))[0], 'tests/test_files/general_test_files/test_ms2q_rf_model.pickle') ms2query_nn_model = load_ms2query_model(ms2q_model_file_name) results = get_ms2query_model_prediction_single_spectrum( results_table, ms2query_nn_model) expected_result = load_pickled_file( os.path.join( os.path.split(os.path.dirname(__file__))[0], "tests/test_files/test_files_ms2library/expected_analog_search_results.pickle" ))[0] expected_result.assert_results_table_equal(results)
def test_store_ms2ds_embeddings(tmp_path, path_to_general_test_files): """Tests store_ms2ds_embeddings""" base_file_name = os.path.join(tmp_path, '100_test_spectra') test_create_files = LibraryFilesCreator( os.path.join(path_to_general_test_files, '100_test_spectra.pickle'), base_file_name) test_create_files.store_ms2ds_embeddings( os.path.join(path_to_general_test_files, 'ms2ds_siamese_210301_5000_500_400.hdf5')) new_embeddings_file_name = base_file_name + "_ms2ds_embeddings.pickle" assert os.path.isfile(new_embeddings_file_name), \ "Expected file to be created" # Test if correct embeddings are stored embeddings = load_pickled_file(new_embeddings_file_name) expected_embeddings = load_pickled_file( os.path.join(path_to_general_test_files, "100_test_spectra_ms2ds_embeddings.pickle")) pd.testing.assert_frame_equal(embeddings, expected_embeddings, check_exact=False, atol=1e-5)
def test_get_tanimoto_scores(): """Tests if the correct tanimoto scores are retrieved from sqlite file """ path_to_test_files_sqlite_dir = os.path.join( os.path.split(os.path.dirname(__file__))[0], 'tests/test_files') sqlite_file_name = os.path.join(path_to_test_files_sqlite_dir, "test_spectra_database.sqlite") test_inchikeys = ['TXZUPPVCNIMVHW', 'WIOKWEJDRXNVSH', 'VBFKEZGCUWHGSK'] tanimoto_score_dataframe = get_tanimoto_score_for_inchikey14s( test_inchikeys, test_inchikeys, sqlite_file_name) reference_tanimoto_scores = \ load_pickled_file(os.path.join(path_to_test_files_sqlite_dir, "test_tanimoto_scores.pickle")) expected_result = reference_tanimoto_scores.loc[test_inchikeys][ test_inchikeys] pd.testing.assert_frame_equal(tanimoto_score_dataframe, expected_result, check_like=True)
def test_add_unknown_charges_to_spectra(): spectra = load_pickled_file(os.path.join( os.path.split(os.path.dirname(__file__))[0], 'tests/test_files/general_test_files/100_test_spectra.pickle')) # Set charges to predefined values for spectrum in spectra[:10]: spectrum.set("charge", None) for spectrum in spectra[10:20]: spectrum.set("charge", 1) for spectrum in spectra[20:30]: spectrum.set("charge", -1) for spectrum in spectra[30:]: spectrum.set("charge", 2) spectra_with_charge = add_unknown_charges_to_spectra(spectra) # Test if charges are set correctly for spectrum in spectra_with_charge[:20]: assert spectrum.get("charge") == 1, "The charge is expected to be 1" for spectrum in spectra_with_charge[20:30]: assert spectrum.get("charge") == -1, "The charge is expected to be -1" for spectrum in spectra_with_charge[30:]: assert spectrum.get("charge") == 2, "The charge is expected to be 2"
def _load_spectra_and_minimal_processing( self, pickled_spectra_file_name: str) -> List[Spectrum]: """Loads spectra from pickled file and does minimal processing Args: ------ pickled_spectra_file_name: The file name of a pickled file containing a list of spectra. """ # Loads the spectra from a pickled file list_of_spectra = load_pickled_file(pickled_spectra_file_name) assert list_of_spectra[0].get(self.settings[ "spectrum_id_column_name"]), \ f"Expected spectra to have '" \ f"{self.settings['spectrum_id_column_name']}' in " \ f"metadata, to solve specify the correct spectrum_solumn_name" # Does normalization and filtering of spectra list_of_spectra = \ minimal_processing_multiple_spectra( list_of_spectra, progress_bar=self.settings["progress_bars"]) return list_of_spectra
def test_get_all_ms2ds_scores(file_names, test_spectra): """Test get_all_ms2ds_scores method of ms2library""" sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \ ms2ds_model_file_name, ms2ds_embeddings_file_name, \ spectrum_id_column_name, ms2q_model_file_name = file_names test_library = MS2Library(sqlite_file_loc, spec2vec_model_file_loc, ms2ds_model_file_name, s2v_pickled_embeddings_file, ms2ds_embeddings_file_name, ms2q_model_file_name, spectrum_id_column_name=spectrum_id_column_name) result = test_library._get_all_ms2ds_scores(test_spectra) expected_result: pd.DataFrame = load_pickled_file( os.path.join( os.path.split(os.path.dirname(__file__))[0], 'tests/test_files/test_files_ms2library/expected_ms2ds_scores.pickle' )) assert isinstance(result, pd.DataFrame), "Expected dictionary" assert_frame_equal(result, expected_result)
def test_export_to_dataframe_with_additional_columns(dummy_data, tmp_path): create_test_classifier_csv_file(tmp_path) test_table: ResultsTable = load_pickled_file(os.path.join( os.path.split(os.path.dirname(__file__))[0], 'tests/test_files/test_files_ms2library/expected_analog_search_results.pickle'))[0] test_table.sqlite_file_name = os.path.join( os.path.split(os.path.dirname(__file__))[0], "tests/test_files/general_test_files/100_test_spectra.sqlite") test_table.classifier_csv_file_name = os.path.join(tmp_path, "test_csv_file") returned_dataframe = test_table.export_to_dataframe(5, additional_metadata_columns=["charge"], additional_ms2query_score_columns=["s2v_score", "ms2ds_score"]) assert isinstance(returned_dataframe, pd.DataFrame) assert list(returned_dataframe.columns) == column_names_for_output(True, True, ["charge"], ["s2v_score", "ms2ds_score"]) # Check if one of the classifiers is filled in assert returned_dataframe["npc_pathway_results"][0] == "Amino acids and Peptides" assert len(returned_dataframe.index) == 5 # Test if first row is correct np.testing.assert_array_almost_equal( list(returned_dataframe.iloc[0, [0, 1, 2, 3, 7, 8, 9]]), [0.56453, 33.25000, 907.0, 940.250, 1, 0.99965, 0.92317], 5) assert np.all(list(returned_dataframe.iloc[0, [4, 5, 6, 10, 11, 12, 13]]) == ['KNGPFNUOXXLKCN', 'CCMSLIB00000001548', 'Hoiamide B', 'CCC[C@@H](C)[C@@H]([C@H](C)[C@@H]1[C@H]([C@H](Cc2nc(cs2)C3=N[C@](CS3)(C4=N[C@](CS4)(C(=O)N[C@H]([C@H]([C@H](C(=O)O[C@H](C(=O)N[C@H](C(=O)O1)[C@@H](C)O)[C@@H](C)CC)C)O)[C@@H](C)CC)C)C)OC)C)O', 'Organic compounds', 'Organic acids and derivatives', 'Peptidomimetics'])
def __init__(self, sqlite_file_location: str, s2v_model_file_name: str, ms2ds_model_file_name: str, pickled_s2v_embeddings_file_name: str, pickled_ms2ds_embeddings_file_name: str, training_spectra_file: str, validation_spectra_file: str, tanimoto_scores_df_file_name: str, preselection_cut_off: int = 2000, **settings): """Parameters ---------- sqlite_file_location: The location at which the sqlite_file_is_stored. The file is expected to have 3 tables: tanimoto_scores, inchikeys and spectra_data. s2v_model_file_name: File location of a spec2vec model. In addition two more files in the same folder are expected with the same name but with extensions .trainables.syn1neg.npy and .wv.vectors.npy. ms2ds_model_file_name: File location of a trained ms2ds model. pickled_s2v_embeddings_file_name: File location of a pickled file with Spec2Vec embeddings in a pd.Dataframe with as index the spectrum id. pickled_ms2ds_embeddings_file_name: File location of a pickled file with ms2ds embeddings in a pd.Dataframe with as index the spectrum id. training_spectra_file: Pickled file with training spectra. validation_spectra_file: Pickled file with validation spectra. tanimoto_scores_df_file_name: A pickled file containing a dataframe with the tanimoto scores between all inchikeys. The tanimoto scores in SQLite cannot be used, since these do not contain the inchikeys for the training spectra. **settings: As additional parameters predefined settings can be changed. spectrum_id_column_name: The name of the column or key in dictionaries under which the spectrum id is stored. Default = "spectrumid" cosine_score_tolerance: Setting for calculating the cosine score. If two peaks fall within the cosine_score tolerance the peaks are considered a match. Default = 0.1 base_nr_mass_similarity: The base nr used for normalizing the mass similarity. Default = 0.8 max_precursor_mz: The value used to normalize the precursor m/z by dividing it by the max_precursor_mz. Default = 13428.370894192036 progress_bars: If True progress bars will be shown. Default = True""" # pylint: disable=too-many-arguments super().__init__(sqlite_file_location, s2v_model_file_name, ms2ds_model_file_name, pickled_s2v_embeddings_file_name, pickled_ms2ds_embeddings_file_name, None, **settings) self.tanimoto_scores: pd.DataFrame = \ load_pickled_file(tanimoto_scores_df_file_name) self.training_spectra = minimal_processing_multiple_spectra( load_pickled_file(training_spectra_file)) self.validation_spectra = minimal_processing_multiple_spectra( load_pickled_file(validation_spectra_file)) self.preselection_cut_off = preselection_cut_off
def add_tanimoto_scores_to_sqlite( sqlite_file_name: str, tanimoto_scores_pickled_dataframe_file: str, list_of_spectra: List[Spectrum], temporary_tanimoto_file_name: str = "temporary_tanimoto_scores", progress_bars: bool = True): """Adds tanimoto scores and inchikey14s to sqlite table sqlite_file_name: Name of sqlite_file that should be created, if it already exists the tables are added. If the tables in this sqlite file already exist, they will be overwritten. tanimoto_scores_pickled_dataframe_file: A pickled file with tanimoto scores. The column names and indexes are inchikey14s. list_of_spectra: List of spectrum objects temporary_tanimoto_file_name: The file name of a temporary .npy file that is created to memory efficiently read out the tanimoto scores. The file is deleted after finishing. progress_bars: If True progress bars will show the progress of the different steps in the process. """ # todo instead of creating a real file and than deleting make a temporary # file, not yet implemented since numpy memmap is not able to access the # file in this case. temporary_tanimoto_file_name = os.path.join(os.getcwd(), temporary_tanimoto_file_name) assert not os.path.exists(temporary_tanimoto_file_name + ".npy"), \ "A file already exists with the temporary file name you want to create" tanimoto_df = load_pickled_file(tanimoto_scores_pickled_dataframe_file) # Get spectra belonging to each inchikey14 spectra_belonging_to_inchikey14 = \ get_spectra_belonging_to_inchikey14(list_of_spectra) inchikeys_with_spectra = list(spectra_belonging_to_inchikey14.keys()) inchikeys_with_spectra.sort() # Remove all inchikeys that do not have any matching spectra filtered_tanimoto_df = tanimoto_df.loc[inchikeys_with_spectra][ inchikeys_with_spectra] inchikeys_order = filtered_tanimoto_df.index # Get closest related inchikey14s for each inchikey14 closest_related_inchikey14s = \ get_closest_related_inchikey14s(filtered_tanimoto_df, inchikeys_order) # Creates a sqlite table containing all tanimoto scores initialize_tanimoto_score_table(sqlite_file_name) assert not filtered_tanimoto_df.isnull().values.any(), \ "No NaN values were expected in tanimoto scores" if progress_bars: print("Saving tanimoto scores to temporary .npy file") np.save(temporary_tanimoto_file_name, filtered_tanimoto_df.to_numpy()) add_tanimoto_scores_to_sqlite_table(sqlite_file_name, temporary_tanimoto_file_name + ".npy", progress_bar=progress_bars) os.remove(temporary_tanimoto_file_name + ".npy") # Creates a table containing the identifiers belonging to each inchikey14 # These identifiers correspond to the identifiers in tanimoto_scores create_inchikey_sqlite_table(sqlite_file_name, inchikeys_order, spectra_belonging_to_inchikey14, closest_related_inchikey14s, progress_bar=progress_bars)
def __init__(self, sqlite_file_name: str, s2v_model_file_name: str, ms2ds_model_file_name: str, pickled_s2v_embeddings_file_name: str, pickled_ms2ds_embeddings_file_name: str, ms2query_model_file_name: Union[str, None], classifier_csv_file_name: Union[str, None] = None, **settings): """ Parameters ---------- sqlite_file_name: The location at which the sqlite_file_is_stored. The file is expected to have 3 tables: tanimoto_scores, inchikeys and spectra_data. s2v_model_file_name: File location of a spec2vec model. In addition two more files in the same folder are expected with the same name but with extensions .trainables.syn1neg.npy and .wv.vectors.npy. ms2ds_model_file_name: File location of a trained ms2ds model. pickled_s2v_embeddings_file_name: File location of a pickled file with Spec2Vec embeddings in a pd.Dataframe with as index the spectrum id. pickled_ms2ds_embeddings_file_name: File location of a pickled file with ms2ds embeddings in a pd.Dataframe with as index the spectrum id. ms2query_model_file_name: File location of ms2query model with .hdf5 extension. classifier_csv_file_name: Csv file location containing classifier annotations per inchikey **settings: As additional parameters predefined settings can be changed. spectrum_id_column_name: The name of the column or key in dictionaries under which the spectrum id is stored. Default = "spectrumid" progress_bars: If True progress bars will be shown of all methods. Default = True """ # pylint: disable=too-many-arguments # Change default settings to values given in **settings self.settings = self._set_settings(settings) # Load models and set file locations self.classifier_file_name = classifier_csv_file_name assert os.path.isfile( sqlite_file_name ), f"The given sqlite file does not exist: {sqlite_file_name}" self.sqlite_file_name = sqlite_file_name if ms2query_model_file_name is not None: self.ms2query_model = load_ms2query_model(ms2query_model_file_name) self.s2v_model = Word2Vec.load(s2v_model_file_name) self.ms2ds_model = load_ms2ds_model(ms2ds_model_file_name) # loads the library embeddings into memory self.s2v_embeddings: pd.DataFrame = load_pickled_file( pickled_s2v_embeddings_file_name) self.ms2ds_embeddings: pd.DataFrame = load_pickled_file( pickled_ms2ds_embeddings_file_name) assert self.ms2ds_model.base.output_shape[1] == self.ms2ds_embeddings.shape[1], \ "Dimension of pre-computed MS2DeepScore embeddings does not fit given model." # load precursor mz's self.precursors_library = get_precursor_mz( self.sqlite_file_name, self.settings["spectrum_id_column_name"]) # Load inchikey information into memory self.spectra_of_inchikey14s, \ self.closely_related_inchikey14s = \ get_inchikey_information(self.sqlite_file_name) self.inchikey14s_of_spectra = {} for inchikey, list_of_spectrum_ids in \ self.spectra_of_inchikey14s.items(): for spectrum_id in list_of_spectrum_ids: self.inchikey14s_of_spectra[spectrum_id] = inchikey
processing pipeline.""" s = default_filters(s) s = add_precursor_mz(s) s = normalize_intensities(s) s = reduce_to_number_of_peaks(s, n_required=5, ratio_desired=0.5, n_max=500) s = select_by_mz(s, mz_from=0, mz_to=1000) s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) s = require_minimum_number_of_peaks(s, n_required=5) return s # path_root = os.path.dirname(os.getcwd()) # path_data = os.path.join(os.path.dirname(path_root), "data/gnps_15_12_2021/") path_data = "C:\\HSD\\OneDrive - Hochschule Düsseldorf\\Data\\ms2query" training_spectra_annotated = load_pickled_file(os.path.join(path_data, "GNPS_15_12_2021_pos_train.pickle")) training_spectra_not_annotated = load_pickled_file(os.path.join(path_data, "ALL_GNPS_15_12_2021_positive_not_annotated.pickle")) all_spectra = training_spectra_annotated + training_spectra_not_annotated # Load data from pickled file and apply filters cleaned_spectra = [spectrum_processing(s) for s in all_spectra] # Omit spectrums that didn't qualify for analysis cleaned_spectra = [s for s in cleaned_spectra if s is not None] # Create spectrum documents reference_documents = [SpectrumDocument(s, n_decimals=2) for s in cleaned_spectra] model_file = os.path.join(path_data, "trained_models", "spec2vec_model_GNPS_15_12_2021.model") model = train_new_word2vec_model(reference_documents,