Beispiel #1
0
def test_calculate_scores_for_metadata(file_names, test_spectra):
    """Test collect_matches_data_multiple_spectra method of ms2library"""
    sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \
        ms2ds_model_file_name, ms2ds_embeddings_file_name, \
        spectrum_id_column_name, ms2q_model_file_name = file_names

    test_library = MS2Library(sqlite_file_loc,
                              spec2vec_model_file_loc,
                              ms2ds_model_file_name,
                              s2v_pickled_embeddings_file,
                              ms2ds_embeddings_file_name,
                              ms2q_model_file_name,
                              spectrum_id_column_name=spectrum_id_column_name)

    ms2dscores: pd.DataFrame = load_pickled_file(
        os.path.join(
            os.path.split(os.path.dirname(__file__))[0],
            'tests/test_files/test_files_ms2library/expected_ms2ds_scores.pickle'
        ))
    results_table = ResultsTable(preselection_cut_off=20,
                                 ms2deepscores=ms2dscores.iloc[:, 0],
                                 query_spectrum=test_spectra[0],
                                 sqlite_file_name=sqlite_file_loc)

    results_table = test_library._calculate_features_for_random_forest_model(
        results_table)
    expected_result = load_pickled_file(
        os.path.join(
            os.path.split(os.path.dirname(__file__))[0],
            "tests/test_files/test_files_ms2library/expected_results_table_with_scores.pickle"
        ))

    results_table.assert_results_table_equal(expected_result)
Beispiel #2
0
def test_get_matches_info_and_tanimoto():
    sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \
        ms2ds_model_file_name, ms2ds_embeddings_file_name, \
        spectrum_id_column_name, training_spectra_file_name, \
        validation_spectra_file_name, tanimoto_scores_file_name\
        = get_test_file_names()

    select_data_for_training = DataCollectorForTraining(
        sqlite_file_loc,
        spec2vec_model_file_loc,
        ms2ds_model_file_name,
        s2v_pickled_embeddings_file,
        ms2ds_embeddings_file_name,
        training_spectra_file_name,
        validation_spectra_file_name,
        tanimoto_scores_file_name,
        spectrum_id_column_name=spectrum_id_column_name)

    query_spectra = load_pickled_file(training_spectra_file_name)

    result = select_data_for_training.get_matches_info_and_tanimoto(
        query_spectra)
    expected_result = load_pickled_file(
        os.path.join(
            os.path.split(os.path.dirname(__file__))[0],
            "tests/test_files/test_files_train_ms2query_nn",
            "expected_train_and_val_data.pickle"))[:2]
    assert isinstance(result, tuple), "Expected tuple to be returned"
    assert len(result) == 2, "Expected tuple to be returned"
    pd.testing.assert_frame_equal(result[0], expected_result[0])
    pd.testing.assert_frame_equal(result[1], expected_result[1])
Beispiel #3
0
def test_get_tanimoto_for_spectrum_ids():
    sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \
        ms2ds_model_file_name, ms2ds_embeddings_file_name, \
        spectrum_id_column_name, training_spectra_file_name, \
        validation_spectra_file_name, tanimoto_scores_file_name \
        = get_test_file_names()

    select_data_for_training = DataCollectorForTraining(
        sqlite_file_loc,
        spec2vec_model_file_loc,
        ms2ds_model_file_name,
        s2v_pickled_embeddings_file,
        ms2ds_embeddings_file_name,
        training_spectra_file_name,
        validation_spectra_file_name,
        tanimoto_scores_file_name,
        spectrum_id_column_name=spectrum_id_column_name)

    query_spectrum = load_pickled_file(training_spectra_file_name)[0]
    spectra_ids_list = \
        ['CCMSLIB00000001603', 'CCMSLIB00000001652', 'CCMSLIB00000001640']
    result = select_data_for_training.get_tanimoto_for_spectrum_ids(
        query_spectrum, spectra_ids_list)
    expected_result = pd.DataFrame([0.199695, 0.177669, 0.192504],
                                   index=spectra_ids_list,
                                   columns=["Tanimoto_score"])
    assert isinstance(result, pd.DataFrame), "Expected a pd.Dataframe"
    pd.testing.assert_frame_equal(result, expected_result, check_dtype=False)
Beispiel #4
0
def test_making_sqlite_file(tmp_path):
    """Makes a temporary sqlite file and tests if it contains the correct info
    """
    # tmp_path is a fixture that makes sure a temporary file is created
    new_sqlite_file_name = os.path.join(tmp_path,
                                        "test_spectra_database.sqlite")

    path_to_general_test_files = os.path.join(
        os.path.split(os.path.dirname(__file__))[0],
        'tests/test_files/general_test_files')

    reference_sqlite_file = os.path.join(path_to_general_test_files,
                                         "100_test_spectra.sqlite")

    list_of_spectra = load_pickled_file(
        os.path.join(path_to_general_test_files, "100_test_spectra.pickle"))
    list_of_spectra = minimal_processing_multiple_spectra(list_of_spectra)

    tanimoto_scores_file_name = os.path.join(
        path_to_general_test_files, "100_test_spectra_tanimoto_scores.pickle")

    # Create sqlite file, with 3 tables
    make_sqlfile_wrapper(new_sqlite_file_name,
                         tanimoto_scores_file_name,
                         list_of_spectra,
                         columns_dict={"precursor_mz": "REAL"},
                         spectrum_id_column_name="spectrumid")
    check_sqlite_files_are_equal(new_sqlite_file_name, reference_sqlite_file)
Beispiel #5
0
def test_get_spectra_from_sqlite_all_spectra():
    """Tests if the correct spectrum data is returned from a sqlite file
    """
    path_to_test_files_sqlite_dir = os.path.join(
        os.path.split(os.path.dirname(__file__))[0], 'tests/test_files')
    sqlite_file_name = os.path.join(path_to_test_files_sqlite_dir,
                                    "test_spectra_database.sqlite")

    spectra_list = get_spectra_from_sqlite(
        sqlite_file_name, [],
        spectrum_id_storage_name="spectrum_id",
        get_all_spectra=True)

    # Test if the output is of the right type
    assert isinstance(spectra_list, list), "Expected a list"
    assert isinstance(spectra_list[0], Spectrum), \
        "Expected a list with matchms.Spectrum.Spectrum objects"

    # Test if the right number of spectra are returned
    assert len(spectra_list) == 10, "Expected 10 spectra"

    # Test if the correct spectra are loaded
    pickled_file_name = os.path.join(path_to_test_files_sqlite_dir,
                                     "first_10_spectra.pickle")
    expected_spectra = load_pickled_file(pickled_file_name)
    for expected_spectrum in expected_spectra:
        spectrum_returned = False
        for spectrum in spectra_list:
            if expected_spectrum.__eq__(spectrum):
                spectrum_returned = True
        assert spectrum_returned, \
            f"Expected spectrum with spectrumid: " \
            f"{expected_spectrum.get('spectrum_id')} to be returned as well"
Beispiel #6
0
def test_get_spectrum_data():
    """Tests if the correct spectrum data is returned from a sqlite file
    """
    path_to_test_files_sqlite_dir = os.path.join(
        os.path.split(os.path.dirname(__file__))[0], 'tests/test_files')
    sqlite_file_name = os.path.join(path_to_test_files_sqlite_dir,
                                    "test_spectra_database.sqlite")

    spectra_id_list = ['CCMSLIB00000001547', 'CCMSLIB00000001549']
    spectra_list = get_spectra_from_sqlite(
        sqlite_file_name,
        spectra_id_list,
        spectrum_id_storage_name="spectrum_id")

    # Test if the output is of the right type
    assert isinstance(spectra_list, list), "Expected a list"
    assert isinstance(spectra_list[0], Spectrum), \
        "Expected a list with matchms.Spectrum.Spectrum objects"

    # Test if the right number of spectra are returned
    assert len(spectra_list) == 2, "Expected only 2 spectra"

    # Test if the correct spectra are loaded
    pickled_file_name = os.path.join(path_to_test_files_sqlite_dir,
                                     "first_10_spectra.pickle")
    original_spectra = load_pickled_file(pickled_file_name)
    assert original_spectra[0].__eq__(spectra_list[0]), \
        "Expected different spectrum to be loaded"
    assert original_spectra[2].__eq__(spectra_list[1]), \
        "Expected different spectrum to be loaded"
Beispiel #7
0
def test_analog_search(file_names, test_spectra):
    """Test analog search"""
    sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \
        ms2ds_model_file_name, ms2ds_embeddings_file_name, \
        spectrum_id_column_name, ms2q_model_file_name = file_names

    test_library = MS2Library(sqlite_file_loc,
                              spec2vec_model_file_loc,
                              ms2ds_model_file_name,
                              s2v_pickled_embeddings_file,
                              ms2ds_embeddings_file_name,
                              ms2q_model_file_name,
                              spectrum_id_column_name=spectrum_id_column_name)

    cutoff = 20
    results = test_library.analog_search_return_results_tables(
        test_spectra, cutoff)

    expected_result = load_pickled_file(
        os.path.join(
            os.path.split(os.path.dirname(__file__))[0],
            "tests/test_files/test_files_ms2library/expected_analog_search_results.pickle"
        ))

    for i in range(len(expected_result)):
        results[i].assert_results_table_equal(expected_result[i])
Beispiel #8
0
def test_create_train_and_val_data_with_saving(tmp_path):
    """Test create_train_and_val_data without saving the files"""
    sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \
        ms2ds_model_file_name, ms2ds_embeddings_file_name, \
        spectrum_id_column_name, training_spectra_file_name, \
        validation_spectra_file_name, tanimoto_scores_file_name = \
        get_test_file_names()
    save_file_name = os.path.join(
        tmp_path, "test_training_and_validation_set_and_labels")

    select_data_for_training = DataCollectorForTraining(
        sqlite_file_loc,
        spec2vec_model_file_loc,
        ms2ds_model_file_name,
        s2v_pickled_embeddings_file,
        ms2ds_embeddings_file_name,
        training_spectra_file_name,
        validation_spectra_file_name,
        tanimoto_scores_file_name,
        spectrum_id_column_name=spectrum_id_column_name)
    returned_results = \
        select_data_for_training.create_train_and_val_data(
            save_file_name=save_file_name)
    assert os.path.exists(save_file_name), "Expected file to be created"

    expected_result = load_pickled_file(
        os.path.join(
            os.path.split(os.path.dirname(__file__))[0],
            "tests/test_files/test_files_train_ms2query_nn",
            "expected_train_and_val_data.pickle"))
    result_in_file = load_pickled_file(save_file_name)
    # Test if the right result is returned
    assert isinstance(returned_results, tuple), \
        "Expected a tuple to be returned"
    assert len(returned_results) == 4, "Expected a tuple with length 4"
    for i, result in enumerate(returned_results):
        assert isinstance(result, pd.DataFrame)
        pd.testing.assert_frame_equal(result, expected_result[i])
    # Test if right information is stored in file
    assert isinstance(result_in_file, tuple), \
        "Expected a tuple to be returned"
    assert len(result_in_file) == 4, "Expected a tuple with length 4"
    for i, result in enumerate(returned_results):
        assert isinstance(result, pd.DataFrame)
        pd.testing.assert_frame_equal(result, expected_result[i])
Beispiel #9
0
def test_store_s2v_embeddings(tmp_path, path_to_general_test_files):
    """Tests store_ms2ds_embeddings"""
    base_file_name = os.path.join(tmp_path, '100_test_spectra')
    test_create_files = LibraryFilesCreator(
        os.path.join(path_to_general_test_files, '100_test_spectra.pickle'),
        base_file_name)
    test_create_files.store_s2v_embeddings(
        os.path.join(path_to_general_test_files,
                     "100_test_spectra_s2v_model.model"))

    new_embeddings_file_name = base_file_name + "_s2v_embeddings.pickle"
    assert os.path.isfile(new_embeddings_file_name), \
        "Expected file to be created"
    embeddings = load_pickled_file(new_embeddings_file_name)
    expected_embeddings = load_pickled_file(
        os.path.join(path_to_general_test_files,
                     "100_test_spectra_s2v_embeddings.pickle"))
    pd.testing.assert_frame_equal(embeddings, expected_embeddings)
Beispiel #10
0
def test_create_all_library_files(tmp_path, path_to_general_test_files):
    """Tests create_all_library_files"""
    base_file_name = os.path.join(tmp_path, '100_test_spectra')
    test_create_files = LibraryFilesCreator(
        os.path.join(path_to_general_test_files, '100_test_spectra.pickle'),
        base_file_name)
    test_create_files.create_all_library_files(
        os.path.join(path_to_general_test_files,
                     '100_test_spectra_tanimoto_scores.pickle'),
        os.path.join(path_to_general_test_files,
                     'ms2ds_siamese_210301_5000_500_400.hdf5'),
        os.path.join(path_to_general_test_files,
                     '100_test_spectra_s2v_model.model'))

    expected_ms2ds_emb_file_name = base_file_name + "_ms2ds_embeddings.pickle"
    expected_s2v_emb_file_name = base_file_name + "_s2v_embeddings.pickle"
    expected_sqlite_file_name = base_file_name + ".sqlite"
    assert os.path.isfile(expected_ms2ds_emb_file_name), \
        "Expected ms2ds embeddings file to be created"
    assert os.path.isfile(expected_s2v_emb_file_name), \
        "Expected s2v file to be created"
    assert os.path.isfile(expected_sqlite_file_name), \
        "Expected sqlite file to be created"
    # Test if correct embeddings are stored
    ms2ds_embeddings = load_pickled_file(expected_ms2ds_emb_file_name)
    s2v_embeddings = load_pickled_file(expected_s2v_emb_file_name)
    expected_s2v_embeddings = load_pickled_file(
        os.path.join(path_to_general_test_files,
                     "100_test_spectra_s2v_embeddings.pickle"))
    expected_ms2ds_embeddings = load_pickled_file(
        os.path.join(path_to_general_test_files,
                     "100_test_spectra_ms2ds_embeddings.pickle"))
    pd.testing.assert_frame_equal(ms2ds_embeddings,
                                  expected_ms2ds_embeddings,
                                  check_exact=False,
                                  atol=1e-5)
    pd.testing.assert_frame_equal(s2v_embeddings,
                                  expected_s2v_embeddings,
                                  check_exact=False,
                                  atol=1e-5)
    # Check if sqlite file is stored correctly
    check_sqlite_files_are_equal(
        expected_sqlite_file_name,
        os.path.join(path_to_general_test_files, "100_test_spectra.sqlite"))
Beispiel #11
0
def test_get_ms2query_model_prediction_single_spectrum():
    results_table = load_pickled_file(
        os.path.join(
            os.path.split(os.path.dirname(__file__))[0],
            "tests/test_files/test_files_ms2library/expected_results_table_with_scores.pickle"
        ))
    ms2q_model_file_name = os.path.join(
        os.path.split(os.path.dirname(__file__))[0],
        'tests/test_files/general_test_files/test_ms2q_rf_model.pickle')
    ms2query_nn_model = load_ms2query_model(ms2q_model_file_name)
    results = get_ms2query_model_prediction_single_spectrum(
        results_table, ms2query_nn_model)

    expected_result = load_pickled_file(
        os.path.join(
            os.path.split(os.path.dirname(__file__))[0],
            "tests/test_files/test_files_ms2library/expected_analog_search_results.pickle"
        ))[0]
    expected_result.assert_results_table_equal(results)
Beispiel #12
0
def test_store_ms2ds_embeddings(tmp_path, path_to_general_test_files):
    """Tests store_ms2ds_embeddings"""
    base_file_name = os.path.join(tmp_path, '100_test_spectra')
    test_create_files = LibraryFilesCreator(
        os.path.join(path_to_general_test_files, '100_test_spectra.pickle'),
        base_file_name)
    test_create_files.store_ms2ds_embeddings(
        os.path.join(path_to_general_test_files,
                     'ms2ds_siamese_210301_5000_500_400.hdf5'))

    new_embeddings_file_name = base_file_name + "_ms2ds_embeddings.pickle"
    assert os.path.isfile(new_embeddings_file_name), \
        "Expected file to be created"
    # Test if correct embeddings are stored
    embeddings = load_pickled_file(new_embeddings_file_name)
    expected_embeddings = load_pickled_file(
        os.path.join(path_to_general_test_files,
                     "100_test_spectra_ms2ds_embeddings.pickle"))
    pd.testing.assert_frame_equal(embeddings,
                                  expected_embeddings,
                                  check_exact=False,
                                  atol=1e-5)
Beispiel #13
0
def test_get_tanimoto_scores():
    """Tests if the correct tanimoto scores are retrieved from sqlite file
    """
    path_to_test_files_sqlite_dir = os.path.join(
        os.path.split(os.path.dirname(__file__))[0], 'tests/test_files')
    sqlite_file_name = os.path.join(path_to_test_files_sqlite_dir,
                                    "test_spectra_database.sqlite")

    test_inchikeys = ['TXZUPPVCNIMVHW', 'WIOKWEJDRXNVSH', 'VBFKEZGCUWHGSK']
    tanimoto_score_dataframe = get_tanimoto_score_for_inchikey14s(
        test_inchikeys, test_inchikeys, sqlite_file_name)

    reference_tanimoto_scores = \
        load_pickled_file(os.path.join(path_to_test_files_sqlite_dir,
                                       "test_tanimoto_scores.pickle"))
    expected_result = reference_tanimoto_scores.loc[test_inchikeys][
        test_inchikeys]

    pd.testing.assert_frame_equal(tanimoto_score_dataframe,
                                  expected_result,
                                  check_like=True)
Beispiel #14
0
def test_add_unknown_charges_to_spectra():
    spectra = load_pickled_file(os.path.join(
        os.path.split(os.path.dirname(__file__))[0],
        'tests/test_files/general_test_files/100_test_spectra.pickle'))
    # Set charges to predefined values
    for spectrum in spectra[:10]:
        spectrum.set("charge", None)
    for spectrum in spectra[10:20]:
        spectrum.set("charge", 1)
    for spectrum in spectra[20:30]:
        spectrum.set("charge", -1)
    for spectrum in spectra[30:]:
        spectrum.set("charge", 2)

    spectra_with_charge = add_unknown_charges_to_spectra(spectra)
    # Test if charges are set correctly
    for spectrum in spectra_with_charge[:20]:
        assert spectrum.get("charge") == 1, "The charge is expected to be 1"
    for spectrum in spectra_with_charge[20:30]:
        assert spectrum.get("charge") == -1, "The charge is expected to be -1"
    for spectrum in spectra_with_charge[30:]:
        assert spectrum.get("charge") == 2, "The charge is expected to be 2"
Beispiel #15
0
    def _load_spectra_and_minimal_processing(
            self, pickled_spectra_file_name: str) -> List[Spectrum]:
        """Loads spectra from pickled file and does minimal processing

        Args:
        ------
        pickled_spectra_file_name:
            The file name of a pickled file containing a list of spectra.
        """
        # Loads the spectra from a pickled file
        list_of_spectra = load_pickled_file(pickled_spectra_file_name)
        assert list_of_spectra[0].get(self.settings[
                                          "spectrum_id_column_name"]), \
            f"Expected spectra to have '" \
            f"{self.settings['spectrum_id_column_name']}' in " \
            f"metadata, to solve specify the correct spectrum_solumn_name"
        # Does normalization and filtering of spectra
        list_of_spectra = \
            minimal_processing_multiple_spectra(
                list_of_spectra,
                progress_bar=self.settings["progress_bars"])
        return list_of_spectra
Beispiel #16
0
def test_get_all_ms2ds_scores(file_names, test_spectra):
    """Test get_all_ms2ds_scores method of ms2library"""
    sqlite_file_loc, spec2vec_model_file_loc, s2v_pickled_embeddings_file, \
        ms2ds_model_file_name, ms2ds_embeddings_file_name, \
        spectrum_id_column_name, ms2q_model_file_name = file_names

    test_library = MS2Library(sqlite_file_loc,
                              spec2vec_model_file_loc,
                              ms2ds_model_file_name,
                              s2v_pickled_embeddings_file,
                              ms2ds_embeddings_file_name,
                              ms2q_model_file_name,
                              spectrum_id_column_name=spectrum_id_column_name)

    result = test_library._get_all_ms2ds_scores(test_spectra)

    expected_result: pd.DataFrame = load_pickled_file(
        os.path.join(
            os.path.split(os.path.dirname(__file__))[0],
            'tests/test_files/test_files_ms2library/expected_ms2ds_scores.pickle'
        ))
    assert isinstance(result, pd.DataFrame), "Expected dictionary"
    assert_frame_equal(result, expected_result)
Beispiel #17
0
def test_export_to_dataframe_with_additional_columns(dummy_data, tmp_path):
    create_test_classifier_csv_file(tmp_path)
    test_table: ResultsTable = load_pickled_file(os.path.join(
        os.path.split(os.path.dirname(__file__))[0],
        'tests/test_files/test_files_ms2library/expected_analog_search_results.pickle'))[0]
    test_table.sqlite_file_name = os.path.join(
        os.path.split(os.path.dirname(__file__))[0], "tests/test_files/general_test_files/100_test_spectra.sqlite")
    test_table.classifier_csv_file_name = os.path.join(tmp_path, "test_csv_file")
    returned_dataframe = test_table.export_to_dataframe(5,
                                                        additional_metadata_columns=["charge"],
                                                        additional_ms2query_score_columns=["s2v_score", "ms2ds_score"])
    assert isinstance(returned_dataframe, pd.DataFrame)
    assert list(returned_dataframe.columns) == column_names_for_output(True, True, ["charge"],
                                                                       ["s2v_score", "ms2ds_score"])
    # Check if one of the classifiers is filled in
    assert returned_dataframe["npc_pathway_results"][0] == "Amino acids and Peptides"
    assert len(returned_dataframe.index) == 5
    # Test if first row is correct
    np.testing.assert_array_almost_equal(
        list(returned_dataframe.iloc[0, [0, 1, 2, 3, 7, 8, 9]]),
        [0.56453, 33.25000, 907.0, 940.250, 1, 0.99965, 0.92317],
        5)
    assert np.all(list(returned_dataframe.iloc[0, [4, 5, 6, 10, 11, 12, 13]]) ==
                       ['KNGPFNUOXXLKCN', 'CCMSLIB00000001548', 'Hoiamide B', 'CCC[C@@H](C)[C@@H]([C@H](C)[C@@H]1[C@H]([C@H](Cc2nc(cs2)C3=N[C@](CS3)(C4=N[C@](CS4)(C(=O)N[C@H]([C@H]([C@H](C(=O)O[C@H](C(=O)N[C@H](C(=O)O1)[C@@H](C)O)[C@@H](C)CC)C)O)[C@@H](C)CC)C)C)OC)C)O', 'Organic compounds', 'Organic acids and derivatives', 'Peptidomimetics'])
Beispiel #18
0
    def __init__(self,
                 sqlite_file_location: str,
                 s2v_model_file_name: str,
                 ms2ds_model_file_name: str,
                 pickled_s2v_embeddings_file_name: str,
                 pickled_ms2ds_embeddings_file_name: str,
                 training_spectra_file: str,
                 validation_spectra_file: str,
                 tanimoto_scores_df_file_name: str,
                 preselection_cut_off: int = 2000,
                 **settings):
        """Parameters
        ----------
        sqlite_file_location:
            The location at which the sqlite_file_is_stored. The file is
            expected to have 3 tables: tanimoto_scores, inchikeys and
            spectra_data.
        s2v_model_file_name:
            File location of a spec2vec model. In addition two more files in
            the same folder are expected with the same name but with extensions
            .trainables.syn1neg.npy and .wv.vectors.npy.
        ms2ds_model_file_name:
            File location of a trained ms2ds model.
        pickled_s2v_embeddings_file_name:
            File location of a pickled file with Spec2Vec embeddings in a
            pd.Dataframe with as index the spectrum id.
        pickled_ms2ds_embeddings_file_name:
            File location of a pickled file with ms2ds embeddings in a
            pd.Dataframe with as index the spectrum id.
        training_spectra_file:
            Pickled file with training spectra.
        validation_spectra_file:
            Pickled file with validation spectra.
        tanimoto_scores_df_file_name:
            A pickled file containing a dataframe with the tanimoto scores
            between all inchikeys. The tanimoto scores in SQLite cannot be
            used, since these do not contain the inchikeys for the training
            spectra.


        **settings:
            As additional parameters predefined settings can be changed.
        spectrum_id_column_name:
            The name of the column or key in dictionaries under which the
            spectrum id is stored. Default = "spectrumid"
        cosine_score_tolerance:
            Setting for calculating the cosine score. If two peaks fall within
            the cosine_score tolerance the peaks are considered a match.
            Default = 0.1
        base_nr_mass_similarity:
            The base nr used for normalizing the mass similarity. Default = 0.8
        max_precursor_mz:
            The value used to normalize the precursor m/z by dividing it by the
            max_precursor_mz. Default = 13428.370894192036
        progress_bars:
            If True progress bars will be shown. Default = True"""
        # pylint: disable=too-many-arguments
        super().__init__(sqlite_file_location, s2v_model_file_name,
                         ms2ds_model_file_name,
                         pickled_s2v_embeddings_file_name,
                         pickled_ms2ds_embeddings_file_name, None, **settings)
        self.tanimoto_scores: pd.DataFrame = \
            load_pickled_file(tanimoto_scores_df_file_name)
        self.training_spectra = minimal_processing_multiple_spectra(
            load_pickled_file(training_spectra_file))
        self.validation_spectra = minimal_processing_multiple_spectra(
            load_pickled_file(validation_spectra_file))
        self.preselection_cut_off = preselection_cut_off
def add_tanimoto_scores_to_sqlite(
        sqlite_file_name: str,
        tanimoto_scores_pickled_dataframe_file: str,
        list_of_spectra: List[Spectrum],
        temporary_tanimoto_file_name: str = "temporary_tanimoto_scores",
        progress_bars: bool = True):
    """Adds tanimoto scores and inchikey14s to sqlite table

    sqlite_file_name:
        Name of sqlite_file that should be created, if it already exists the
        tables are added. If the tables in this sqlite file already exist, they
        will be overwritten.
    tanimoto_scores_pickled_dataframe_file:
        A pickled file with tanimoto scores. The column names and indexes are
        inchikey14s.
    list_of_spectra:
        List of spectrum objects
    temporary_tanimoto_file_name:
        The file name of a temporary .npy file that is created to memory
        efficiently read out the tanimoto scores. The file is deleted after
        finishing.
    progress_bars:
        If True progress bars will show the progress of the different steps
        in the process.
    """
    # todo instead of creating a real file and than deleting make a temporary
    #  file, not yet implemented since numpy memmap is not able to access the
    #  file in this case.

    temporary_tanimoto_file_name = os.path.join(os.getcwd(),
                                                temporary_tanimoto_file_name)
    assert not os.path.exists(temporary_tanimoto_file_name + ".npy"), \
        "A file already exists with the temporary file name you want to create"

    tanimoto_df = load_pickled_file(tanimoto_scores_pickled_dataframe_file)

    # Get spectra belonging to each inchikey14
    spectra_belonging_to_inchikey14 = \
        get_spectra_belonging_to_inchikey14(list_of_spectra)

    inchikeys_with_spectra = list(spectra_belonging_to_inchikey14.keys())
    inchikeys_with_spectra.sort()
    # Remove all inchikeys that do not have any matching spectra
    filtered_tanimoto_df = tanimoto_df.loc[inchikeys_with_spectra][
        inchikeys_with_spectra]

    inchikeys_order = filtered_tanimoto_df.index

    # Get closest related inchikey14s for each inchikey14
    closest_related_inchikey14s = \
        get_closest_related_inchikey14s(filtered_tanimoto_df, inchikeys_order)
    # Creates a sqlite table containing all tanimoto scores
    initialize_tanimoto_score_table(sqlite_file_name)

    assert not filtered_tanimoto_df.isnull().values.any(), \
        "No NaN values were expected in tanimoto scores"
    if progress_bars:
        print("Saving tanimoto scores to temporary .npy file")
    np.save(temporary_tanimoto_file_name, filtered_tanimoto_df.to_numpy())

    add_tanimoto_scores_to_sqlite_table(sqlite_file_name,
                                        temporary_tanimoto_file_name + ".npy",
                                        progress_bar=progress_bars)
    os.remove(temporary_tanimoto_file_name + ".npy")
    # Creates a table containing the identifiers belonging to each inchikey14
    # These identifiers correspond to the identifiers in tanimoto_scores
    create_inchikey_sqlite_table(sqlite_file_name,
                                 inchikeys_order,
                                 spectra_belonging_to_inchikey14,
                                 closest_related_inchikey14s,
                                 progress_bar=progress_bars)
Beispiel #20
0
    def __init__(self,
                 sqlite_file_name: str,
                 s2v_model_file_name: str,
                 ms2ds_model_file_name: str,
                 pickled_s2v_embeddings_file_name: str,
                 pickled_ms2ds_embeddings_file_name: str,
                 ms2query_model_file_name: Union[str, None],
                 classifier_csv_file_name: Union[str, None] = None,
                 **settings):
        """
        Parameters
        ----------
        sqlite_file_name:
            The location at which the sqlite_file_is_stored. The file is
            expected to have 3 tables: tanimoto_scores, inchikeys and
            spectra_data.
        s2v_model_file_name:
            File location of a spec2vec model. In addition two more files in
            the same folder are expected with the same name but with extensions
            .trainables.syn1neg.npy and .wv.vectors.npy.
        ms2ds_model_file_name:
            File location of a trained ms2ds model.
        pickled_s2v_embeddings_file_name:
            File location of a pickled file with Spec2Vec embeddings in a
            pd.Dataframe with as index the spectrum id.
        pickled_ms2ds_embeddings_file_name:
            File location of a pickled file with ms2ds embeddings in a
            pd.Dataframe with as index the spectrum id.
        ms2query_model_file_name:
            File location of ms2query model with .hdf5 extension.
        classifier_csv_file_name:
            Csv file location containing classifier annotations per inchikey

        **settings:
            As additional parameters predefined settings can be changed.
        spectrum_id_column_name:
            The name of the column or key in dictionaries under which the
            spectrum id is stored. Default = "spectrumid"
        progress_bars:
            If True progress bars will be shown of all methods. Default = True
        """
        # pylint: disable=too-many-arguments

        # Change default settings to values given in **settings
        self.settings = self._set_settings(settings)

        # Load models and set file locations
        self.classifier_file_name = classifier_csv_file_name
        assert os.path.isfile(
            sqlite_file_name
        ), f"The given sqlite file does not exist: {sqlite_file_name}"
        self.sqlite_file_name = sqlite_file_name

        if ms2query_model_file_name is not None:
            self.ms2query_model = load_ms2query_model(ms2query_model_file_name)

        self.s2v_model = Word2Vec.load(s2v_model_file_name)
        self.ms2ds_model = load_ms2ds_model(ms2ds_model_file_name)

        # loads the library embeddings into memory
        self.s2v_embeddings: pd.DataFrame = load_pickled_file(
            pickled_s2v_embeddings_file_name)
        self.ms2ds_embeddings: pd.DataFrame = load_pickled_file(
            pickled_ms2ds_embeddings_file_name)

        assert self.ms2ds_model.base.output_shape[1] == self.ms2ds_embeddings.shape[1], \
            "Dimension of pre-computed MS2DeepScore embeddings does not fit given model."

        # load precursor mz's
        self.precursors_library = get_precursor_mz(
            self.sqlite_file_name, self.settings["spectrum_id_column_name"])

        # Load inchikey information into memory
        self.spectra_of_inchikey14s, \
            self.closely_related_inchikey14s = \
            get_inchikey_information(self.sqlite_file_name)
        self.inchikey14s_of_spectra = {}
        for inchikey, list_of_spectrum_ids in \
                self.spectra_of_inchikey14s.items():
            for spectrum_id in list_of_spectrum_ids:
                self.inchikey14s_of_spectra[spectrum_id] = inchikey
    processing pipeline."""
    s = default_filters(s)
    s = add_precursor_mz(s)
    s = normalize_intensities(s)
    s = reduce_to_number_of_peaks(s, n_required=5, ratio_desired=0.5, n_max=500)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
    s = require_minimum_number_of_peaks(s, n_required=5)
    return s


# path_root = os.path.dirname(os.getcwd())
# path_data = os.path.join(os.path.dirname(path_root), "data/gnps_15_12_2021/")
path_data = "C:\\HSD\\OneDrive - Hochschule Düsseldorf\\Data\\ms2query"

training_spectra_annotated = load_pickled_file(os.path.join(path_data,
                                                            "GNPS_15_12_2021_pos_train.pickle"))
training_spectra_not_annotated = load_pickled_file(os.path.join(path_data,
                                                                "ALL_GNPS_15_12_2021_positive_not_annotated.pickle"))
all_spectra = training_spectra_annotated + training_spectra_not_annotated
# Load data from pickled file and apply filters
cleaned_spectra = [spectrum_processing(s) for s in all_spectra]

# Omit spectrums that didn't qualify for analysis
cleaned_spectra = [s for s in cleaned_spectra if s is not None]

# Create spectrum documents
reference_documents = [SpectrumDocument(s, n_decimals=2) for s in cleaned_spectra]

model_file = os.path.join(path_data, "trained_models",
                          "spec2vec_model_GNPS_15_12_2021.model")
model = train_new_word2vec_model(reference_documents,