Exemple #1
0
def test_save_load_mgf_consistency(tmpdir, charge, ionmode, parent_mass):
    """Test saving and loading spectrum to .mgf file"""
    mz = numpy.array([100.1, 200.02, 300.003], dtype="float")
    intensities = numpy.array([0.01, 0.02, 1.0], dtype="float")
    metadata = {
        "precursor_mz": 200.5,
        "charge": charge,
        "ionmode": ionmode,
        "parent_mass": parent_mass
    }
    builder = SpectrumBuilder().with_mz(mz).with_intensities(intensities)
    spectrum1 = builder.with_metadata(metadata,
                                      metadata_harmonization=True).build()
    spectrum2 = builder.with_metadata(metadata,
                                      metadata_harmonization=True).build()

    # Write to test file
    filename = os.path.join(tmpdir, "test.mgf")
    save_as_mgf([spectrum1, spectrum2], filename)

    # Test if file exists
    assert os.path.isfile(filename)

    # Test importing spectra again
    spectrum_imports = list(load_from_mgf(filename))
    assert spectrum_imports[0].get("precursor_mz") == 200.5
    assert spectrum_imports[0].get("charge") == charge
    assert spectrum_imports[0].get("ionmode") == ionmode
    assert spectrum_imports[0].get("parent_mass") == str(parent_mass)
Exemple #2
0
    def predict_similarity_of_pair(
        self,
        mgf_path: str,
    ) -> float:
        """
        Predicts molecular structural similarities (Tanimoto scores) from pairs of
        mass spectrometry spectra in local mgf files using MS2DeepScore algorithm

        Parameters
        ----------
        mgf_path: str
            Local path to mgf file with two spectra

        Returns
        -------
        Molecular structural similarities (Tanimoto scores)

        """
        spectra_generator = load_from_mgf(mgf_path)

        payload = self._build_payload(spectra_generator)
        response = self._send_request(payload)

        prediction = self._format_results(response)
        return prediction
Exemple #3
0
def convert_files_to_matchms_spectrum_objects(
        file_name) -> Union[List[Spectrum], None]:
    """Loads spectra from your spectrum file into memory as matchms Spectrum object

    The following file extensions can be loaded in with this function:
    "mzML", "json", "mgf", "msp", "mzxml", "usi" and "pickle".
    A pickled file is expected to directly contain a list of matchms spectrum objects.

    Args:
    -----
    file_name:
        Path to file containing spectra, with file extension "mzML", "json", "mgf", "msp",
        "mzxml", "usi" or "pickle"
    """
    assert os.path.exists(
        file_name), f"The specified file: {file_name} does not exists"

    file_extension = os.path.splitext(file_name)[1].lower()
    if file_extension == ".mzml":
        return list(importing.load_from_mzml(file_name))
    if file_extension == ".json":
        return list(importing.load_from_json(file_name))
    if file_extension == ".mgf":
        return list(importing.load_from_mgf(file_name))
    if file_extension == ".msp":
        return list(importing.load_from_msp(file_name))
    if file_extension == ".mzxml":
        return list(importing.load_from_mzxml(file_name))
    if file_extension == ".usi":
        return list(importing.load_from_usi(file_name))
    if file_extension == ".pickle":
        return load_pickled_file(file_name)
    print(f"File extension of file: {file_name} is not recognized")
    return None
def library_match(spectra_list,
                  lib_mgf,
                  precursor_tol=1.0,
                  cosine=0.7,
                  n_peaks=3):
    """Reads a given library mgf file and matches the given spectra to the library spectra using normal cosine.
    Each test spectra is given the name of the library spectra match with the highest cosine score."""

    library = load_from_mgf(lib_mgf)

    # Apply filters to clean and enhance each spectrum
    library_spectra = []
    for spectrum in library:
        # spectrum = default_filters(spectrum)
        # Scale peak intensities to maximum of 1
        spectrum = normalize_intensities(spectrum)
        library_spectra.append(spectrum)

    scores = calculate_scores(references=library_spectra,
                              queries=spectra_list,
                              similarity_function=CosineHungarian())

    scores_list = []
    for score in scores:
        print(score)
        scores_list.append(score)

    scores_list.sort(reverse=True, key=lambda tuple: tuple[2])
Exemple #5
0
    def match_spectra_from_path(
        self,
        mgf_path: str,
        n_best: int,
        include_metadata: List[str] = None,
        ion_mode: str = "positive",
    ) -> List[pd.DataFrame]:
        """
        Finds the N best matches for spectra in a local mgf file using spec2vec algorithm.

        Parameters
        ----------
        mgf_path: str
            Local path to mgf file
        n_best: int
            Number of best matches to select
        include_metadata: List[str]
            Metadata keys to include in the response. Will make response slower. Please
            check the documentation for a list of valid keys.
        ion_mode: str
            Selects which model will be used for the predictions: Either a model trained with
            positive or negative ion mode spectra data. Defaults to positive.

        Returns
        -------
        A list of pandas dataframes containing the best matches and optionally metadata
        for these matches.

        """
        # validates input
        if ion_mode not in ["positive", "negative"]:
            raise ValueError(
                "Parameter ion_mode should be either set to 'positive' or 'negative. Defaults to 'positive'.'"
            )

        parameters = self._build_parameters(n_best, include_metadata)
        # loads spectra
        spectra_generator = load_from_mgf(mgf_path)

        # defines endpoint based on user choice of spectra ion mode
        endpoint = self._PREDICT_ENDPOINT_BASE.format(ion_mode=ion_mode)

        # issue requests respecting the spectra limit per request
        batch = []
        requests = []
        for spectrum in spectra_generator:
            batch.append(spectrum)
            if len(batch) == SPECTRA_LIMIT_PER_REQUEST:
                payload = self._build_payload(batch, parameters)
                requests.append(self._send_request(payload, endpoint))
                batch = []
        if batch:
            payload = self._build_payload(batch, parameters)
            requests.append(self._send_request(payload, endpoint))

        predictions = []
        for r in requests:
            predictions.extend(self._format_results(r))
        return predictions
def test_load_from_mgf_using_filepath():
    module_root = os.path.join(os.path.dirname(__file__), "..")
    spectra_file = os.path.join(module_root, "tests", "pesticides.mgf")

    spectra = list(load_from_mgf(spectra_file))

    assert len(spectra) > 0
    assert isinstance(spectra[0], Spectrum)
def test_load_from_mgf_using_file():
    module_root = os.path.join(os.path.dirname(__file__), "..")
    spectra_filepath = os.path.join(module_root, "tests", "pesticides.mgf")

    with open(spectra_filepath, "r", encoding="utf-8") as spectra_file:
        spectra = list(load_from_mgf(spectra_file))

        assert len(spectra) > 0
        assert isinstance(spectra[0], Spectrum)
def test_user_workflow():

    def apply_my_filters(s):
        s = default_filters(s)
        s = add_parent_mass(s)
        s = normalize_intensities(s)
        s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0)
        s = select_by_mz(s, mz_from=0, mz_to=1000)
        s = require_minimum_number_of_peaks(s, n_required=5)
        return s

    module_root = os.path.join(os.path.dirname(__file__), "..")
    spectrums_file = os.path.join(module_root, "tests", "pesticides.mgf")

    # apply my filters to the data
    spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

    # omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]

    # this will be a library grouping analysis, so queries = references = spectrums
    queries = spectrums[:]
    references = spectrums[:]

    # define similarity function
    cosine_greedy = CosineGreedy(tolerance=0.3)

    # calculate_scores
    scores = list(calculate_scores(references,
                                   queries,
                                   cosine_greedy))

    # filter out self-comparisons, require at least 20 matching peaks:
    filtered = [(reference, query, score, n_matching) for (reference, query, score, n_matching) in scores
                if reference != query and n_matching >= 20]

    sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

    actual_top10 = sorted_by_score[:10]

    expected_top10 = [
        (references[48], queries[50], pytest.approx(0.9994783627790965, rel=1e-9), 25),
        (references[50], queries[48], pytest.approx(0.9994783627790965, rel=1e-9), 25),
        (references[46], queries[48], pytest.approx(0.9990141860269471, rel=1e-9), 27),
        (references[48], queries[46], pytest.approx(0.9990141860269471, rel=1e-9), 27),
        (references[46], queries[50], pytest.approx(0.9988793406908719, rel=1e-9), 22),
        (references[50], queries[46], pytest.approx(0.9988793406908719, rel=1e-9), 22),
        (references[57], queries[59], pytest.approx(0.9982171275552505, rel=1e-9), 46),
        (references[59], queries[57], pytest.approx(0.9982171275552505, rel=1e-9), 46),
        (references[73], queries[74], pytest.approx(0.9973823244169199, rel=1e-9), 23),
        (references[74], queries[73], pytest.approx(0.9973823244169199, rel=1e-9), 23),
    ]
    assert actual_top10 == expected_top10
def test_user_workflow_spec2vec():
    def apply_my_filters(s):
        s = default_filters(s)
        s = add_parent_mass(s)
        s = add_losses(s)
        s = normalize_intensities(s)
        s = select_by_relative_intensity(s,
                                         intensity_from=0.01,
                                         intensity_to=1.0)
        s = select_by_mz(s, mz_from=0, mz_to=1000)
        s = require_minimum_number_of_peaks(s, n_required=5)
        return s

    module_root = os.path.join(os.path.dirname(__file__), '..')
    spectrums_file = os.path.join(module_root, 'tests', 'pesticides.mgf')

    # apply my filters to the data
    spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

    # omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]

    documents = [SpectrumDocument(s) for s in spectrums]

    # create and train model
    model = gensim.models.Word2Vec([d.words for d in documents],
                                   size=5,
                                   min_count=1)
    model.train([d.words for d in documents],
                total_examples=len(documents),
                epochs=20)

    # define similarity_function
    spec2vec = Spec2Vec(model=model, documents=documents)

    references = documents[:26]
    queries = documents[25:]

    # calculate scores on all combinations of references and queries
    scores = list(calculate_scores(references, queries, spec2vec))

    # filter out self-comparisons
    filtered = [(reference, query, score)
                for (reference, query, score) in scores if reference != query]

    sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

    actual_top10 = sorted_by_score[:10]

    actual_scores = [score for (reference, query, score) in actual_top10]

    assert max(actual_scores) > 0.99
Exemple #10
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("-s",
                        dest="symmetric",
                        action='store_true',
                        help="Computation is symmetric.")
    parser.add_argument("--ref",
                        dest="references_filename",
                        type=str,
                        help="Path to reference spectra library.")
    parser.add_argument("--ref_format",
                        dest="references_format",
                        type=str,
                        help="Reference spectra library file format.")
    parser.add_argument("queries_filename",
                        type=str,
                        help="Path to query spectra.")
    parser.add_argument("queries_format",
                        type=str,
                        help="Query spectra file format.")
    parser.add_argument("similarity_metric",
                        type=str,
                        help='Metric to use for matching.')
    parser.add_argument("tolerance",
                        type=float,
                        help="Tolerance to use for peak matching.")
    parser.add_argument(
        "mz_power",
        type=float,
        help="The power to raise mz to in the cosine function.")
    parser.add_argument(
        "intensity_power",
        type=float,
        help="The power to raise intensity to in the cosine function.")
    parser.add_argument("output_filename_scores",
                        type=str,
                        help="Path where to store the output .tsv scores.")
    parser.add_argument("output_filename_matches",
                        type=str,
                        help="Path where to store the output .tsv matches.")
    args = parser.parse_args()

    if args.queries_format == 'msp':
        queries_spectra = list(load_from_msp(args.queries_filename))
    elif args.queries_format == 'mgf':
        queries_spectra = list(load_from_mgf(args.queries_filename))
    else:
        raise ValueError(
            f'File format {args.queries_format} not supported for query spectra.'
        )

    if args.symmetric:
        reference_spectra = []
    else:
        if args.references_format == 'msp':
            reference_spectra = list(load_from_msp(args.references_filename))
        elif args.references_format == 'mgf':
            reference_spectra = list(load_from_mgf(args.references_filename))
        else:
            raise ValueError(
                f'File format {args.references_format} not supported for reference spectra library.'
            )

    if args.similarity_metric == 'CosineGreedy':
        similarity_metric = CosineGreedy(args.tolerance, args.mz_power,
                                         args.intensity_power)
    elif args.similarity_metric == 'CosineHungarian':
        similarity_metric = CosineHungarian(args.tolerance, args.mz_power,
                                            args.intensity_power)
    elif args.similarity_metric == 'ModifiedCosine':
        similarity_metric = ModifiedCosine(args.tolerance, args.mz_power,
                                           args.intensity_power)
        reference_spectra = list(map(convert_precursor_mz, reference_spectra))
        queries_spectra = list(map(convert_precursor_mz, queries_spectra))
    else:
        return -1

    print("Calculating scores...")
    scores = calculate_scores(
        references=queries_spectra if args.symmetric else reference_spectra,
        queries=queries_spectra,
        similarity_function=similarity_metric,
        is_symmetric=args.symmetric)

    write_outputs(args, scores)
    return 0
Exemple #11
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("--spectra",
                        type=str,
                        required=True,
                        help="Mass spectra file to be filtered.")
    parser.add_argument("--spectra_format",
                        type=str,
                        required=True,
                        help="Format of spectra file.")
    parser.add_argument("--output",
                        type=str,
                        required=True,
                        help="Filtered mass spectra file.")
    parser.add_argument(
        "-normalise_intensities",
        action='store_true',
        help="Normalize intensities of peaks (and losses) to unit height.")
    parser.add_argument(
        "-default_filters",
        action='store_true',
        help=
        "Collection of filters that are considered default and that do no require any (factory) arguments."
    )
    parser.add_argument(
        "-clean_metadata",
        action='store_true',
        help=
        "Apply all adding and cleaning filters if possible, so that the spectra have canonical metadata."
    )
    parser.add_argument(
        "-relative_intensity",
        action='store_true',
        help=
        "Keep only peaks within set relative intensity range (keep if to_intensity >= intensity >= from_intensity)."
    )
    parser.add_argument("--from_intensity",
                        type=float,
                        help="Lower bound for intensity filter")
    parser.add_argument("--to_intensity",
                        type=float,
                        help="Upper bound for intensity filter")
    parser.add_argument(
        "-mz_range",
        action='store_true',
        help=
        "Keep only peaks between set m/z range (keep if to_mz >= m/z >= from_mz)."
    )
    parser.add_argument("--from_mz",
                        type=float,
                        help="Lower bound for m/z  filter")
    parser.add_argument("--to_mz",
                        type=float,
                        help="Upper bound for m/z  filter")
    args = parser.parse_args()

    if not (args.normalise_intensities or args.default_filters or
            args.clean_metadata or args.relative_intensity or args.mz_range):
        raise ValueError('No filter selected.')

    if args.spectra_format == 'msp':
        spectra = list(load_from_msp(args.spectra))
    elif args.queries_format == 'mgf':
        spectra = list(load_from_mgf(args.spectra))
    else:
        raise ValueError(
            f'File format {args.spectra_format} not supported for mass spectra file.'
        )

    filtered_spectra = []
    for spectrum in spectra:
        if args.normalise_intensities:
            spectrum = normalize_intensities(spectrum)

        if args.default_filters:
            spectrum = default_filters(spectrum)

        if args.clean_metadata:
            filters = [
                add_compound_name, add_precursor_mz, add_fingerprint,
                add_losses, add_parent_mass, add_retention_index,
                add_retention_time, clean_compound_name
            ]
            for metadata_filter in filters:
                spectrum = metadata_filter(spectrum)

        if args.relative_intensity:
            spectrum = select_by_relative_intensity(spectrum,
                                                    args.from_intensity,
                                                    args.to_intensity)

        if args.mz_range:
            spectrum = select_by_mz(spectrum, args.from_mz, args.to_mz)

        filtered_spectra.append(spectrum)

    if args.spectra_format == 'msp':
        save_as_msp(filtered_spectra, args.output)
    else:
        save_as_mgf(filtered_spectra, args.output)

    return 0
Exemple #12
0
def mgf_generator(mgf_path):
    return list(load_from_mgf(mgf_path))[:20]
def test_user_workflow():
    def apply_my_filters(s):
        s = default_filters(s)
        s = add_parent_mass(s)
        s = normalize_intensities(s)
        s = select_by_relative_intensity(s,
                                         intensity_from=0.0,
                                         intensity_to=1.0)
        s = select_by_mz(s, mz_from=0, mz_to=1000)
        s = require_minimum_number_of_peaks(s, n_required=5)
        return s

    module_root = os.path.join(os.path.dirname(__file__), '..')
    spectrums_file = os.path.join(module_root, 'tests', 'pesticides.mgf')

    # apply my filters to the data
    spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

    # omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]

    # this will be a library grouping analysis, so queries = references = spectrums
    queries = spectrums[:]
    references = spectrums[:]

    # define similarity function
    cosine_greedy = CosineGreedy()

    # calculate_scores
    scores = list(calculate_scores(references, queries, cosine_greedy))

    # filter out self-comparisons, require at least 20 matching peaks:
    filtered = [(reference, query, score, n_matching)
                for (reference, query, score, n_matching) in scores
                if reference != query and n_matching >= 20]

    sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

    actual_top10 = sorted_by_score[:10]

    expected_top10 = [
        (references[48], queries[50],
         pytest.approx(0.9994510368270997, rel=1e-9), 25),
        (references[50], queries[48],
         pytest.approx(0.9994510368270997, rel=1e-9), 25),
        (references[46], queries[48],
         pytest.approx(0.9981252309590571, rel=1e-9), 27),
        (references[48], queries[46],
         pytest.approx(0.9981252309590571, rel=1e-9), 27),
        (references[46], queries[50],
         pytest.approx(0.9979632203390496, rel=1e-9), 22),
        (references[50], queries[46],
         pytest.approx(0.9979632203390496, rel=1e-9), 22),
        (references[73], queries[74],
         pytest.approx(0.9956795920716246, rel=1e-9), 23),
        (references[74], queries[73],
         pytest.approx(0.9956795920716246, rel=1e-9), 23),
        (references[57], queries[59],
         pytest.approx(0.9886557001269415, rel=1e-9), 46),
        (references[59], queries[57],
         pytest.approx(0.9886557001269415, rel=1e-9), 46),
    ]
    assert actual_top10 == expected_top10
def main():
    parser = argparse.ArgumentParser(description='Creating Spec2Vec Pairs')
    parser.add_argument('input_mgf', help='input_mgf')
    parser.add_argument('output_pairs', help='output_pairs')
    parser.add_argument('model_file', help='model_file')
    parser.add_argument('--min_score',
                        type=float,
                        default=0.7,
                        help='model_file')
    args = parser.parse_args()

    spectra = load_from_mgf(args.input_mgf)

    filtered_spectra = [post_process(s) for s in spectra]

    # Omit spectrums that didn't qualify for analysis
    filtered_spectra = [s for s in filtered_spectra if s is not None]

    # Create spectrum documents
    query_documents = [
        SpectrumDocument(s, n_decimals=2) for s in filtered_spectra
    ]

    #DEBUG
    #query_documents = query_documents[:100]

    # Loading the model
    model = gensim.models.Word2Vec.load(args.model_file)

    # Define similarity_function
    spec2vec = Spec2Vec(model=model,
                        intensity_weighting_power=0.5,
                        allowed_missing_percentage=80.0)

    print("total documents", len(query_documents))
    scores = calculate_scores(query_documents, query_documents,
                              spec2vec).scores

    number_of_spectra = len(query_documents)

    output_scores_list = []
    for i in range(number_of_spectra):
        for j in range(number_of_spectra):
            if i <= j:
                continue

            i_spectrum = filtered_spectra[i]
            j_spectrum = filtered_spectra[j]

            sim = scores[i][j]

            if sim < args.min_score:
                continue

            score_dict = {}
            score_dict["filename"] = args.input_mgf
            score_dict["CLUSTERID1"] = i_spectrum.metadata["scans"]
            score_dict["CLUSTERID2"] = j_spectrum.metadata["scans"]
            score_dict["Cosine"] = sim
            score_dict["mz1"] = i_spectrum.metadata["pepmass"][0]
            score_dict["mz2"] = j_spectrum.metadata["pepmass"][0]
            score_dict["DeltaMZ"] = score_dict["mz2"] - score_dict["mz1"]
            score_dict["EdgeAnnotation"] = "Spec2Vec"

            output_scores_list.append(score_dict)

    # Saving Data Out
    pd.DataFrame(output_scores_list).to_csv(args.output_pairs,
                                            sep="\t",
                                            index=False)
Exemple #15
0
def test_user_workflow_spec2vec_parallel():
    """Test typical user workflow to get from mass spectra to spec2vec similarities.

    This test will run a typical workflow example using a small dataset and a
    pretrained word2vec model. One main aspect of this is to test if users will
    get exactly the same spec2vec similarity scores when starting from a word2vec
    model that was trained and saved elsewhere.
    """
    def apply_my_filters(s):
        """This is how a user would typically design his own pre- and post-
        processing pipeline."""
        s = default_filters(s)
        s = add_parent_mass(s)
        s = normalize_intensities(s)
        s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
        s = select_by_mz(s, mz_from=0, mz_to=1000)
        s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
        s = require_minimum_number_of_peaks(s, n_required=5)
        return s

    repository_root = os.path.join(os.path.dirname(__file__), "..")
    spectrums_file = os.path.join(repository_root, "tests", "pesticides.mgf")

    # apply my filters to the data
    spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

    # omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]

    documents = [SpectrumDocument(s) for s in spectrums]

    model_file = os.path.join(repository_root, "integration-tests",
                              "test_user_workflow_spec2vec.model")
    if os.path.isfile(model_file):
        model = gensim.models.Word2Vec.load(model_file)
    else:
        # create and train model
        model = gensim.models.Word2Vec([d.words for d in documents],
                                       size=10,
                                       min_count=1)
        model.train([d.words for d in documents],
                    total_examples=len(documents),
                    epochs=20)
        model.save(model_file)

    # define similarity_function
    spec2vec = Spec2VecParallel(model=model, intensity_weighting_power=0.5)

    references = documents[:26]
    queries = documents[25:]

    # calculate scores on all combinations of references and queries
    scores = list(calculate_scores_parallel(references, queries, spec2vec))

    # filter out self-comparisons
    filtered = [(reference, query, score)
                for (reference, query, score) in scores if reference != query]

    sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

    actual_top10 = sorted_by_score[:10]

    expected_top10 = [(documents[19], documents[25],
                       pytest.approx(0.9999121928249473, rel=1e-9)),
                      (documents[20], documents[25],
                       pytest.approx(0.9998846890269892, rel=1e-9)),
                      (documents[20], documents[45],
                       pytest.approx(0.9998756073673759, rel=1e-9)),
                      (documents[25], documents[45],
                       pytest.approx(0.9998750427994474, rel=1e-9)),
                      (documents[19], documents[27],
                       pytest.approx(0.9998722768460854, rel=1e-9)),
                      (documents[22], documents[27],
                       pytest.approx(0.9998633023352553, rel=1e-9)),
                      (documents[18], documents[27],
                       pytest.approx(0.9998616961532616, rel=1e-9)),
                      (documents[19], documents[45],
                       pytest.approx(0.9998528723697396, rel=1e-9)),
                      (documents[14], documents[71],
                       pytest.approx(0.9998404364805897, rel=1e-9)),
                      (documents[20], documents[27],
                       pytest.approx(0.9998336807761137, rel=1e-9))]

    assert actual_top10 == expected_top10, "Expected different top 10 table."
print(args)

if (args.matchms):
    print('use matchms')
    import matchms
    from matchms.importing import load_from_mgf
    from matchms.filtering import default_filters
    from matchms.filtering import normalize_intensities
    from matchms.filtering import add_precursor_mz
    from matchms import calculate_scores
    from matchms.similarity import ModifiedCosine
    from msmolnet.use_matchms import convert_matches as convert

    input_mgf = f'{args.input}.mgf'
    print(f"reading file {input_mgf}")
    file = load_from_mgf(input_mgf)
    print(file)

    print("normalising intensities")

    # Apply filters to clean and enhance each spectrum
    spectrums = []
    for spectrum in file:
        spectrum = default_filters(spectrum)
        # Scale peak intensities to maximum of 1
        spectrum = normalize_intensities(spectrum)
        print(spectrum.get('precursor_mz'))
        spectrums.append(spectrum)

    scores = calculate_scores(
        references=spectrums,